-
-
Notifications
You must be signed in to change notification settings - Fork 17.5k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Feature] improve CsvLoader & clean code (#3830)
* Improve CSV Loader * Improve S3 Loaders --------- Co-authored-by: Henry <[email protected]>
- Loading branch information
1 parent
cc87d85
commit 24eb437
Showing
7 changed files
with
229 additions
and
206 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
74 changes: 74 additions & 0 deletions
74
packages/components/nodes/documentloaders/Csv/CsvLoader.ts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
import { TextLoader } from 'langchain/document_loaders/fs/text' | ||
import Papa from 'papaparse' | ||
|
||
type CSVLoaderOptions = { | ||
// Return specifific column from key (string) or index (integer) | ||
column?: string | number | ||
// Force separator (default: auto detect) | ||
separator?: string | ||
} | ||
|
||
/** | ||
* A class that extends the TextLoader class. It represents a document | ||
* loader that loads documents from a CSV file. It has a constructor that | ||
* takes a `filePathOrBlob` parameter representing the path to the CSV | ||
* file or a Blob object, and an optional `options` parameter of type | ||
* `CSVLoaderOptions` or a string representing the column to use as the | ||
* document's pageContent. | ||
*/ | ||
export class CSVLoader extends TextLoader { | ||
protected options: CSVLoaderOptions = {} | ||
|
||
constructor(filePathOrBlob: ConstructorParameters<typeof TextLoader>[0], options?: CSVLoaderOptions | string) { | ||
super(filePathOrBlob) | ||
|
||
if (typeof options === 'string') { | ||
this.options = { column: options } | ||
} else { | ||
this.options = options ?? this.options | ||
} | ||
} | ||
/** | ||
* A protected method that parses the raw CSV data and returns an array of | ||
* strings representing the pageContent of each document. It uses the | ||
* `papaparse` to parse the CSV data. If | ||
* the `column` option is specified, it checks if the column exists in the | ||
* CSV file and returns the values of that column as the pageContent. If | ||
* the `column` option is not specified, it converts each row of the CSV | ||
* data into key/value pairs and joins them with newline characters. | ||
* @param raw The raw CSV data to be parsed. | ||
* @returns An array of strings representing the pageContent of each document. | ||
*/ | ||
async parse(raw: string): Promise<string[]> { | ||
const { column, separator } = this.options | ||
|
||
const { | ||
data: parsed, | ||
meta: { fields = [] } | ||
} = Papa.parse<{ [K: string]: string }>(raw.trim(), { | ||
delimiter: separator, | ||
header: true | ||
}) | ||
|
||
if (column !== undefined) { | ||
if (!fields.length) { | ||
throw new Error(`Unable to resolve fields from header.`) | ||
} | ||
|
||
let searchIdx = column | ||
|
||
if (typeof column == 'number') { | ||
searchIdx = fields[column] | ||
} | ||
|
||
if (!fields.includes(searchIdx as string)) { | ||
throw new Error(`Column ${column} not found in CSV file.`) | ||
} | ||
|
||
// Note TextLoader will raise an exception if the value is null. | ||
return parsed.map((row) => row[searchIdx]) | ||
} | ||
|
||
return parsed.map((row) => fields.map((key) => `${key.trim() || '_0'}: ${row[key]?.trim()}`).join('\n')) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.