diff --git a/libs/langchain-community/src/document_loaders/fs/unstructured.ts b/libs/langchain-community/src/document_loaders/fs/unstructured.ts index 62e7053f42b2..48915c89fdc5 100644 --- a/libs/langchain-community/src/document_loaders/fs/unstructured.ts +++ b/libs/langchain-community/src/document_loaders/fs/unstructured.ts @@ -113,6 +113,7 @@ export type UnstructuredLoaderOptions = { combineUnderNChars?: number; newAfterNChars?: number; maxCharacters?: number; + extractImageBlockTypes?: string[]; }; export type UnstructuredDirectoryLoaderOptions = UnstructuredLoaderOptions & { @@ -178,6 +179,8 @@ export class UnstructuredLoader extends BaseDocumentLoader { private maxCharacters?: number; + private extractImageBlockTypes?: string[]; + constructor( filepathOrBufferOptions: string | UnstructuredMemoryLoaderOptions, unstructuredOptions: UnstructuredLoaderOptions | string = {} @@ -221,6 +224,7 @@ export class UnstructuredLoader extends BaseDocumentLoader { this.combineUnderNChars = options.combineUnderNChars; this.newAfterNChars = options.newAfterNChars; this.maxCharacters = options.maxCharacters; + this.extractImageBlockTypes = options.extractImageBlockTypes; } } @@ -288,6 +292,13 @@ export class UnstructuredLoader extends BaseDocumentLoader { formData.append("max_characters", String(this.maxCharacters)); } + if (this.extractImageBlockTypes !== undefined) { + formData.append( + "extract_image_block_types", + JSON.stringify(this.extractImageBlockTypes) + ); + } + const headers = { "UNSTRUCTURED-API-KEY": this.apiKey ?? "", }; diff --git a/libs/langchain-community/src/document_loaders/tests/unstructured.int.test.ts b/libs/langchain-community/src/document_loaders/tests/unstructured.int.test.ts index b0b0712118a6..998c4df0fb99 100644 --- a/libs/langchain-community/src/document_loaders/tests/unstructured.int.test.ts +++ b/libs/langchain-community/src/document_loaders/tests/unstructured.int.test.ts @@ -71,11 +71,28 @@ test.skip("Test Unstructured base loader with fast strategy", async () => { const loader = new UnstructuredLoader(filePath, options); const docs = await loader.load(); - expect(docs.length).toBeGreaterThan(10); expect(typeof docs[0].pageContent).toBe("string"); }); +test.skip("Test Unstructured base loader with extractImageBlockTypes", async () => { + const filePath = path.resolve( + path.dirname(url.fileURLToPath(import.meta.url)), + "./example_data/1706.03762.pdf" + ); + + const options = { + apiKey: process.env.UNSTRUCTURED_API_KEY!, + extractImageBlockTypes: ["image"], + }; + + const loader = new UnstructuredLoader(filePath, options); + const docs = await loader.load(); + + expect(docs.length).toBeGreaterThan(10); + expect(docs.some((item) => item?.metadata?.category === "image")).toBe(true); +}); + test.skip("Test Unstructured directory loader", async () => { const directoryPath = path.resolve( path.dirname(url.fileURLToPath(import.meta.url)),