quarkiverse · jmartisk · Apr 16, 2024 · Apr 16, 2024
@@ -60,4 +60,13 @@ at startup, use the following properties:
   between adjacent documents. Default is 30.
 
 To control the number of retrieved documents, use
-`quarkus.langchain4j.easy-rag.max-results`. The default is 5.
+`quarkus.langchain4j.easy-rag.max-results`. The default is 5.
+
+To control the path matcher denoting which files to ingest, use
+`quarkus.langchain4j.easy-rag.path-matcher`. The default is `glob:**`,
+meaning all files recursively.
+
+For finer-grained control of the Apache Tika parsers (for example, to turn
+off OCR capabilities), you can use a regular XML config file recognized by
+Tika (see https://tika.apache.org/2.9.2/configuring.html[Tika
+documentation]), and specify `-Dtika.config` to point at the file.
@@ -0,0 +1,46 @@
+package io.quarkiverse.langchain4j.test;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.util.List;
+
+import jakarta.inject.Inject;
+
+import org.jboss.shrinkwrap.api.ShrinkWrap;
+import org.jboss.shrinkwrap.api.asset.StringAsset;
+import org.jboss.shrinkwrap.api.spec.JavaArchive;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.extension.RegisterExtension;
+
+import dev.langchain4j.data.embedding.Embedding;
+import dev.langchain4j.data.segment.TextSegment;
+import dev.langchain4j.store.embedding.EmbeddingMatch;
+import dev.langchain4j.store.embedding.inmemory.InMemoryEmbeddingStore;
+import io.quarkus.test.QuarkusUnitTest;
+
+/**
+ * Verify usage of the `quarkus.langchain4j.easy-rag.path-matcher` property.
+ */
+public class EasyRagNotRecursiveTest {
+
+    @RegisterExtension
+    static final QuarkusUnitTest unitTest = new QuarkusUnitTest()
+            .setArchiveProducer(() -> ShrinkWrap.create(JavaArchive.class)
+                    .addAsResource(new StringAsset("quarkus.langchain4j.easy-rag.path=src/test/resources/ragdocuments\n" +
+                            "quarkus.langchain4j.easy-rag.recursive=false\n"),
+                            "application.properties"));
+
+    @Inject
+    InMemoryEmbeddingStore<TextSegment> embeddingStore;
+
+    Embedding DUMMY_EMBEDDING = new Embedding(new float[384]);
+
+    @Test
+    public void verifyOnlyTheRootDirectoryIsIngested() {
+        List<EmbeddingMatch<TextSegment>> relevant = embeddingStore.findRelevant(DUMMY_EMBEDDING, 3);
+        assertEquals(1, relevant.size());
+        assertTrue(relevant.get(0).embedded().text().contains("Charlie"));
+    }
+
+}
@@ -0,0 +1,45 @@
+package io.quarkiverse.langchain4j.test;
+
+import static org.junit.jupiter.api.Assertions.*;
+
+import java.util.List;
+
+import jakarta.inject.Inject;
+
+import org.jboss.shrinkwrap.api.ShrinkWrap;
+import org.jboss.shrinkwrap.api.asset.StringAsset;
+import org.jboss.shrinkwrap.api.spec.JavaArchive;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.extension.RegisterExtension;
+
+import dev.langchain4j.data.embedding.Embedding;
+import dev.langchain4j.data.segment.TextSegment;
+import dev.langchain4j.store.embedding.EmbeddingMatch;
+import dev.langchain4j.store.embedding.inmemory.InMemoryEmbeddingStore;
+import io.quarkus.test.QuarkusUnitTest;
+
+/**
+ * Verify usage of the `quarkus.langchain4j.easy-rag.path-matcher` property.
+ */
+public class EasyRagPathMatcherTest {
+
+    @RegisterExtension
+    static final QuarkusUnitTest unitTest = new QuarkusUnitTest()
+            .setArchiveProducer(() -> ShrinkWrap.create(JavaArchive.class)
+                    .addAsResource(new StringAsset("quarkus.langchain4j.easy-rag.path=src/test/resources/ragdocuments\n" +
+                            "quarkus.langchain4j.easy-rag.path-matcher=glob:*.pdf\n"),
+                            "application.properties"));
+
+    @Inject
+    InMemoryEmbeddingStore<TextSegment> embeddingStore;
+
+    Embedding DUMMY_EMBEDDING = new Embedding(new float[384]);
+
+    @Test
+    public void verifyPathMatchingOnlyPdf() {
+        List<EmbeddingMatch<TextSegment>> relevant = embeddingStore.findRelevant(DUMMY_EMBEDDING, 3);
+        assertEquals(1, relevant.size());
+        assertTrue(relevant.get(0).embedded().text().contains("Charlie"));
+    }
+
+}
@@ -15,6 +15,21 @@ public interface EasyRagConfig {
      */
     String path();
 
+    /**
+     * Matcher used for filtering which files from the directory should be ingested.
+     * This uses the {@link java.nio.file.FileSystem} path matcher syntax.
+     * Example: `glob:**.txt` to recursively match all files with the `.txt` extension.
+     * The default is `glob:**`, recursively matching all files.
+     */
+    @WithDefault("glob:**")
+    String pathMatcher();
+
+    /**
+     * Whether to recursively ingest documents from subdirectories.
+     */
+    @WithDefault("true")
+    Boolean recursive();
+
     /**
      * Maximum segment size when splitting documents, in tokens.
      */

@@ -1,5 +1,7 @@
 package io.quarkiverse.langchain4j.easyrag.runtime;
 
+import java.nio.file.FileSystems;
+import java.nio.file.PathMatcher;
 import java.util.List;
 import java.util.function.Function;
 import java.util.function.Supplier;
@@ -31,8 +33,16 @@ public class EasyRagRecorder {
     public void ingest(EasyRagConfig config, BeanContainer beanContainer) {
         EmbeddingStore<TextSegment> embeddingStore = beanContainer.beanInstance(EmbeddingStore.class);
         EmbeddingModel embeddingModel = beanContainer.beanInstance(EmbeddingModel.class);
-        LOGGER.info("Ingesting documents from path: " + config.path());
-        List<Document> documents = FileSystemDocumentLoader.loadDocuments(config.path());
+
+        PathMatcher pathMatcher = FileSystems.getDefault().getPathMatcher(config.pathMatcher());
+        LOGGER.info("Ingesting documents from path: " + config.path() +
+                ", path matcher = " + config.pathMatcher() + ", recursive = " + config.recursive());
+        List<Document> documents = null;
+        if (config.recursive()) {
+            documents = FileSystemDocumentLoader.loadDocumentsRecursively(config.path(), pathMatcher);
+        } else {
+            documents = FileSystemDocumentLoader.loadDocuments(config.path(), pathMatcher);
+        }
         DocumentSplitter documentSplitter = DocumentSplitters.recursive(config.maxSegmentSize(),
                 config.maxOverlapSize(), new HuggingFaceTokenizer());
         List<Document> splitDocuments = documentSplitter