feat: add cql support for confluence extractor (#153)

robodev-r2d2 · NewDev16 · a-klos · web-flow · commit 3647bc2c972d · 2025-11-07T15:10:43.000+01:00
This pull request introduces support for Confluence Query Language (CQL)
filtering in the document extraction workflow, allowing users to specify
either a `space_key` or a `cql` query to select Confluence pages for
processing. The frontend and backend have been updated to handle these
new parameters, including user interface improvements and stricter
validation to ensure that at least one filtering option is provided.
Documentation and tests have also been added to clarify and verify the
new behavior.

**Backend changes for Confluence extraction:**

* The backend now accepts either a `space_key` or a `cql` parameter for
Confluence extraction, with validation to require at least one; empty
values for these parameters are ignored, and an error is raised if both
are missing.
[[1]](diffhunk://#diff-2b5524f0cb01b11e336def1a99356a243662de61a73be6dd5da1be89227cf112L57-R80)
[[2]](diffhunk://#diff-abd3edfc8fadc978097bb0fa2dbc6996a3bc15fa110f2269ae43541b9bf98c64L36-R40)
[[3]](diffhunk://#diff-abd3edfc8fadc978097bb0fa2dbc6996a3bc15fa110f2269ae43541b9bf98c64R128)
* The `ConfluenceParameters` model and extraction logic were updated to
support the optional `cql` parameter and propagate it to the loader.
[[1]](diffhunk://#diff-abd3edfc8fadc978097bb0fa2dbc6996a3bc15fa110f2269ae43541b9bf98c64L36-R40)
[[2]](diffhunk://#diff-abd3edfc8fadc978097bb0fa2dbc6996a3bc15fa110f2269ae43541b9bf98c64R62)
* The extraction process always sets `content_format` to `VIEW` for
consistency.
* Tests were added to verify CQL support and validation logic for
required parameters.

**Frontend changes for Confluence configuration:**

* The Confluence upload UI now includes fields for both `spaceKey` and
`cql` (both optional), updates the payload sent to the backend, and
clarifies placeholders and descriptions.
[[1]](diffhunk://#diff-a6fc8bcaabdced0bd0b5b642bd5a4aa9cb124a5bbebd0762e76f9dcb0df884c1R25)
[[2]](diffhunk://#diff-a6fc8bcaabdced0bd0b5b642bd5a4aa9cb124a5bbebd0762e76f9dcb0df884c1L78-R80)
[[3]](diffhunk://#diff-a6fc8bcaabdced0bd0b5b642bd5a4aa9cb124a5bbebd0762e76f9dcb0df884c1L185-R196)
[[4]](diffhunk://#diff-0f7547155cd6592b947aae6327e72dbe57073ae43aba24e82ad7ef78fee08153L12-R13)
[[5]](diffhunk://#diff-0f7547155cd6592b947aae6327e72dbe57073ae43aba24e82ad7ef78fee08153L58-R72)
* Localization strings were updated to describe the new CQL filtering
feature and improve user guidance.
[[1]](diffhunk://#diff-e485c1eda5b61acd7bba3807afc19b489ad515ba3a6feddd627596986245c334L13-R15)
[[2]](diffhunk://#diff-430c5bb0cfd37251a3388659a69ca7cff0726cd2cc40d592b79b55c9f644050dL15-R17)

**Documentation update:**

* The README was updated to document the new Confluence extraction
parameters and their behavior.

---------

Co-authored-by: Andreas Klos &lt;aklos@outlook.de&gt;
Co-authored-by: Andreas Klos &lt;andreas.klos@stackit.cloud&gt;
diff --git a/libs/README.md b/libs/README.md
@@ -321,6 +321,8 @@ The following types of information can be extracted:
 - `TABLE`: data in tabular form found in the document
 - `IMAGE`: image found in the document
 
+For Confluence sources, provide the instance `url` and API `token` and include either a `space_key` or a `cql` filter (empty values are ignored). Optional flags such as `include_attachments`, `keep_markdown_format`, and `keep_newlines` mirror the parameters supported by LangChain's `ConfluenceLoader`.
+
 For sitemap sources, additional parameters can be provided, e.g.:
 
 - `web_path`: The URL of the XML sitemap to crawl
diff --git a/libs/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py b/libs/extractor-api-lib/src/extractor_api_lib/impl/extractors/confluence_extractor.py
@@ -10,6 +10,8 @@
 from extractor_api_lib.impl.mapper.confluence_langchain_document2information_piece import (
     ConfluenceLangchainDocument2InformationPiece,
 )
+from langchain_community.document_loaders.confluence import ContentFormat
+
 
 logger = logging.getLogger(__name__)
 
@@ -54,11 +56,28 @@ async def aextract_content(
             A list of information pieces extracted from Confluence.
         """
         # Convert list of key value pairs to dict
-        confluence_loader_parameters = {
-            x.key: int(x.value) if x.value.isdigit() else x.value for x in extraction_parameters.kwargs
-        }
-        if not confluence_loader_parameters.get("max_pages") or isinstance(
-            confluence_loader_parameters.get("max_pages"), str
+        confluence_loader_parameters = {}
+        for key_value in extraction_parameters.kwargs or []:
+            if key_value is None or key_value.key is None:
+                continue
+
+            value = key_value.value
+            if isinstance(value, str):
+                value = value.strip()
+                if not value and key_value.key in {"space_key", "cql"}:
+                    # Skip empty optional parameters
+                    continue
+                if value.isdigit():
+                    value = int(value)
+
+            confluence_loader_parameters[key_value.key] = value
+
+        if "cql" not in confluence_loader_parameters and "space_key" not in confluence_loader_parameters:
+            raise ValueError("Either 'space_key' or 'cql' must be provided for Confluence extraction.")
+        if (
+            "max_pages" in confluence_loader_parameters
+            and not confluence_loader_parameters.get("max_pages")
+            or isinstance(confluence_loader_parameters.get("max_pages"), str)
         ):
             logging.warning(
                 "max_pages parameter is not set or invalid discarding it. ConfluenceLoader will use default value."
@@ -67,6 +86,7 @@ async def aextract_content(
         # Drop the document_name parameter as it is not used by the ConfluenceLoader
         if "document_name" in confluence_loader_parameters:
             confluence_loader_parameters.pop("document_name", None)
+        confluence_loader_parameters["content_format"] = ContentFormat.VIEW
         document_loader = ConfluenceLoader(**confluence_loader_parameters)
         documents = document_loader.load()
         return [self._mapper.map_document2informationpiece(x, extraction_parameters.document_name) for x in documents]
diff --git a/libs/extractor-api-lib/src/extractor_api_lib/models/confluence_parameters.py b/libs/extractor-api-lib/src/extractor_api_lib/models/confluence_parameters.py
@@ -33,7 +33,11 @@ class ConfluenceParameters(BaseModel):
 
     url: StrictStr = Field(description="url of the confluence space.")
     token: StrictStr = Field(description="api key to access confluence.")
-    space_key: StrictStr = Field(description="the space key of the confluence pages.")
+    space_key: Optional[StrictStr] = Field(default=None, description="the space key of the confluence pages.")
+    cql: Optional[StrictStr] = Field(
+        default=None,
+        description="Optional Confluence Query Language (CQL) expression used to filter pages.",
+    )
     include_attachments: Optional[StrictBool] = Field(
         default=False,
         description="whether to include file attachments (e.g., images, documents) in the parsed content. Default is `false`.",
@@ -55,6 +59,7 @@ class ConfluenceParameters(BaseModel):
         "url",
         "token",
         "space_key",
+        "cql",
         "include_attachments",
         "keep_markdown_format",
         "keep_newlines",
@@ -120,6 +125,7 @@ def from_dict(cls, obj: Dict) -> Self:
                 "url": obj.get("url"),
                 "token": obj.get("token"),
                 "space_key": obj.get("space_key"),
+                "cql": obj.get("cql"),
                 "include_attachments": (
                     obj.get("include_attachments") if obj.get("include_attachments") is not None else False
                 ),
diff --git a/libs/extractor-api-lib/tests/confluence_extractor_test.py b/libs/extractor-api-lib/tests/confluence_extractor_test.py
@@ -0,0 +1,68 @@
+"""Tests for the ConfluenceExtractor."""
+
+import pytest
+from unittest.mock import MagicMock, patch
+from langchain_core.documents import Document as LangchainDocument
+
+from extractor_api_lib.impl.extractors.confluence_extractor import ConfluenceExtractor
+from extractor_api_lib.models.extraction_parameters import ExtractionParameters
+from extractor_api_lib.models.key_value_pair import KeyValuePair
+from extractor_api_lib.models.dataclasses.internal_information_piece import InternalInformationPiece
+from extractor_api_lib.impl.types.content_type import ContentType
+
+
+@pytest.fixture
+def confluence_mapper():
+    """Return a mapper mock that produces predictable information pieces."""
+    mapper = MagicMock()
+    mapper.map_document2informationpiece.return_value = InternalInformationPiece(
+        type=ContentType.TEXT,
+        metadata={"document": "doc", "id": "id", "related": []},
+        page_content="content",
+    )
+    return mapper
+
+
+@pytest.mark.asyncio
+@patch("extractor_api_lib.impl.extractors.confluence_extractor.ConfluenceLoader")
+async def test_aextract_content_supports_cql(mock_loader_cls, confluence_mapper):
+    """Ensure the extractor forwards the CQL parameter to the loader."""
+    extractor = ConfluenceExtractor(mapper=confluence_mapper)
+    extraction_parameters = ExtractionParameters(
+        document_name="confluence_doc",
+        source_type="confluence",
+        kwargs=[
+            KeyValuePair(key="url", value="https://example.atlassian.net"),
+            KeyValuePair(key="token", value="token"),
+            KeyValuePair(key="cql", value="type=page"),
+        ],
+    )
+
+    mock_loader_instance = MagicMock()
+    mock_loader_instance.load.return_value = [LangchainDocument(page_content="content", metadata={"title": "Doc"})]
+    mock_loader_cls.return_value = mock_loader_instance
+
+    results = await extractor.aextract_content(extraction_parameters)
+
+    assert len(results) == 1
+    confluence_mapper.map_document2informationpiece.assert_called_once()
+    loader_kwargs = mock_loader_cls.call_args.kwargs
+    assert loader_kwargs["cql"] == "type=page"
+    assert "space_key" not in loader_kwargs
+
+
+@pytest.mark.asyncio
+async def test_aextract_content_requires_space_key_or_cql(confluence_mapper):
+    """The extractor must receive either a space key or a CQL expression."""
+    extractor = ConfluenceExtractor(mapper=confluence_mapper)
+    extraction_parameters = ExtractionParameters(
+        document_name="confluence_doc",
+        source_type="confluence",
+        kwargs=[
+            KeyValuePair(key="url", value="https://example.atlassian.net"),
+            KeyValuePair(key="token", value="token"),
+        ],
+    )
+
+    with pytest.raises(ValueError, match="Either 'space_key' or 'cql' must be provided for Confluence extraction."):
+        await extractor.aextract_content(extraction_parameters)
diff --git a/services/frontend/libs/admin-app/data-access/document.api.ts b/services/frontend/libs/admin-app/data-access/document.api.ts
@@ -9,7 +9,8 @@ axios.defaults.auth = {
 
 // confluence configuration interface
 export interface ConfluenceConfig {
-  spaceKey: string;
+  spaceKey?: string;
+  cql?: string;
   token: string;
   url: string;
   maxPages?: number;
@@ -55,11 +56,20 @@ export class DocumentAPI {
     static async loadConfluence(config: ConfluenceConfig): Promise<void> {
         try {
             // convert config to list of key/value items for backend
-            const payload = [
-                { key: 'url', value: config.url },
+            const payload: { key: string; value: string }[] = [
+                { key: 'url', value: config.url.trim() },
                 { key: 'token', value: config.token },
-                { key: 'space_key', value: config.spaceKey },
-            ] as { key: string; value: string }[];
+            ];
+
+            const spaceKey = config.spaceKey?.trim();
+            if (spaceKey) {
+                payload.push({ key: 'space_key', value: spaceKey });
+            }
+
+            const cql = config.cql?.trim();
+            if (cql) {
+                payload.push({ key: 'cql', value: cql });
+            }
 
             if (typeof config.maxPages === 'number') {
                 payload.push({ key: 'max_pages', value: String(config.maxPages) });
diff --git a/services/frontend/libs/admin-app/feature-document/DocumentUploadContainer.vue b/services/frontend/libs/admin-app/feature-document/DocumentUploadContainer.vue
@@ -22,6 +22,7 @@ const spaceKey = ref('');
 const confluenceToken = ref('');
 const confluenceUrl = ref('');
 const maxPages = ref<number>();
+const confluenceCql = ref('');
 
 // sitemap configuration refs
 const sitemapName = ref('');
@@ -75,7 +76,8 @@ const handleConfluenceUpload = () => {
         spaceKey: spaceKey.value,
         token: confluenceToken.value,
         url: confluenceUrl.value,
-        maxPages: maxPages.value
+        maxPages: maxPages.value,
+        cql: confluenceCql.value,
     });
 }
 
@@ -182,13 +184,16 @@ const getErrorMessage = (errorType: string) => {
                       <label for="confluenceName" class="sr-only"> Confluence Name</label>
                       <input v-model="confluenceName" type="text" placeholder="Name" class="input input-bordered w-full" />
                       <label for="spaceKey" class="sr-only">Space key</label>
-                      <input v-model="spaceKey" type="text" placeholder="Space key" class="input input-bordered w-full" />
+                      <input v-model="spaceKey" type="text" placeholder="Space key (optional)" class="input input-bordered w-full" />
+                      <label for="confluenceCql" class="sr-only">CQL</label>
+                      <input v-model="confluenceCql" type="text" placeholder="CQL query (optional)" class="input input-bordered w-full" />
                       <label for="confluenceToken" class="sr-only">Token</label>
                       <input v-model="confluenceToken" type="password" placeholder="Token" class="input input-bordered w-full" />
                       <label for="maxPages" class="sr-only">Max pages</label>
-                      <input v-model.number="maxPages" type="number" placeholder="Max number of pages" class="input input-bordered w-full" />
+                      <input v-model.number="maxPages" type="number" placeholder="Max number of pages (optional)" class="input input-bordered w-full" />
                     </div>
-                    <p class="text-xs opacity-50 mb-4">{{ t('documents.confluenceLoadDescription') }}</p>
+                    <p class="text-xs opacity-50">{{ t('documents.confluenceLoadDescription') }}</p>
+                    <p class="text-xs opacity-50 mb-4">{{ t('documents.confluenceQueryHint') }}</p>
                     <button class="btn btn-sm btn-accent" @click="handleConfluenceUpload">
                         {{ t('documents.loadConfluence') }}
                     </button>
diff --git a/services/frontend/libs/i18n/admin/de.json b/services/frontend/libs/i18n/admin/de.json
@@ -10,8 +10,9 @@
     "uploadingDocument": "Wird hochgeladen...",
     "fileUpload": "Datei-Upload",
     "confluenceUpload": "Confluence",
-    "confluenceLoadTitle": "Confluence-Seiten laden",
-    "confluenceLoadDescription": "Klicken Sie auf den Button unten, um Seiten aus Confluence zu laden",
+    "confluenceLoadTitle": "Confluence-Inhalte laden",
+    "confluenceLoadDescription": "Geben Sie Ihre Confluence-Zugangsdaten an und wählen Sie einen Space-Key oder einen CQL-Filter",
+    "confluenceQueryHint": "Lassen Sie die Felder leer, um den gesamten Bereich zu laden, oder geben Sie einen Confluence Query Language (CQL) Ausdruck zum Filtern der Seiten ein",
     "loadConfluence": "Laden starten",
     "fileTypeNotAllowedTitle": "Dateityp nicht erlaubt",
     "fileTypeNotAllowedDescription": "Nur PDF-, DOCX-, PPTX- und XML-Dateien sind erlaubt",
diff --git a/services/frontend/libs/i18n/admin/en.json b/services/frontend/libs/i18n/admin/en.json
@@ -12,8 +12,9 @@
     "fileTypeNotAllowedDescription": "Only PDF, DOCX, PPTX, and XML files are allowed",
     "fileUpload": "File Upload",
     "confluenceUpload": "Confluence",
-    "confluenceLoadTitle": "Load all Confluence pages from a space",
-    "confluenceLoadDescription": "Click the button below to load pages from Confluence",
+    "confluenceLoadTitle": "Load Confluence content",
+    "confluenceLoadDescription": "Provide your Confluence credentials and choose a space key or CQL filter",
+    "confluenceQueryHint": "Leave fields blank to load the whole space or supply a Confluence Query Language (CQL) expression to filter pages",
     "loadConfluence": "Load Confluence",
     "sitemapUpload": "Sitemap",
     "sitemapLoadTitle": "Load content from a sitemap",