diff --git a/confluence_vector_sync/azure_cognitive_search.py b/confluence_vector_sync/azure_cognitive_search.py index e5b2f0a..076ee3d 100644 --- a/confluence_vector_sync/azure_cognitive_search.py +++ b/confluence_vector_sync/azure_cognitive_search.py @@ -101,6 +101,7 @@ def create_item(self, item): "document_id": item["id"], "space": item["space"]["key"], "title": item["title"], + "titleVector": self.embedder.embed_query(item["title"]), "chunk": chunk_text, "chunkVector": self.embedder.embed_query(chunk_text), "last_modified_date": last_modified_date, @@ -142,9 +143,11 @@ def create_or_update_index(self): {"name": "space", "type": "Edm.String", "searchable": "true", "retrievable": "true", "filterable": "true"}, {"name": "title", "type": "Edm.String", "searchable": "true", "retrievable": "true"}, + {"name": "titleVector", "type": "Collection(Edm.Single)", "searchable": "true", "retrievable": "true", + "dimensions": 1536, "vectorSearchProfile": "default-vector-profile"}, {"name": "chunk", "type": "Edm.String", "searchable": "true", "retrievable": "true"}, {"name": "chunkVector", "type": "Collection(Edm.Single)", "searchable": "true", "retrievable": "true", - "dimensions": 1536, "vectorSearchConfiguration": "vectorConfig"}, + "dimensions": 1536, "vectorSearchProfile": "default-vector-profile"}, {"name": "last_modified_date", "type": "Edm.DateTimeOffset", "searchable": "false", "retrievable": "true", "filterable": "true"}, {"name": "last_indexed_date", "type": "Edm.DateTimeOffset", "searchable": "false", @@ -152,12 +155,19 @@ def create_or_update_index(self): {"name": "url", "type": "Edm.String", "searchable": "false", "retrievable": "true"} ], "vectorSearch": { - "algorithmConfigurations": [ + "algorithms": [ { - "name": "vectorConfig", + "name": "hnsw-config-1", "kind": "hnsw" } - ]}, + ], + "profiles": [ + { + "name": "default-vector-profile", + "algorithm": "hnsw-config-1", + } + ] + }, "semantic": { "configurations": [ { diff --git a/confluence_vector_sync/config.py b/confluence_vector_sync/config.py index b1c2cd5..035eef4 100644 --- a/confluence_vector_sync/config.py +++ b/confluence_vector_sync/config.py @@ -12,7 +12,7 @@ def get_config(): "azure_search_key": os.getenv("AZURE_SEARCH_KEY"), "azure_search_full_reindex": os.getenv("AZURE_SEARCH_FULL_REINDEX", "false").lower() == "true", "azure_search_embedding_model": os.getenv("AZURE_SEARCH_EMBEDDING_MODEL", "text-embedding-ada-002"), - "azure_search_api_version": os.getenv("AZURE_SEARCH_API_VERSION", "2023-07-01-Preview"), + "azure_search_api_version": os.getenv("AZURE_SEARCH_API_VERSION", "2023-11-01"), "azure_search_confluence_index": os.getenv("AZURE_SEARCH_CONFLUENCE_INDEX", "confluence"), "confluence_url": os.getenv("CONFLUENCE_URL"), "confluence_user_name": os.getenv("CONFLUENCE_USER_NAME"), diff --git a/confluence_vector_sync/sync.py b/confluence_vector_sync/sync.py index b9536a2..fd48a5c 100644 --- a/confluence_vector_sync/sync.py +++ b/confluence_vector_sync/sync.py @@ -22,8 +22,8 @@ def sync(): page["status"] in {"archived", "trashed", "deleted"}] # Create model of documents in search-index for all included confluence spaces search = search_from_config(config) - search.chunker = confluence search.create_or_update_index() + search.chunker = confluence search.index(changeset={"upsert": current, "remove": archived}) logging.info("Indexing complete") logging.debug(search.diagnostics)