diff --git a/targets/export-elasticsearch/request.http b/targets/export-elasticsearch/request.http index e6ab397b2..9bb820aad 100644 --- a/targets/export-elasticsearch/request.http +++ b/targets/export-elasticsearch/request.http @@ -43,7 +43,7 @@ GET http://localhost:8787/embedding/service-public/list GET http://localhost:8787/embedding/service-public/source ### -DELETE http://localhost:8787/embedding/service-public/delete +DELETE http://localhost:8787/embedding/service-public ### POST http://localhost:8787/embedding/contribution @@ -61,7 +61,7 @@ GET http://localhost:8787/embedding/contribution/list GET http://localhost:8787/embedding/contribution/source ### -DELETE http://localhost:8787/embedding/contribution/delete +DELETE http://localhost:8787/embedding/contribution ### POST http://localhost:8787/chat/service-public diff --git a/targets/export-elasticsearch/src/controllers/embedding.ts b/targets/export-elasticsearch/src/controllers/embedding.ts index e204eecaf..95c453ce5 100644 --- a/targets/export-elasticsearch/src/controllers/embedding.ts +++ b/targets/export-elasticsearch/src/controllers/embedding.ts @@ -96,7 +96,7 @@ export class EmbeddingController implements interfaces.Controller { } } - @httpDelete("/:slug/delete") + @httpDelete("/:slug") async delete( @requestParam("slug") slug: CollectionSlug ): Promise> { diff --git a/targets/export-elasticsearch/src/repositories/documents.ts b/targets/export-elasticsearch/src/repositories/documents.ts index 05d8e6767..20671dabe 100644 --- a/targets/export-elasticsearch/src/repositories/documents.ts +++ b/targets/export-elasticsearch/src/repositories/documents.ts @@ -4,48 +4,15 @@ import { injectable } from "inversify"; import { name } from "../utils"; import { getDocumentBySource } from "./graphql"; - -interface Document { - id: string; - cdtnId: string; - title: string; - slug: string; - source: string; - text: string; - isPublished: boolean; - isSearchable: boolean; - metaDescription: string; - document: { - raw: string; - url: string; - date: string; - description: string; - answers?: { - generic?: { - markdown: string; - }; - conventionAnswer?: { - markdown: string; - }; - }; - referencedTexts?: - | { - slug: string; - type: string; - title: string; - }[] - | null; - }; - __typename: string; -} +import { DocumentRepo } from "../type"; @injectable() @name("DocumentsRepository") export class DocumentsRepository { - public async getBySource(source: string): Promise { + public async getBySource(source: string): Promise { try { const res = await client - .query<{ documents: Document[] }>(getDocumentBySource, { + .query<{ documents: DocumentRepo[] }>(getDocumentBySource, { source, }) .toPromise(); diff --git a/targets/export-elasticsearch/src/services/embedding.ts b/targets/export-elasticsearch/src/services/embedding.ts index 38479288d..246f27d59 100644 --- a/targets/export-elasticsearch/src/services/embedding.ts +++ b/targets/export-elasticsearch/src/services/embedding.ts @@ -8,7 +8,14 @@ import { IEmbeddingFunction, OpenAIEmbeddingFunction, } from "chromadb"; -import { CollectionSlug } from "../type"; +import { CollectionSlug, DocumentRepo } from "../type"; + +interface ChromaGetResult { + text: string; + metadatas: Record; +} + +type ChromaGetResults = ChromaGetResult[]; @injectable() @name("EmbeddingService") @@ -27,32 +34,42 @@ export class EmbeddingService { } async ingestServicePublicDocuments() { - return await this.ingestDocuments( + await this.ingestDocuments( SOURCES.SHEET_SP, CollectionSlug.SERVICE_PUBLIC, (doc) => doc.text ); + return { result: "Documents ingested" }; } async ingestContributionDocuments() { - return await this.ingestDocuments( + await this.ingestDocuments( + SOURCES.CONTRIBUTIONS, + CollectionSlug.CONTRIBUTION + "-generic", + (r) => { + return r.document.answers?.generic?.markdown ?? ""; + } + ); + await this.ingestDocuments( SOURCES.CONTRIBUTIONS, - CollectionSlug.CONTRIBUTION, + CollectionSlug.CONTRIBUTION + "-idcc", + (r) => { + return r.document.answers?.conventionAnswer?.markdown ?? ""; + }, (r) => { - const idccNumber = r.slug.split("-")[0]; - const answer = - r.document.answers?.generic?.markdown + - "\n\n" + - r.document.answers?.conventionAnswer?.markdown; - return "Pour l'idcc numéro " + idccNumber + "\n\n" + answer + "\n\n"; + return { + idccNumber: r.slug.split("-")[0], + }; } ); + return { result: "Documents ingested" }; } async ingestDocuments( source: string, collectionName: string, - getText: (doc: any) => string + getText: (doc: DocumentRepo) => string, + getMetadata?: (doc: DocumentRepo) => Record ) { const results = await this.documentsRepository.getBySource(source); const collection = await this.client.getOrCreateCollection({ @@ -72,6 +89,9 @@ export class EmbeddingService { const metadatasSplits = textSplits.map(() => ({ title: r.title, metaDescription: r.metaDescription, + id, + numChunks: idSplits.length, + ...getMetadata?.(r), })); acc.ids.push(...idSplits); acc.documents.push(...textSplits); @@ -83,6 +103,8 @@ export class EmbeddingService { acc.metadatas.push({ title: r.title, metaDescription: r.metaDescription, + id: r.cdtnId, + ...getMetadata?.(r), }); return acc; }, @@ -99,29 +121,80 @@ export class EmbeddingService { console.error(e); } } - - return { result: "Documents ingested" }; } async getContributionDocuments(query: string) { - return await this.getDocuments(CollectionSlug.CONTRIBUTION, query); - } - - async getServicePublicDocuments(query: string) { - return await this.getDocuments(CollectionSlug.SERVICE_PUBLIC, query); - } - - async getDocuments(collectionName: string, query: string) { + // etape 1 : retrouver les 5 meilleurs elements const collection = await this.client.getOrCreateCollection({ - name: collectionName, + name: CollectionSlug.CONTRIBUTION + "-generic", embeddingFunction: this.embedder, }); const result = await collection.query({ queryTexts: [query], }); + // etape 2 : recuperer les parties découpées + // etape 3 : filer les infos liées à la cc return result; } + async getServicePublicDocuments(query: string): Promise { + try { + const result: ChromaGetResults = []; + const collection = await this.client.getOrCreateCollection({ + name: CollectionSlug.SERVICE_PUBLIC, + embeddingFunction: this.embedder, + }); + const queryTextsResult = await collection.query({ + queryTexts: [query], + nResults: 5, + }); + const metadataList = queryTextsResult.metadatas[0]!.reduce( + (acc: any[], m: any) => { + if (!acc.find((a) => a.metaDescription === m.metaDescription)) { + acc.push(m); + } + return acc; + }, + [] + ); + for (let i = 0; i < metadataList.length; i++) { + const metadata = metadataList[i]!; + const queryResult = await collection.query({ + queryTexts: [" "], + where: { + id: { + $eq: metadata.id as string, + }, + }, + }); + console.log(queryResult); + const ids = queryResult.ids[0]!; + const documents = queryResult.documents[0]!; + const text: string = documents + .map((_doc, j) => ({ + [`${ids[j]}`]: documents[j], + })) + .sort((a, b) => { + const aKey = Object.keys(a)[0]; + const bKey = Object.keys(b)[0]; + return aKey.localeCompare(bKey); + }) + .reduce((acc, curr) => { + const text = Object.values(curr); + return acc + text; + }, ""); + result.push({ + text, + metadatas: metadata, + }); + } + return result; + } catch (e: any) { + console.error(e); + return e.message; + } + } + async countAndPeekContributionDocuments() { return await this.countAndPeekDocuments(CollectionSlug.CONTRIBUTION); } diff --git a/targets/export-elasticsearch/src/type.ts b/targets/export-elasticsearch/src/type.ts index dd3753bfb..99a7c4ffa 100644 --- a/targets/export-elasticsearch/src/type.ts +++ b/targets/export-elasticsearch/src/type.ts @@ -2,3 +2,37 @@ export enum CollectionSlug { SERVICE_PUBLIC = "service-public", CONTRIBUTION = "contribution", } + +export interface DocumentRepo { + id: string; + cdtnId: string; + title: string; + slug: string; + source: string; + text: string; + isPublished: boolean; + isSearchable: boolean; + metaDescription: string; + document: { + raw: string; + url: string; + date: string; + description: string; + answers?: { + generic?: { + markdown: string; + }; + conventionAnswer?: { + markdown: string; + }; + }; + referencedTexts?: + | { + slug: string; + type: string; + title: string; + }[] + | null; + }; + __typename: string; +}