Skip to content

Commit

Permalink
fix: chunk
Browse files Browse the repository at this point in the history
  • Loading branch information
maxgfr committed Jul 25, 2023
1 parent d71cabe commit bbd557a
Show file tree
Hide file tree
Showing 5 changed files with 135 additions and 61 deletions.
4 changes: 2 additions & 2 deletions targets/export-elasticsearch/request.http
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ GET http://localhost:8787/embedding/service-public/list
GET http://localhost:8787/embedding/service-public/source

###
DELETE http://localhost:8787/embedding/service-public/delete
DELETE http://localhost:8787/embedding/service-public

###
POST http://localhost:8787/embedding/contribution
Expand All @@ -61,7 +61,7 @@ GET http://localhost:8787/embedding/contribution/list
GET http://localhost:8787/embedding/contribution/source

###
DELETE http://localhost:8787/embedding/contribution/delete
DELETE http://localhost:8787/embedding/contribution

###
POST http://localhost:8787/chat/service-public
Expand Down
2 changes: 1 addition & 1 deletion targets/export-elasticsearch/src/controllers/embedding.ts
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ export class EmbeddingController implements interfaces.Controller {
}
}

@httpDelete("/:slug/delete")
@httpDelete("/:slug")
async delete(
@requestParam("slug") slug: CollectionSlug
): Promise<Record<string, any>> {
Expand Down
39 changes: 3 additions & 36 deletions targets/export-elasticsearch/src/repositories/documents.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,48 +4,15 @@ import { injectable } from "inversify";

import { name } from "../utils";
import { getDocumentBySource } from "./graphql";

interface Document {
id: string;
cdtnId: string;
title: string;
slug: string;
source: string;
text: string;
isPublished: boolean;
isSearchable: boolean;
metaDescription: string;
document: {
raw: string;
url: string;
date: string;
description: string;
answers?: {
generic?: {
markdown: string;
};
conventionAnswer?: {
markdown: string;
};
};
referencedTexts?:
| {
slug: string;
type: string;
title: string;
}[]
| null;
};
__typename: string;
}
import { DocumentRepo } from "../type";

@injectable()
@name("DocumentsRepository")
export class DocumentsRepository {
public async getBySource(source: string): Promise<Document[]> {
public async getBySource(source: string): Promise<DocumentRepo[]> {
try {
const res = await client
.query<{ documents: Document[] }>(getDocumentBySource, {
.query<{ documents: DocumentRepo[] }>(getDocumentBySource, {
source,
})
.toPromise();
Expand Down
117 changes: 95 additions & 22 deletions targets/export-elasticsearch/src/services/embedding.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,14 @@ import {
IEmbeddingFunction,
OpenAIEmbeddingFunction,
} from "chromadb";
import { CollectionSlug } from "../type";
import { CollectionSlug, DocumentRepo } from "../type";

interface ChromaGetResult {
text: string;
metadatas: Record<string, any>;
}

type ChromaGetResults = ChromaGetResult[];

@injectable()
@name("EmbeddingService")
Expand All @@ -27,32 +34,42 @@ export class EmbeddingService {
}

async ingestServicePublicDocuments() {
return await this.ingestDocuments(
await this.ingestDocuments(
SOURCES.SHEET_SP,
CollectionSlug.SERVICE_PUBLIC,
(doc) => doc.text
);
return { result: "Documents ingested" };
}

async ingestContributionDocuments() {
return await this.ingestDocuments(
await this.ingestDocuments(
SOURCES.CONTRIBUTIONS,
CollectionSlug.CONTRIBUTION + "-generic",
(r) => {
return r.document.answers?.generic?.markdown ?? "";
}
);
await this.ingestDocuments(
SOURCES.CONTRIBUTIONS,
CollectionSlug.CONTRIBUTION,
CollectionSlug.CONTRIBUTION + "-idcc",
(r) => {
return r.document.answers?.conventionAnswer?.markdown ?? "";
},
(r) => {
const idccNumber = r.slug.split("-")[0];
const answer =
r.document.answers?.generic?.markdown +
"\n\n" +
r.document.answers?.conventionAnswer?.markdown;
return "Pour l'idcc numéro " + idccNumber + "\n\n" + answer + "\n\n";
return {
idccNumber: r.slug.split("-")[0],
};
}
);
return { result: "Documents ingested" };
}

async ingestDocuments(
source: string,
collectionName: string,
getText: (doc: any) => string
getText: (doc: DocumentRepo) => string,
getMetadata?: (doc: DocumentRepo) => Record<string, any>
) {
const results = await this.documentsRepository.getBySource(source);
const collection = await this.client.getOrCreateCollection({
Expand All @@ -72,6 +89,9 @@ export class EmbeddingService {
const metadatasSplits = textSplits.map(() => ({
title: r.title,
metaDescription: r.metaDescription,
id,
numChunks: idSplits.length,
...getMetadata?.(r),
}));
acc.ids.push(...idSplits);
acc.documents.push(...textSplits);
Expand All @@ -83,6 +103,8 @@ export class EmbeddingService {
acc.metadatas.push({
title: r.title,
metaDescription: r.metaDescription,
id: r.cdtnId,
...getMetadata?.(r),
});
return acc;
},
Expand All @@ -99,29 +121,80 @@ export class EmbeddingService {
console.error(e);
}
}

return { result: "Documents ingested" };
}

async getContributionDocuments(query: string) {
return await this.getDocuments(CollectionSlug.CONTRIBUTION, query);
}

async getServicePublicDocuments(query: string) {
return await this.getDocuments(CollectionSlug.SERVICE_PUBLIC, query);
}

async getDocuments(collectionName: string, query: string) {
// etape 1 : retrouver les 5 meilleurs elements
const collection = await this.client.getOrCreateCollection({
name: collectionName,
name: CollectionSlug.CONTRIBUTION + "-generic",
embeddingFunction: this.embedder,
});
const result = await collection.query({
queryTexts: [query],
});
// etape 2 : recuperer les parties découpées
// etape 3 : filer les infos liées à la cc
return result;
}

async getServicePublicDocuments(query: string): Promise<ChromaGetResults> {
try {
const result: ChromaGetResults = [];
const collection = await this.client.getOrCreateCollection({
name: CollectionSlug.SERVICE_PUBLIC,
embeddingFunction: this.embedder,
});
const queryTextsResult = await collection.query({
queryTexts: [query],
nResults: 5,
});
const metadataList = queryTextsResult.metadatas[0]!.reduce(
(acc: any[], m: any) => {
if (!acc.find((a) => a.metaDescription === m.metaDescription)) {
acc.push(m);
}
return acc;
},
[]
);
for (let i = 0; i < metadataList.length; i++) {
const metadata = metadataList[i]!;
const queryResult = await collection.query({
queryTexts: [" "],
where: {
id: {
$eq: metadata.id as string,
},
},
});
console.log(queryResult);
const ids = queryResult.ids[0]!;
const documents = queryResult.documents[0]!;
const text: string = documents
.map((_doc, j) => ({
[`${ids[j]}`]: documents[j],
}))
.sort((a, b) => {
const aKey = Object.keys(a)[0];
const bKey = Object.keys(b)[0];
return aKey.localeCompare(bKey);
})
.reduce((acc, curr) => {
const text = Object.values(curr);
return acc + text;
}, "");
result.push({
text,
metadatas: metadata,
});
}
return result;
} catch (e: any) {
console.error(e);
return e.message;
}
}

async countAndPeekContributionDocuments() {
return await this.countAndPeekDocuments(CollectionSlug.CONTRIBUTION);
}
Expand Down
34 changes: 34 additions & 0 deletions targets/export-elasticsearch/src/type.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,37 @@ export enum CollectionSlug {
SERVICE_PUBLIC = "service-public",
CONTRIBUTION = "contribution",
}

export interface DocumentRepo {
id: string;
cdtnId: string;
title: string;
slug: string;
source: string;
text: string;
isPublished: boolean;
isSearchable: boolean;
metaDescription: string;
document: {
raw: string;
url: string;
date: string;
description: string;
answers?: {
generic?: {
markdown: string;
};
conventionAnswer?: {
markdown: string;
};
};
referencedTexts?:
| {
slug: string;
type: string;
title: string;
}[]
| null;
};
__typename: string;
}

0 comments on commit bbd557a

Please sign in to comment.