From 03ff7c13643ffab766474a33f315b23a0cb8d029 Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Tue, 21 Jan 2025 10:27:58 +0100 Subject: [PATCH] #5241 - Improve assistant attribution functionality - Require less transfer thinking when referring to a chunk - Handle cases where a model insists on referring to chunks as "document XXX" --- .../documents/DocumentContextRetriever.java | 33 +++++-------- .../src/main/ts/src/AssistantPanel.svelte | 48 ++++++++++++------- 2 files changed, 42 insertions(+), 39 deletions(-) diff --git a/inception/inception-assistant/src/main/java/de/tudarmstadt/ukp/inception/assistant/documents/DocumentContextRetriever.java b/inception/inception-assistant/src/main/java/de/tudarmstadt/ukp/inception/assistant/documents/DocumentContextRetriever.java index 1e71dc9812..9fe2450c50 100644 --- a/inception/inception-assistant/src/main/java/de/tudarmstadt/ukp/inception/assistant/documents/DocumentContextRetriever.java +++ b/inception/inception-assistant/src/main/java/de/tudarmstadt/ukp/inception/assistant/documents/DocumentContextRetriever.java @@ -81,8 +81,8 @@ public List retrieve(ChatContext aAssistant, MTextMessage aMessage var body = new StringBuilder(); for (var chunk : chunks) { var reference = MReference.builder() // - //.withId(String.valueOf(references.size() + 1)) // - .withId(UUID.randomUUID().toString().substring(0,8)) // + // .withId(String.valueOf(references.size() + 1)) // + .withId(UUID.randomUUID().toString().substring(0, 8)) // .withDocumentId(chunk.documentId()) // .withDocumentName(chunk.documentName()) // .withBegin(chunk.begin()) // @@ -102,41 +102,32 @@ public List retrieve(ChatContext aAssistant, MTextMessage aMessage .withRole(SYSTEM).internal() // .withReferences(references.values()); - // Works good with qwen72b but not with granite 8b -// msg.withMessage(join("\n", asList( -// "The document retriever found the following relevant information in the documents of this project.", -// "", // -// body.toString(), "", -// "It is critical to mention the source of each document text in the form `{{ref::ref-id}}`."))); - - msg.withMessage(join("\n", asList( - """ + var instruction = """ Use the following documents from this project to respond. It is absolutely critital to mention the `{{ref::ref-id}}` after each individual information from a document. - Here is an example of how to include the ref-id: + Here is an example: Input: { + "id": "{{ref::917}}" "document": "The Eiffel Tower is located in Paris, France.", - "ref-id": "917" } { + "id": "{{ref::735}}" "document": "It is one of the most famous landmarks in the world.", - "ref-id": "735" } { + "id": "{{ref::582}}" "document": The Eiffel Tower was built from 1887 to 1889.", - "ref-id": "582" } - + Response: The Eiffel Tower is a famous landmark located in Paris, France {{ref::917}} {{ref::735}}. It was built from 1887 to 1889 {{ref::582}}. - + Now, use the same pattern to process the following document: - """, - "", // - body.toString()))); + """; + msg.withMessage(join("\n", asList(instruction, "", body.toString()))); return asList(msg.build()); } @@ -145,8 +136,8 @@ private void renderChunkJson(StringBuilder body, Chunk chunk, MReference aRefere { try { var data = new LinkedHashMap(); + data.put("id", "{{ref::" + aReference.id() + "}}"); data.put("document", chunk.text()); - data.put("ref-id", aReference.id()); data.entrySet().removeIf(e -> isBlank(e.getValue())); body.append(JSONUtil.toPrettyJsonString(data)); body.append("\n"); diff --git a/inception/inception-assistant/src/main/ts/src/AssistantPanel.svelte b/inception/inception-assistant/src/main/ts/src/AssistantPanel.svelte index 9811105d30..2fc05db91a 100644 --- a/inception/inception-assistant/src/main/ts/src/AssistantPanel.svelte +++ b/inception/inception-assistant/src/main/ts/src/AssistantPanel.svelte @@ -348,25 +348,37 @@ var pureHtml = DOMPurify.sanitize(rawHtml, { RETURN_DOM: false }); var refNum = 0; - // Replace all `{{ref::X}}` with the respective reference link - pureHtml = pureHtml.replace( - /\s*{{ref::([\w-]+)}}(\.*)/g, - (match, refId, dots) => { - const reference = message.references.find( - (ref) => ref.id === refId, - ); - if (reference) { - refNum++; - return `${dots}${refNum}`; - } + function replaceReferences(text, pattern) { + return text.replace( + pattern, + (match, refId, dots) => { + const reference = message.references.find( + (ref) => ref.id === refId, + ); + if (reference) { + refNum++; + return `${dots}${refNum}`; + } + + // If no matching reference is found, keep the original text + // console.trace( + // `Reference with id ${refId} not found in message ${message.id}` + // ); + return match; + }, + ); + } - // If no matching reference is found, keep the original text - // console.trace( - // `Reference with id ${refId} not found in message ${message.id}` - // ); - return match; - }, - ); + // Our canonical reference format + const refIdReplacementPattern = /\s*{{ref::([\w-]+)}}(\.*)/g + + // Some models (deepseek-r1) can't be bothered to properly use our reference syntax + // and keep referring to documents using the "Document XXXXXXXX" syntax... + const docIdReplacementPattern = /\s*[Dd]ocument[\s,]+([0-9a-f]{8})(\.*)/g + + // Replace all references with the respective reference link + pureHtml = replaceReferences(pureHtml, refIdReplacementPattern); + pureHtml = replaceReferences(pureHtml, docIdReplacementPattern); return pureHtml; }