Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Same citation numbers #109

Merged
merged 4 commits into from
Oct 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion api/src/stampy_chat/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ def talk_to_robot_internal(index, query: str, mode: str, history: Prompt, sessio
top_k_blocks = get_top_k_blocks(index, query, k)

yield {
"state": "loading", "phase": "semantic",
"state": "citations",
"citations": [
{'title': block.title, 'author': block.authors, 'date': block.date, 'url': block.url}
for block in top_k_blocks
Expand Down
3 changes: 3 additions & 0 deletions api/src/stampy_chat/followups.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ def search_authored(query: str):


def get_followups(query):
if not query.strip():
return []
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this was causing errors when the server returned an error (because of the empty query) which was parsed as followups


url = 'https://nlp.stampy.ai/api/search?query=' + quote(query)
response = requests.get(url).json()
return [Followup(entry['title'], entry['pageid'], entry['score']) for entry in response]
Expand Down
4 changes: 2 additions & 2 deletions api/tests/stampy_chat/test_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,7 @@ def test_talk_to_robot_internal(history, context):
with patch('openai.ChatCompletion.create', return_value=chunks):
assert list(talk_to_robot_internal("index", "what is this about?", "default", history, 'session id')) == [
{'phase': 'semantic', 'state': 'loading'},
{'citations': [], 'phase': 'semantic', 'state': 'loading'},
{'citations': [], 'state': 'citations'},
{'phase': 'prompt', 'state': 'loading'},
{'phase': 'llm', 'state': 'loading'},
{'content': 'response 1', 'state': 'streaming'},
Expand Down Expand Up @@ -300,7 +300,7 @@ def test_talk_to_robot_internal_error(history, context):
with patch('openai.ChatCompletion.create', return_value=chunks):
assert list(talk_to_robot_internal("index", "what is this about?", "default", history, 'session id')) == [
{'phase': 'semantic', 'state': 'loading'},
{'citations': [], 'phase': 'semantic', 'state': 'loading'},
{'citations': [], 'state': 'citations'},
{'phase': 'prompt', 'state': 'loading'},
{'phase': 'llm', 'state': 'loading'},
{'content': 'response 1', 'state': 'streaming'},
Expand Down
58 changes: 18 additions & 40 deletions web/src/components/assistant.tsx
Original file line number Diff line number Diff line change
@@ -1,49 +1,27 @@
import { ProcessText, ShowCitation, ShowInTextCitation } from "./citations";
import { useState } from "react";
import { ShowCitation, CitationsBlock } from "./citations";
import { GlossarySpan } from "./glossary";
import type { Citation, AssistantEntry } from "../types";

export const ShowAssistantEntry: React.FC<{entry: AssistantEntry}> = ({entry}) => {
const in_text_citation_regex = /\[([0-9]+)\]/g;

let [response, cite_map] = ProcessText(entry.content, entry.base_count);

// ----------------- create the ordered citation array -----------------

const citations = new Map<number, Citation>();
cite_map.forEach((value, key) => {
const index = key.charCodeAt(0) - 'a'.charCodeAt(0);
if (index >= entry.citations.length) {
console.log("invalid citation index: " + index);
} else {
citations.set(value, entry.citations[index]!);
}
});
import type { Citation, AssistantEntry as AssistantType} from "../types";

export const AssistantEntry: React.FC<{entry: AssistantType}> = ({entry}) => {
return (
<div className="mt-3 mb-8">
{ // split into paragraphs
response.split("\n").map(paragraph => ( <p> {
paragraph.split(in_text_citation_regex).map((text, i) => {
if (i % 2 === 0) {
return <GlossarySpan content={text.trim()} />;
}
i = parseInt(text) - 1;
if (!citations.has(i)) return `[${text}]`;
const citation = citations.get(i)!;
return (
<ShowInTextCitation citation={citation} i={i} />
);
})
} </p>))
}
<ul className="mt-5">
{ // show citations
Array.from(citations.entries()).map(([i, citation]) => (
<li key={i}>
<ShowCitation citation={citation} i={i} />
</li>
{ entry.content.split("\n").map(paragraph => (
<CitationsBlock
text={paragraph}
citations={entry.citationsMap}
textRenderer={(t) => (<GlossarySpan content={t}/>)}
/>
))
}
<ul className="mt-5">
{ // show citations
Array.from(entry.citationsMap.values()).map(citation => (
<li key={citation.index}>
<ShowCitation citation={citation} />
</li>
))
}
</ul>
</div>
);
Expand Down
73 changes: 43 additions & 30 deletions web/src/components/citations.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,7 @@ import type { Citation } from "../types";
import { Colours, A } from "./html";


// todo: memoize this if too slow.
export const ProcessText: (text: string, base_count: number) => [string, Map<string, number>] = (text, base_count) => {

export const formatCitations: (text: string) => string = (text) => {
// ---------------------- normalize citation form ----------------------
// the general plan here is just to add parsing cases until we can respond
// well to almost everything the LLM emits. We won't ever reach five nines,
Expand Down Expand Up @@ -41,33 +39,29 @@ export const ProcessText: (text: string, base_count: number) => [string, Map<str
/\[\s*([a-z]+)\s*\]/g,
(_match: string, x: string) => `[${x}]`
)
return response;
}

// -------------- map citations from strings into numbers --------------

export const findCitations: (text: string, citations: Citations[]) => Map<string, Citation> = (text, citations) => {
// figure out what citations are in the response, and map them appropriately
const cite_map = new Map<string, number>();
let cite_count = 0;
const cite_map = new Map<string, Citation>();

// scan a regex for [x] over the response. If x isn't in the map, add it.
// (note: we're actually doing this twice - once on parsing, once on render.
// if that looks like a problem, we could swap from strings to custom ropes).
const regex = /\[([a-z]+)\]/g;
Aprillion marked this conversation as resolved.
Show resolved Hide resolved
let match;
let response_copy = ""
while ((match = regex.exec(response)) !== null) {
if (!cite_map.has(match[1]!)) {
cite_map.set(match[1]!, base_count + cite_count++);
while ((match = regex.exec(text)) !== null) {
const letter = match[1];
const citation = citations[letter.charCodeAt(0) - 'a'.charCodeAt(0)]
if (!cite_map.has(letter!)) {
cite_map.set(letter!, citation);
}
// replace [x] with [i]
response_copy += response.slice(response_copy.length, match.index) + `[${cite_map.get(match[1]!)! + 1}]`;
}

response = response_copy + response.slice(response_copy.length);

return [response, cite_map]
return cite_map
}

export const ShowCitation: React.FC<{citation: Citation, i: number}> = ({citation, i}) => {
export const ShowCitation: React.FC<{citation: Citation}> = ({citation}) => {

var c_str = citation.title;

Expand All @@ -82,22 +76,41 @@ export const ShowCitation: React.FC<{citation: Citation, i: number}> = ({citatio
: `https://duckduckgo.com/?q=${encodeURIComponent(citation.title)}`;

return (
<A className={Colours[i % Colours.length] + " border-2 flex items-center rounded my-2 text-sm no-underline w-fit"}
<A className={Colours[(citation.index - 1) % Colours.length] + " border-2 flex items-center rounded my-2 text-sm no-underline w-fit"}
href={url}>
<span className="mx-1"> [{i + 1}] </span>
<span className="mx-1"> [{citation.index}] </span>
<p className="mx-1 my-0"> {c_str} </p>
</A>
);
};

export const ShowInTextCitation: React.FC<{citation: Citation, i: number}> = ({citation, i}) => {
const url = citation.url && citation.url !== ""
? citation.url
: `https://duckduckgo.com/?q=${encodeURIComponent(citation.title)}`;
return (
<A className={Colours[i % Colours.length] + " border-2 rounded text-sm no-underline w-min px-0.5 pb-0.5 ml-1 mr-0.5"}
href={url}>
[{i + 1}]
</A>
);
export const CitationRef: React.FC<{citation: Citation}> = ({citation}) => {
const url = citation.url && citation.url !== ""
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I keep wanting to add a linter, but worry that it'll mess up other PRs... :/

? citation.url
: `https://duckduckgo.com/?q=${encodeURIComponent(citation.title)}`;
return (
<A className={Colours[(citation.index - 1) % Colours.length] + " border-2 rounded text-sm no-underline w-min px-0.5 pb-0.5 ml-1 mr-0.5"}
href={url}>
[{citation.index}]
</A>
);
};


export const CitationsBlock: React.FC<{text: string, citations: Map<string, Citation>, textRenderer: (t: str) => any}> = ({text, citations, textRenderer}) => {
Aprillion marked this conversation as resolved.
Show resolved Hide resolved
const regex = /\[([a-z]+)\]/g;
return (
<p> {
text.split(regex).map((part, i) => {
// When splitting, the even parts are basic text sections, while the odd ones are
// citations
if (i % 2 == 0) {
return textRenderer(part)
} else {
return (<CitationRef citation={citations.get(part)} />)
}
})
}
</p>
)
}
8 changes: 4 additions & 4 deletions web/src/components/entry.tsx
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import type {
Entry as EntryType,
AssistantEntry,
AssistantEntry as AssistantEntryType,
ErrorMessage,
StampyMessage,
UserEntry,
} from "../types";
import { ShowAssistantEntry } from "./assistant";
import { AssistantEntry } from "./assistant";
import { GlossarySpan } from "./glossary";
import Image from "next/image";
import logo from "../logo.svg";
Expand All @@ -30,10 +30,10 @@ export const Error = ({ entry }: { entry: ErrorMessage }) => {
);
};

export const Assistant = ({ entry }: { entry: AssistantEntry }) => {
export const Assistant = ({ entry }: { entry: AssistantEntryType }) => {
return (
<li>
<ShowAssistantEntry entry={entry} />
<AssistantEntry entry={entry} />
</li>
);
};
Expand Down
20 changes: 10 additions & 10 deletions web/src/hooks/useSearch.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import type {
Followup,
SearchResult,
} from "../types";
import { formatCitations, findCitations } from '../components/citations';

const MAX_FOLLOWUPS = 4;
const DATA_HEADER = "data: "
Expand Down Expand Up @@ -50,21 +51,22 @@ export async function* iterateData(res: Response) {

export const extractAnswer = async (
res: Response,
baseReferencesIndex: number,
setCurrent: (e: CurrentSearch) => void
): Promise<SearchResult> => {
var result: AssistantEntry = {
role: "assistant",
content: "",
citations: [],
base_count: baseReferencesIndex,
citationsMap: Map<string, Citation>,
};
var followups: Followup[] = [];
for await (var data of iterateData(res)) {
switch (data.state) {
case "loading":
// display loading phases, once citations are available toss them
// into the current item.
setCurrent({ phase: data.phase, ...result });
break;

case "citations":
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This adds a separate step for parsing citations

result = {
...result,
citations: data?.citations || result?.citations || [],
Expand All @@ -74,11 +76,12 @@ export const extractAnswer = async (

case "streaming":
// incrementally build up the response
const content = formatCitations((result?.content || "") + data.content);
result = {
content,
role: "assistant",
content: (result?.content || "") + data.content,
citations: result?.citations || [],
base_count: result?.base_count || baseReferencesIndex,
citationsMap: findCitations(content, result?.citations || []),
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The LLM returns citations like "bla bla bla [a] ble ble [b][c]". These then need to be mapped to the appropriate numbers. This citationMap is a mapping of {<letter>: <full citation object>}. It's possible to change from letters to numbers - it's all in the LLM prompt

};
setCurrent({ phase: "streaming", ...result });
break;
Expand Down Expand Up @@ -118,7 +121,6 @@ export const queryLLM = async (
query: string,
mode: string,
history: HistoryEntry[],
baseReferencesIndex: number,
setCurrent: (e?: CurrentSearch) => void,
sessionId: string
): Promise<SearchResult> => {
Expand All @@ -130,7 +132,7 @@ export const queryLLM = async (
}

try {
return await extractAnswer(res, baseReferencesIndex, setCurrent);
return await extractAnswer(res, setCurrent);
} catch (e) {
return {
result: { role: "error", content: e ? e.toString() : "unknown error" },
Expand Down Expand Up @@ -191,7 +193,6 @@ export const runSearch = async (
query: string,
query_source: "search" | "followups",
mode: string,
baseReferencesIndex: number,
entries: Entry[],
setCurrent: (c: CurrentSearch) => void,
sessionId: string
Expand All @@ -208,7 +209,6 @@ export const runSearch = async (
query,
mode,
history,
baseReferencesIndex,
setCurrent,
sessionId
);
Expand Down
Loading