Skip to content

Commit

Permalink
Merge pull request #109 from StampyAI/same-citation-numbers
Browse files Browse the repository at this point in the history
Same citation numbers
  • Loading branch information
mruwnik committed Oct 1, 2023
2 parents bd8da81 + 748daca commit 40b518d
Show file tree
Hide file tree
Showing 10 changed files with 149 additions and 125 deletions.
2 changes: 1 addition & 1 deletion api/src/stampy_chat/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ def talk_to_robot_internal(index, query: str, mode: str, history: Prompt, sessio
top_k_blocks = get_top_k_blocks(index, query, k)

yield {
"state": "loading", "phase": "semantic",
"state": "citations",
"citations": [
{'title': block.title, 'author': block.authors, 'date': block.date, 'url': block.url}
for block in top_k_blocks
Expand Down
3 changes: 3 additions & 0 deletions api/src/stampy_chat/followups.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ def search_authored(query: str):


def get_followups(query):
if not query.strip():
return []

url = 'https://nlp.stampy.ai/api/search?query=' + quote(query)
response = requests.get(url).json()
return [Followup(entry['title'], entry['pageid'], entry['score']) for entry in response]
Expand Down
4 changes: 2 additions & 2 deletions api/tests/stampy_chat/test_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,7 @@ def test_talk_to_robot_internal(history, context):
with patch('openai.ChatCompletion.create', return_value=chunks):
assert list(talk_to_robot_internal("index", "what is this about?", "default", history, 'session id')) == [
{'phase': 'semantic', 'state': 'loading'},
{'citations': [], 'phase': 'semantic', 'state': 'loading'},
{'citations': [], 'state': 'citations'},
{'phase': 'prompt', 'state': 'loading'},
{'phase': 'llm', 'state': 'loading'},
{'content': 'response 1', 'state': 'streaming'},
Expand Down Expand Up @@ -300,7 +300,7 @@ def test_talk_to_robot_internal_error(history, context):
with patch('openai.ChatCompletion.create', return_value=chunks):
assert list(talk_to_robot_internal("index", "what is this about?", "default", history, 'session id')) == [
{'phase': 'semantic', 'state': 'loading'},
{'citations': [], 'phase': 'semantic', 'state': 'loading'},
{'citations': [], 'state': 'citations'},
{'phase': 'prompt', 'state': 'loading'},
{'phase': 'llm', 'state': 'loading'},
{'content': 'response 1', 'state': 'streaming'},
Expand Down
58 changes: 18 additions & 40 deletions web/src/components/assistant.tsx
Original file line number Diff line number Diff line change
@@ -1,49 +1,27 @@
import { ProcessText, ShowCitation, ShowInTextCitation } from "./citations";
import { useState } from "react";
import { ShowCitation, CitationsBlock } from "./citations";
import { GlossarySpan } from "./glossary";
import type { Citation, AssistantEntry } from "../types";

export const ShowAssistantEntry: React.FC<{entry: AssistantEntry}> = ({entry}) => {
const in_text_citation_regex = /\[([0-9]+)\]/g;

let [response, cite_map] = ProcessText(entry.content, entry.base_count);

// ----------------- create the ordered citation array -----------------

const citations = new Map<number, Citation>();
cite_map.forEach((value, key) => {
const index = key.charCodeAt(0) - 'a'.charCodeAt(0);
if (index >= entry.citations.length) {
console.log("invalid citation index: " + index);
} else {
citations.set(value, entry.citations[index]!);
}
});
import type { Citation, AssistantEntry as AssistantType} from "../types";

export const AssistantEntry: React.FC<{entry: AssistantType}> = ({entry}) => {
return (
<div className="mt-3 mb-8">
{ // split into paragraphs
response.split("\n").map(paragraph => ( <p> {
paragraph.split(in_text_citation_regex).map((text, i) => {
if (i % 2 === 0) {
return <GlossarySpan content={text.trim()} />;
}
i = parseInt(text) - 1;
if (!citations.has(i)) return `[${text}]`;
const citation = citations.get(i)!;
return (
<ShowInTextCitation citation={citation} i={i} />
);
})
} </p>))
}
<ul className="mt-5">
{ // show citations
Array.from(citations.entries()).map(([i, citation]) => (
<li key={i}>
<ShowCitation citation={citation} i={i} />
</li>
{ entry.content.split("\n").map(paragraph => (
<CitationsBlock
text={paragraph}
citations={entry.citationsMap}
textRenderer={(t) => (<GlossarySpan content={t}/>)}
/>
))
}
<ul className="mt-5">
{ // show citations
Array.from(entry.citationsMap.values()).map(citation => (
<li key={citation.index}>
<ShowCitation citation={citation} />
</li>
))
}
</ul>
</div>
);
Expand Down
73 changes: 43 additions & 30 deletions web/src/components/citations.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,7 @@ import type { Citation } from "../types";
import { Colours, A } from "./html";


// todo: memoize this if too slow.
export const ProcessText: (text: string, base_count: number) => [string, Map<string, number>] = (text, base_count) => {

export const formatCitations: (text: string) => string = (text) => {
// ---------------------- normalize citation form ----------------------
// the general plan here is just to add parsing cases until we can respond
// well to almost everything the LLM emits. We won't ever reach five nines,
Expand Down Expand Up @@ -41,33 +39,29 @@ export const ProcessText: (text: string, base_count: number) => [string, Map<str
/\[\s*([a-z]+)\s*\]/g,
(_match: string, x: string) => `[${x}]`
)
return response;
}

// -------------- map citations from strings into numbers --------------

export const findCitations: (text: string, citations: Citations[]) => Map<string, Citation> = (text, citations) => {
// figure out what citations are in the response, and map them appropriately
const cite_map = new Map<string, number>();
let cite_count = 0;
const cite_map = new Map<string, Citation>();

// scan a regex for [x] over the response. If x isn't in the map, add it.
// (note: we're actually doing this twice - once on parsing, once on render.
// if that looks like a problem, we could swap from strings to custom ropes).
const regex = /\[([a-z]+)\]/g;
let match;
let response_copy = ""
while ((match = regex.exec(response)) !== null) {
if (!cite_map.has(match[1]!)) {
cite_map.set(match[1]!, base_count + cite_count++);
while ((match = regex.exec(text)) !== null) {
const letter = match[1];
const citation = citations[letter.charCodeAt(0) - 'a'.charCodeAt(0)]
if (!cite_map.has(letter!)) {
cite_map.set(letter!, citation);
}
// replace [x] with [i]
response_copy += response.slice(response_copy.length, match.index) + `[${cite_map.get(match[1]!)! + 1}]`;
}

response = response_copy + response.slice(response_copy.length);

return [response, cite_map]
return cite_map
}

export const ShowCitation: React.FC<{citation: Citation, i: number}> = ({citation, i}) => {
export const ShowCitation: React.FC<{citation: Citation}> = ({citation}) => {

var c_str = citation.title;

Expand All @@ -82,22 +76,41 @@ export const ShowCitation: React.FC<{citation: Citation, i: number}> = ({citatio
: `https://duckduckgo.com/?q=${encodeURIComponent(citation.title)}`;

return (
<A className={Colours[i % Colours.length] + " border-2 flex items-center rounded my-2 text-sm no-underline w-fit"}
<A className={Colours[(citation.index - 1) % Colours.length] + " border-2 flex items-center rounded my-2 text-sm no-underline w-fit"}
href={url}>
<span className="mx-1"> [{i + 1}] </span>
<span className="mx-1"> [{citation.index}] </span>
<p className="mx-1 my-0"> {c_str} </p>
</A>
);
};

export const ShowInTextCitation: React.FC<{citation: Citation, i: number}> = ({citation, i}) => {
const url = citation.url && citation.url !== ""
? citation.url
: `https://duckduckgo.com/?q=${encodeURIComponent(citation.title)}`;
return (
<A className={Colours[i % Colours.length] + " border-2 rounded text-sm no-underline w-min px-0.5 pb-0.5 ml-1 mr-0.5"}
href={url}>
[{i + 1}]
</A>
);
export const CitationRef: React.FC<{citation: Citation}> = ({citation}) => {
const url = citation.url && citation.url !== ""
? citation.url
: `https://duckduckgo.com/?q=${encodeURIComponent(citation.title)}`;
return (
<A className={Colours[(citation.index - 1) % Colours.length] + " border-2 rounded text-sm no-underline w-min px-0.5 pb-0.5 ml-1 mr-0.5"}
href={url}>
[{citation.index}]
</A>
);
};


export const CitationsBlock: React.FC<{text: string, citations: Map<string, Citation>, textRenderer: (t: str) => any}> = ({text, citations, textRenderer}) => {
const regex = /\[([a-z]+)\]/g;
return (
<p> {
text.split(regex).map((part, i) => {
// When splitting, the even parts are basic text sections, while the odd ones are
// citations
if (i % 2 == 0) {
return textRenderer(part)
} else {
return (<CitationRef citation={citations.get(part)} />)
}
})
}
</p>
)
}
8 changes: 4 additions & 4 deletions web/src/components/entry.tsx
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import type {
Entry as EntryType,
AssistantEntry,
AssistantEntry as AssistantEntryType,
ErrorMessage,
StampyMessage,
UserEntry,
} from "../types";
import { ShowAssistantEntry } from "./assistant";
import { AssistantEntry } from "./assistant";
import { GlossarySpan } from "./glossary";
import Image from "next/image";
import logo from "../logo.svg";
Expand All @@ -30,10 +30,10 @@ export const Error = ({ entry }: { entry: ErrorMessage }) => {
);
};

export const Assistant = ({ entry }: { entry: AssistantEntry }) => {
export const Assistant = ({ entry }: { entry: AssistantEntryType }) => {
return (
<li>
<ShowAssistantEntry entry={entry} />
<AssistantEntry entry={entry} />
</li>
);
};
Expand Down
20 changes: 10 additions & 10 deletions web/src/hooks/useSearch.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import type {
Followup,
SearchResult,
} from "../types";
import { formatCitations, findCitations } from '../components/citations';

const MAX_FOLLOWUPS = 4;
const DATA_HEADER = "data: "
Expand Down Expand Up @@ -50,21 +51,22 @@ export async function* iterateData(res: Response) {

export const extractAnswer = async (
res: Response,
baseReferencesIndex: number,
setCurrent: (e: CurrentSearch) => void
): Promise<SearchResult> => {
var result: AssistantEntry = {
role: "assistant",
content: "",
citations: [],
base_count: baseReferencesIndex,
citationsMap: Map<string, Citation>,
};
var followups: Followup[] = [];
for await (var data of iterateData(res)) {
switch (data.state) {
case "loading":
// display loading phases, once citations are available toss them
// into the current item.
setCurrent({ phase: data.phase, ...result });
break;

case "citations":
result = {
...result,
citations: data?.citations || result?.citations || [],
Expand All @@ -74,11 +76,12 @@ export const extractAnswer = async (

case "streaming":
// incrementally build up the response
const content = formatCitations((result?.content || "") + data.content);
result = {
content,
role: "assistant",
content: (result?.content || "") + data.content,
citations: result?.citations || [],
base_count: result?.base_count || baseReferencesIndex,
citationsMap: findCitations(content, result?.citations || []),
};
setCurrent({ phase: "streaming", ...result });
break;
Expand Down Expand Up @@ -118,7 +121,6 @@ export const queryLLM = async (
query: string,
mode: string,
history: HistoryEntry[],
baseReferencesIndex: number,
setCurrent: (e?: CurrentSearch) => void,
sessionId: string
): Promise<SearchResult> => {
Expand All @@ -130,7 +132,7 @@ export const queryLLM = async (
}

try {
return await extractAnswer(res, baseReferencesIndex, setCurrent);
return await extractAnswer(res, setCurrent);
} catch (e) {
return {
result: { role: "error", content: e ? e.toString() : "unknown error" },
Expand Down Expand Up @@ -191,7 +193,6 @@ export const runSearch = async (
query: string,
query_source: "search" | "followups",
mode: string,
baseReferencesIndex: number,
entries: Entry[],
setCurrent: (c: CurrentSearch) => void,
sessionId: string
Expand All @@ -208,7 +209,6 @@ export const runSearch = async (
query,
mode,
history,
baseReferencesIndex,
setCurrent,
sessionId
);
Expand Down
Loading

0 comments on commit 40b518d

Please sign in to comment.