From 55d1b9f4d17699bb93fded3c7fa5cb7f819c8e00 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Philipp=20Gill=C3=A9?= <philipp.gille@gmail.com>
Date: Sat, 2 Mar 2024 14:26:59 +0100
Subject: [PATCH] Improve example

- Switch models
- Switch question
- Add warm-up
- Make use of persistency
- Improve output/logs
- Improve chat input
---
 example/main.go | 174 ++++++++++++++++++++++++------------------------
 1 file changed, 86 insertions(+), 88 deletions(-)

diff --git a/example/main.go b/example/main.go
index 0ccded8..21fd8e3 100644
--- a/example/main.go
+++ b/example/main.go
@@ -3,37 +3,42 @@ package main
 import (
 	"context"
 	"encoding/json"
-	"fmt"
 	"io"
 	"log"
 	"net/http"
 	"os"
 	"runtime"
 	"strconv"
+	"strings"
 
 	"github.com/philippgille/chromem-go"
 	"github.com/sashabaranov/go-openai"
 )
 
 const (
-	question = "Wich smooth jazz album received a Grammy nomination in 2009? I want to know the album name and artist."
+	question = "How many Albatros L 74 planes were produced?"
 	// We use a local LLM running in ollama: https://ollama.com/
 	ollamaBaseURL = "http://localhost:11434/v1"
 	// We use a very small model that doesn't need much resources and is fast, but
-	// doesn't have much knowledge: https://ollama.com/library/tinyllama
-	ollamaModel = "tinyllama:1.1b"
+	// doesn't have much knowledge: https://ollama.com/library/gemma
+	// We found Gemma 2B to be superior to TinyLlama (1.1B), Stable LM 2 (1.6B)
+	// and Phi-2 (2.7B) for the retrieval augmented generation (RAG) use case.
+	ollamaModel = "gemma:2b"
 )
 
 func main() {
 	ctx := context.Background()
 
-	// First we ask an LLM a fairly specific question that it won't know the answer
-	// to.
+	// Warm up ollama, in case the model isn't loaded yet
+	log.Println("Warming up ollama...")
+	_ = askLLM(ctx, nil, "Hello!")
+
+	// First we ask an LLM a fairly specific question that it likely won't know
+	// the answer to.
+	log.Println("Question: " + question)
 	log.Println("Asking LLM...")
-	reply := askLLM(ctx, "", question)
-	fmt.Printf("\nInitial reply from the LLM:\n" +
-		"===========================\n\n" +
-		reply + "\n\n")
+	reply := askLLM(ctx, nil, question)
+	log.Printf("Initial reply from the LLM: \"" + reply + "\"\n")
 
 	// Now we use our vector database for retrieval augmented generation (RAG),
 	// which means we provide the LLM with relevant knowledge.
@@ -45,48 +50,51 @@ func main() {
 	if err != nil {
 		panic(err)
 	}
-	// Create collection.
+	// Create collection if it wasn't loaded from persistent storage yet.
 	// We don't pass any embedding function, leading to the default being used (OpenAI
 	// text-embedding-3-small), which requires the OPENAI_API_KEY environment variable
 	// to be set.
-	collection, err := db.CreateCollection("Wikipedia", nil, nil)
-	if err != nil {
-		panic(err)
-	}
-	// Add docs to the collection.
-	// Here we use a DBpedia sample, where each line contains the lead section/introduction
-	// to some Wikipedia article and its category.
-	f, err := os.Open("dbpedia_sample.jsonl")
+	collection, err := db.GetOrCreateCollection("Wikipedia", nil, nil)
 	if err != nil {
 		panic(err)
 	}
-	d := json.NewDecoder(f)
-	var ids []string
-	var metadatas []map[string]string
-	var texts []string
-	log.Println("Reading JSON lines...")
-	// In this example we just read the first 20 lines, but in a real-world scenario
-	// you'd read the entire file.
-	for i := 0; i < 20; i++ {
-		var article struct {
-			Text     string `json:"text"`
-			Category string `json:"category"`
+	// Add docs to the collection, if the collection was just created (and not
+	// loaded from persistent storage).
+	if collection.Count() == 0 {
+		// Here we use a DBpedia sample, where each line contains the lead section/introduction
+		// to some Wikipedia article and its category.
+		f, err := os.Open("dbpedia_sample.jsonl")
+		if err != nil {
+			panic(err)
+		}
+		d := json.NewDecoder(f)
+		var ids []string
+		var metadatas []map[string]string
+		var texts []string
+		log.Println("Reading JSON lines...")
+		for i := 1; ; i++ {
+			var article struct {
+				Text     string `json:"text"`
+				Category string `json:"category"`
+			}
+			err := d.Decode(&article)
+			if err == io.EOF {
+				break // reached end of file
+			} else if err != nil {
+				panic(err)
+			}
+
+			ids = append(ids, strconv.Itoa(i))
+			metadatas = append(metadatas, map[string]string{"category": article.Category})
+			texts = append(texts, article.Text)
 		}
-		err := d.Decode(&article)
-		if err == io.EOF {
-			break // reached end of file
-		} else if err != nil {
+		log.Println("Adding documents to chromem-go, including creating their embeddings via OpenAI API...")
+		err = collection.AddConcurrently(ctx, ids, nil, metadatas, texts, runtime.NumCPU())
+		if err != nil {
 			panic(err)
 		}
-
-		ids = append(ids, strconv.Itoa(i))
-		metadatas = append(metadatas, map[string]string{"category": article.Category})
-		texts = append(texts, article.Text)
-	}
-	log.Println("Adding documents to chromem-go...")
-	err = collection.AddConcurrently(ctx, ids, nil, metadatas, texts, runtime.NumCPU())
-	if err != nil {
-		panic(err)
+	} else {
+		log.Println("Not reading JSON lines because collection was loaded from persistent storage.")
 	}
 
 	// Search for documents similar to the one we added just by passing the original
@@ -103,68 +111,58 @@ func main() {
 
 	// Now we can ask the LLM again, augmenting the question with the knowledge we retrieved.
 	// In this example we just use both retrieved documents as context.
-	context := docRes[0].Document + "\n\n" + docRes[1].Document
+	contexts := []string{docRes[0].Document, docRes[1].Document}
 	log.Println("Asking LLM with augmented question...")
-	reply = askLLM(ctx, context, question)
-	fmt.Printf("\nReply after augmenting the question with knowledge:\n" +
-		"===================================================\n\n" +
-		reply + "\n\n")
+	reply = askLLM(ctx, contexts, question)
+	log.Printf("Reply after augmenting the question with knowledge: \"" + reply + "\"\n")
 
 	/* Output (can differ slightly on each run):
-
-	2024/02/17 15:25:04 Asking LLM...
-
-	Initial reply from the LLM:
-	===========================
-
-	"The Album That Received A Grammy Nominated In 2009" or "A Smooth Jazz Album That Was Nominated For The Grammy Award In 2009".
-
-	2024/02/17 15:25:06 Setting up chromem-go...
-	2024/02/17 15:25:06 Reading JSON lines...
-	2024/02/17 15:25:06 Adding documents to chromem-go...
-	2024/02/17 15:25:08 Querying chromem-go...
-	2024/02/17 15:25:08 Asking LLM with augmented question...
-
-	Reply after augmenting the question with knowledge:
-	===================================================
-
-	"The Spice of Life" by Earl Klugh. The nomination was for Best Pop Instrumental Album at the 51st Grammy Awards in 2009.
+	2024/03/02 14:52:40 Warming up ollama...
+	2024/03/02 14:52:42 Question: How many Albatros L 74 planes were produced?
+	2024/03/02 14:52:42 Asking LLM...
+	2024/03/02 14:52:45 Initial reply from the LLM: "I am unable to provide a specific number for the number of Albatros L 74 planes produced, as I do not have access to real-time information or comprehensive records."
+	2024/03/02 14:52:45 Setting up chromem-go...
+	2024/03/02 14:52:45 Reading JSON lines...
+	2024/03/02 14:52:45 Adding documents to chromem-go, including creating their embeddings via OpenAI API...
+	2024/03/02 14:52:55 Querying chromem-go...
+	2024/03/02 14:52:55 Asking LLM with augmented question...
+	2024/03/02 14:53:01 Reply after augmenting the question with knowledge: "Answer: Only two Albatros L 74 planes were produced."
 	*/
 }
 
-func askLLM(ctx context.Context, context, question string) string {
+func askLLM(ctx context.Context, contexts []string, question string) string {
 	// We use a local LLM running in ollama, which has an OpenAI-compatible API.
-	// openAIClient := openai.NewClient(os.Getenv("OPENAI_API_KEY"))
 	openAIClient := openai.NewClientWithConfig(openai.ClientConfig{
 		BaseURL:    ollamaBaseURL,
 		HTTPClient: http.DefaultClient,
 	})
-	res, err := openAIClient.CreateChatCompletion(ctx, openai.ChatCompletionRequest{
-		// Model: openai.GPT3Dot5Turbo,
-		Model: ollamaModel,
-		Messages: []openai.ChatCompletionMessage{
-			{
-				Role:    openai.ChatMessageRoleSystem,
-				Content: "You are a helpful assistant. You answer the user's questions. Combine your knowledge with the context that the user might provide, as it's likely relevant to the user's question. If you are not sure, say that you don't know the answer.",
-			},
-			{
-				Role:    openai.ChatMessageRoleUser,
-				Content: "Context: " + context,
-			},
-			{
-				Role:    openai.ChatMessageRoleUser,
-				Content: "Question: " + question,
-			},
-			{
-				Role:    openai.ChatMessageRoleAssistant,
-				Content: "Based on your provided context and question, I think the answer is:",
-			},
+	messages := []openai.ChatCompletionMessage{
+		{
+			Role:    openai.ChatMessageRoleSystem,
+			Content: "You are a helpful assistant. You answer the user's questions in a concise manner. If you are not sure, say that you don't know the answer. If the user provides contexts, use them to answer their question.",
 		},
+	}
+	// Add contexts in reverse order, as many LLMs prioritize the latest message
+	// or rather forget about older ones (despite fitting into the LLM context).
+	for i := len(contexts) - 1; i >= 0; i-- {
+		messages = append(messages, openai.ChatCompletionMessage{
+			Role:    openai.ChatMessageRoleUser,
+			Content: "Context:" + contexts[i],
+		})
+	}
+	messages = append(messages, openai.ChatCompletionMessage{
+		Role:    openai.ChatMessageRoleUser,
+		Content: "Question: " + question,
+	})
+	res, err := openAIClient.CreateChatCompletion(ctx, openai.ChatCompletionRequest{
+		Model:    ollamaModel,
+		Messages: messages,
 	})
 	if err != nil {
 		panic(err)
 	}
 	reply := res.Choices[0].Message.Content
+	reply = strings.TrimSpace(reply)
 
 	return reply
 }