From 968bf17b203e729f2a0dddd241939fa8868af5d0 Mon Sep 17 00:00:00 2001 From: reddyv Date: Wed, 17 Jul 2024 15:37:16 -0400 Subject: [PATCH 1/3] update readme --- README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 3b2a5fa6..84706a4e 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,12 @@ # Needle In A Haystack - Pressure Testing LLMs +This repository is a fork of Greg Kamradt code https://twitter.com/GregKamradt. Its purpose is to add support for testing Google Gemini models. + +Original Repository: https://github.com/gkamradt/LLMTest_NeedleInAHaystack + A simple 'needle in a haystack' analysis to test in-context retrieval ability of long context LLMs. -Supported model providers: OpenAI, Anthropic, Cohere +Supported model providers: **Google**, OpenAI, Anthropic, Cohere Get the behind the scenes on the [overview video](https://youtu.be/KwRRuiCCdmc). From ad4124eaab807adc235cc124a0ecd259b201f905 Mon Sep 17 00:00:00 2001 From: reddyv Date: Thu, 18 Jul 2024 09:57:41 -0400 Subject: [PATCH 2/3] add support for google gemini --- README.md | 17 ++- needlehaystack/evaluators/__init__.py | 1 + needlehaystack/evaluators/google.py | 66 ++++++++++ needlehaystack/providers/__init__.py | 1 + needlehaystack/providers/google.py | 171 ++++++++++++++++++++++++++ needlehaystack/run.py | 10 +- requirements.txt | 15 ++- 7 files changed, 268 insertions(+), 13 deletions(-) create mode 100644 needlehaystack/evaluators/google.py create mode 100644 needlehaystack/providers/google.py diff --git a/README.md b/README.md index 84706a4e..660835c0 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,16 @@ # Needle In A Haystack - Pressure Testing LLMs -This repository is a fork of Greg Kamradt code https://twitter.com/GregKamradt. Its purpose is to add support for testing Google Gemini models. +This repository is a fork of Greg Kamradt code https://twitter.com/GregKamradt. Original Repository: https://github.com/gkamradt/LLMTest_NeedleInAHaystack +This fork adds support for using **Google Gemini** models as the provider and/or evaluator. + +```zsh +needlehaystack.run_test --provider google --evaluator google --model_name "gemini-1.5-pro" --document_depth_percents "[50]" --context_lengths "[200]" +``` + +### Original readme below A simple 'needle in a haystack' analysis to test in-context retrieval ability of long context LLMs. Supported model providers: **Google**, OpenAI, Anthropic, Cohere @@ -36,7 +43,7 @@ source venv/bin/activate ### Environment Variables - `NIAH_MODEL_API_KEY` - API key for interacting with the model. Depending on the provider, this gets used appropriately with the correct sdk. -- `NIAH_EVALUATOR_API_KEY` - API key to use if `openai` evaluation strategy is used. +- `NIAH_EVALUATOR_API_KEY` - API key to use if `openai` or `google` evaluation strategy is used. ### Install Package @@ -50,10 +57,10 @@ pip install needlehaystack Start using the package by calling the entry point `needlehaystack.run_test` from command line. -You can then run the analysis on OpenAI, Anthropic, or Cohere models with the following command line arguments: +You can then run the analysis on Google Gemini, OpenAI, Anthropic, or Cohere models with the following command line arguments: -- `provider` - The provider of the model, available options are `openai`, `anthropic`, and `cohere`. Defaults to `openai` -- `evaluator` - The evaluator, which can either be a `model` or `LangSmith`. See more on `LangSmith` below. If using a `model`, only `openai` is currently supported. Defaults to `openai`. +- `provider` - The provider of the model, available options are `openai`, `anthropic`, `google`, and `cohere`. Defaults to `openai` +- `evaluator` - The evaluator, which can either be a `openai`, `google`, or `LangSmith`. See more on `LangSmith` below. Defaults to `openai`. - `model_name` - Model name of the language model accessible by the provider. Defaults to `gpt-3.5-turbo-0125` - `evaluator_model_name` - Model name of the language model accessible by the evaluator. Defaults to `gpt-3.5-turbo-0125` diff --git a/needlehaystack/evaluators/__init__.py b/needlehaystack/evaluators/__init__.py index a9712d9f..259a9876 100644 --- a/needlehaystack/evaluators/__init__.py +++ b/needlehaystack/evaluators/__init__.py @@ -1,3 +1,4 @@ from .evaluator import Evaluator from .openai import OpenAIEvaluator from .langsmith import LangSmithEvaluator +from .google import GoogleEvaluator diff --git a/needlehaystack/evaluators/google.py b/needlehaystack/evaluators/google.py new file mode 100644 index 00000000..e5151aff --- /dev/null +++ b/needlehaystack/evaluators/google.py @@ -0,0 +1,66 @@ +import os + +from .evaluator import Evaluator + +from langchain.evaluation import load_evaluator +from langchain_google_genai import ChatGoogleGenerativeAI + +class GoogleEvaluator(Evaluator): + DEFAULT_MODEL_KWARGS: dict = dict(temperature=0) + CRITERIA = {"accuracy": """ + Score 1: The answer is completely unrelated to the reference. + Score 3: The answer has minor relevance but does not align with the reference. + Score 5: The answer has moderate relevance but contains inaccuracies. + Score 7: The answer aligns with the reference but has minor omissions. + Score 10: The answer is completely accurate and aligns perfectly with the reference. + Only respond with a numberical score"""} + + def __init__(self, + model_name: str = "gemini-1.5-pro", + model_kwargs: dict = DEFAULT_MODEL_KWARGS, + true_answer: str = None, + question_asked: str = None,): + """ + :param model_name: The name of the model. + :param model_kwargs: Model configuration. Default is {temperature: 0} + :param true_answer: The true answer to the question asked. + :param question_asked: The question asked to the model. + """ + + if (not true_answer) or (not question_asked): + raise ValueError("true_answer and question_asked must be supplied with init.") + + self.model_name = model_name + self.model_kwargs = model_kwargs + self.true_answer = true_answer + self.question_asked = question_asked + + api_key = os.getenv('NIAH_EVALUATOR_API_KEY') + if (not api_key): + raise ValueError("NIAH_EVALUATOR_API_KEY must be in env for using google evaluator.") + + self.api_key = api_key + + self.evaluator = ChatGoogleGenerativeAI(model=self.model_name, + google_api_key=self.api_key, + **self.model_kwargs) + + def evaluate_response(self, response: str) -> int: + evaluator = load_evaluator( + "labeled_score_string", + criteria=self.CRITERIA, + llm=self.evaluator, + ) + + eval_result = evaluator.evaluate_strings( + # The models response + prediction=response, + + # The actual answer + reference=self.true_answer, + + # The question asked + input=self.question_asked, + ) + + return int(eval_result['score']) diff --git a/needlehaystack/providers/__init__.py b/needlehaystack/providers/__init__.py index 4e514513..7b4b4aba 100644 --- a/needlehaystack/providers/__init__.py +++ b/needlehaystack/providers/__init__.py @@ -1,4 +1,5 @@ from .anthropic import Anthropic from .cohere import Cohere +from .google import Google from .model import ModelProvider from .openai import OpenAI diff --git a/needlehaystack/providers/google.py b/needlehaystack/providers/google.py new file mode 100644 index 00000000..7747de45 --- /dev/null +++ b/needlehaystack/providers/google.py @@ -0,0 +1,171 @@ +import os +from operator import itemgetter +import pkg_resources +import requests +from typing import Optional + +import google.generativeai as genai +from langchain_google_genai import ChatGoogleGenerativeAI +from langchain.prompts import PromptTemplate +import sentencepiece + +from .model import ModelProvider + + +class Google(ModelProvider): + """ + A wrapper class for interacting with Google's Gemini API, providing methods to encode text, generate prompts, + evaluate models, and create LangChain runnables for language model interactions. + + Attributes: + model_name (str): The name of the Google model to use for evaluations and interactions. + model: An instance of the Google Gemini client for API calls. + tokenizer: A tokenizer instance for encoding and decoding text to and from token representations. + """ + + DEFAULT_MODEL_KWARGS: dict = dict(max_output_tokens=300, + temperature=0) + VOCAB_FILE_URL = "https://raw.githubusercontent.com/google/gemma_pytorch/33b652c465537c6158f9a472ea5700e5e770ad3f/tokenizer/tokenizer.model" + + def __init__(self, + model_name: str = "gemini-1.5-pro", + model_kwargs: dict = DEFAULT_MODEL_KWARGS, + vocab_file_url: str = VOCAB_FILE_URL): + """ + Initializes the Google model provider with a specific model. + + Args: + model_name (str): The name of the Google model to use. Defaults to 'gemini-1.5-pro'. + model_kwargs (dict): Model configuration. Defaults to {max_tokens: 300, temperature: 0}. + vocab_file_url (str): Sentencepiece model file that defines tokenization vocabulary. Deafults to gemma + tokenizer https://github.com/google/gemma_pytorch/blob/main/tokenizer/tokenizer.model + + Raises: + ValueError: If NIAH_MODEL_API_KEY is not found in the environment. + """ + api_key = os.getenv('NIAH_MODEL_API_KEY') + if (not api_key): + raise ValueError("NIAH_MODEL_API_KEY must be in env.") + + self.model_name = model_name + self.model_kwargs = model_kwargs + self.api_key = api_key + genai.configure(api_key=self.api_key) + self.model = genai.GenerativeModel(self.model_name) + + local_vocab_file = 'tokenizer.model' + if not os.path.exists(local_vocab_file): + response = requests.get(vocab_file_url) # Download Tokenizer Vocab File (4MB) + response.raise_for_status() + + with open(local_vocab_file, 'wb') as f: + for chunk in response.iter_content(): + f.write(chunk) + self.tokenizer = sentencepiece.SentencePieceProcessor(local_vocab_file) + + resource_path = pkg_resources.resource_filename('needlehaystack', 'providers/Anthropic_prompt.txt') + + # Generate the prompt structure for the model + # Replace the following file with the appropriate prompt structure + with open(resource_path, 'r') as file: + self.prompt_structure = file.read() + + async def evaluate_model(self, prompt: str) -> str: + """ + Evaluates a given prompt using the Google model and retrieves the model's response. + + Args: + prompt (str): The prompt to send to the model. + + Returns: + str: The content of the model's response to the prompt. + """ + response = await self.model.generate_content_async( + prompt, + generation_config=self.model_kwargs + ) + + return response.text + + def generate_prompt(self, context: str, retrieval_question: str) -> str: + """ + Generates a structured prompt for querying the model, based on a given context and retrieval question. + + Args: + context (str): The context or background information relevant to the question. + retrieval_question (str): The specific question to be answered by the model. + + Returns: + str: The text prompt + """ + return self.prompt_structure.format( + retrieval_question=retrieval_question, + context=context) + + def encode_text_to_tokens(self, text: str) -> list[int]: + """ + Encodes a given text string to a sequence of tokens using the model's tokenizer. + + Args: + text (str): The text to encode. + + Returns: + list[int]: A list of token IDs representing the encoded text. + """ + return self.tokenizer.encode(text) + + def decode_tokens(self, tokens: list[int], context_length: Optional[int] = None) -> str: + """ + Decodes a sequence of tokens back into a text string using the model's tokenizer. + + Args: + tokens (list[int]): The sequence of token IDs to decode. + context_length (Optional[int], optional): An optional length specifying the number of tokens to decode. If not provided, decodes all tokens. + + Returns: + str: The decoded text string. + """ + return self.tokenizer.decode(tokens[:context_length]) + + def get_langchain_runnable(self, context: str) -> str: + """ + Creates a LangChain runnable that constructs a prompt based on a given context and a question, + queries the Google model, and returns the model's response. This method leverages the LangChain + library to build a sequence of operations: extracting input variables, generating a prompt, + querying the model, and processing the response. + + Args: + context (str): The context or background information relevant to the user's question. + This context is provided to the model to aid in generating relevant and accurate responses. + + Returns: + str: A LangChain runnable object that can be executed to obtain the model's response to a + dynamically provided question. The runnable encapsulates the entire process from prompt + generation to response retrieval. + + Example: + To use the runnable: + - Define the context and question. + - Execute the runnable with these parameters to get the model's response. + """ + + template = """You are a helpful AI bot that answers questions for a user. Keep your response short and direct" \n + \n ------- \n + {context} + \n ------- \n + Here is the user question: \n --- --- --- \n {question} \n Don't give information outside the document or repeat your findings.""" + + prompt = PromptTemplate( + template=template, + input_variables=["context", "question"], + ) + # Create a LangChain runnable + model = ChatGoogleGenerativeAI(temperature=0, model=self.model_name) + chain = ({"context": lambda x: context, + "question": itemgetter("question")} + | prompt + | model + ) + return chain + + diff --git a/needlehaystack/run.py b/needlehaystack/run.py index 8edbccba..f6a67354 100644 --- a/needlehaystack/run.py +++ b/needlehaystack/run.py @@ -5,8 +5,8 @@ from jsonargparse import CLI from . import LLMNeedleHaystackTester, LLMMultiNeedleHaystackTester -from .evaluators import Evaluator, LangSmithEvaluator, OpenAIEvaluator -from .providers import Anthropic, ModelProvider, OpenAI, Cohere +from .evaluators import Evaluator, LangSmithEvaluator, OpenAIEvaluator, GoogleEvaluator +from .providers import Anthropic, ModelProvider, OpenAI, Cohere, Google load_dotenv() @@ -65,6 +65,8 @@ def get_model_to_test(args: CommandArgs) -> ModelProvider: return Anthropic(model_name=args.model_name) case "cohere": return Cohere(model_name=args.model_name) + case "google": + return Google(model_name=args.model_name) case _: raise ValueError(f"Invalid provider: {args.provider}") @@ -86,6 +88,10 @@ def get_evaluator(args: CommandArgs) -> Evaluator: return OpenAIEvaluator(model_name=args.evaluator_model_name, question_asked=args.retrieval_question, true_answer=args.needle) + case "google": + return GoogleEvaluator(model_name=args.evaluator_model_name, + question_asked=args.retrieval_question, + true_answer=args.needle) case "langsmith": return LangSmithEvaluator() case _: diff --git a/requirements.txt b/requirements.txt index 610cb52a..75788d35 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,6 +12,7 @@ distro==1.8.0 filelock==3.13.1 frozenlist==1.4.0 fsspec==2023.10.0 +google-generativeai==0.7.2 h11==0.14.0 httpcore==1.0.2 httpx==0.25.2 @@ -20,13 +21,14 @@ idna==3.6 jsonargparse==4.27.5 jsonpatch==1.33 jsonpointer==2.4 -langchain==0.1.9 -langchain-community>=0.0.24 -langchain-core>=0.1.26 +langchain==0.2.9 +langchain-anthropic==0.1.13 +langchain-cohere==0.1.5 +langchain-community==0.2.7 +langchain-core==0.2.21 +langchain-google-genai==1.0.7 +langchain-openai==0.1.7 langsmith>=0.1.8 -langchain_openai -langchain_anthropic -langchain_cohere marshmallow==3.20.1 multidict==6.0.4 mypy-extensions==1.0.0 @@ -50,3 +52,4 @@ typing_extensions==4.8.0 urllib3==2.1.0 yarl==1.9.3 pytest==8.1.1 +sentencepiece==0.2.0 From de5dc4f8036928717c56a748ea92917864ea9037 Mon Sep 17 00:00:00 2001 From: reddyv Date: Fri, 19 Jul 2024 10:57:46 -0400 Subject: [PATCH 3/3] update runtime arguments --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 660835c0..ff2d586e 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ Original Repository: https://github.com/gkamradt/LLMTest_NeedleInAHaystack This fork adds support for using **Google Gemini** models as the provider and/or evaluator. ```zsh -needlehaystack.run_test --provider google --evaluator google --model_name "gemini-1.5-pro" --document_depth_percents "[50]" --context_lengths "[200]" +needlehaystack.run_test --provider google --evaluator google --model_name "gemini-1.5-pro" --evaluator_model_name "gemini-1.5-pro" --document_depth_percents "[50]" --context_lengths "[200]" ``` ### Original readme below