gkamradt · gauss5930 · Mar 28, 2024 · Mar 28, 2024
diff --git a/README.md b/README.md
@@ -46,9 +46,9 @@ pip install needlehaystack
 
 Start using the package by calling the entry point `needlehaystack.run_test` from command line.
 
-You can then run the analysis on OpenAI or Anthropic models with the following command line arguments:
+You can then run the analysis on OpenAI, Anthropic, or HuggingFace models with the following command line arguments:
 
-- `provider` - The provider of the model, available options are `openai` and `anthropic`. Defaults to `openai`
+- `provider` - The provider of the model, available options are `openai`, `anthropic`, and `huggingface`. Defaults to `openai`
 - `evaluator` - The evaluator, which can either be a `model` or `LangSmith`. See more on `LangSmith` below. If using a `model`, only `openai` is currently supported. Defaults to `openai`.
 - `model_name` - Model name of the language model accessible by the provider. Defaults to `gpt-3.5-turbo-0125`
 - `evaluator_model_name` - Model name of the language model accessible by the evaluator. Defaults to `gpt-3.5-turbo-0125`
@@ -69,6 +69,12 @@ Following command runs the test for anthropic model `claude-2.1` for a single co
 needlehaystack.run_test --provider anthropic --model_name "claude-2.1" --document_depth_percents "[50]" --context_lengths "[2000]"
 ```
 
+Following command runs the test for anthropic model `mistralai/Mistral-7B-Instruct-v0.2` for a single context length of 2000 and single document depth of 50%.
+
+```zsh
+needlehaystack.run_test --provider huggingface --model_name "mistralai/Mistral-7B-Instruct-v0.2" --document_depth_percents "[50]" --context_lengths "[2000]"
+```
+
 ### For Contributors
 
 1. Fork and clone the repository.
@@ -109,6 +115,7 @@ The package `needlehaystack` is available for import in your test cases. Develop
 `LLMMultiNeedleHaystackTester` parameters:
 
 - `multi_needle` - True or False, whether to run multi-needle
+- `multi_needle_type` - The type of needle insertion method
 - `needles` - List of needles to insert in the context
 
 Other Parameters:
@@ -127,10 +134,20 @@ Other Parameters:
 
 <img src="img/Claude_2_1_testing.png" alt="GPT-4-128 Context Testing" width="800"/>
 
+## MistralAI's Mistral-7B-Instruct-v0.2 (Run 03/22/2024)
+
+<img src="img/Mistral_Instruct_testing.png" alt="Mistral-Instruct Context Testing" width="800">
+
 ## Multi Needle Evaluator
 
 To enable multi-needle insertion into our context, use `--multi_needle True`.
 
+There are two ways to insert the needles into the haystack:
+- [depth_percent](#depth_percent)
+- [random](#random)
+
+### Depth_percent
+
 This inserts the first needle at the specified `depth_percent`, then evenly distributes subsequent needles through the remaining context after this depth.
 
 For even spacing, it calculates the `depth_percent_interval` as:
@@ -158,6 +175,44 @@ Needle 9: 40 + 8 * 6 = 88
 Needle 10: 40 + 9 * 6 = 94
 ```
 
+### Random
+
+This insertion methodology targets to not only preserve the randomness but also insert the needles evenly. The specific description of method as follows:
+
+1. Divide the entire context by the given number of needles to find the range to insert the needle.
+2. Insert needles randomly within the given range of each needle.
+
+To find the insertion range of each needles, it calculates as follows:
+
+```
+insertion_range = []
+num_needles = len(self.needles)
+range_interval = 100 / num_needles
+
+for i in range(num_needles):
+    insertion_range.append(range(i * range_interval, (i + 1) * range_interval))
+```
+
+In case of 10 needles, the first needle can be placed randomly in the range of [0, 10], the second at range of [10, 20], and so on.
+
+Following example shows the depth percents for the case of 10 needles.
+
+```
+num_needles = 10
+range_interval = 100 / 10 = 10
+
+Needle 1: [0, 10] -> random
+Needle 2: [10, 20] -> random
+Needle 3: [20, 30] -> random
+Needle 4: [30, 40] -> random
+Needle 5: [40, 50] -> random
+Needle 6: [50, 60] -> random
+Needle 7: [60, 70] -> random
+Needle 8: [70, 80] -> random
+Needle 9: [80, 90] -> random
+Needle 10: [90, 100] -> random
+```
+
 ## LangSmith Evaluator
 
 You can use LangSmith to orchestrate evals and store results.

diff --git a/img/Mistral_Instruct_testing.png b/img/Mistral_Instruct_testing.png
diff --git a/needlehaystack/llm_multi_needle_haystack_tester.py b/needlehaystack/llm_multi_needle_haystack_tester.py
@@ -3,6 +3,7 @@
 import json
 import os
 import time
+import random
 from asyncio import Semaphore
 from datetime import datetime, timezone
 
@@ -26,20 +27,27 @@ class LLMMultiNeedleHaystackTester(LLMNeedleHaystackTester):
     """
     def __init__(self, *args, 
                  needles=[], 
+                 provider: str = None,
                  model_to_test: ModelProvider = None,
                  evaluator: Evaluator = None, 
                  print_ongoing_status = True,
                  eval_set = "multi-needle-eval-sf",
+                 multi_needle_type = "depth_percent",
                  **kwargs):
 
+        if multi_needle_type not in ["depth_percent", "random"]:
+            raise ValueError("Invalid multi_needle_type")
+
         super().__init__(*args, model_to_test=model_to_test, **kwargs)
+        self.provider = provider
         self.needles = needles
         self.evaluator = evaluator
         self.model_to_test = model_to_test
         self.eval_set = eval_set
         self.model_name = self.model_to_test.model_name
         self.print_ongoing_status = print_ongoing_status
         self.insertion_percentages = []
+        self.type = multi_needle_type
 
     async def insert_needles(self, context, depth_percent, context_length):
         """
@@ -65,33 +73,60 @@ async def insert_needles(self, context, depth_percent, context_length):
         Returns:
             str: The new context with needles inserted.
         """
-        tokens_context = self.model_to_test.encode_text_to_tokens(context)
+        if self.provider == "huggingface":
+            tokens_context = self.model_to_test.encode_text_to_tokens(context, no_bos=True)
+        else:
+            tokens_context = self.model_to_test.encode_text_to_tokens(context)
         context_length -= self.final_context_length_buffer
 
         # Calculate the total length of all needles in tokens
-        total_needles_length = sum(len(self.model_to_test.encode_text_to_tokens(needle)) for needle in self.needles)
-
+        if self.provider == "huggingface":
+            total_needles_length = sum(len(self.model_to_test.encode_text_to_tokens(needle, no_bos=True)) for needle in self.needles)
+        else:
+            total_needles_length = sum(len(self.model_to_test.encode_text_to_tokens(needle)) for needle in self.needles)
+
         # Ensure context length accounts for needles
         if len(tokens_context) + total_needles_length > context_length:
             tokens_context = tokens_context[:context_length - total_needles_length]
 
-        # To evenly distribute the needles, we calculate the intervals they need to be inserted.
-        depth_percent_interval = (100 - depth_percent) / len(self.needles)
-
-        # Reset the insertion percentages list for the current context
-        self.insertion_percentages = []
+        # Insert the needles with 'depth_percent' method
+        if self.type == "depth_percent":
+            # To evenly distribute the needles, we calculate the intervals they need to be inserted.
+            depth_percent_interval = (100 - depth_percent) / len(self.needles)
+
+            # Reset the insertion percentages list for the current context
+            self.insertion_percentages = []
+
+        # Insert the needles with 'random' method
+        elif self.type == "random":
+            # Reset the insertion percentages list for the random insertion points
+            self.insertion_percentages = []
+
+            # To randomly and evenly distribute the needles, we calculate the insertion range and range intervals.
+            num_needles = len(self.needles)
+            range_interval = int(100 / num_needles)
+
+            # Generate random insertion points for each needle based on the range interval.
+            for i in range(num_needles):
+                self.insertion_percentages.append(random.randrange(i * range_interval, (i + 1) * range_interval))
 
         # Insert needles at calculated points
-        for needle in self.needles:
+        for i in range(len(self.needles)):
 
-            tokens_needle = self.model_to_test.encode_text_to_tokens(needle)
+            if self.provider == "huggingface":
+                tokens_needle = self.model_to_test.encode_text_to_tokens(self.needles[i], no_bos=True)
+            else:
+                tokens_needle = self.model_to_test.encode_text_to_tokens(self.needles[i])
 
             if depth_percent == 100:
                 # If your depth percent is 100 (which means your needle is the last thing in the doc), throw it at the end
                 tokens_context = tokens_context + tokens_needle
             else:
                 # Go get the position (in terms of tokens) to insert your needle
-                insertion_point = int(len(tokens_context) * (depth_percent / 100))
+                if self.type == "random":
+                    insertion_point = int(len(tokens_context) * (self.insertion_percentages[i] / 100))    
+                elif self.type == "depth_percent":
+                    insertion_point = int(len(tokens_context) * (depth_percent / 100))
 
                 # tokens_new_context represents the tokens before the needle
                 tokens_new_context = tokens_context[:insertion_point]
@@ -109,11 +144,16 @@ async def insert_needles(self, context, depth_percent, context_length):
 
                 # Log 
                 insertion_percentage = (insertion_point / len(tokens_context)) * 100
-                self.insertion_percentages.append(insertion_percentage)
-                print(f"Inserted '{needle}' at {insertion_percentage:.2f}% of the context, total length now: {len(tokens_context)} tokens")
 
-                # Adjust depth for next needle
-                depth_percent += depth_percent_interval  
+                if self.type == "random":
+                    self.insertion_percentages[i] = insertion_percentage
+
+                if self.type == "depth_percent":
+                    self.insertion_percentages.append(insertion_percentage)
+                    # Adjust depth for next needle
+                    depth_percent += depth_percent_interval
+
+                print(f"Inserted '{self.needles[i]}' at {insertion_percentage:.2f}% of the context, total length now: {len(tokens_context)} tokens")
 
         new_context = self.model_to_test.decode_tokens(tokens_context)
         return new_context
@@ -129,7 +169,11 @@ def encode_and_trim(self, context, context_length):
         Returns:
             str: The encoded and trimmed context.
         """
-        tokens = self.model_to_test.encode_text_to_tokens(context)
+        if self.provider == "huggingface":
+            tokens = self.model_to_test.encode_text_to_tokens(context, no_bos=True)
+        else:
+            tokens = self.model_to_test.encode_text_to_tokens(context)
+
         if len(tokens) > context_length:
             context = self.model_to_test.decode_tokens(tokens, context_length)
         return context
@@ -211,7 +255,7 @@ async def evaluate_and_log(self, context_length, depth_percent):
                 print (f"Score: {score}")
                 print (f"Response: {response}\n")
 
-            context_file_location = f'{self.model_name.replace(".", "_")}_len_{context_length}_depth_{int(depth_percent*100)}'
+            context_file_location = f'{self.model_name.split("/")[-1].replace(".", "_")}_len_{context_length}_depth_{int(depth_percent*100)}'
 
             if self.save_contexts:
                 results['file_name'] = context_file_location

diff --git a/needlehaystack/llm_needle_haystack_tester.py b/needlehaystack/llm_needle_haystack_tester.py
@@ -18,6 +18,7 @@ class LLMNeedleHaystackTester:
     """
     def __init__(self,
                  model_to_test: ModelProvider = None,
+                 provider: str = None,
                  evaluator: Evaluator = None,
                  needle = None,
                  haystack_dir = "PaulGrahamEssays",
@@ -68,6 +69,7 @@ def __init__(self,
         if not needle or not haystack_dir or not retrieval_question:
             raise ValueError("Needle, haystack, and retrieval_question must be provided.")
 
+        self.provider = provider
         self.needle = needle
         self.haystack_dir = haystack_dir
         self.retrieval_question = retrieval_question
@@ -182,7 +184,7 @@ async def evaluate_and_log(self, context_length, depth_percent):
             print (f"Score: {score}")
             print (f"Response: {response}\n")
 
-        context_file_location = f'{self.model_name.replace(".", "_")}_len_{context_length}_depth_{int(depth_percent*100)}'
+        context_file_location = f'{self.model_name.split("/")[-1].replace(".", "_")}_len_{context_length}_depth_{int(depth_percent*100)}'
 
         if self.save_contexts:
             results['file_name'] = context_file_location
@@ -242,8 +244,12 @@ async def generate_context(self, context_length, depth_percent):
         return context
 
     def insert_needle(self, context, depth_percent, context_length):
-        tokens_needle = self.model_to_test.encode_text_to_tokens(self.needle)
-        tokens_context = self.model_to_test.encode_text_to_tokens(context)
+        if self.provider == "huggingface":
+            tokens_needle = self.model_to_test.encode_text_to_tokens(self.needle, no_bos=True)
+            tokens_context = self.model_to_test.encode_text_to_tokens(context, no_bos=True)
+        else:
+            tokens_needle = self.model_to_test.encode_text_to_tokens(self.needle)
+            tokens_context = self.model_to_test.encode_text_to_tokens(context)
 
         # Reducing the context length by 150 buffer. This is to account for system message, the user question, and response.
         context_length -= self.final_context_length_buffer
@@ -279,7 +285,10 @@ def insert_needle(self, context, depth_percent, context_length):
         return new_context
 
     def get_context_length_in_tokens(self, context):
-        return len(self.model_to_test.encode_text_to_tokens(context))
+        if self.provider == "huggingface":
+            return len(self.model_to_test.encode_text_to_tokens(context, no_bos=True))
+        else:
+            return len(self.model_to_test.encode_text_to_tokens(context))
 
     def read_context_files(self):
         context = ""

diff --git a/needlehaystack/providers/__init__.py b/needlehaystack/providers/__init__.py
@@ -1,3 +1,4 @@
 from .anthropic import Anthropic
 from .model import ModelProvider
-from .openai import OpenAI
+from .openai import OpenAI
+from .huggingface import HuggingFace