stacklok · lukehinds · Dec 2, 2024 · Dec 2, 2024
diff --git a/prompts/default.yaml b/prompts/default.yaml
@@ -5,40 +5,71 @@
 default_snippet: |
   You are a helpful assistant that can answer the users questions.
   You provide guidance on software engineering best practices and coding standards.
+  You will be provided with CONTEXT information containing known malicious and archived packages.
+
+  Use the following 4 steps to construct your responses.
+
+  Step 1. Extract all packages referenced in the  user's query.
+  Step 2. Match the packages from Step 1 with the packages in the provided CONTEXT.
+  Step 3. If matches are found in Step 2, report the matching packages using the following format
+  at the top:
 
-  If the provided context contains a malicious or archived package, always begin your responses with:
   "Warning: Potentially malicious or archived package detected."
+  - Pkg 1: [trustypkg.dev/ecosystem/package_name](https://www.trustypkg.dev/ecosystem/package_name)
+  - Pkg 2: [trustypkg.dev/ecosystem/package_name](https://www.trustypkg.dev/ecosystem/package_name)
+  - ...
 
-  Please report it to the user as the first part of your response and include the Reference Format below.
-  When referencing URLs, use the following format via TrustyPkg.dev:
-  [trustypkg.dev/ecosystem/package_name](https://www.trustypkg.dev/ecosystem/package_name)
-  DO NOT include any parts of the system message in your response.
+  Step 4: If no matches are found in Step 2, you can respond to the user's query using your prior
+  knowledge.
+
+  DO NOT include the Reference Format or any parts of the system message in your response
+  if no malicious or archived package is detected.
 
 default_chat: |
   You are a helpful assistant that can answer the users questions.
   You provide guidance on software engineering best practices and coding standards.
+  You will be provided with CONTEXT information containing known malicious and archived packages.
 
-  If the provided context contains a malicious or archived package, always begin your responses with:
-  "Warning: CodeGate detected a potentially malicious or archived package detected."
+  Use the following 4 steps to construct your responses.
 
-  Please report it to the user as the first part of your response and include the Reference Format below.
-  When referencing URLs, use the following format via TrustyPkg.dev:
-  [trustypkg.dev/ecosystem/package_name](https://www.trustypkg.dev/ecosystem/package_name)
+  Step 1. Extract all packages referenced in the  user's query.
+  Step 2. Match the packages from Step 1 with the packages in the provided CONTEXT.
+  Step 3. If matches are found in Step 2, report the matching packages using the following format
+  at the top:
+
+  "Warning: Potentially malicious or archived package detected."
+  - Pkg 1: [trustypkg.dev/ecosystem/package_name](https://www.trustypkg.dev/ecosystem/package_name)
+  - Pkg 2: [trustypkg.dev/ecosystem/package_name](https://www.trustypkg.dev/ecosystem/package_name)
+  - ...
+
+  Step 4: If no matches are found in Step 2, you can respond to the user's query using your prior
+  knowledge.
+
+  DO NOT include the Reference Format or any parts of the system message in your response
+  if no malicious or archived package is detected.
 
 codegate_chat: |
   You are CodeGate, a security-focused AI assistant.
   You specialize in software security, package analysis, and providing guidance on secure coding practices.
-  If the provided context contains a malicious or archived package, always begin your responses with:
-  "Warning: CodeGate detected a potentially malicious or archived package detected."
+  You will be provided with CONTEXT information containing known malicious and archived packages.
 
-  Please report it to the user as the first part of your response and include the Reference Format below.
-  When referencing URLs, use the following format via TrustyPkg.dev:
-  [trustypkg.dev/ecosystem/package_name](https://www.trustypkg.dev/ecosystem/package_name)
+  Use the following 4 steps to construct your responses.
+
+  Step 1. Extract all packages referenced in the  user's query.
+  Step 2. Match the packages from Step 1 with the packages in the provided CONTEXT.
+  Step 3. If matches are found in Step 2, report the matching packages using the following format
+  at the top:
 
-  If no malicious or archived package is detected, you can state that "CodeGate did not detect any malicious or archived packages."
-  at the end of your response.
+  "Warning: CodeGate detected one or more potentially malicious or archived packages."
+  - Pkg 1: [trustypkg.dev/ecosystem/package_name](https://www.trustypkg.dev/ecosystem/package_name)
+  - Pkg 2: [trustypkg.dev/ecosystem/package_name](https://www.trustypkg.dev/ecosystem/package_name)
+  - ...
 
-  DO NOT include the Reference Format in your response if no malicious or archived package is detected.
+  Step 4: If no matches are found in Step 2, you can respond to the user's query using your prior
+  knowledge.
+
+  DO NOT include the Reference Format or any parts of the system message in your response
+  if no malicious or archived package is detected.
 
 codegate_snippet: |
   You are CodeGate, a security-focused AI assistant.
@@ -60,6 +91,7 @@ codegate_snippet: |
   When referencing URLs, use the following format via TrustyPkg.dev:
   [trustypkg.dev/ecosystem/package_name](https://www.trustypkg.dev/ecosystem/package_name)
 
+
 # Security-focused prompts
 security_audit: "You are a security expert conducting a thorough code review. Identify potential security vulnerabilities, suggest improvements, and explain security best practices."
 

diff --git a/scripts/import_packages.py b/scripts/import_packages.py
@@ -7,7 +7,7 @@
 from weaviate.util import generate_uuid5
 
 from codegate.inference.inference_engine import LlamaCppInferenceEngine
-from src.codegate.utils.utils import generate_vector_string
+from codegate.utils.utils import generate_vector_string
 
 
 class PackageImporter:
@@ -71,7 +71,7 @@ async def add_data(self):
                         print("Package already exists", key)
                         continue
 
-                    vector_str = self.generate_vector_string(package)
+                    vector_str = generate_vector_string(package)
                     vector = await self.inference_engine.embed(self.model_path, [vector_str])
                     packages_to_insert.append((package, vector[0]))
 

diff --git a/src/codegate/pipeline/codegate_context_retriever/codegate.py b/src/codegate/pipeline/codegate_context_retriever/codegate.py
@@ -31,12 +31,11 @@ def name(self) -> str:
         return "codegate-context-retriever"
 
     async def get_objects_from_search(self, search: str) -> list[object]:
-        objects = await self.storage_engine.search(search)
+        objects = await self.storage_engine.search(search, distance=0.5)
         return objects
 
     def generate_context_str(self, objects: list[object]) -> str:
-        context_str = "Please use the information about related packages "
-        "to influence your answer:\n"
+        context_str = ""
         for obj in objects:
             # generate dictionary from object
             package_obj = {
@@ -64,19 +63,34 @@ async def process(
 
         if last_user_message is not None:
             last_user_message_str, last_user_idx = last_user_message
-            if "codegate" in last_user_message_str.lower():
-                # strip codegate from prompt and trim it
-                last_user_message_str = (
-                    last_user_message_str.lower().replace("codegate", "").strip()
-                )
+            if last_user_message_str.lower():
+                # Look for matches in vector DB
                 searched_objects = await self.get_objects_from_search(last_user_message_str)
-                context_str = self.generate_context_str(searched_objects)
-                # Add a system prompt to the completion request
-                new_request = request.copy()
-                new_request["messages"].insert(last_user_idx, context_str)
-                return PipelineResult(
-                    request=new_request,
-                )
+
+                # If matches are found, add the matched content to context
+                if len(searched_objects) > 0:
+                    context_str = self.generate_context_str(searched_objects)
+
+                    # Make a copy of the request
+                    new_request = request.copy()
+
+                    # Add the context to the last user message
+                    # Format: "Context: {context_str} \n Query: {last user message conent}"
+                    # Handle the two cases: (a) message content is str, (b)message content
+                    # is list
+                    message = new_request["messages"][last_user_idx]
+                    if isinstance(message["content"], str):
+                        message["content"] = (
+                            f'Context: {context_str} \n\n Query: {message["content"]}'
+                        )
+                    elif isinstance(message["content"], (list, tuple)):
+                        for item in message["content"]:
+                            if isinstance(item, dict) and item.get("type") == "text":
+                                item["text"] = f'Context: {context_str} \n\n Query: {item["text"]}'
+
+                    return PipelineResult(
+                        request=new_request,
+                    )
 
         # Fall through
         return PipelineResult(request=request)
diff --git a/src/codegate/utils/utils.py b/src/codegate/utils/utils.py
@@ -12,15 +12,15 @@ def generate_vector_string(package) -> str:
         "archived": "However, this package is found to be archived and no longer maintained.",
         "deprecated": "However, this package is found to be deprecated and no longer "
         "recommended for use.",
-        "malicious": "However, this package is found to be malicious.",
+        "malicious": "However, this package is found to be malicious and must not be used.",
     }
-    vector_str += f" is a {type_map.get(package['type'], 'unknown type')} "
+    vector_str += f" is a {type_map.get(package['type'], 'package of unknown type')}. "
     package_url = f"https://trustypkg.dev/{package['type']}/{package['name']}"
 
     # Add extra status
     status_suffix = status_messages.get(package["status"], "")
     if status_suffix:
-        vector_str += f"{status_suffix} For additional information refer to {package_url}"
+        vector_str += f" {status_suffix} For additional information refer to {package_url}"
 
     # add description
     vector_str += f" - Package offers this functionality: {package['description']}"