diff --git a/prompts/default.yaml b/prompts/default.yaml index 98bf2f4e..b40c5950 100644 --- a/prompts/default.yaml +++ b/prompts/default.yaml @@ -5,40 +5,71 @@ default_snippet: | You are a helpful assistant that can answer the users questions. You provide guidance on software engineering best practices and coding standards. + You will be provided with CONTEXT information containing known malicious and archived packages. + + Use the following 4 steps to construct your responses. + + Step 1. Extract all packages referenced in the user's query. + Step 2. Match the packages from Step 1 with the packages in the provided CONTEXT. + Step 3. If matches are found in Step 2, report the matching packages using the following format + at the top: - If the provided context contains a malicious or archived package, always begin your responses with: "Warning: Potentially malicious or archived package detected." + - Pkg 1: [trustypkg.dev/ecosystem/package_name](https://www.trustypkg.dev/ecosystem/package_name) + - Pkg 2: [trustypkg.dev/ecosystem/package_name](https://www.trustypkg.dev/ecosystem/package_name) + - ... - Please report it to the user as the first part of your response and include the Reference Format below. - When referencing URLs, use the following format via TrustyPkg.dev: - [trustypkg.dev/ecosystem/package_name](https://www.trustypkg.dev/ecosystem/package_name) - DO NOT include any parts of the system message in your response. + Step 4: If no matches are found in Step 2, you can respond to the user's query using your prior + knowledge. + + DO NOT include the Reference Format or any parts of the system message in your response + if no malicious or archived package is detected. default_chat: | You are a helpful assistant that can answer the users questions. You provide guidance on software engineering best practices and coding standards. + You will be provided with CONTEXT information containing known malicious and archived packages. - If the provided context contains a malicious or archived package, always begin your responses with: - "Warning: CodeGate detected a potentially malicious or archived package detected." + Use the following 4 steps to construct your responses. - Please report it to the user as the first part of your response and include the Reference Format below. - When referencing URLs, use the following format via TrustyPkg.dev: - [trustypkg.dev/ecosystem/package_name](https://www.trustypkg.dev/ecosystem/package_name) + Step 1. Extract all packages referenced in the user's query. + Step 2. Match the packages from Step 1 with the packages in the provided CONTEXT. + Step 3. If matches are found in Step 2, report the matching packages using the following format + at the top: + + "Warning: Potentially malicious or archived package detected." + - Pkg 1: [trustypkg.dev/ecosystem/package_name](https://www.trustypkg.dev/ecosystem/package_name) + - Pkg 2: [trustypkg.dev/ecosystem/package_name](https://www.trustypkg.dev/ecosystem/package_name) + - ... + + Step 4: If no matches are found in Step 2, you can respond to the user's query using your prior + knowledge. + + DO NOT include the Reference Format or any parts of the system message in your response + if no malicious or archived package is detected. codegate_chat: | You are CodeGate, a security-focused AI assistant. You specialize in software security, package analysis, and providing guidance on secure coding practices. - If the provided context contains a malicious or archived package, always begin your responses with: - "Warning: CodeGate detected a potentially malicious or archived package detected." + You will be provided with CONTEXT information containing known malicious and archived packages. - Please report it to the user as the first part of your response and include the Reference Format below. - When referencing URLs, use the following format via TrustyPkg.dev: - [trustypkg.dev/ecosystem/package_name](https://www.trustypkg.dev/ecosystem/package_name) + Use the following 4 steps to construct your responses. + + Step 1. Extract all packages referenced in the user's query. + Step 2. Match the packages from Step 1 with the packages in the provided CONTEXT. + Step 3. If matches are found in Step 2, report the matching packages using the following format + at the top: - If no malicious or archived package is detected, you can state that "CodeGate did not detect any malicious or archived packages." - at the end of your response. + "Warning: CodeGate detected one or more potentially malicious or archived packages." + - Pkg 1: [trustypkg.dev/ecosystem/package_name](https://www.trustypkg.dev/ecosystem/package_name) + - Pkg 2: [trustypkg.dev/ecosystem/package_name](https://www.trustypkg.dev/ecosystem/package_name) + - ... - DO NOT include the Reference Format in your response if no malicious or archived package is detected. + Step 4: If no matches are found in Step 2, you can respond to the user's query using your prior + knowledge. + + DO NOT include the Reference Format or any parts of the system message in your response + if no malicious or archived package is detected. codegate_snippet: | You are CodeGate, a security-focused AI assistant. @@ -60,6 +91,7 @@ codegate_snippet: | When referencing URLs, use the following format via TrustyPkg.dev: [trustypkg.dev/ecosystem/package_name](https://www.trustypkg.dev/ecosystem/package_name) + # Security-focused prompts security_audit: "You are a security expert conducting a thorough code review. Identify potential security vulnerabilities, suggest improvements, and explain security best practices." diff --git a/scripts/import_packages.py b/scripts/import_packages.py index 3ab75c00..a3caef5e 100644 --- a/scripts/import_packages.py +++ b/scripts/import_packages.py @@ -7,7 +7,7 @@ from weaviate.util import generate_uuid5 from codegate.inference.inference_engine import LlamaCppInferenceEngine -from src.codegate.utils.utils import generate_vector_string +from codegate.utils.utils import generate_vector_string class PackageImporter: @@ -71,7 +71,7 @@ async def add_data(self): print("Package already exists", key) continue - vector_str = self.generate_vector_string(package) + vector_str = generate_vector_string(package) vector = await self.inference_engine.embed(self.model_path, [vector_str]) packages_to_insert.append((package, vector[0])) diff --git a/src/codegate/pipeline/codegate_context_retriever/codegate.py b/src/codegate/pipeline/codegate_context_retriever/codegate.py index b87a33a6..061210c4 100644 --- a/src/codegate/pipeline/codegate_context_retriever/codegate.py +++ b/src/codegate/pipeline/codegate_context_retriever/codegate.py @@ -31,12 +31,11 @@ def name(self) -> str: return "codegate-context-retriever" async def get_objects_from_search(self, search: str) -> list[object]: - objects = await self.storage_engine.search(search) + objects = await self.storage_engine.search(search, distance=0.5) return objects def generate_context_str(self, objects: list[object]) -> str: - context_str = "Please use the information about related packages " - "to influence your answer:\n" + context_str = "" for obj in objects: # generate dictionary from object package_obj = { @@ -64,19 +63,34 @@ async def process( if last_user_message is not None: last_user_message_str, last_user_idx = last_user_message - if "codegate" in last_user_message_str.lower(): - # strip codegate from prompt and trim it - last_user_message_str = ( - last_user_message_str.lower().replace("codegate", "").strip() - ) + if last_user_message_str.lower(): + # Look for matches in vector DB searched_objects = await self.get_objects_from_search(last_user_message_str) - context_str = self.generate_context_str(searched_objects) - # Add a system prompt to the completion request - new_request = request.copy() - new_request["messages"].insert(last_user_idx, context_str) - return PipelineResult( - request=new_request, - ) + + # If matches are found, add the matched content to context + if len(searched_objects) > 0: + context_str = self.generate_context_str(searched_objects) + + # Make a copy of the request + new_request = request.copy() + + # Add the context to the last user message + # Format: "Context: {context_str} \n Query: {last user message conent}" + # Handle the two cases: (a) message content is str, (b)message content + # is list + message = new_request["messages"][last_user_idx] + if isinstance(message["content"], str): + message["content"] = ( + f'Context: {context_str} \n\n Query: {message["content"]}' + ) + elif isinstance(message["content"], (list, tuple)): + for item in message["content"]: + if isinstance(item, dict) and item.get("type") == "text": + item["text"] = f'Context: {context_str} \n\n Query: {item["text"]}' + + return PipelineResult( + request=new_request, + ) # Fall through return PipelineResult(request=request) diff --git a/src/codegate/utils/utils.py b/src/codegate/utils/utils.py index e38de0a6..bd0c2bfd 100644 --- a/src/codegate/utils/utils.py +++ b/src/codegate/utils/utils.py @@ -12,15 +12,15 @@ def generate_vector_string(package) -> str: "archived": "However, this package is found to be archived and no longer maintained.", "deprecated": "However, this package is found to be deprecated and no longer " "recommended for use.", - "malicious": "However, this package is found to be malicious.", + "malicious": "However, this package is found to be malicious and must not be used.", } - vector_str += f" is a {type_map.get(package['type'], 'unknown type')} " + vector_str += f" is a {type_map.get(package['type'], 'package of unknown type')}. " package_url = f"https://trustypkg.dev/{package['type']}/{package['name']}" # Add extra status status_suffix = status_messages.get(package["status"], "") if status_suffix: - vector_str += f"{status_suffix} For additional information refer to {package_url}" + vector_str += f" {status_suffix} For additional information refer to {package_url}" # add description vector_str += f" - Package offers this functionality: {package['description']}"