feat: add local LLM support with llama.cpp (#8)

fynnfluegge · Sep 13, 2023 · 9032c35 · 9032c35
1 parent a28ecaf
commit 9032c35
Show file tree

Hide file tree

Showing 16 changed files with 224 additions and 132 deletions.
diff --git a/README.md b/README.md
@@ -10,7 +10,11 @@
 
 <div align="center">
 
-Focus on writing your code, let AI write the documentation for you. With just a few keystrokes in your terminal.
+Focus on writing your code, let AI write the documentation for you. 
+
+With just a few keystrokes in your terminal by using the OpenAI API or 100% local LLMs without any data leaks.
+
+Powered by [langchain](https://github.com/langchain-ai/langchain), [lama.cpp](https://github.com/ggerganov/llama.cpp) and [treesitter](https://github.com/tree-sitter/tree-sitter).
 
 ![ezgif-4-53d6e634af](https://github.com/fynnfluegge/doc-comments.ai/assets/16321871/8f2756cb-36f9-43c6-94b1-658b89b49786)
 
@@ -22,15 +26,32 @@ Focus on writing your code, let AI write the documentation for you. With just a
   - e.g. Javadoc, JSDoc, Docstring, Rustdoc
 - ✍️ Create inline documentation comments in method bodies
 - 🌳 Treesitter integration
+- 💻 Local LLM support
 
 > [!NOTE]
 > Documentations will only be added to files without unstaged changes, so that nothing is overwritten.
 
 ## 🚀 Usage
-- `aicomments <RELATIVE_FILE_PATH>`: Create documentations for any method in the file which doesn't have any yet.
-- `aicomments <RELATIVE_FILE_PATH> --inline`: Create also documentation comments in the method body.
-- `aicomments <RELATIVE_FILE_PATH> --gpt4`: Use GPT-4 model (Default is GPT-3.5).
-- `aicomments <RELATIVE_FILE_PATH> --guided`: Guided mode, confirm documentation generation for each method. 
+Create documentations for any method in the file with GPT-3.5 Turbo model:
+```
+aicomments <RELATIVE_FILE_PATH>
+```
+Create also documentation comments in the method body:
+```
+aicomments <RELATIVE_FILE_PATH> --inline
+```
+Use GPT-4 model (Default is GPT-3.5):
+```
+aicomments <RELATIVE_FILE_PATH> --gpt4
+```
+Guided mode, confirm documentation generation for each method:
+```
+aicomments <RELATIVE_FILE_PATH> --guided
+```
+Use a local LLM on your machine:
+```
+aicomments <RELATIVE_FILE_PATH> --local --model_path <RELATIVE_MODEL_PATH>
+``` 
 
 ## ⚙️ Supported Languages
 - [x] Python
@@ -49,8 +70,8 @@ Focus on writing your code, let AI write the documentation for you. With just a
 - Python >= 3.9
 
 ## 🔧 Installation
-
-Create your personal OpenAI Api key and add it as `$OPENAI_API_KEY` to your environment with:
+### 1. OpenAI API usage
+Create your personal OpenAI API key and add it as `$OPENAI_API_KEY` to your environment with:
 
 ```
 export OPENAI_API_KEY=<YOUR_API_KEY>
@@ -62,9 +83,24 @@ Install with `pipx`:
 pipx install doc-comments-ai
 ```
 
-> [!NOTE]
 > It is recommended to use `pipx` for installation, nonetheless it is also possible to use `pip`.
 
-## 🚨 Disclaimer
+### 2. Local LLM usage
+By using a local LLM no API key is required. The recommended way for the installation is `pip` since `CMake` arguments needs to be passed to the `llama.cpp` build for better performance which is not possible with `pipx`.
+You can also use the OpenAI API with this installation.
 
-Your code won't be stored, but your code does leave your machine.
+> [!WARNING]  
+> If your are sensitive to your global `pip` packages you may consider to checkout the repo and install and run it manually with `poetry` or `conda`.
+
+See the following instructions for your machine with `CMake`: [installation-with-hardware-acceleration](https://github.com/abetlen/llama-cpp-python#installation-with-hardware-acceleration)
+and install `llama-cpp-python` with your desired hardware acceleration, e.g. for Metal on Mac run:
+```
+CMAKE_ARGS="-DLLAMA_METAL=on" pip install llama-cpp-python
+```
+
+To install `doc-comments.ai` which should use your previously installed `llama.cpp` build run:
+```
+pip install doc-comments-ai
+```
+> [!IMPORTANT]  
+> The results by using a local LLM will highly be affected by your selected model. To get similar results compared to GPT-3.5/4 you need to select very large models which require a powerful hardware.
diff --git a/doc_comments_ai/app.py b/doc_comments_ai/app.py
@@ -6,8 +6,7 @@
 
 from doc_comments_ai import domain, llm, utils
 from doc_comments_ai.llm import GptModel
-from doc_comments_ai.treesitter.treesitter import (Treesitter, TreesitterNode,
-                                                   get_source_from_node)
+from doc_comments_ai.treesitter import Treesitter, TreesitterMethodNode
 
 
 def run():
@@ -21,6 +20,16 @@ def run():
 
     parser = argparse.ArgumentParser()
     parser.add_argument("dir", nargs="?", default=os.getcwd())
+    parser.add_argument(
+        "--local",
+        action="store_true",
+        help="Uses the local version of the LLM model.",
+    )
+    parser.add_argument(
+        "--model_path",
+        type=str,
+        help="Path to the local model.",
+    )
     parser.add_argument(
         "--inline",
         action="store_true",
@@ -53,7 +62,7 @@ def run():
     if args.gpt4:
         llm_wrapper = llm.LLM(model=GptModel.GPT_4)
     else:
-        llm_wrapper = llm.LLM()
+        llm_wrapper = llm.LLM(local=args.local, model_path=args.model_path)
 
     generated_doc_comments = {}
 
@@ -65,23 +74,25 @@ def run():
         programming_language = utils.get_programming_language(file_extension)
 
         treesitter_parser = Treesitter.create_treesitter(programming_language)
-        treesitterNodes: list[TreesitterNode] = treesitter_parser.parse(file_bytes)
+        treesitterNodes: list[TreesitterMethodNode] = treesitter_parser.parse(
+            file_bytes
+        )
 
         for node in treesitterNodes:
             method_name = utils.get_bold_text(node.name)
 
-            if args.guided:
-                print(f"Generate doc for {utils.get_bold_text(method_name)}? (y/n)")
-                if not input().lower() == "y":
-                    continue
-
             if node.doc_comment:
                 print(
                     f"⚠️  Method {method_name} already has a doc comment. Skipping..."
                 )
                 continue
 
-            method_source_code = get_source_from_node(node.node)
+            if args.guided:
+                print(f"Generate doc for {utils.get_bold_text(method_name)}? (y/n)")
+                if not input().lower() == "y":
+                    continue
+
+            method_source_code = node.node.text.decode()
 
             tokens = utils.count_tokens(method_source_code)
             if tokens > 2048 and not args.gpt4:

diff --git a/doc_comments_ai/llm.py b/doc_comments_ai/llm.py
@@ -1,6 +1,8 @@
 from enum import Enum
+
+from langchain import LLMChain, PromptTemplate
 from langchain.chat_models import ChatOpenAI
-from langchain import PromptTemplate, LLMChain
+from langchain.llms import LlamaCpp
 
 
 class GptModel(Enum):
@@ -9,14 +11,32 @@ class GptModel(Enum):
 
 
 class LLM:
-    def __init__(self, model: GptModel = GptModel.GPT_35):
+    def __init__(
+        self,
+        model: GptModel = GptModel.GPT_35,
+        local: bool = False,
+        model_path: str | None = None,
+    ):
         max_tokens = 2048 if model == GptModel.GPT_35 else 4096
-        self.llm = ChatOpenAI(temperature=0.9, max_tokens=max_tokens, model=model.value)
+        if local:
+            if model_path is None:
+                raise ValueError("model_path must be set in local mode.")
+
+            self.llm = LlamaCpp(
+                model_path=model_path,
+                temperature=0.9,
+                max_tokens=max_tokens,
+                verbose=False,
+            )
+        else:
+            self.llm = ChatOpenAI(
+                temperature=0.9, max_tokens=max_tokens, model=model.value
+            )
         self.template = (
-            "I have this {language} method:\n{code}\nAdd a doc comment to the method. "
+            "Add a detailed doc comment to the following {language} method:\n{code}\n"
             "The doc comment should describe what the method does. "
+            "{inline_comments} "
             "Return the method implementaion with the doc comment as a markdown code block. "
-            "{inline_comments}"
             "Don't include any explanations in your response."
         )
         self.prompt = PromptTemplate(
@@ -31,7 +51,9 @@ def generate_doc_comment(self, language, code, inline=False):
         """
 
         if inline:
-            inline_comments = "Add inline comments to the code if necessary."
+            inline_comments = (
+                "Add inline comments to the method body where it makes sense."
+            )
         else:
             inline_comments = ""
 

diff --git a/doc_comments_ai/treesitter/__init__.py b/doc_comments_ai/treesitter/__init__.py
@@ -0,0 +1 @@
+from doc_comments_ai.treesitter.treesitter import Treesitter, TreesitterMethodNode
diff --git a/doc_comments_ai/treesitter/treesitter.py b/doc_comments_ai/treesitter/treesitter.py
@@ -7,7 +7,7 @@
 from doc_comments_ai.treesitter.treesitter_registry import TreesitterRegistry
 
 
-class TreesitterNode:
+class TreesitterMethodNode:
     def __init__(
         self,
         name: "str | bytes | None",
@@ -16,6 +16,7 @@ def __init__(
     ):
         self.name = name
         self.doc_comment = doc_comment
+        self.method_source_code = node.text.decode()
         self.node = node
 
 
@@ -29,7 +30,7 @@ def create_treesitter(language: Language) -> "Treesitter":
         return TreesitterRegistry.create_treesitter(language)
 
     @abstractmethod
-    def parse(self, file_bytes: bytes) -> list[TreesitterNode]:
+    def parse(self, file_bytes: bytes) -> list[TreesitterMethodNode]:
         self.tree = self.parser.parse(file_bytes)
         pass
 
@@ -47,7 +48,3 @@ def _query_method_name(self, node: tree_sitter.Node):
         This function returns the name of a method node
         """
         pass
-
-
-def get_source_from_node(node: tree_sitter.Node) -> str:
-    return node.text.decode()
diff --git a/doc_comments_ai/treesitter/treesitter_go.py b/doc_comments_ai/treesitter/treesitter_go.py
@@ -1,25 +1,24 @@
 import tree_sitter
-from doc_comments_ai.treesitter.treesitter import (
-    Treesitter,
-    TreesitterNode,
-    get_source_from_node,
-)
+
 from doc_comments_ai.constants import Language
+from doc_comments_ai.treesitter.treesitter import Treesitter, TreesitterMethodNode
 from doc_comments_ai.treesitter.treesitter_registry import TreesitterRegistry
 
 
 class TreesitterGo(Treesitter):
     def __init__(self):
         super().__init__(Language.GO)
 
-    def parse(self, file_bytes: bytes) -> list[TreesitterNode]:
+    def parse(self, file_bytes: bytes) -> list[TreesitterMethodNode]:
         super().parse(file_bytes)
         result = []
         methods = self._query_all_methods(self.tree.root_node)
         for method in methods:
             method_name = self._query_method_name(method["method"])
             doc_comment = method["doc_comment"]
-            result.append(TreesitterNode(method_name, doc_comment, method["method"]))
+            result.append(
+                TreesitterMethodNode(method_name, doc_comment, method["method"])
+            )
         return result
 
     def _query_method_name(self, node: tree_sitter.Node):
@@ -34,7 +33,7 @@ def _query_all_methods(self, node: tree_sitter.Node):
         if node.type == "function_declaration":
             doc_comment_node = None
             if node.prev_named_sibling and node.prev_named_sibling.type == "comment":
-                doc_comment_node = get_source_from_node(node.prev_named_sibling)
+                doc_comment_node = node.prev_named_sibling.text.decode()
             methods.append({"method": node, "doc_comment": doc_comment_node})
         else:
             for child in node.children:

diff --git a/doc_comments_ai/treesitter/treesitter_java.py b/doc_comments_ai/treesitter/treesitter_java.py
@@ -1,25 +1,24 @@
 import tree_sitter
-from doc_comments_ai.treesitter.treesitter import (
-    Treesitter,
-    TreesitterNode,
-    get_source_from_node,
-)
+
 from doc_comments_ai.constants import Language
+from doc_comments_ai.treesitter.treesitter import Treesitter, TreesitterMethodNode
 from doc_comments_ai.treesitter.treesitter_registry import TreesitterRegistry
 
 
 class TreesitterJava(Treesitter):
     def __init__(self):
         super().__init__(Language.JAVA)
 
-    def parse(self, file_bytes: bytes) -> list[TreesitterNode]:
+    def parse(self, file_bytes: bytes) -> list[TreesitterMethodNode]:
         super().parse(file_bytes)
         result = []
         methods = self._query_all_methods(self.tree.root_node)
         for method in methods:
             method_name = self._query_method_name(method["method"])
             doc_comment = method["doc_comment"]
-            result.append(TreesitterNode(method_name, doc_comment, method["method"]))
+            result.append(
+                TreesitterMethodNode(method_name, doc_comment, method["method"])
+            )
         return result
 
     def _query_method_name(self, node: tree_sitter.Node):
@@ -37,7 +36,7 @@ def _query_all_methods(self, node: tree_sitter.Node):
                 node.prev_named_sibling
                 and node.prev_named_sibling.type == "block_comment"
             ):
-                doc_comment_node = get_source_from_node(node.prev_named_sibling)
+                doc_comment_node = node.prev_named_sibling.text.decode()
             methods.append({"method": node, "doc_comment": doc_comment_node})
         else:
             for child in node.children:

diff --git a/doc_comments_ai/treesitter/treesitter_js.py b/doc_comments_ai/treesitter/treesitter_js.py
@@ -1,25 +1,24 @@
 import tree_sitter
-from doc_comments_ai.treesitter.treesitter import (
-    Treesitter,
-    TreesitterNode,
-    get_source_from_node,
-)
+
 from doc_comments_ai.constants import Language
+from doc_comments_ai.treesitter.treesitter import Treesitter, TreesitterMethodNode
 from doc_comments_ai.treesitter.treesitter_registry import TreesitterRegistry
 
 
 class TreesitterJavascript(Treesitter):
     def __init__(self):
         super().__init__(Language.JAVASCRIPT)
 
-    def parse(self, file_bytes: bytes) -> list[TreesitterNode]:
+    def parse(self, file_bytes: bytes) -> list[TreesitterMethodNode]:
         super().parse(file_bytes)
         result = []
         methods = self._query_all_methods(self.tree.root_node)
         for method in methods:
             method_name = self._query_method_name(method["method"])
             doc_comment = method["doc_comment"]
-            result.append(TreesitterNode(method_name, doc_comment, method["method"]))
+            result.append(
+                TreesitterMethodNode(method_name, doc_comment, method["method"])
+            )
         return result
 
     def _query_method_name(self, node: tree_sitter.Node):
@@ -34,7 +33,7 @@ def _query_all_methods(self, node: tree_sitter.Node):
         if node.type == "function_declaration":
             doc_comment_node = None
             if node.prev_named_sibling and node.prev_named_sibling.type == "comment":
-                doc_comment_node = get_source_from_node(node.prev_named_sibling)
+                doc_comment_node = node.prev_named_sibling.text.decode()
             methods.append({"method": node, "doc_comment": doc_comment_node})
         else:
             for child in node.children:
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		from doc_comments_ai.treesitter.treesitter import Treesitter, TreesitterMethodNode