refactor: additional clean-up and polish

move from large to small model for embedding, fix issues with cleaning volume mounts
moderneinc · Dec 19, 2024 · e4dbfd2 · e4dbfd2
1 parent 2c3155c
commit e4dbfd2
Show file tree

Hide file tree

Showing 7 changed files with 41 additions and 26 deletions.
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -1,6 +1,6 @@
 {
-    "build": {
-        "dockerfile": "Dockerfile",
-        "context": ".."
-    }
+  "build": {
+    "dockerfile": "../Dockerfile",
+    "context": ".."
+  }
 }
diff --git a/.devcontainer/Dockerfile → Dockerfile b/.devcontainer/Dockerfile → Dockerfile
@@ -1,19 +1,17 @@
 FROM mcr.microsoft.com/devcontainers/python:3.12 AS base
 COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
 
-FROM base AS dependencies
 WORKDIR /app
+
+FROM base AS dependencies
 COPY pyproject.toml .
+COPY uv.lock .
 RUN uv pip install --system -r pyproject.toml
 
-FROM dependencies AS application
+FROM dependencies AS models
 COPY scripts/download_model.py .
-
-FROM application AS models
 RUN uv run download_model.py
 
 FROM models AS final
 COPY scripts/* .
-
-RUN chmod +x entry-point.sh
-ENTRYPOINT ["./entry-point.sh"]
+COPY templates templates
diff --git a/scripts/analyze_logs.py b/scripts/analyze_logs.py
@@ -23,7 +23,7 @@ def main(url, repository_path, log_file, username, password, logs_dir, output_di
     analyzer.analyze_and_visualize_clusters()
 
 
-if __name__ == "__main__":
+def analyze_logs():
     parser = argparse.ArgumentParser(
         description="Download and unzip ingest samples. If no url is provided, it will prompt the user to select a file.",
     )
@@ -78,3 +78,7 @@ def main(url, repository_path, log_file, username, password, logs_dir, output_di
         args.logs,
         args.output_dir,
     )
+
+
+if __name__ == "__main__":
+    analyze_logs()
diff --git a/scripts/build_log_analyzer.py b/scripts/build_log_analyzer.py
@@ -42,8 +42,8 @@ def wrap_line(text, max_len=200, max_lines=8):
 
 class BuildLogAnalyzer:
     def __init__(self, output_dir="output"):
-        self.tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-large-en-v1.5")
-        self.model = AutoModel.from_pretrained("BAAI/bge-large-en-v1.5")
+        self.tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5")
+        self.model = AutoModel.from_pretrained("BAAI/bge-small-en-v1.5")
         self.model.eval()
         self.random_state = 42
         self.data_frames: pd.DataFrame | None = None
@@ -148,6 +148,7 @@ def _create_scatter_plot(self):
 
         with open(self.final_cluster_html_path, "w", encoding="utf-8") as f:
             f.write(html_content)
+        print(f"Scatter plot analysis saved to {self.final_cluster_html_path}")
 
     def _create_cluster_logs(self):
         df = self.data_frames
@@ -217,6 +218,7 @@ def create_dropdown_options(data, cluster_id):
             data.visible = i == 0
 
         fig.write_html(self.final_logs_html_path)
+        print(f"Cluster logs saved to {self.final_logs_html_path}")
 
     def analyze_and_visualize_clusters(self):
         self._embed_summaries_cluster()
@@ -262,7 +264,7 @@ def load_failure_logs(self):
         df = df[df["Solved"] == False]
         self.data_frames = df
         print(
-            "Succesfully loaded "
+            "Successfully loaded "
             + str(len(df))
             + " logs. There were "
             + str(number_of_logs - len(df))
@@ -414,5 +416,5 @@ def extract_failure_stacktraces(self):
                 print("Failure to extract log's stack trace from ", str(row["Path"]))
                 any_failures = True
         if not any_failures:
-            print("Succesfully extracted logs for", len(df), self.output_dir)
+            print("Successfully extracted logs for", len(df), self.output_dir)
         self.data_frames = df
diff --git a/scripts/download_model.py b/scripts/download_model.py
@@ -1,5 +1,13 @@
+# /// script
+# requires-python = ">=3.12"
+# dependencies = [
+#     "transformers",
+#     "torch",  # required by transformers for model handling
+# ]
+# ///
+
 from transformers import AutoModel, AutoTokenizer
 
 # Model will download the first time they are initialized
-tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-large-en-v1.5")
-model = AutoModel.from_pretrained("BAAI/bge-large-en-v1.5")
+tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5")
+model = AutoModel.from_pretrained("BAAI/bge-small-en-v1.5")
diff --git a/scripts/log_downloader.py b/scripts/log_downloader.py
@@ -1,4 +1,5 @@
 import os
+import sys
 import zipfile
 
 import requests
@@ -47,10 +48,10 @@ def _download_and_unzip_file(self, url, file_name):
         # Remove downloaded zip file
         os.remove(local_filename)
 
-    def _download_logs_interactive(self, path=None):
+    def _download_logs_interactive(self, path=""):
         print(
             f"Fetching files from {self.url}/api/storage/{self.repository_path}/{path}"
-            if path
+            if path != ""
             else f"Fetching files from {self.url}/api/storage/{self.repository_path}"
         )
         items = self._collect_items(
@@ -92,12 +93,13 @@ def _download_logs_interactive(self, path=None):
                 print(
                     "Invalid choice. Please rerun the script and select a number from the list."
                 )
+                self._download_logs_interactive(path)
         except ValueError:
             print("Invalid input. Please enter a number.")
 
-    def _collect_items(self, url, path=None):
+    def _collect_items(self, url, path=""):
         results = []
-        data = self._fetch_directory_contents(f"{url}/{path}" if path else url)
+        data = self._fetch_directory_contents(f"{url}/{path}" if path != "" else url)
         if data is None:
             return results
 
@@ -120,7 +122,7 @@ def _fetch_directory_contents(self, url):
             return response.json()
         except requests.RequestException as e:
             print(f"Failed to fetch directory contents from {url}: {e}")
-            return None
+            sys.exit(1)
 
     def _download_file(self, url, local_filename):
         try:
@@ -132,7 +134,7 @@ def _download_file(self, url, local_filename):
             return local_filename
         except requests.RequestException as e:
             print(f"Failed to download file from {url}: {e}")
-            return None
+            sys.exit(1)
 
     def _unzip_file(self, zip_path, extract_to):
         try:

diff --git a/scripts/utils.py b/scripts/utils.py
@@ -4,8 +4,9 @@
 
 def prepare_directory(directory):
     if os.path.exists(directory):
-        shutil.rmtree(directory)
-    os.makedirs(directory)
+        shutil.rmtree(directory, ignore_errors=True)
+    else:
+        os.makedirs(directory)
 
 
 def copy_directory(source, destination):