Skip to content

Commit

Permalink
refactor: additional clean-up and polish
Browse files Browse the repository at this point in the history
move from large to small model for embedding, fix issues with cleaning volume mounts
  • Loading branch information
sjungling committed Dec 19, 2024
1 parent 2c3155c commit e4dbfd2
Show file tree
Hide file tree
Showing 7 changed files with 41 additions and 26 deletions.
8 changes: 4 additions & 4 deletions .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"build": {
"dockerfile": "Dockerfile",
"context": ".."
}
"build": {
"dockerfile": "../Dockerfile",
"context": ".."
}
}
12 changes: 5 additions & 7 deletions .devcontainer/Dockerfile → Dockerfile
Original file line number Diff line number Diff line change
@@ -1,19 +1,17 @@
FROM mcr.microsoft.com/devcontainers/python:3.12 AS base
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/

FROM base AS dependencies
WORKDIR /app

FROM base AS dependencies
COPY pyproject.toml .
COPY uv.lock .
RUN uv pip install --system -r pyproject.toml

FROM dependencies AS application
FROM dependencies AS models
COPY scripts/download_model.py .

FROM application AS models
RUN uv run download_model.py

FROM models AS final
COPY scripts/* .

RUN chmod +x entry-point.sh
ENTRYPOINT ["./entry-point.sh"]
COPY templates templates
6 changes: 5 additions & 1 deletion scripts/analyze_logs.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def main(url, repository_path, log_file, username, password, logs_dir, output_di
analyzer.analyze_and_visualize_clusters()


if __name__ == "__main__":
def analyze_logs():
parser = argparse.ArgumentParser(
description="Download and unzip ingest samples. If no url is provided, it will prompt the user to select a file.",
)
Expand Down Expand Up @@ -78,3 +78,7 @@ def main(url, repository_path, log_file, username, password, logs_dir, output_di
args.logs,
args.output_dir,
)


if __name__ == "__main__":
analyze_logs()
10 changes: 6 additions & 4 deletions scripts/build_log_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ def wrap_line(text, max_len=200, max_lines=8):

class BuildLogAnalyzer:
def __init__(self, output_dir="output"):
self.tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-large-en-v1.5")
self.model = AutoModel.from_pretrained("BAAI/bge-large-en-v1.5")
self.tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5")
self.model = AutoModel.from_pretrained("BAAI/bge-small-en-v1.5")
self.model.eval()
self.random_state = 42
self.data_frames: pd.DataFrame | None = None
Expand Down Expand Up @@ -148,6 +148,7 @@ def _create_scatter_plot(self):

with open(self.final_cluster_html_path, "w", encoding="utf-8") as f:
f.write(html_content)
print(f"Scatter plot analysis saved to {self.final_cluster_html_path}")

def _create_cluster_logs(self):
df = self.data_frames
Expand Down Expand Up @@ -217,6 +218,7 @@ def create_dropdown_options(data, cluster_id):
data.visible = i == 0

fig.write_html(self.final_logs_html_path)
print(f"Cluster logs saved to {self.final_logs_html_path}")

def analyze_and_visualize_clusters(self):
self._embed_summaries_cluster()
Expand Down Expand Up @@ -262,7 +264,7 @@ def load_failure_logs(self):
df = df[df["Solved"] == False]
self.data_frames = df
print(
"Succesfully loaded "
"Successfully loaded "
+ str(len(df))
+ " logs. There were "
+ str(number_of_logs - len(df))
Expand Down Expand Up @@ -414,5 +416,5 @@ def extract_failure_stacktraces(self):
print("Failure to extract log's stack trace from ", str(row["Path"]))
any_failures = True
if not any_failures:
print("Succesfully extracted logs for", len(df), self.output_dir)
print("Successfully extracted logs for", len(df), self.output_dir)
self.data_frames = df
12 changes: 10 additions & 2 deletions scripts/download_model.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
# /// script
# requires-python = ">=3.12"
# dependencies = [
# "transformers",
# "torch", # required by transformers for model handling
# ]
# ///

from transformers import AutoModel, AutoTokenizer

# Model will download the first time they are initialized
tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-large-en-v1.5")
model = AutoModel.from_pretrained("BAAI/bge-large-en-v1.5")
tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5")
model = AutoModel.from_pretrained("BAAI/bge-small-en-v1.5")
14 changes: 8 additions & 6 deletions scripts/log_downloader.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import sys
import zipfile

import requests
Expand Down Expand Up @@ -47,10 +48,10 @@ def _download_and_unzip_file(self, url, file_name):
# Remove downloaded zip file
os.remove(local_filename)

def _download_logs_interactive(self, path=None):
def _download_logs_interactive(self, path=""):
print(
f"Fetching files from {self.url}/api/storage/{self.repository_path}/{path}"
if path
if path != ""
else f"Fetching files from {self.url}/api/storage/{self.repository_path}"
)
items = self._collect_items(
Expand Down Expand Up @@ -92,12 +93,13 @@ def _download_logs_interactive(self, path=None):
print(
"Invalid choice. Please rerun the script and select a number from the list."
)
self._download_logs_interactive(path)
except ValueError:
print("Invalid input. Please enter a number.")

def _collect_items(self, url, path=None):
def _collect_items(self, url, path=""):
results = []
data = self._fetch_directory_contents(f"{url}/{path}" if path else url)
data = self._fetch_directory_contents(f"{url}/{path}" if path != "" else url)
if data is None:
return results

Expand All @@ -120,7 +122,7 @@ def _fetch_directory_contents(self, url):
return response.json()
except requests.RequestException as e:
print(f"Failed to fetch directory contents from {url}: {e}")
return None
sys.exit(1)

def _download_file(self, url, local_filename):
try:
Expand All @@ -132,7 +134,7 @@ def _download_file(self, url, local_filename):
return local_filename
except requests.RequestException as e:
print(f"Failed to download file from {url}: {e}")
return None
sys.exit(1)

def _unzip_file(self, zip_path, extract_to):
try:
Expand Down
5 changes: 3 additions & 2 deletions scripts/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@

def prepare_directory(directory):
if os.path.exists(directory):
shutil.rmtree(directory)
os.makedirs(directory)
shutil.rmtree(directory, ignore_errors=True)
else:
os.makedirs(directory)


def copy_directory(source, destination):
Expand Down

0 comments on commit e4dbfd2

Please sign in to comment.