diff --git a/.github/workflows/query_engine_test.yml b/.github/workflows/query_engine_test.yml index 6a1188e..4c5171f 100644 --- a/.github/workflows/query_engine_test.yml +++ b/.github/workflows/query_engine_test.yml @@ -34,5 +34,5 @@ jobs: pip install -r engine/requirements.txt - name: Run tests - run: + run: jac test engine/src/query_engine.jac diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b2b2b33..bfb0c2c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,12 +1,12 @@ repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v2.3.0 + rev: v6.0.0 hooks: - id: check-yaml args: [--allow-multiple-documents] - id: check-json - id: trailing-whitespace - - repo: https://github.com/psf/black - rev: 24.1.1 + - repo: https://github.com/psf/black-pre-commit-mirror + rev: 25.9.0 hooks: - id: black \ No newline at end of file diff --git a/app/scripts/runs2data.jac b/app/scripts/runs2data.jac index 1ad51b2..6831d5b 100644 --- a/app/scripts/runs2data.jac +++ b/app/scripts/runs2data.jac @@ -5,7 +5,7 @@ import:py argparse; can convert_run(run: str, prompt_disc: str) { responses_files = [f for f in os.listdir(os.path.join("runs", run)) if f.endswith(".json")]; data = {"run": run, "prompt_disc": prompt_disc, "outputs": {}}; - + for responses_file in responses_files { with open(os.path.join("runs", run, responses_file), "r") as f { model_data = json.load(f); diff --git a/app/src/components/auto_evaluator/emb_sim_scorer.impl.jac b/app/src/components/auto_evaluator/emb_sim_scorer.impl.jac index 880a473..527696b 100644 --- a/app/src/components/auto_evaluator/emb_sim_scorer.impl.jac +++ b/app/src/components/auto_evaluator/emb_sim_scorer.impl.jac @@ -146,7 +146,7 @@ import from collections {Counter} weights = np.ones(n_gram) / n_gram; p_ns = []; - n = min(len(reference_ngrams), len(candidate_ngrams)); + n = min(len(reference_ngrams), len(candidate_ngrams)); i = 0; while (i < n) { ref_ng = list(reference_ngrams[i]); # Convert generator to list if necessary diff --git a/app/src/components/auto_evaluator/emb_sim_scorer.jac b/app/src/components/auto_evaluator/emb_sim_scorer.jac index 92bf7f1..ccf234c 100644 --- a/app/src/components/auto_evaluator/emb_sim_scorer.jac +++ b/app/src/components/auto_evaluator/emb_sim_scorer.jac @@ -31,7 +31,7 @@ can emb_sim_scorer { if st.session_state['anchor_model'] not in model_list { st.session_state['anchor_model'] = model_list[0]; } - + (col1, col2, col3) = st.columns(3); anchor_model_selection = col1.selectbox("Select Anchor Model", options=model_list, key='anchor_model', index=model_list.index(st.session_state.get('anchor_model', model_list[0]))); embedder_selection = col2.selectbox("Select Type of Embedder", options=['USE', 'USE_QA', 'SBERT'], key='embedder', index=['USE', 'USE_QA', 'SBERT', 'OPEN_AI_Embedder'].index(st.session_state.get('embedder', 'SBERT'))); @@ -47,7 +47,7 @@ can emb_sim_scorer { } except Exception as e{ print(e); st.error('Error calculating embedding scores. Please try again.'); - } + } } if button_clicked { st.session_state['button_clicked'] = False; diff --git a/app/src/components/dashboard/dashboard.impl.jac b/app/src/components/dashboard/dashboard.impl.jac index 3a92df8..c522a3b 100644 --- a/app/src/components/dashboard/dashboard.impl.jac +++ b/app/src/components/dashboard/dashboard.impl.jac @@ -111,7 +111,7 @@ can get_outputs -> tuple { (full_outputs, partial_outputs) = get_outputs(); with full_output_col { with st.container(border=True) { - + st.metric("Number of Full Output", f"{len(full_outputs)}/{n_workers}"); } } diff --git a/app/src/components/dashboard/dashboard.jac b/app/src/components/dashboard/dashboard.jac index b7553fa..f51586d 100644 --- a/app/src/components/dashboard/dashboard.jac +++ b/app/src/components/dashboard/dashboard.jac @@ -29,13 +29,13 @@ glob expand = True; can dashboard { :g: expand ; - + if st.session_state.get("current_hv_config", None) { status_indicator(); - + # chart_type = st.selectbox("Select a chart type:", ("Area Chart", "Bar Chart", "Line Chart", "Altair Chart", "Plotly Figure", "Heat Map","Stacked Bar Chart","Histogram")); chart_type = st.selectbox("Select a chart type:", ("Disribution Plot", "Heat Map", "Stacked Bar Chart", "Histogram")); - + # Conditional rendering based on the dropdown selection if chart_type == "Area Chart" { area_chart(); diff --git a/app/src/components/dashboard/plot_utils.jac b/app/src/components/dashboard/plot_utils.jac index d609d7e..84d8f88 100644 --- a/app/src/components/dashboard/plot_utils.jac +++ b/app/src/components/dashboard/plot_utils.jac @@ -8,7 +8,7 @@ import:py json; can generate_stacked_bar_chart(model_performance: dict, criteria: list) { df_data = []; - + for (model, crits) in model_performance.items() { for (crit, counts) in crits.items() { df_data.append({"model": model, "criterion": crit, "wins": counts["wins"], "ties": counts["ties"], "losses": counts["losses"]}); @@ -23,7 +23,7 @@ can generate_stacked_bar_chart(model_performance: dict, criteria: list) { } total_fig_height = fig_height_per_row * len(criteria); colors = {"wins": "green", "ties": "orange", "losses": "red"}; - + for (i, criterion) in enumerate(criteria) { criterion_data = df[df["criterion"] == criterion].sort_values("wins", ascending=False); for (j, outcome) in enumerate(["wins", "ties", "losses"]) { @@ -31,7 +31,7 @@ can generate_stacked_bar_chart(model_performance: dict, criteria: list) { } } fig.update_layout(barmode="stack", title="Model Performance by Criterion", height=total_fig_height); - + for (i, criterion) in enumerate(criteria) { fig.update_yaxes(title_text=criterion.capitalize(), row=i + 1, col=1, title_standoff=25); } @@ -40,7 +40,7 @@ can generate_stacked_bar_chart(model_performance: dict, criteria: list) { can generate_heatmaps(placeholder: str, model_performance: dict, preference_matrix: dict, all_models: list, criteria: list) { rows_list = []; - + for (model, crit_dict) in model_performance.items() { for (crit, win_tie_loss) in crit_dict.items() { rows_list.append({"model": model, "criterion": crit, "wins": win_tie_loss['wins'], "ties": win_tie_loss['ties'], "losses": win_tie_loss['losses']}); @@ -50,21 +50,21 @@ can generate_heatmaps(placeholder: str, model_performance: dict, preference_matr global_max = df[["wins", "ties", "losses"]].max().max(); global_min = df[["wins", "ties", "losses"]].min().max(); subplot_titles = []; - + for crit in criteria { subplot_titles.append("Heatmap of Wins for " + crit); subplot_titles.append("Heatmap of Total Wins for " + crit); } fig = make_subplots(rows=len(criteria), cols=2, subplot_titles=subplot_titles, horizontal_spacing=0.15, specs=[[{}, {}] for _ in range(len(criteria))]); global_individual_max = 0; - + for criterion in criteria { for model in all_models { max_wins = max(preference_matrix[criterion][model].values()); global_individual_max = max(global_individual_max, max_wins); } } - + for (i, criterion) in enumerate(criteria) { can model_sorting_criteria(model: str) { total = sum(preference_matrix[criterion][model].values()); diff --git a/app/src/components/generator/generator.jac b/app/src/components/generator/generator.jac index d801025..340e99e 100644 --- a/app/src/components/generator/generator.jac +++ b/app/src/components/generator/generator.jac @@ -20,7 +20,7 @@ can generator { } st.header('Response Generator'); st.caption("This helps you to generate the necessary response for the " - "given prompttemplate with given values for all the selected SLMs and " + "given prompttemplate with given values for all the selected SLMs and " "propierity LLMs."); (selected_models, n_samples, temp) = model_settings(); (prompt_template, arguments) = prompt_settings(); diff --git a/app/src/components/theme.jac b/app/src/components/theme.jac index 2c21b03..d24540b 100644 --- a/app/src/components/theme.jac +++ b/app/src/components/theme.jac @@ -22,7 +22,7 @@ can initPage(page_title: str) -> None { favicon = Image.open(os.path.join(dir_root, "../assets/favicon.png")); st.set_page_config(page_title=page_title, page_icon=favicon); local_css(os.path.join(dir_root, "../assets/style.css")); - + with open(os.path.join(dir_root, "../assets/theme.html")) as f { st.html(f.read(), height=80); } diff --git a/app/src/components/utils.jac b/app/src/components/utils.jac index c174383..7a528ce 100644 --- a/app/src/components/utils.jac +++ b/app/src/components/utils.jac @@ -18,7 +18,7 @@ can call_action(action: str, **kwargs: dict) -> dict { '''Check the Status of the Query Engine.''' can check_query_engine -> bool { - + try { ret = requests.get(ACTION_SERVER_URL); return ret.status_code == 200; @@ -29,7 +29,7 @@ can check_query_engine -> bool { '''Check the Status of the Ollama Server.''' can check_ollama_server -> bool { - + try { ret = requests.get(OLLAMA_SERVER_URL); return ret.status_code == 200; @@ -59,7 +59,7 @@ can check_engine_status { '''Load the Model in the Query Engine.''' can load_engine(provider_name: str, model_name: str, temperature: float, prompt_template: str) -> bool { config = {"provider_name": provider_name, "model_name": model_name, "temperature": temperature, "prompt_template": prompt_template}; - + try { call_action(action="load_engine", config=config); return True; @@ -73,7 +73,7 @@ can load_engine(provider_name: str, model_name: str, temperature: float, prompt_ can run_inference(model_name: str, num_samples: int, payload: dict) -> dict { outputs = []; full_prompt = None; - + for i in range(num_samples) { try { start_time = time.time(); @@ -99,7 +99,7 @@ can map_prompt_names_to_ids(prompt_data_dir: str, prompt_info_file: str) { } prompt_info = [{prompt["prompt_id"]: prompt["prompt"]} for prompt in prompt_info]; prompt_ids = {}; - + for filename in os.listdir(prompt_data_dir) { use_case_name = "_".join(filename.split(".")[0].split("_")[:-1]); file_path = os.path.join(prompt_data_dir, filename); @@ -122,7 +122,7 @@ can generate_performance_data(formatted_output: dict, all_models: list, prompt_i } model_performance = {model: {criterion: {"wins": 0, "ties": 0, "losses": 0} for criterion in criteria} for model in all_models}; preference_matrix = {criterion: {model: {other_model: 0 for other_model in all_models if other_model != model} for model in all_models} for criterion in criteria}; - + for outputs in formatted_output { if (prompt_id == "all_combined" or outputs["prompt_id"] == prompt_id) { @@ -140,7 +140,7 @@ can generate_performance_data(formatted_output: dict, all_models: list, prompt_i model_performance[model2][crit]["wins"]+=1; preference_matrix[crit][model2][model1]+=1; } else { # Ties - + model_performance[model1][crit]["ties"]+=1; model_performance[model2][crit]["ties"]+=1; } @@ -160,7 +160,7 @@ can format_responses_by_prompt(workers_data_dir: str, distribution_file: str, re with open(response_file, "r") as file { all_responses = json.load(file); } - + for filename in os.listdir(workers_data_dir) { file_path = os.path.join(workers_data_dir, filename); if (os.path.isfile(file_path)) { @@ -213,7 +213,7 @@ can convert_run(run: str) { os.makedirs("data", exist_ok=True); responses_files = [f for f in os.listdir(os.path.join(os.path.abspath("runs"), run)) if f.endswith(".json")]; data = {"run": run, "prompt_disc": None, "outputs": {}}; - + for responses_file in responses_files { with open(os.path.join(os.path.abspath("runs"), run, responses_file), "r") as f { model_data = json.load(f); diff --git a/app/src/tests/test_dashboard.jac b/app/src/tests/test_dashboard.jac index 354c10d..80f4b36 100644 --- a/app/src/tests/test_dashboard.jac +++ b/app/src/tests/test_dashboard.jac @@ -43,7 +43,7 @@ import:jac from helpers, get_item_by_label; # app.session_state.admin_privileges = True; # app.run(); # dashboard_tab = get_item_by_label(app, "tab", "Dashboard"); - + # # Assert error messages for missing configuration and results # assert (dashboard_tab.error[0].value == "Human Evaluation config was not found. Initialize a Human Evaluation first."); # assert (dashboard_tab.error[1].value == "Results were not found. Initialize a Human Evaluation first. If Initiated already, wait until the results are ready."); diff --git a/app/src/tests/test_generator.jac b/app/src/tests/test_generator.jac index 6d167eb..b62e464 100644 --- a/app/src/tests/test_generator.jac +++ b/app/src/tests/test_generator.jac @@ -15,7 +15,7 @@ test app_running { # test run_query_engine { # :g: query_engine ; - + # query_engine = subprocess.Popen(["jac", "run", "src/query_engine.jac"]); # time.sleep(10); # response = requests.get("http://localhost:8000"); @@ -25,7 +25,7 @@ test app_running { # test run_ollama_server { # :g: ollama_server ; - + # ollama_server = subprocess.Popen(["ollama", "serve"]); # time.sleep(10); # response = requests.get("http://localhost:11434"); diff --git a/app/src/tests/test_human_eval.jac b/app/src/tests/test_human_eval.jac index 405fc36..46db860 100644 --- a/app/src/tests/test_human_eval.jac +++ b/app/src/tests/test_human_eval.jac @@ -19,7 +19,7 @@ test human_eval_without_config { test human_eval_with_config { shutil.unpack_archive(os.path.join(os.path.dirname(__file__), "fixtures", "config.zip"), "."); :g: app ; - + app = AppTest.from_file("app.py").run(timeout=20); assert not app.exception; assert not app.error; @@ -29,13 +29,13 @@ test human_eval_with_config { test human_eval_config_ab_testing { shutil.unpack_archive(os.path.join(os.path.dirname(__file__), "fixtures", "config.zip"), "."); :g: app ; - + app = AppTest.from_file("app.py").run(timeout=20); get_item_by_label(app, "text_input", "Worker ID").set_value("test_worker_id").run(); assert os.path.exists(os.path.join(".human_eval_config", "worker_count.txt")); assert not app.exception; assert app.session_state.worker_id == "test_worker_id"; - + # TODO: Perform one evaluation shutil.rmtree(".human_eval_config"); } @@ -43,13 +43,13 @@ test human_eval_config_ab_testing { test human_eval_config_criteria { shutil.unpack_archive(os.path.join(os.path.dirname(__file__), "fixtures", "criteria_config.zip"), "."); :g: app ; - + app = AppTest.from_file("app.py").run(timeout=20); get_item_by_label(app, "text_input", "Worker ID").set_value("test_worker_id").run(); assert os.path.exists(os.path.join(".human_eval_config", "worker_count.txt")); assert not app.exception; assert app.session_state.worker_id == "test_worker_id"; - + # TODO: Perform one evaluation shutil.rmtree(".human_eval_config"); } diff --git a/app/src/tests/test_llm_as_evaluator.jac b/app/src/tests/test_llm_as_evaluator.jac index bfa5c9a..4a0e36f 100644 --- a/app/src/tests/test_llm_as_evaluator.jac +++ b/app/src/tests/test_llm_as_evaluator.jac @@ -32,14 +32,14 @@ test app_running { # test llm_as_evaluator_ab_testing { # shutil.unpack_archive(os.path.join(os.path.dirname(__file__), "fixtures", "config.zip"), ".human_eval_config"); # :g: app ; - + # app = AppTest.from_file("app.py").run(timeout=20); # app.session_state.admin_privileges = True; # app.run(); # assert not app.exception; # llm_as_evaluator_tab = get_item_by_label(app, "tab", "LLM as Evaluator"); # assert not llm_as_evaluator_tab.error; - + # # TODO: Run the LLM as Evaluator # shutil.rmtree(".human_eval_config"); # shutil.rmtree("runs"); @@ -48,14 +48,14 @@ test app_running { # test llm_as_evaluator_criteria { # shutil.unpack_archive(os.path.join(os.path.dirname(__file__), "fixtures", "criteria_config.zip"), ".human_eval_config"); # :g: app ; - + # app = AppTest.from_file("app.py").run(timeout=20); # app.session_state.admin_privileges = True; # app.run(); # assert not app.exception; # llm_as_evaluator_tab = get_item_by_label(app, "tab", "LLM as Evaluator"); # assert not llm_as_evaluator_tab.error; - + # # TODO: Run the LLM as Evaluator # shutil.rmtree(".human_eval_config"); # shutil.rmtree("runs"); diff --git a/app/src/tests/test_setup.jac b/app/src/tests/test_setup.jac index a1a4eb7..927f622 100644 --- a/app/src/tests/test_setup.jac +++ b/app/src/tests/test_setup.jac @@ -43,7 +43,7 @@ test setup_humnan_eval_ab_testing { setup_tab.text_area("city_name_responses.json_prompt_simple_disc").set_value("This is a new simple prompt description"); get_item_by_label(setup_tab, "button", "Create Evaluation Configuration").set_value(True).run(); assert not app.exception; - + assert os.path.exists(".human_eval_config"); assert os.path.exists(os.path.join(".human_eval_config", "config.json")); assert os.path.exists(os.path.join(".human_eval_config", "distribution.json")); diff --git a/engine/src/query_engine.test.jac b/engine/src/query_engine.test.jac index 7439d26..f302122 100644 --- a/engine/src/query_engine.test.jac +++ b/engine/src/query_engine.test.jac @@ -4,7 +4,7 @@ import:py requests; test run_query_engine { :g: query_engine ; - + query_engine = subprocess.Popen(["jac", "run", "src/query_engine.jac"]); time.sleep(10); response = requests.get("http://localhost:8000"); @@ -14,7 +14,7 @@ test run_query_engine { test run_ollama_server { :g: ollama_server ; - + ollama_server = subprocess.Popen(["ollama", "serve"]); time.sleep(10); response = requests.get("http://localhost:11434");