Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/query_engine_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,5 +34,5 @@ jobs:
pip install -r engine/requirements.txt

- name: Run tests
run:
run:
jac test engine/src/query_engine.jac
6 changes: 3 additions & 3 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v2.3.0
rev: v6.0.0
hooks:
- id: check-yaml
args: [--allow-multiple-documents]
- id: check-json
- id: trailing-whitespace
- repo: https://github.com/psf/black
rev: 24.1.1
- repo: https://github.com/psf/black-pre-commit-mirror
rev: 25.9.0
hooks:
- id: black
2 changes: 1 addition & 1 deletion app/scripts/runs2data.jac
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import:py argparse;
can convert_run(run: str, prompt_disc: str) {
responses_files = [f for f in os.listdir(os.path.join("runs", run)) if f.endswith(".json")];
data = {"run": run, "prompt_disc": prompt_disc, "outputs": {}};

for responses_file in responses_files {
with open(os.path.join("runs", run, responses_file), "r") as f {
model_data = json.load(f);
Expand Down
2 changes: 1 addition & 1 deletion app/src/components/auto_evaluator/emb_sim_scorer.impl.jac
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ import from collections {Counter}
weights = np.ones(n_gram) / n_gram;
p_ns = [];

n = min(len(reference_ngrams), len(candidate_ngrams));
n = min(len(reference_ngrams), len(candidate_ngrams));
i = 0;
while (i < n) {
ref_ng = list(reference_ngrams[i]); # Convert generator to list if necessary
Expand Down
4 changes: 2 additions & 2 deletions app/src/components/auto_evaluator/emb_sim_scorer.jac
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ can emb_sim_scorer {
if st.session_state['anchor_model'] not in model_list {
st.session_state['anchor_model'] = model_list[0];
}

(col1, col2, col3) = st.columns(3);
anchor_model_selection = col1.selectbox("Select Anchor Model", options=model_list, key='anchor_model', index=model_list.index(st.session_state.get('anchor_model', model_list[0])));
embedder_selection = col2.selectbox("Select Type of Embedder", options=['USE', 'USE_QA', 'SBERT'], key='embedder', index=['USE', 'USE_QA', 'SBERT', 'OPEN_AI_Embedder'].index(st.session_state.get('embedder', 'SBERT')));
Expand All @@ -47,7 +47,7 @@ can emb_sim_scorer {
} except Exception as e{
print(e);
st.error('Error calculating embedding scores. Please try again.');
}
}
}
if button_clicked {
st.session_state['button_clicked'] = False;
Expand Down
2 changes: 1 addition & 1 deletion app/src/components/dashboard/dashboard.impl.jac
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ can get_outputs -> tuple {
(full_outputs, partial_outputs) = get_outputs();
with full_output_col {
with st.container(border=True) {

st.metric("Number of Full Output", f"{len(full_outputs)}/{n_workers}");
}
}
Expand Down
6 changes: 3 additions & 3 deletions app/src/components/dashboard/dashboard.jac
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,13 @@ glob expand = True;

can dashboard {
:g: expand ;

if st.session_state.get("current_hv_config", None) {
status_indicator();

# chart_type = st.selectbox("Select a chart type:", ("Area Chart", "Bar Chart", "Line Chart", "Altair Chart", "Plotly Figure", "Heat Map","Stacked Bar Chart","Histogram"));
chart_type = st.selectbox("Select a chart type:", ("Disribution Plot", "Heat Map", "Stacked Bar Chart", "Histogram"));

# Conditional rendering based on the dropdown selection
if chart_type == "Area Chart" {
area_chart();
Expand Down
14 changes: 7 additions & 7 deletions app/src/components/dashboard/plot_utils.jac
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import:py json;

can generate_stacked_bar_chart(model_performance: dict, criteria: list) {
df_data = [];

for (model, crits) in model_performance.items() {
for (crit, counts) in crits.items() {
df_data.append({"model": model, "criterion": crit, "wins": counts["wins"], "ties": counts["ties"], "losses": counts["losses"]});
Expand All @@ -23,15 +23,15 @@ can generate_stacked_bar_chart(model_performance: dict, criteria: list) {
}
total_fig_height = fig_height_per_row * len(criteria);
colors = {"wins": "green", "ties": "orange", "losses": "red"};

for (i, criterion) in enumerate(criteria) {
criterion_data = df[df["criterion"] == criterion].sort_values("wins", ascending=False);
for (j, outcome) in enumerate(["wins", "ties", "losses"]) {
fig.add_trace(go.Bar(x=criterion_data["model"], y=criterion_data[outcome], text=criterion_data[outcome], textposition="auto", name=outcome.capitalize(), marker_color=colors[outcome], showlegend=(i == 0)), row=i + 1, col=1);
}
}
fig.update_layout(barmode="stack", title="Model Performance by Criterion", height=total_fig_height);

for (i, criterion) in enumerate(criteria) {
fig.update_yaxes(title_text=criterion.capitalize(), row=i + 1, col=1, title_standoff=25);
}
Expand All @@ -40,7 +40,7 @@ can generate_stacked_bar_chart(model_performance: dict, criteria: list) {

can generate_heatmaps(placeholder: str, model_performance: dict, preference_matrix: dict, all_models: list, criteria: list) {
rows_list = [];

for (model, crit_dict) in model_performance.items() {
for (crit, win_tie_loss) in crit_dict.items() {
rows_list.append({"model": model, "criterion": crit, "wins": win_tie_loss['wins'], "ties": win_tie_loss['ties'], "losses": win_tie_loss['losses']});
Expand All @@ -50,21 +50,21 @@ can generate_heatmaps(placeholder: str, model_performance: dict, preference_matr
global_max = df[["wins", "ties", "losses"]].max().max();
global_min = df[["wins", "ties", "losses"]].min().max();
subplot_titles = [];

for crit in criteria {
subplot_titles.append("Heatmap of Wins for " + crit);
subplot_titles.append("Heatmap of Total Wins for " + crit);
}
fig = make_subplots(rows=len(criteria), cols=2, subplot_titles=subplot_titles, horizontal_spacing=0.15, specs=[[{}, {}] for _ in range(len(criteria))]);
global_individual_max = 0;

for criterion in criteria {
for model in all_models {
max_wins = max(preference_matrix[criterion][model].values());
global_individual_max = max(global_individual_max, max_wins);
}
}

for (i, criterion) in enumerate(criteria) {
can model_sorting_criteria(model: str) {
total = sum(preference_matrix[criterion][model].values());
Expand Down
2 changes: 1 addition & 1 deletion app/src/components/generator/generator.jac
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ can generator {
}
st.header('Response Generator');
st.caption("This helps you to generate the necessary response for the "
"given prompttemplate with given values for all the selected SLMs and "
"given prompttemplate with given values for all the selected SLMs and "
"propierity LLMs.");
(selected_models, n_samples, temp) = model_settings();
(prompt_template, arguments) = prompt_settings();
Expand Down
2 changes: 1 addition & 1 deletion app/src/components/theme.jac
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ can initPage(page_title: str) -> None {
favicon = Image.open(os.path.join(dir_root, "../assets/favicon.png"));
st.set_page_config(page_title=page_title, page_icon=favicon);
local_css(os.path.join(dir_root, "../assets/style.css"));

with open(os.path.join(dir_root, "../assets/theme.html")) as f {
st.html(f.read(), height=80);
}
Expand Down
18 changes: 9 additions & 9 deletions app/src/components/utils.jac
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ can call_action(action: str, **kwargs: dict) -> dict {

'''Check the Status of the Query Engine.'''
can check_query_engine -> bool {

try {
ret = requests.get(ACTION_SERVER_URL);
return ret.status_code == 200;
Expand All @@ -29,7 +29,7 @@ can check_query_engine -> bool {

'''Check the Status of the Ollama Server.'''
can check_ollama_server -> bool {

try {
ret = requests.get(OLLAMA_SERVER_URL);
return ret.status_code == 200;
Expand Down Expand Up @@ -59,7 +59,7 @@ can check_engine_status {
'''Load the Model in the Query Engine.'''
can load_engine(provider_name: str, model_name: str, temperature: float, prompt_template: str) -> bool {
config = {"provider_name": provider_name, "model_name": model_name, "temperature": temperature, "prompt_template": prompt_template};

try {
call_action(action="load_engine", config=config);
return True;
Expand All @@ -73,7 +73,7 @@ can load_engine(provider_name: str, model_name: str, temperature: float, prompt_
can run_inference(model_name: str, num_samples: int, payload: dict) -> dict {
outputs = [];
full_prompt = None;

for i in range(num_samples) {
try {
start_time = time.time();
Expand All @@ -99,7 +99,7 @@ can map_prompt_names_to_ids(prompt_data_dir: str, prompt_info_file: str) {
}
prompt_info = [{prompt["prompt_id"]: prompt["prompt"]} for prompt in prompt_info];
prompt_ids = {};

for filename in os.listdir(prompt_data_dir) {
use_case_name = "_".join(filename.split(".")[0].split("_")[:-1]);
file_path = os.path.join(prompt_data_dir, filename);
Expand All @@ -122,7 +122,7 @@ can generate_performance_data(formatted_output: dict, all_models: list, prompt_i
}
model_performance = {model: {criterion: {"wins": 0, "ties": 0, "losses": 0} for criterion in criteria} for model in all_models};
preference_matrix = {criterion: {model: {other_model: 0 for other_model in all_models if other_model != model} for model in all_models} for criterion in criteria};

for outputs in formatted_output {
if (prompt_id == "all_combined"
or outputs["prompt_id"] == prompt_id) {
Expand All @@ -140,7 +140,7 @@ can generate_performance_data(formatted_output: dict, all_models: list, prompt_i
model_performance[model2][crit]["wins"]+=1;
preference_matrix[crit][model2][model1]+=1;
} else { # Ties

model_performance[model1][crit]["ties"]+=1;
model_performance[model2][crit]["ties"]+=1;
}
Expand All @@ -160,7 +160,7 @@ can format_responses_by_prompt(workers_data_dir: str, distribution_file: str, re
with open(response_file, "r") as file {
all_responses = json.load(file);
}

for filename in os.listdir(workers_data_dir) {
file_path = os.path.join(workers_data_dir, filename);
if (os.path.isfile(file_path)) {
Expand Down Expand Up @@ -213,7 +213,7 @@ can convert_run(run: str) {
os.makedirs("data", exist_ok=True);
responses_files = [f for f in os.listdir(os.path.join(os.path.abspath("runs"), run)) if f.endswith(".json")];
data = {"run": run, "prompt_disc": None, "outputs": {}};

for responses_file in responses_files {
with open(os.path.join(os.path.abspath("runs"), run, responses_file), "r") as f {
model_data = json.load(f);
Expand Down
2 changes: 1 addition & 1 deletion app/src/tests/test_dashboard.jac
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ import:jac from helpers, get_item_by_label;
# app.session_state.admin_privileges = True;
# app.run();
# dashboard_tab = get_item_by_label(app, "tab", "Dashboard");

# # Assert error messages for missing configuration and results
# assert (dashboard_tab.error[0].value == "Human Evaluation config was not found. Initialize a Human Evaluation first.");
# assert (dashboard_tab.error[1].value == "Results were not found. Initialize a Human Evaluation first. If Initiated already, wait until the results are ready.");
Expand Down
4 changes: 2 additions & 2 deletions app/src/tests/test_generator.jac
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ test app_running {

# test run_query_engine {
# :g: query_engine ;

# query_engine = subprocess.Popen(["jac", "run", "src/query_engine.jac"]);
# time.sleep(10);
# response = requests.get("http://localhost:8000");
Expand All @@ -25,7 +25,7 @@ test app_running {

# test run_ollama_server {
# :g: ollama_server ;

# ollama_server = subprocess.Popen(["ollama", "serve"]);
# time.sleep(10);
# response = requests.get("http://localhost:11434");
Expand Down
10 changes: 5 additions & 5 deletions app/src/tests/test_human_eval.jac
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ test human_eval_without_config {
test human_eval_with_config {
shutil.unpack_archive(os.path.join(os.path.dirname(__file__), "fixtures", "config.zip"), ".");
:g: app ;

app = AppTest.from_file("app.py").run(timeout=20);
assert not app.exception;
assert not app.error;
Expand All @@ -29,27 +29,27 @@ test human_eval_with_config {
test human_eval_config_ab_testing {
shutil.unpack_archive(os.path.join(os.path.dirname(__file__), "fixtures", "config.zip"), ".");
:g: app ;

app = AppTest.from_file("app.py").run(timeout=20);
get_item_by_label(app, "text_input", "Worker ID").set_value("test_worker_id").run();
assert os.path.exists(os.path.join(".human_eval_config", "worker_count.txt"));
assert not app.exception;
assert app.session_state.worker_id == "test_worker_id";

# TODO: Perform one evaluation
shutil.rmtree(".human_eval_config");
}

test human_eval_config_criteria {
shutil.unpack_archive(os.path.join(os.path.dirname(__file__), "fixtures", "criteria_config.zip"), ".");
:g: app ;

app = AppTest.from_file("app.py").run(timeout=20);
get_item_by_label(app, "text_input", "Worker ID").set_value("test_worker_id").run();
assert os.path.exists(os.path.join(".human_eval_config", "worker_count.txt"));
assert not app.exception;
assert app.session_state.worker_id == "test_worker_id";

# TODO: Perform one evaluation
shutil.rmtree(".human_eval_config");
}
8 changes: 4 additions & 4 deletions app/src/tests/test_llm_as_evaluator.jac
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,14 @@ test app_running {
# test llm_as_evaluator_ab_testing {
# shutil.unpack_archive(os.path.join(os.path.dirname(__file__), "fixtures", "config.zip"), ".human_eval_config");
# :g: app ;

# app = AppTest.from_file("app.py").run(timeout=20);
# app.session_state.admin_privileges = True;
# app.run();
# assert not app.exception;
# llm_as_evaluator_tab = get_item_by_label(app, "tab", "LLM as Evaluator");
# assert not llm_as_evaluator_tab.error;

# # TODO: Run the LLM as Evaluator
# shutil.rmtree(".human_eval_config");
# shutil.rmtree("runs");
Expand All @@ -48,14 +48,14 @@ test app_running {
# test llm_as_evaluator_criteria {
# shutil.unpack_archive(os.path.join(os.path.dirname(__file__), "fixtures", "criteria_config.zip"), ".human_eval_config");
# :g: app ;

# app = AppTest.from_file("app.py").run(timeout=20);
# app.session_state.admin_privileges = True;
# app.run();
# assert not app.exception;
# llm_as_evaluator_tab = get_item_by_label(app, "tab", "LLM as Evaluator");
# assert not llm_as_evaluator_tab.error;

# # TODO: Run the LLM as Evaluator
# shutil.rmtree(".human_eval_config");
# shutil.rmtree("runs");
Expand Down
2 changes: 1 addition & 1 deletion app/src/tests/test_setup.jac
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ test setup_humnan_eval_ab_testing {
setup_tab.text_area("city_name_responses.json_prompt_simple_disc").set_value("This is a new simple prompt description");
get_item_by_label(setup_tab, "button", "Create Evaluation Configuration").set_value(True).run();
assert not app.exception;

assert os.path.exists(".human_eval_config");
assert os.path.exists(os.path.join(".human_eval_config", "config.json"));
assert os.path.exists(os.path.join(".human_eval_config", "distribution.json"));
Expand Down
4 changes: 2 additions & 2 deletions engine/src/query_engine.test.jac
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import:py requests;

test run_query_engine {
:g: query_engine ;

query_engine = subprocess.Popen(["jac", "run", "src/query_engine.jac"]);
time.sleep(10);
response = requests.get("http://localhost:8000");
Expand All @@ -14,7 +14,7 @@ test run_query_engine {

test run_ollama_server {
:g: ollama_server ;

ollama_server = subprocess.Popen(["ollama", "serve"]);
time.sleep(10);
response = requests.get("http://localhost:11434");
Expand Down
Loading