From 92b98a6d78d02e46c3a4749ccc337f2844da29b4 Mon Sep 17 00:00:00 2001 From: Ian Date: Mon, 2 Oct 2023 17:39:22 -0700 Subject: [PATCH 1/8] allow rows outputs per metric type --- evaluation/steps/run_catwalk.py | 51 +++++++++++++++++++++++++++++++-- 1 file changed, 48 insertions(+), 3 deletions(-) diff --git a/evaluation/steps/run_catwalk.py b/evaluation/steps/run_catwalk.py index 98ba5ff6e..7ba489284 100644 --- a/evaluation/steps/run_catwalk.py +++ b/evaluation/steps/run_catwalk.py @@ -257,12 +257,57 @@ def run( return tsv_outputs - def _write_to_gsheet(self, gsheet: str, rows: List[Dict]): + def _write_to_gsheet(self, gsheet: str, rows: List[Dict], sheet_title: str = "Sheet1"): import pygsheets + # make rows into dataframe + new_df = pd.DataFrame(rows) + client = pygsheets.authorize(service_account_json=os.environ["GDRIVE_SERVICE_ACCOUNT_JSON"]) sheet = client.open(gsheet) - worksheet = sheet[0] # TODO: pass in sheet title, etc. + + # make sheet if doesn't exist + if sheet_title in [s.title for s in sheet.worksheets()]: + worksheet = sheet.worksheet_by_title(sheet_title) + else: + sheet.add_worksheet(rows=new_df.shape[0], cols=new_df.shape[1], title=sheet_title) + worksheet = sheet.worksheet_by_title(sheet_title) current_df = worksheet.get_as_df() - new_df = pd.concat([current_df, pd.DataFrame(rows)]) + current_df = worksheet.get_as_df() + new_df = pd.concat([current_df, new_df]) worksheet.set_dataframe(new_df, (1, 1), nan="") + +@Step.register("write-outputs-as-rows-multiple-metrics") +class WriteOutputsAsRows(WriteOutputsAsRows): + VERSION = "001" + + def run( + self, models: List[str], outputs: List[Dict], prediction_kwargs: List[Dict], gsheet: Optional[str] = None + ) -> List: + per_metric_type_tsv_outputs = {} + for idx, d in enumerate(outputs): + model = models[idx] + pred_kwargs = copy.deepcopy(DEFAULT_PREDICTION_KWARGS) + pred_kwargs.update(prediction_kwargs[idx]) + tsv_outputs = [] + for metric_type_name, metrics_dict in d["metrics"].items(): + row = {} + row["date"] = datetime.now(tz=pytz.utc).strftime("%Y-%m-%d %H:%M:%S UTC") + row["model"] = model + row["full_model"] = f"lm::pretrained={model}" + row["task"] = d["task"] + row["processing_time"] = d["processing_time"] + row["num_instances"] = d["num_instances"] + row["tango_workspace"] = self.workspace.url + row["tango_step"] = self.unique_id + for metric_name in metrics_dict: + row[metric_name] = metrics_dict[metric_name] + + row.update(pred_kwargs) + per_metric_type_tsv_outputs[metric_type_name] = per_metric_type_tsv_outputs.get(metric_type_name, []) + [row] + + if gsheet: + for metric_type_name, tsv_outputs in per_metric_type_tsv_outputs.items(): + self._write_to_gsheet(gsheet, tsv_outputs, sheet_title=metric_type_name) + + return per_metric_type_tsv_outputs From 4036fca9c978123520e6a66ee7c1ba78b48b166b Mon Sep 17 00:00:00 2001 From: Ian Date: Mon, 2 Oct 2023 19:10:26 -0700 Subject: [PATCH 2/8] minimal subdomains create_fine_grained_pipeline --- evaluation/experiments/utils.libsonnet | 65 +++++++++++++++++++++++++- evaluation/steps/run_catwalk.py | 40 +++++++++++++++- 2 files changed, 103 insertions(+), 2 deletions(-) diff --git a/evaluation/experiments/utils.libsonnet b/evaluation/experiments/utils.libsonnet index bce239d4a..4ad4a069a 100644 --- a/evaluation/experiments/utils.libsonnet +++ b/evaluation/experiments/utils.libsonnet @@ -102,6 +102,7 @@ local outputs_step_name(config) = config.task_name + std.get(config.task_kwargs, "task_rename", ""); local outputs_ref(config) = {type: "ref", ref: outputs_step_name(config)}; +local processed_outputs_ref(config) = {type: "ref", ref: "processed_" + outputs_step_name(config)}; local create_outputs_steps(model_task_configs) = std.foldl( function(x, config) x + { @@ -119,12 +120,30 @@ local create_outputs_steps(model_task_configs) = std.foldl( {} ); +local create_process_outputs_steps(model_task_configs) = std.foldl( + function(x, config) x + { + ["processed_" + outputs_step_name(config)]: { + type: "process-outputs", + outputs: outputs_ref(config), + step_resources: { + gpu_count: 0 + } + }, + }, + model_task_configs, + {} +); local all_outputs(model_task_configs) = [ outputs_ref(config) for config in model_task_configs ]; +local all_processed_outputs(model_task_configs) = [ + processed_outputs_ref(config) + for config in model_task_configs +]; + local all_pred_kwargs(model_task_configs) = [ config.prediction_kwargs for config in model_task_configs @@ -149,6 +168,20 @@ local create_outputs_as_rows_steps(model_task_configs, gsheet) = } }; +local create_processed_outputs_as_rows_multiple_metrics_steps(model_task_configs, gsheet) = + { + "combine-all-outputs": { + type: "write-outputs-as-rows-multiple-metrics", + outputs: all_processed_outputs(model_task_configs), + models: all_models(model_task_configs), + prediction_kwargs: all_pred_kwargs(model_task_configs), + gsheet: gsheet, + step_resources: { + gpu_count: 0 + } + } + }; + local create_pipeline(models, task_sets, gsheet) = // Model steps @@ -175,9 +208,39 @@ local create_pipeline(models, task_sets, gsheet) = all_steps; +local create_fine_grained_pipeline(models, task_sets, gsheet) = + + // Model steps + local model_location_steps = create_model_location_steps(models); + local catwalk_model_steps = create_catwalk_model_steps(models); + + // Task steps + local task_configs = flatten_task_sets(task_sets); + local task_steps = create_task_steps(task_configs); + + // Prediction and metrics + local model_task_configs = model_task_cross_product(models, task_configs); + local outputs_steps = create_outputs_steps(model_task_configs); + + local processed_outputs_steps = create_process_outputs_steps(model_task_configs); + + // Aggregate results for each task set and model combination + local combine_all_outputs = create_processed_outputs_as_rows_multiple_metrics_steps(model_task_configs, gsheet); + + local all_steps = + model_location_steps + + catwalk_model_steps + + task_steps + + outputs_steps + + processed_outputs_steps + + combine_all_outputs; + + all_steps; + { - create_pipeline: create_pipeline + create_pipeline: create_pipeline, + create_fine_grained_pipeline: create_fine_grained_pipeline } /*local wandb_log_step = { diff --git a/evaluation/steps/run_catwalk.py b/evaluation/steps/run_catwalk.py index 7ba489284..315d2a366 100644 --- a/evaluation/steps/run_catwalk.py +++ b/evaluation/steps/run_catwalk.py @@ -6,6 +6,7 @@ from pydoc import locate from typing import Any, Dict, List, Optional +import numpy as np import pandas as pd import pytz from catwalk.dependencies.lm_eval.utils import simple_parse_args_string @@ -128,6 +129,40 @@ def run( } +@Step.register("process-outputs") +class ProcessOutputs(Step): + VERSION = "002" + + def run( + self, + outputs: Dict, + **kwargs, + ) -> Dict: + task_name = outputs["task"] + new_metrics = {} + if "subdomain" in outputs["instance_predictions"][0]["instance"]: + new_metrics[f"ppl_token_{task_name}_subdomains"] = {} + sum_logits = {} + num_tokens = {} + for instance_prediction in outputs["instance_predictions"]: + subdomain = instance_prediction["instance"]["subdomain"] + sum_logits[subdomain] = ( + sum_logits.get(subdomain, 0) + instance_prediction["prediction"]["model_output"]["sum_logits"] + ) + num_tokens[subdomain] = ( + num_tokens.get(subdomain, 0) + instance_prediction["prediction"]["model_output"]["num_tokens"] + ) + + for subdomain in sum_logits: + new_metrics[f"ppl_token_{task_name}_subdomains"][subdomain] = np.exp( + -sum_logits[subdomain] / num_tokens[subdomain] + ) + + outputs["metrics"].update(new_metrics) + + return outputs + + @Step.register("predict-and-calculate-metrics") class PredictAndCalculateMetricsStep(Step): VERSION = "003" @@ -277,6 +312,7 @@ def _write_to_gsheet(self, gsheet: str, rows: List[Dict], sheet_title: str = "Sh new_df = pd.concat([current_df, new_df]) worksheet.set_dataframe(new_df, (1, 1), nan="") + @Step.register("write-outputs-as-rows-multiple-metrics") class WriteOutputsAsRows(WriteOutputsAsRows): VERSION = "001" @@ -304,7 +340,9 @@ def run( row[metric_name] = metrics_dict[metric_name] row.update(pred_kwargs) - per_metric_type_tsv_outputs[metric_type_name] = per_metric_type_tsv_outputs.get(metric_type_name, []) + [row] + per_metric_type_tsv_outputs[metric_type_name] = per_metric_type_tsv_outputs.get( + metric_type_name, [] + ) + [row] if gsheet: for metric_type_name, tsv_outputs in per_metric_type_tsv_outputs.items(): From 83cf0c3d4fae38dfa44f555c0f05b58afbd2c088 Mon Sep 17 00:00:00 2001 From: Ian Date: Mon, 2 Oct 2023 19:29:46 -0700 Subject: [PATCH 3/8] type hints grumble grumble --- evaluation/steps/run_catwalk.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/evaluation/steps/run_catwalk.py b/evaluation/steps/run_catwalk.py index 315d2a366..e29a7e5d7 100644 --- a/evaluation/steps/run_catwalk.py +++ b/evaluation/steps/run_catwalk.py @@ -141,9 +141,9 @@ def run( task_name = outputs["task"] new_metrics = {} if "subdomain" in outputs["instance_predictions"][0]["instance"]: - new_metrics[f"ppl_token_{task_name}_subdomains"] = {} - sum_logits = {} - num_tokens = {} + new_metrics[f"ppl_token_{task_name}_subdomains"]: Dict[str, float] = {} + sum_logits: Dict[str, float] = {} + num_tokens: Dict[str, int] = {} for instance_prediction in outputs["instance_predictions"]: subdomain = instance_prediction["instance"]["subdomain"] sum_logits[subdomain] = ( @@ -314,18 +314,18 @@ def _write_to_gsheet(self, gsheet: str, rows: List[Dict], sheet_title: str = "Sh @Step.register("write-outputs-as-rows-multiple-metrics") -class WriteOutputsAsRows(WriteOutputsAsRows): +class WriteOutputsAsRowsMultipleMetrics(WriteOutputsAsRows): VERSION = "001" def run( self, models: List[str], outputs: List[Dict], prediction_kwargs: List[Dict], gsheet: Optional[str] = None - ) -> List: - per_metric_type_tsv_outputs = {} + ) -> Dict[str, List[Dict]]: + per_metric_type_tsv_outputs: Dict[str, List[Dict]] = {} for idx, d in enumerate(outputs): model = models[idx] pred_kwargs = copy.deepcopy(DEFAULT_PREDICTION_KWARGS) pred_kwargs.update(prediction_kwargs[idx]) - tsv_outputs = [] + tsv_outputs: List[Dict] = [] for metric_type_name, metrics_dict in d["metrics"].items(): row = {} row["date"] = datetime.now(tz=pytz.utc).strftime("%Y-%m-%d %H:%M:%S UTC") From 859108b78a3d4d553acc8a15ecfd4844e106dac4 Mon Sep 17 00:00:00 2001 From: Ian Date: Mon, 2 Oct 2023 19:35:54 -0700 Subject: [PATCH 4/8] ugh mypy! --- evaluation/steps/run_catwalk.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/evaluation/steps/run_catwalk.py b/evaluation/steps/run_catwalk.py index e29a7e5d7..bbbba6a7b 100644 --- a/evaluation/steps/run_catwalk.py +++ b/evaluation/steps/run_catwalk.py @@ -139,9 +139,9 @@ def run( **kwargs, ) -> Dict: task_name = outputs["task"] - new_metrics = {} + new_metrics: Dict[str, float] = {} if "subdomain" in outputs["instance_predictions"][0]["instance"]: - new_metrics[f"ppl_token_{task_name}_subdomains"]: Dict[str, float] = {} + new_metrics[f"ppl_token_{task_name}_subdomains"] = {} sum_logits: Dict[str, float] = {} num_tokens: Dict[str, int] = {} for instance_prediction in outputs["instance_predictions"]: @@ -319,7 +319,7 @@ class WriteOutputsAsRowsMultipleMetrics(WriteOutputsAsRows): def run( self, models: List[str], outputs: List[Dict], prediction_kwargs: List[Dict], gsheet: Optional[str] = None - ) -> Dict[str, List[Dict]]: + ) -> Dict[str, List[Dict]]: # type: ignore per_metric_type_tsv_outputs: Dict[str, List[Dict]] = {} for idx, d in enumerate(outputs): model = models[idx] From c1c137bbd9a4e2127f9ec778b8d05b0d2fc0b377 Mon Sep 17 00:00:00 2001 From: Ian Date: Mon, 2 Oct 2023 19:39:30 -0700 Subject: [PATCH 5/8] more mypy --- evaluation/steps/run_catwalk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evaluation/steps/run_catwalk.py b/evaluation/steps/run_catwalk.py index bbbba6a7b..92ca19ccd 100644 --- a/evaluation/steps/run_catwalk.py +++ b/evaluation/steps/run_catwalk.py @@ -139,7 +139,7 @@ def run( **kwargs, ) -> Dict: task_name = outputs["task"] - new_metrics: Dict[str, float] = {} + new_metrics: Dict[str, Dict] = {} if "subdomain" in outputs["instance_predictions"][0]["instance"]: new_metrics[f"ppl_token_{task_name}_subdomains"] = {} sum_logits: Dict[str, float] = {} From b1e8ffe7d1abf0dbf408288b1620c438314da495 Mon Sep 17 00:00:00 2001 From: Ian Date: Fri, 6 Oct 2023 12:07:09 -0700 Subject: [PATCH 6/8] turn write_to_gsheet into its own function --- evaluation/steps/run_catwalk.py | 47 ++++++++++++++++----------------- 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/evaluation/steps/run_catwalk.py b/evaluation/steps/run_catwalk.py index 90c6a036a..db041c1b4 100644 --- a/evaluation/steps/run_catwalk.py +++ b/evaluation/steps/run_catwalk.py @@ -301,38 +301,18 @@ def run( tsv_outputs.append(row) if gsheet: - self._write_to_gsheet(gsheet, tsv_outputs) + write_to_gsheet(gsheet, tsv_outputs) return tsv_outputs - def _write_to_gsheet(self, gsheet: str, rows: List[Dict], sheet_title: str = "Sheet1"): - import pygsheets - - # make rows into dataframe - new_df = pd.DataFrame(rows) - - client = pygsheets.authorize(service_account_json=os.environ["GDRIVE_SERVICE_ACCOUNT_JSON"]) - sheet = client.open(gsheet) - - # make sheet if doesn't exist - if sheet_title in [s.title for s in sheet.worksheets()]: - worksheet = sheet.worksheet_by_title(sheet_title) - else: - sheet.add_worksheet(rows=new_df.shape[0], cols=new_df.shape[1], title=sheet_title) - worksheet = sheet.worksheet_by_title(sheet_title) - current_df = worksheet.get_as_df() - current_df = worksheet.get_as_df() - new_df = pd.concat([current_df, new_df]) - worksheet.set_dataframe(new_df, (1, 1), nan="") - @Step.register("write-outputs-as-rows-multiple-metrics") -class WriteOutputsAsRowsMultipleMetrics(WriteOutputsAsRows): +class WriteOutputsAsRowsMultipleMetrics(Step): VERSION = "001" def run( self, models: List[str], outputs: List[Dict], prediction_kwargs: List[Dict], gsheet: Optional[str] = None - ) -> Dict[str, List[Dict]]: # type: ignore + ) -> Dict[str, List[Dict]]: per_metric_type_tsv_outputs: Dict[str, List[Dict]] = {} for idx, d in enumerate(outputs): model = models[idx] @@ -359,6 +339,25 @@ def run( if gsheet: for metric_type_name, tsv_outputs in per_metric_type_tsv_outputs.items(): - self._write_to_gsheet(gsheet, tsv_outputs, sheet_title=metric_type_name) + write_to_gsheet(gsheet, tsv_outputs, sheet_title=metric_type_name) return per_metric_type_tsv_outputs + +def write_to_gsheet(gsheet: str, rows: List[Dict], sheet_title: str = "Sheet1"): + import pygsheets + + # make rows into dataframe + new_df = pd.DataFrame(rows) + + client = pygsheets.authorize(service_account_json=os.environ["GDRIVE_SERVICE_ACCOUNT_JSON"]) + sheet = client.open(gsheet) + + # make sheet if doesn't exist + if sheet_title in [s.title for s in sheet.worksheets()]: + worksheet = sheet.worksheet_by_title(sheet_title) + else: + sheet.add_worksheet(rows=new_df.shape[0], cols=new_df.shape[1], title=sheet_title) + worksheet = sheet.worksheet_by_title(sheet_title) + current_df = worksheet.get_as_df() + new_df = pd.concat([current_df, new_df]) + worksheet.set_dataframe(new_df, (1, 1), nan="") \ No newline at end of file From 6bdd5c136fe2be1d7ceb4ba6fbfa5f46698117e8 Mon Sep 17 00:00:00 2001 From: Ian Date: Fri, 6 Oct 2023 12:10:43 -0700 Subject: [PATCH 7/8] style fixes --- evaluation/steps/run_catwalk.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/evaluation/steps/run_catwalk.py b/evaluation/steps/run_catwalk.py index db041c1b4..90214371a 100644 --- a/evaluation/steps/run_catwalk.py +++ b/evaluation/steps/run_catwalk.py @@ -343,6 +343,7 @@ def run( return per_metric_type_tsv_outputs + def write_to_gsheet(gsheet: str, rows: List[Dict], sheet_title: str = "Sheet1"): import pygsheets @@ -360,4 +361,4 @@ def write_to_gsheet(gsheet: str, rows: List[Dict], sheet_title: str = "Sheet1"): worksheet = sheet.worksheet_by_title(sheet_title) current_df = worksheet.get_as_df() new_df = pd.concat([current_df, new_df]) - worksheet.set_dataframe(new_df, (1, 1), nan="") \ No newline at end of file + worksheet.set_dataframe(new_df, (1, 1), nan="") From 56d708abbffa91867a2a02e87d0ce25c413c656a Mon Sep 17 00:00:00 2001 From: Ian Date: Fri, 6 Oct 2023 13:19:20 -0700 Subject: [PATCH 8/8] add support for model_kwargs to fine-grained --- evaluation/steps/run_catwalk.py | 1 + 1 file changed, 1 insertion(+) diff --git a/evaluation/steps/run_catwalk.py b/evaluation/steps/run_catwalk.py index d018441e8..ccea68096 100644 --- a/evaluation/steps/run_catwalk.py +++ b/evaluation/steps/run_catwalk.py @@ -328,6 +328,7 @@ def run( row = {} row["date"] = datetime.now(tz=pytz.utc).strftime("%Y-%m-%d %H:%M:%S UTC") row["model"] = model + row["model_kwargs"] = d["model_kwargs"] row["full_model"] = f"lm::pretrained={model}" row["task"] = d["task"] row["processing_time"] = d["processing_time"]