From 92b98a6d78d02e46c3a4749ccc337f2844da29b4 Mon Sep 17 00:00:00 2001
From: Ian <magnusson.i@northeastern.edu>
Date: Mon, 2 Oct 2023 17:39:22 -0700
Subject: [PATCH 1/8] allow rows outputs per metric type

---
 evaluation/steps/run_catwalk.py | 51 +++++++++++++++++++++++++++++++--
 1 file changed, 48 insertions(+), 3 deletions(-)

diff --git a/evaluation/steps/run_catwalk.py b/evaluation/steps/run_catwalk.py
index 98ba5ff6e..7ba489284 100644
--- a/evaluation/steps/run_catwalk.py
+++ b/evaluation/steps/run_catwalk.py
@@ -257,12 +257,57 @@ def run(
 
         return tsv_outputs
 
-    def _write_to_gsheet(self, gsheet: str, rows: List[Dict]):
+    def _write_to_gsheet(self, gsheet: str, rows: List[Dict], sheet_title: str = "Sheet1"):
         import pygsheets
 
+        # make rows into dataframe
+        new_df = pd.DataFrame(rows)
+
         client = pygsheets.authorize(service_account_json=os.environ["GDRIVE_SERVICE_ACCOUNT_JSON"])
         sheet = client.open(gsheet)
-        worksheet = sheet[0]  # TODO: pass in sheet title, etc.
+
+        # make sheet if doesn't exist
+        if sheet_title in [s.title for s in sheet.worksheets()]:
+            worksheet = sheet.worksheet_by_title(sheet_title)
+        else:
+            sheet.add_worksheet(rows=new_df.shape[0], cols=new_df.shape[1], title=sheet_title)
+            worksheet = sheet.worksheet_by_title(sheet_title)
         current_df = worksheet.get_as_df()
-        new_df = pd.concat([current_df, pd.DataFrame(rows)])
+        current_df = worksheet.get_as_df()
+        new_df = pd.concat([current_df, new_df])
         worksheet.set_dataframe(new_df, (1, 1), nan="")
+
+@Step.register("write-outputs-as-rows-multiple-metrics")
+class WriteOutputsAsRows(WriteOutputsAsRows):
+    VERSION = "001"
+
+    def run(
+        self, models: List[str], outputs: List[Dict], prediction_kwargs: List[Dict], gsheet: Optional[str] = None
+    ) -> List:
+        per_metric_type_tsv_outputs = {}
+        for idx, d in enumerate(outputs):
+            model = models[idx]
+            pred_kwargs = copy.deepcopy(DEFAULT_PREDICTION_KWARGS)
+            pred_kwargs.update(prediction_kwargs[idx])
+            tsv_outputs = []
+            for metric_type_name, metrics_dict in d["metrics"].items():
+                row = {}
+                row["date"] = datetime.now(tz=pytz.utc).strftime("%Y-%m-%d %H:%M:%S UTC")
+                row["model"] = model
+                row["full_model"] = f"lm::pretrained={model}"
+                row["task"] = d["task"]
+                row["processing_time"] = d["processing_time"]
+                row["num_instances"] = d["num_instances"]
+                row["tango_workspace"] = self.workspace.url
+                row["tango_step"] = self.unique_id
+                for metric_name in metrics_dict:
+                    row[metric_name] = metrics_dict[metric_name]
+
+                row.update(pred_kwargs)
+                per_metric_type_tsv_outputs[metric_type_name] = per_metric_type_tsv_outputs.get(metric_type_name, []) + [row]
+
+        if gsheet:
+            for metric_type_name, tsv_outputs in per_metric_type_tsv_outputs.items():
+                self._write_to_gsheet(gsheet, tsv_outputs, sheet_title=metric_type_name)
+
+        return per_metric_type_tsv_outputs

From 4036fca9c978123520e6a66ee7c1ba78b48b166b Mon Sep 17 00:00:00 2001
From: Ian <magnusson.i@northeastern.edu>
Date: Mon, 2 Oct 2023 19:10:26 -0700
Subject: [PATCH 2/8] minimal subdomains create_fine_grained_pipeline

---
 evaluation/experiments/utils.libsonnet | 65 +++++++++++++++++++++++++-
 evaluation/steps/run_catwalk.py        | 40 +++++++++++++++-
 2 files changed, 103 insertions(+), 2 deletions(-)

diff --git a/evaluation/experiments/utils.libsonnet b/evaluation/experiments/utils.libsonnet
index bce239d4a..4ad4a069a 100644
--- a/evaluation/experiments/utils.libsonnet
+++ b/evaluation/experiments/utils.libsonnet
@@ -102,6 +102,7 @@ local outputs_step_name(config) =
     config.task_name + std.get(config.task_kwargs, "task_rename", "");
 
 local outputs_ref(config) = {type: "ref", ref: outputs_step_name(config)};
+local processed_outputs_ref(config) = {type: "ref", ref: "processed_" + outputs_step_name(config)};
 
 local create_outputs_steps(model_task_configs) = std.foldl(
     function(x, config) x + {
@@ -119,12 +120,30 @@ local create_outputs_steps(model_task_configs) = std.foldl(
     {}
 );
 
+local create_process_outputs_steps(model_task_configs) = std.foldl(
+    function(x, config) x + {
+        ["processed_" + outputs_step_name(config)]: {
+            type: "process-outputs",
+            outputs: outputs_ref(config),
+            step_resources: {
+                gpu_count: 0
+            }
+        },
+    },
+    model_task_configs,
+    {}
+);
 
 local all_outputs(model_task_configs) = [
     outputs_ref(config)
     for config in model_task_configs
 ];
 
+local all_processed_outputs(model_task_configs) = [
+    processed_outputs_ref(config)
+    for config in model_task_configs
+];
+
 local all_pred_kwargs(model_task_configs) = [
     config.prediction_kwargs
     for config in model_task_configs
@@ -149,6 +168,20 @@ local create_outputs_as_rows_steps(model_task_configs, gsheet) =
         }
     };
 
+local create_processed_outputs_as_rows_multiple_metrics_steps(model_task_configs, gsheet) =
+    {
+        "combine-all-outputs": {
+            type: "write-outputs-as-rows-multiple-metrics",
+            outputs: all_processed_outputs(model_task_configs),
+            models: all_models(model_task_configs),
+            prediction_kwargs: all_pred_kwargs(model_task_configs),
+            gsheet: gsheet,
+            step_resources: {
+                gpu_count: 0
+            }
+        }
+    };
+
 local create_pipeline(models, task_sets, gsheet) =
 
     // Model steps
@@ -175,9 +208,39 @@ local create_pipeline(models, task_sets, gsheet) =
 
     all_steps;
 
+local create_fine_grained_pipeline(models, task_sets, gsheet) =
+
+    // Model steps
+    local model_location_steps = create_model_location_steps(models);
+    local catwalk_model_steps = create_catwalk_model_steps(models);
+
+    // Task steps
+    local task_configs = flatten_task_sets(task_sets);
+    local task_steps = create_task_steps(task_configs);
+
+    // Prediction and metrics
+    local model_task_configs = model_task_cross_product(models, task_configs);
+    local outputs_steps = create_outputs_steps(model_task_configs);
+
+    local processed_outputs_steps = create_process_outputs_steps(model_task_configs);
+
+    // Aggregate results for each task set and model combination
+    local combine_all_outputs = create_processed_outputs_as_rows_multiple_metrics_steps(model_task_configs, gsheet);
+
+    local all_steps =
+        model_location_steps +
+        catwalk_model_steps +
+        task_steps +
+        outputs_steps +
+        processed_outputs_steps +
+        combine_all_outputs;
+
+    all_steps;
+
 
 {
-    create_pipeline: create_pipeline
+    create_pipeline: create_pipeline,
+    create_fine_grained_pipeline: create_fine_grained_pipeline
 }
 
 /*local wandb_log_step = {
diff --git a/evaluation/steps/run_catwalk.py b/evaluation/steps/run_catwalk.py
index 7ba489284..315d2a366 100644
--- a/evaluation/steps/run_catwalk.py
+++ b/evaluation/steps/run_catwalk.py
@@ -6,6 +6,7 @@
 from pydoc import locate
 from typing import Any, Dict, List, Optional
 
+import numpy as np
 import pandas as pd
 import pytz
 from catwalk.dependencies.lm_eval.utils import simple_parse_args_string
@@ -128,6 +129,40 @@ def run(
 }
 
 
+@Step.register("process-outputs")
+class ProcessOutputs(Step):
+    VERSION = "002"
+
+    def run(
+        self,
+        outputs: Dict,
+        **kwargs,
+    ) -> Dict:
+        task_name = outputs["task"]
+        new_metrics = {}
+        if "subdomain" in outputs["instance_predictions"][0]["instance"]:
+            new_metrics[f"ppl_token_{task_name}_subdomains"] = {}
+            sum_logits = {}
+            num_tokens = {}
+            for instance_prediction in outputs["instance_predictions"]:
+                subdomain = instance_prediction["instance"]["subdomain"]
+                sum_logits[subdomain] = (
+                    sum_logits.get(subdomain, 0) + instance_prediction["prediction"]["model_output"]["sum_logits"]
+                )
+                num_tokens[subdomain] = (
+                    num_tokens.get(subdomain, 0) + instance_prediction["prediction"]["model_output"]["num_tokens"]
+                )
+
+            for subdomain in sum_logits:
+                new_metrics[f"ppl_token_{task_name}_subdomains"][subdomain] = np.exp(
+                    -sum_logits[subdomain] / num_tokens[subdomain]
+                )
+
+        outputs["metrics"].update(new_metrics)
+
+        return outputs
+
+
 @Step.register("predict-and-calculate-metrics")
 class PredictAndCalculateMetricsStep(Step):
     VERSION = "003"
@@ -277,6 +312,7 @@ def _write_to_gsheet(self, gsheet: str, rows: List[Dict], sheet_title: str = "Sh
         new_df = pd.concat([current_df, new_df])
         worksheet.set_dataframe(new_df, (1, 1), nan="")
 
+
 @Step.register("write-outputs-as-rows-multiple-metrics")
 class WriteOutputsAsRows(WriteOutputsAsRows):
     VERSION = "001"
@@ -304,7 +340,9 @@ def run(
                     row[metric_name] = metrics_dict[metric_name]
 
                 row.update(pred_kwargs)
-                per_metric_type_tsv_outputs[metric_type_name] = per_metric_type_tsv_outputs.get(metric_type_name, []) + [row]
+                per_metric_type_tsv_outputs[metric_type_name] = per_metric_type_tsv_outputs.get(
+                    metric_type_name, []
+                ) + [row]
 
         if gsheet:
             for metric_type_name, tsv_outputs in per_metric_type_tsv_outputs.items():

From 83cf0c3d4fae38dfa44f555c0f05b58afbd2c088 Mon Sep 17 00:00:00 2001
From: Ian <magnusson.i@northeastern.edu>
Date: Mon, 2 Oct 2023 19:29:46 -0700
Subject: [PATCH 3/8] type hints grumble grumble

---
 evaluation/steps/run_catwalk.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/evaluation/steps/run_catwalk.py b/evaluation/steps/run_catwalk.py
index 315d2a366..e29a7e5d7 100644
--- a/evaluation/steps/run_catwalk.py
+++ b/evaluation/steps/run_catwalk.py
@@ -141,9 +141,9 @@ def run(
         task_name = outputs["task"]
         new_metrics = {}
         if "subdomain" in outputs["instance_predictions"][0]["instance"]:
-            new_metrics[f"ppl_token_{task_name}_subdomains"] = {}
-            sum_logits = {}
-            num_tokens = {}
+            new_metrics[f"ppl_token_{task_name}_subdomains"]: Dict[str, float] = {}
+            sum_logits: Dict[str, float] = {}
+            num_tokens: Dict[str, int] = {}
             for instance_prediction in outputs["instance_predictions"]:
                 subdomain = instance_prediction["instance"]["subdomain"]
                 sum_logits[subdomain] = (
@@ -314,18 +314,18 @@ def _write_to_gsheet(self, gsheet: str, rows: List[Dict], sheet_title: str = "Sh
 
 
 @Step.register("write-outputs-as-rows-multiple-metrics")
-class WriteOutputsAsRows(WriteOutputsAsRows):
+class WriteOutputsAsRowsMultipleMetrics(WriteOutputsAsRows):
     VERSION = "001"
 
     def run(
         self, models: List[str], outputs: List[Dict], prediction_kwargs: List[Dict], gsheet: Optional[str] = None
-    ) -> List:
-        per_metric_type_tsv_outputs = {}
+    ) -> Dict[str, List[Dict]]:
+        per_metric_type_tsv_outputs: Dict[str, List[Dict]] = {}
         for idx, d in enumerate(outputs):
             model = models[idx]
             pred_kwargs = copy.deepcopy(DEFAULT_PREDICTION_KWARGS)
             pred_kwargs.update(prediction_kwargs[idx])
-            tsv_outputs = []
+            tsv_outputs: List[Dict] = []
             for metric_type_name, metrics_dict in d["metrics"].items():
                 row = {}
                 row["date"] = datetime.now(tz=pytz.utc).strftime("%Y-%m-%d %H:%M:%S UTC")

From 859108b78a3d4d553acc8a15ecfd4844e106dac4 Mon Sep 17 00:00:00 2001
From: Ian <magnusson.i@northeastern.edu>
Date: Mon, 2 Oct 2023 19:35:54 -0700
Subject: [PATCH 4/8] ugh mypy!

---
 evaluation/steps/run_catwalk.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/evaluation/steps/run_catwalk.py b/evaluation/steps/run_catwalk.py
index e29a7e5d7..bbbba6a7b 100644
--- a/evaluation/steps/run_catwalk.py
+++ b/evaluation/steps/run_catwalk.py
@@ -139,9 +139,9 @@ def run(
         **kwargs,
     ) -> Dict:
         task_name = outputs["task"]
-        new_metrics = {}
+        new_metrics: Dict[str, float] = {}
         if "subdomain" in outputs["instance_predictions"][0]["instance"]:
-            new_metrics[f"ppl_token_{task_name}_subdomains"]: Dict[str, float] = {}
+            new_metrics[f"ppl_token_{task_name}_subdomains"] = {}
             sum_logits: Dict[str, float] = {}
             num_tokens: Dict[str, int] = {}
             for instance_prediction in outputs["instance_predictions"]:
@@ -319,7 +319,7 @@ class WriteOutputsAsRowsMultipleMetrics(WriteOutputsAsRows):
 
     def run(
         self, models: List[str], outputs: List[Dict], prediction_kwargs: List[Dict], gsheet: Optional[str] = None
-    ) -> Dict[str, List[Dict]]:
+    ) -> Dict[str, List[Dict]]:  # type: ignore
         per_metric_type_tsv_outputs: Dict[str, List[Dict]] = {}
         for idx, d in enumerate(outputs):
             model = models[idx]

From c1c137bbd9a4e2127f9ec778b8d05b0d2fc0b377 Mon Sep 17 00:00:00 2001
From: Ian <magnusson.i@northeastern.edu>
Date: Mon, 2 Oct 2023 19:39:30 -0700
Subject: [PATCH 5/8] more mypy

---
 evaluation/steps/run_catwalk.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/evaluation/steps/run_catwalk.py b/evaluation/steps/run_catwalk.py
index bbbba6a7b..92ca19ccd 100644
--- a/evaluation/steps/run_catwalk.py
+++ b/evaluation/steps/run_catwalk.py
@@ -139,7 +139,7 @@ def run(
         **kwargs,
     ) -> Dict:
         task_name = outputs["task"]
-        new_metrics: Dict[str, float] = {}
+        new_metrics: Dict[str, Dict] = {}
         if "subdomain" in outputs["instance_predictions"][0]["instance"]:
             new_metrics[f"ppl_token_{task_name}_subdomains"] = {}
             sum_logits: Dict[str, float] = {}

From b1e8ffe7d1abf0dbf408288b1620c438314da495 Mon Sep 17 00:00:00 2001
From: Ian <magnusson.i@northeastern.edu>
Date: Fri, 6 Oct 2023 12:07:09 -0700
Subject: [PATCH 6/8] turn write_to_gsheet into its own function

---
 evaluation/steps/run_catwalk.py | 47 ++++++++++++++++-----------------
 1 file changed, 23 insertions(+), 24 deletions(-)

diff --git a/evaluation/steps/run_catwalk.py b/evaluation/steps/run_catwalk.py
index 90c6a036a..db041c1b4 100644
--- a/evaluation/steps/run_catwalk.py
+++ b/evaluation/steps/run_catwalk.py
@@ -301,38 +301,18 @@ def run(
             tsv_outputs.append(row)
 
         if gsheet:
-            self._write_to_gsheet(gsheet, tsv_outputs)
+            write_to_gsheet(gsheet, tsv_outputs)
 
         return tsv_outputs
 
-    def _write_to_gsheet(self, gsheet: str, rows: List[Dict], sheet_title: str = "Sheet1"):
-        import pygsheets
-
-        # make rows into dataframe
-        new_df = pd.DataFrame(rows)
-
-        client = pygsheets.authorize(service_account_json=os.environ["GDRIVE_SERVICE_ACCOUNT_JSON"])
-        sheet = client.open(gsheet)
-
-        # make sheet if doesn't exist
-        if sheet_title in [s.title for s in sheet.worksheets()]:
-            worksheet = sheet.worksheet_by_title(sheet_title)
-        else:
-            sheet.add_worksheet(rows=new_df.shape[0], cols=new_df.shape[1], title=sheet_title)
-            worksheet = sheet.worksheet_by_title(sheet_title)
-        current_df = worksheet.get_as_df()
-        current_df = worksheet.get_as_df()
-        new_df = pd.concat([current_df, new_df])
-        worksheet.set_dataframe(new_df, (1, 1), nan="")
-
 
 @Step.register("write-outputs-as-rows-multiple-metrics")
-class WriteOutputsAsRowsMultipleMetrics(WriteOutputsAsRows):
+class WriteOutputsAsRowsMultipleMetrics(Step):
     VERSION = "001"
 
     def run(
         self, models: List[str], outputs: List[Dict], prediction_kwargs: List[Dict], gsheet: Optional[str] = None
-    ) -> Dict[str, List[Dict]]:  # type: ignore
+    ) -> Dict[str, List[Dict]]:
         per_metric_type_tsv_outputs: Dict[str, List[Dict]] = {}
         for idx, d in enumerate(outputs):
             model = models[idx]
@@ -359,6 +339,25 @@ def run(
 
         if gsheet:
             for metric_type_name, tsv_outputs in per_metric_type_tsv_outputs.items():
-                self._write_to_gsheet(gsheet, tsv_outputs, sheet_title=metric_type_name)
+                write_to_gsheet(gsheet, tsv_outputs, sheet_title=metric_type_name)
 
         return per_metric_type_tsv_outputs
+
+def write_to_gsheet(gsheet: str, rows: List[Dict], sheet_title: str = "Sheet1"):
+    import pygsheets
+
+    # make rows into dataframe
+    new_df = pd.DataFrame(rows)
+
+    client = pygsheets.authorize(service_account_json=os.environ["GDRIVE_SERVICE_ACCOUNT_JSON"])
+    sheet = client.open(gsheet)
+
+    # make sheet if doesn't exist
+    if sheet_title in [s.title for s in sheet.worksheets()]:
+        worksheet = sheet.worksheet_by_title(sheet_title)
+    else:
+        sheet.add_worksheet(rows=new_df.shape[0], cols=new_df.shape[1], title=sheet_title)
+        worksheet = sheet.worksheet_by_title(sheet_title)
+    current_df = worksheet.get_as_df()
+    new_df = pd.concat([current_df, new_df])
+    worksheet.set_dataframe(new_df, (1, 1), nan="")
\ No newline at end of file

From 6bdd5c136fe2be1d7ceb4ba6fbfa5f46698117e8 Mon Sep 17 00:00:00 2001
From: Ian <magnusson.i@northeastern.edu>
Date: Fri, 6 Oct 2023 12:10:43 -0700
Subject: [PATCH 7/8] style fixes

---
 evaluation/steps/run_catwalk.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/evaluation/steps/run_catwalk.py b/evaluation/steps/run_catwalk.py
index db041c1b4..90214371a 100644
--- a/evaluation/steps/run_catwalk.py
+++ b/evaluation/steps/run_catwalk.py
@@ -343,6 +343,7 @@ def run(
 
         return per_metric_type_tsv_outputs
 
+
 def write_to_gsheet(gsheet: str, rows: List[Dict], sheet_title: str = "Sheet1"):
     import pygsheets
 
@@ -360,4 +361,4 @@ def write_to_gsheet(gsheet: str, rows: List[Dict], sheet_title: str = "Sheet1"):
         worksheet = sheet.worksheet_by_title(sheet_title)
     current_df = worksheet.get_as_df()
     new_df = pd.concat([current_df, new_df])
-    worksheet.set_dataframe(new_df, (1, 1), nan="")
\ No newline at end of file
+    worksheet.set_dataframe(new_df, (1, 1), nan="")

From 56d708abbffa91867a2a02e87d0ce25c413c656a Mon Sep 17 00:00:00 2001
From: Ian <magnusson.i@northeastern.edu>
Date: Fri, 6 Oct 2023 13:19:20 -0700
Subject: [PATCH 8/8] add support for model_kwargs to fine-grained

---
 evaluation/steps/run_catwalk.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/evaluation/steps/run_catwalk.py b/evaluation/steps/run_catwalk.py
index d018441e8..ccea68096 100644
--- a/evaluation/steps/run_catwalk.py
+++ b/evaluation/steps/run_catwalk.py
@@ -328,6 +328,7 @@ def run(
                 row = {}
                 row["date"] = datetime.now(tz=pytz.utc).strftime("%Y-%m-%d %H:%M:%S UTC")
                 row["model"] = model
+                row["model_kwargs"] = d["model_kwargs"]
                 row["full_model"] = f"lm::pretrained={model}"
                 row["task"] = d["task"]
                 row["processing_time"] = d["processing_time"]