allenai · ljvmiranda921 · Jan 21, 2025 · Nov 22, 2024 · Nov 22, 2024 · Nov 22, 2024
diff --git a/beaker/simulate_1m.yml b/beaker/simulate_1m.yml
@@ -0,0 +1,188 @@
+version: v2
+budget: ai2/oe-adapt
+description: "Get features using datamodel approach given many simulations"
+tasks:
+  - name: simulate-large-1e6
+    image:
+      beaker: ljm/human-datamodel
+    command: ["python", "-m", "scripts.sample_best_subset"]
+    arguments:
+      - --input_path
+      - /source/features.jsonl
+      - --output_dir
+      - /output/
+      - --model_path
+      - /model/model.pkl
+      - --sampling_method
+      - optimal_simulated
+      - --budgets
+      - 0.25
+      - 0.50
+      - 0.75
+      - --n_simulations
+      - 1000000
+      - --response_a_col
+      - response_a
+      - --response_b_col
+      - response_b
+    datasets:
+      - mountPath: /source/
+        source:
+          beaker: ljm/helpsteer2-pref-feats
+      - mountPath: /model/
+        source:
+          beaker: ljm/hybrid-prefs-multipref-quad-model
+    result:
+      path: /output
+    resources:
+      gpuCount: 2
+    context:
+      priority: normal
+      preemptible: true
+    constraints:
+      cluster:
+        - ai2/ceres-cirrascale
+        # - ai2/saturn-cirrascale
+    envVars:
+      - name: OPENAI_API_KEY
+        secret: OPENAI_API_KEY
+      - name: TOKENIZERS_PARALLELISM
+        value: false
+  - name: simulate-large-1e5
+    image:
+      beaker: ljm/human-datamodel
+    command: ["python", "-m", "scripts.sample_best_subset"]
+    arguments:
+      - --input_path
+      - /source/features.jsonl
+      - --output_dir
+      - /output/
+      - --model_path
+      - /model/model.pkl
+      - --sampling_method
+      - optimal_simulated
+      - --budgets
+      - 0.25
+      - 0.50
+      - 0.75
+      - --n_simulations
+      - 100000
+      - --response_a_col
+      - response_a
+      - --response_b_col
+      - response_b
+    datasets:
+      - mountPath: /source/
+        source:
+          beaker: ljm/helpsteer2-pref-feats
+      - mountPath: /model/
+        source:
+          beaker: ljm/hybrid-prefs-multipref-quad-model
+    result:
+      path: /output
+    resources:
+      gpuCount: 1
+    context:
+      priority: normal
+      preemptible: true
+    constraints:
+      cluster:
+        - ai2/ceres-cirrascale
+        # - ai2/saturn-cirrascale
+    envVars:
+      - name: OPENAI_API_KEY
+        secret: OPENAI_API_KEY
+      - name: TOKENIZERS_PARALLELISM
+        value: false
+  - name: simulate-large-1e4
+    image:
+      beaker: ljm/human-datamodel
+    command: ["python", "-m", "scripts.sample_best_subset"]
+    arguments:
+      - --input_path
+      - /source/features.jsonl
+      - --output_dir
+      - /output/
+      - --model_path
+      - /model/model.pkl
+      - --sampling_method
+      - optimal_simulated
+      - --budgets
+      - 0.25
+      - 0.50
+      - 0.75
+      - --n_simulations
+      - 10000
+      - --response_a_col
+      - response_a
+      - --response_b_col
+      - response_b
+    datasets:
+      - mountPath: /source/
+        source:
+          beaker: ljm/helpsteer2-pref-feats
+      - mountPath: /model/
+        source:
+          beaker: ljm/hybrid-prefs-multipref-quad-model
+    result:
+      path: /output
+    resources:
+      gpuCount: 1
+    context:
+      priority: normal
+      preemptible: true
+    constraints:
+      cluster:
+        - ai2/ceres-cirrascale
+        # - ai2/saturn-cirrascale
+    envVars:
+      - name: OPENAI_API_KEY
+        secret: OPENAI_API_KEY
+      - name: TOKENIZERS_PARALLELISM
+        value: false
+  - name: simulate-large-1e3
+    image:
+      beaker: ljm/human-datamodel
+    command: ["python", "-m", "scripts.sample_best_subset"]
+    arguments:
+      - --input_path
+      - /source/features.jsonl
+      - --output_dir
+      - /output/
+      - --model_path
+      - /model/model.pkl
+      - --sampling_method
+      - optimal_simulated
+      - --budgets
+      - 0.25
+      - 0.50
+      - 0.75
+      - --n_simulations
+      - 1000
+      - --response_a_col
+      - response_a
+      - --response_b_col
+      - response_b
+    datasets:
+      - mountPath: /source/
+        source:
+          beaker: ljm/helpsteer2-pref-feats
+      - mountPath: /model/
+        source:
+          beaker: ljm/hybrid-prefs-multipref-quad-model
+    result:
+      path: /output
+    resources:
+      gpuCount: 1
+    context:
+      priority: normal
+      preemptible: true
+    constraints:
+      cluster:
+        - ai2/ceres-cirrascale
+        # - ai2/saturn-cirrascale
+    envVars:
+      - name: OPENAI_API_KEY
+        secret: OPENAI_API_KEY
+      - name: TOKENIZERS_PARALLELISM
+        value: false
diff --git a/evals/convert.Dockerfile b/evals/convert.Dockerfile
@@ -54,7 +54,8 @@ ENV PATH="/usr/local/google-cloud-sdk/bin:${PATH}"
 
 # Clone EasyLM repository
 RUN git clone https://github.com/hamishivi/EasyLM.git  . && \
-    git checkout bc241782b67bbe926e148ec9d2046d76b7ba58c8
+    # git checkout bc241782b67bbe926e148ec9d2046d76b7ba58c8
+    git checkout dbf2212c1775b2762f7108d62c8c8b01b52ea4aa
 
 COPY ai2-allennlp-79f5e3a8e95a.json /root/.config/gcloud/application_default_credentials.json
 # Set environment variable for Google Cloud

diff --git a/evals/convert_to_hf.py b/evals/convert_to_hf.py
@@ -125,7 +125,10 @@ def main():
         )
         pytorch_dir = Path(args.pytorch_dir)
         for params_path in params_paths:
-            experiment_name = params_path.parent.stem.split("--")[0]
+            if "llama" in str(params_path):
+                experiment_name = params_path.parts[-2].replace(".", "-").split("--")[0]
+            else:
+                experiment_name = params_path.parent.stem.split("--")[0]
             if args.prefix:
                 experiment_name = f"{args.prefix}-{experiment_name}"
             output_dir = pytorch_dir / experiment_name
@@ -250,28 +253,30 @@ def create_beaker_experiment_spec(
         tasks=[
             TaskSpec(
                 name=f"evaluate-{experiment_name}",
-                image=ImageSource(beaker="nathanl/rb_v16"),
+                image=ImageSource(beaker="nathanl/rewardbench_auto"),
                 constraints=Constraints(
                     cluster=[
-                        "ai2/allennlp-cirrascale",
-                        # "ai2/jupiter-cirrascale-2",
+                        "ai2/saturn-cirrascale",
+                        "ai2/ceres-cirrascale",
+                        "ai2/jupiter-cirrascale-2",
                     ]
                 ),
                 context=TaskContext(priority="normal", preemptible=True),
                 result=ResultSpec(path="/output"),
                 command=["/bin/sh", "-c"],
                 arguments=[
-                    "python scripts/run_rm.py --model /reward_model --tokenizer /reward_model --batch_size 8 --chat_template tulu --trust_remote_code --do_not_save"
+                    "python scripts/run_rm.py --model /reward_model --tokenizer /reward_model --batch_size 8 --trust_remote_code --do_not_save"
                 ],
                 datasets=[
                     DataMount(
                         source=DataSource(beaker=dataset_name),
                         mount_path="/reward_model",
                     ),
-                    DataMount(
-                        source=DataSource(host_path="/net/nfs.cirrascale"),
-                        mount_path="/net/nfs.cirrascale",
-                    ),
+                    # There's no more NFS but we'll keep this here for posterity
+                    # DataMount(
+                    #     source=DataSource(host_path="/net/nfs.cirrascale"),
+                    #     mount_path="/net/nfs.cirrascale",
+                    # ),
                 ],
                 resources=TaskResources(gpu_count=1),
                 env_vars=[

diff --git a/evals/templates/template-llama31.yml b/evals/templates/template-llama31.yml
@@ -0,0 +1,40 @@
+version: v2
+budget: ai2/oe-adapt
+description: "Convert model to pytorch and launch a rewardbench eval job"
+tasks:
+  - name: template
+    image:
+      beaker: ljm/easylm-convert-llama
+    command: ["python", "convert_to_hf.py"]
+    arguments:
+      - --gcs_bucket
+      - ljm-dev
+      - --batch_size
+      - 1
+      - --tokenizer_path
+      - meta-llama/Llama-3.1-8B
+      - --model_size
+      - 8b31
+    result:
+      path: /output
+    resources:
+      gpuCount: 1
+    context:
+      priority: normal
+      preemptible: true
+    constraints:
+      cluster:
+        - ai2/ceres-cirrascale
+        - ai2/saturn-cirrascale
+        - ai2/jupiter-cirrascale-2
+    envVars:
+      - name: OPENAI_API_KEY
+        secret: OPENAI_API_KEY
+      - name: GOOGLE_SERVICE_ACCOUNT
+        secret: GOOGLE_SERVICE_ACCOUNT
+      - name: BEAKER_TOKEN
+        secret: BEAKER_TOKEN
+      - name: TOKENIZERS_PARALLELISM
+        value: "false"
+      - name: HF_TOKEN
+        secret: HF_TOKEN
diff --git a/scripts/create_tpu_single.sh b/scripts/create_tpu_single.sh
@@ -2,8 +2,13 @@ tpu_name=$1
 type=$2
 zone=$3
 echo "Creating TPU: $tpu_name (type: $type zone: $zone)"
-while ! gcloud alpha compute tpus tpu-vm create $tpu_name --accelerator-type=$type --zone=$zone --project=ai2-tpu --version=v2-alpha --preemptible; do sleep 60; done
+while ! gcloud alpha compute tpus tpu-vm create $tpu_name --accelerator-type=$type --zone=$zone --project=ai2-tpu --version=v2-alpha; do sleep 60; done
 gcloud alpha compute tpus tpu-vm ssh $tpu_name --zone=$zone --project=ai2-tpu --worker=all --command="git clone https://github.com/hamishivi/easylm.git"
-gcloud alpha compute tpus tpu-vm ssh $tpu_name --zone=$zone --project=ai2-tpu --worker=all --command="cd easylm; git checkout bc241782b67bbe926e148ec9d2046d76b7ba58c8 .; ./scripts/tpu_vm_setup.sh"
+gcloud alpha compute tpus tpu-vm ssh $tpu_name --zone=$zone --project=ai2-tpu --worker=all --command="cd easylm; git checkout dbf2212c1775b2762f7108d62c8c8b01b52ea4aa .; ./scripts/tpu_vm_setup.sh"
+# gcloud alpha compute tpus tpu-vm ssh $tpu_name --zone=$zone --project=ai2-tpu --worker=all --command="cd easylm; git checkout bc241782b67bbe926e148ec9d2046d76b7ba58c8 .; ./scripts/tpu_vm_setup.sh"
 gcloud alpha compute tpus tpu-vm ssh $tpu_name --zone=$zone --project=ai2-tpu --worker=all --command="python3 -m pip install wandb --upgrade"
 gcloud alpha compute tpus tpu-vm ssh $tpu_name --zone=$zone --project=ai2-tpu --worker=all --command="python3 -m wandb login $WANDB_TOKEN"
+gcloud alpha compute tpus tpu-vm ssh $tpu_name --zone=$zone --project=ai2-tpu --worker=all --command="python3 -m pip install -U 'huggingface_hub[cli]'"
+gcloud alpha compute tpus tpu-vm ssh $tpu_name --zone=$zone --project=ai2-tpu --worker=all --command="echo 'export PATH=\$PATH:~/.local/bin' >> ~/.bashrc"
+gcloud alpha compute tpus tpu-vm ssh $tpu_name --zone=$zone --project=ai2-tpu --worker=all --command="source ~/.bashrc"
+gcloud alpha compute tpus tpu-vm ssh $tpu_name --zone=$zone --project=ai2-tpu --worker=all --command="~/.local/bin/huggingface-cli login --token $HF_TOKEN"
diff --git a/scripts/experiment_optimal_simulated.sh b/scripts/experiment_optimal_simulated.sh
@@ -0,0 +1,11 @@
+for i in 256 512 1024 2048 4096 8192; do
+    python3 -m scripts.sample_best_subset \
+        --input_path data/hs2p_all_features/features.jsonl \
+        --output_dir data/hs2p_best_mixes_optimal_simulated_$i \
+        --model_path data/multipref_quadratic_model/model.pkl \
+        --sampling_method optimal_simulated \
+        --budgets 0.25 0.50 0.75 \
+        --n_simulations $i \
+        --response_a_col response_a \
+        --response_b_col response_b 
+done
diff --git a/scripts/experiment_scaling_download.sh b/scripts/experiment_scaling_download.sh
@@ -0,0 +1,5 @@
+for i in 256 512 1024 2048 4096 8192; do
+    python3 scripts/fetch_evals_rewardbench.py \
+        --output_path data/hs2p-$i-results-llama.csv \
+        --experiment_prefix rm-eval-hs2p-$i-llama3
+done
diff --git a/scripts/get_count_feats.py b/scripts/get_count_feats.py
@@ -1,9 +1,11 @@
 import argparse
 import json
 import logging
+import os
 import random
 import sys
 import uuid
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from pathlib import Path
 from typing import Optional
 
@@ -94,7 +96,8 @@ def generate_instances(
     tags = []
     uuids = [uuid.uuid4().hex for _ in range(len(budgets))]
     budget_instances: dict[str, dict[str, int]] = {}
-    for id, budget in tqdm(zip(uuids, budgets), total=len(budgets)):
+
+    def process_budget(id: str, budget: int) -> str:
         instances_to_swap = run_knapsack(capacity=budget, items=feat_instance_map)
 
         tag = f"ID__{id}__SWAPS_{budget}"
@@ -111,7 +114,7 @@ def generate_instances(
         )
         df_swapped["is_swapped"] = df["id"].apply(lambda x: x in instances_to_swap)
         annotations = df_swapped.to_dict(orient="records")
-        converted_annotations = []
+        converted_annotations: list[dict[str, Optional[str]]] = []
         for annotation in annotations:
             if "model_a" not in annotation:
                 annotation["model_a"] = ""
@@ -134,7 +137,7 @@ def generate_instances(
                 f.write(json.dumps(annotation) + "\n")
 
         # Save the budget
-        budget_instance_map = {}
+        budget_instance_map: dict[str, int] = {}
         swapped_ids = [eg["id"] for eg in converted_annotations if eg["is_swapped"]]
         swapped_df = df[df["id"].isin(swapped_ids)].reset_index(drop=True)
         for feature_str in all_features:
@@ -148,7 +151,19 @@ def generate_instances(
         budget_instances[tag] = budget_instance_map
 
         # Save the tag file to create the experiments.txt later
-        tags.append(f"{swaps_outfile.stem}::{counts_outfile.stem}")
+        return f"{swaps_outfile.stem}::{counts_outfile.stem}"
+
+    with tqdm(total=len(budgets)) as pbar:
+        with ThreadPoolExecutor(max_workers=None) as executor:
+            n_workers = executor._max_workers
+            logging.info(f"Running simulation on {n_workers} workers")
+            futures = {
+                executor.submit(process_budget, id, budget): id
+                for id, budget in zip(uuids, budgets)
+            }
+            for future in as_completed(futures):
+                tags.append(future.result())
+                pbar.update(1)
 
     experiments_file = output_dir / "experiments.txt"
     with experiments_file.open("w") as f: