diff --git a/classifiers/scripts/fineweb_full.sh b/classifiers/scripts/fineweb_full.sh new file mode 100644 index 00000000..a74d07f7 --- /dev/null +++ b/classifiers/scripts/fineweb_full.sh @@ -0,0 +1,45 @@ +#! /bin/bash + +DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/full/*.jsonl.zstd' + +NUM_NODES=4 +MODEL_NAME="HuggingFaceFW/fineweb-edu-classifier" +CLUSTER="ai2/jupiter*" +BATCH_SIZE=1024 +PRIORITY="urgent" + +# Generate a hash for the run name by combining model name and documents +RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}') +RUN_NAME="fineweb_classifier_${RUN_HASH:0:8}" + +# Set the run name as an environment variable +export BEAKER_EXPERIMENT_NAME="${RUN_NAME}" + + +gantry run \ + --task-name "${RUN_NAME}" \ + --description "Score ${DOCUMENTS} with ${MODEL_NAME}" \ + --allow-dirty \ + --workspace ai2/davidw-oe-annealing \ + --beaker-image 'petew/olmo-torch23-gantry' \ + --timeout -1 \ + --show-logs \ + --host-networking \ + --venv 'base' \ + --priority "${PRIORITY}" \ + --leader-selection \ + --gpus 8 \ + --replicas ${NUM_NODES} \ + --preemptible \ + --cluster "${CLUSTER}" \ + --budget ai2/oe-data \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \ + --env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \ + --env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \ + --env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \ + --shared-memory 10GiB \ + --install "pip install -e classifiers/" \ + --yes \ + -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 8 --prefetch-factor 8" diff --git a/classifiers/scripts/nvidia-deberta-full.sh b/classifiers/scripts/nvidia-deberta-full.sh new file mode 100644 index 00000000..d58ad44f --- /dev/null +++ b/classifiers/scripts/nvidia-deberta-full.sh @@ -0,0 +1,57 @@ +#! /bin/bash + +DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/100*/*.jsonl.zstd' + + +# DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/100b/*_dclm_shard_0000*.jsonl.zstd' +# DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/100b/*_dclm_shard_0001*.jsonl.zstd' +# DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/100b/*_dclm_shard_0002*.jsonl.zstd' + +# DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/100b-extras/*_dclm_shard_0000*.jsonl.zstd' +# DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/100b-extras/*_dclm_shard_0001*.jsonl.zstd' +# DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/100b-extras/*_dclm_shard_0002*.jsonl.zstd' + + +NUM_NODES=8 +# NUM_NODES=1 +MODEL_NAME="nvidia/quality-classifier-deberta" +CLUSTER="ai2/jupiter*" +BATCH_SIZE=512 +PRIORITY="urgent" +# PRIORITY="urgent" + +# Generate a hash for the run name by combining model name and documents +RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}') +RUN_NAME="nvidia_deberta_${RUN_HASH:0:8}" + +# Set the run name as an environment variable +export BEAKER_EXPERIMENT_NAME="${RUN_NAME}" + + +gantry run \ + --task-name "${RUN_NAME}" \ + --description "Score ${DOCUMENTS} with ${MODEL_NAME}" \ + --allow-dirty \ + --workspace ai2/davidw-oe-annealing \ + --beaker-image 'petew/olmo-torch23-gantry' \ + --timeout -1 \ + --show-logs \ + --host-networking \ + --venv 'base' \ + --priority "${PRIORITY}" \ + --leader-selection \ + --gpus 8 \ + --replicas ${NUM_NODES} \ + --preemptible \ + --cluster "${CLUSTER}" \ + --budget ai2/oe-data \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \ + --env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \ + --env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \ + --env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \ + --shared-memory 10GiB \ + --install "pip install -e classifiers/" \ + --yes \ + -- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 4 --model-compile --max-length 1024"