Skip to content

Commit

Permalink
full
Browse files Browse the repository at this point in the history
  • Loading branch information
soldni committed Oct 26, 2024
1 parent a80609f commit 9135994
Show file tree
Hide file tree
Showing 2 changed files with 102 additions and 0 deletions.
45 changes: 45 additions & 0 deletions classifiers/scripts/fineweb_full.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#! /bin/bash

DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/full/*.jsonl.zstd'

NUM_NODES=4
MODEL_NAME="HuggingFaceFW/fineweb-edu-classifier"
CLUSTER="ai2/jupiter*"
BATCH_SIZE=1024
PRIORITY="urgent"

# Generate a hash for the run name by combining model name and documents
RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}')
RUN_NAME="fineweb_classifier_${RUN_HASH:0:8}"

# Set the run name as an environment variable
export BEAKER_EXPERIMENT_NAME="${RUN_NAME}"


gantry run \
--task-name "${RUN_NAME}" \
--description "Score ${DOCUMENTS} with ${MODEL_NAME}" \
--allow-dirty \
--workspace ai2/davidw-oe-annealing \
--beaker-image 'petew/olmo-torch23-gantry' \
--timeout -1 \
--show-logs \
--host-networking \
--venv 'base' \
--priority "${PRIORITY}" \
--leader-selection \
--gpus 8 \
--replicas ${NUM_NODES} \
--preemptible \
--cluster "${CLUSTER}" \
--budget ai2/oe-data \
--env LOG_FILTER_TYPE=local_rank0_only \
--env OMP_NUM_THREADS=8 \
--env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
--env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \
--env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \
--env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \
--shared-memory 10GiB \
--install "pip install -e classifiers/" \
--yes \
-- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 8 --prefetch-factor 8"
57 changes: 57 additions & 0 deletions classifiers/scripts/nvidia-deberta-full.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#! /bin/bash

DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/100*/*.jsonl.zstd'


# DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/100b/*_dclm_shard_0000*.jsonl.zstd'
# DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/100b/*_dclm_shard_0001*.jsonl.zstd'
# DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/100b/*_dclm_shard_0002*.jsonl.zstd'

# DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/100b-extras/*_dclm_shard_0000*.jsonl.zstd'
# DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/100b-extras/*_dclm_shard_0001*.jsonl.zstd'
# DOCUMENTS='s3://ai2-llm/pretraining-data/sources/dclm/v0/documents/100b-extras/*_dclm_shard_0002*.jsonl.zstd'


NUM_NODES=8
# NUM_NODES=1
MODEL_NAME="nvidia/quality-classifier-deberta"
CLUSTER="ai2/jupiter*"
BATCH_SIZE=512
PRIORITY="urgent"
# PRIORITY="urgent"

# Generate a hash for the run name by combining model name and documents
RUN_HASH=$(echo -n "${MODEL_NAME}${DOCUMENTS}" | md5sum | awk '{print $1}')
RUN_NAME="nvidia_deberta_${RUN_HASH:0:8}"

# Set the run name as an environment variable
export BEAKER_EXPERIMENT_NAME="${RUN_NAME}"


gantry run \
--task-name "${RUN_NAME}" \
--description "Score ${DOCUMENTS} with ${MODEL_NAME}" \
--allow-dirty \
--workspace ai2/davidw-oe-annealing \
--beaker-image 'petew/olmo-torch23-gantry' \
--timeout -1 \
--show-logs \
--host-networking \
--venv 'base' \
--priority "${PRIORITY}" \
--leader-selection \
--gpus 8 \
--replicas ${NUM_NODES} \
--preemptible \
--cluster "${CLUSTER}" \
--budget ai2/oe-data \
--env LOG_FILTER_TYPE=local_rank0_only \
--env OMP_NUM_THREADS=8 \
--env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
--env-secret AWS_ACCESS_KEY_ID=lucas-AWS_ACCESS_KEY_ID \
--env-secret AWS_SECRET_ACCESS_KEY=lucas-AWS_SECRET_ACCESS_KEY \
--env-secret WANDB_API_KEY=lucas-WANDB_API_KEY \
--shared-memory 10GiB \
--install "pip install -e classifiers/" \
--yes \
-- /bin/bash -c "huggingface-cli download ${MODEL_NAME} && torchrun --nnodes "${NUM_NODES}:${NUM_NODES}" --nproc-per-node 8 --rdzv_id 12347 --rdzv_backend static --rdzv_endpoint "\${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" --node_rank "\${BEAKER_REPLICA_RANK}" --rdzv_conf 'read_timeout=3600' -m dolma_classifiers.inference --source-prefix ${DOCUMENTS} --batch-size ${BATCH_SIZE} --use-wandb --wandb-project 'dolma-classifiers' --wandb-entity ai2-llm --model-name ${MODEL_NAME} --num-workers 4 --model-compile --max-length 1024"

0 comments on commit 9135994

Please sign in to comment.