Skip to content

Commit

Permalink
wips
Browse files Browse the repository at this point in the history
  • Loading branch information
soldni committed Dec 3, 2024
1 parent 97998b5 commit 0b07b45
Show file tree
Hide file tree
Showing 4 changed files with 107 additions and 0 deletions.
48 changes: 48 additions & 0 deletions configs/peteish-anneal/digits.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
collections=(
"${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/dclm/*/*.json.zst"
"${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/flan/*.json.gz"
"${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/math/codesearchnet-owmfilter/*/*.jsonl.gz"
"${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/math/dolmino_math_synth/basic_math/*TRAIN.jsonl"
"${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/math/dolmino_math_synth/gsm8k-synth/resample_v1_6x/*.jsonl"
"${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/math/dolmino_math_synth/gsm_mind/*/*.jsonl"
"${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/math/gsm8k/*/train/*.jsonl.zst"
"${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/math/mathcoder2-synthmath/ajibawa-2023/*.jsonl"
"${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/math/mathcoder2-synthmath/m-a-p_Matrix/*/*.jsonl"
"${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/math/metamath-owmfilter/*.jsonl.gz"
"${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/math/tinyGSM-MIND/*/*.jsonl.gz"
"${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/math/tulu_math/*/*.jsonl"
"${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/pes2o/*.json.gz"
"${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/stackexchange/*.json.gz"
"${HOME}/ai2-llm/pretraining-data/sources/dolmino-mix-1124/documents/wiki/*.json.gz"
)
tokenizer="allenai/dolma2-tokenizer-sigdig"

for path in "${collections[@]}"; do
name=$(echo "${path}" | sed -E 's|.*/documents/([^*]+).*|\1|' | sed 's:^/::; s:/$::')
destination="${HOME}/ai2-llm/preprocessed/dolmino-mix-1124/${tokenizer}/${name}"

echo "Tokenizing $path to $destination"
echo "Number of files: $(ls -1 $path 2>/dev/null | wc -l)"

if [[ "$name" == *"dclm"* ]]; then
processes=$(($(nproc) - 4))
else
processes=20
fi

set -ex
dolma tokens \
--documents "${path}" \
--destination $destination \
--tokenizer.name_or_path ${tokenizer} \
--tokenizer.eos_token_id 100257 \
--tokenizer.pad_token_id 100277 \
--no-tokenizer.segment_before_tokenization \
--tokenizer.encode_special_tokens \
--processes ${processes} \
--seed 3920 \
--max_size 1073741824 \
--sample_ring_prop \
--dtype uint32
set +ex
done
30 changes: 30 additions & 0 deletions configs/peteish-anneal/mmlu-web/decontaminate.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/usr/bin/env bash

set -ex

SCRIPT_PATH=$(realpath "$0")

bloom_filter_file=/tmp/oe-eval-data-dedupe_ngrams_8_1-train_dev_test.bin
remote_bloom_filter_file=s3://ai2-llm/bloom-filters/oe-eval-data-dedupe_ngrams_8_1-20241018-train_dev_test.bin

aws s3 cp $remote_bloom_filter_file $bloom_filter_file
size=331605257

dolma dedupe \
--documents \
"${HOME}/ai2-llm/pretraining-data/sources/dclm/v0_mmlu_web_minhash_dedup/documents/*.json.zst" \
--dedupe.name dedupe_ngrams_8_1_all_train \
--dedupe.paragraphs.attribute_name dedupe_ngrams_8_1_all_train \
--dedupe.paragraphs.by_ngram.ngram_length 8 \
--dedupe.paragraphs.by_ngram.skip_short_paragraphs \
--dedupe.paragraphs.by_ngram.stride 1 \
--dedupe.paragraphs.by_ngram.overlap_threshold 0 \
--dedupe.skip_empty \
--bloom_filter.file $bloom_filter_file \
--bloom_filter.read_only \
--bloom_filter.estimated_doc_count $size \
--bloom_filter.desired_false_positive_rate 0.001 \
--processes "$(expr $(nproc) - 4)"


dolma -c "$(dirname ${SCRIPT_PATH})/remove_all_train.yaml" mix --processes $(expr $(nproc) - 4)
13 changes: 13 additions & 0 deletions configs/peteish-anneal/mmlu-web/remove_all_train.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
streams:
- name: dclm
documents: &documents
- ${oc.env:HOME}/ai2-llm/pretraining-data/sources/dclm/v0_mmlu_web_minhash_dedup/documents/*.json.zst
attributes: &attributes
- dedupe_ngrams_8_1_all_train
output:
max_size_in_bytes: 200_000_000
path: ${oc.env:HOME}/ai2-llm/pretraining-data/sources/dclm/v0_mmlu_web_minhash_dedup_decontam/documents
filter:
exclude:
- ([.attributes.dedupe_ngrams_8_1_all_train[] | select(.[2] >= 0.1)] | length != 0)
syntax: jq
16 changes: 16 additions & 0 deletions configs/peteish-anneal/mmlu-web/tokenize.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
destination: ${oc.env:HOME}/ai2-llm/preprocessed/dclm/v0_mmlu_web_minhash_dedup_decontam/allenai/dolma2-tokenizer
documents:
- ${oc.env:HOME}/ai2-llm/pretraining-data/sources/dclm/v0_mmlu_web_minhash_dedup_decontam/documents/*

processes: 128
seed: 3920
max_size: 4_294_967_296
dtype: uint32

tokenizer:
name_or_path: allenai/dolma2-tokenizer
bos_token_id: null
eos_token_id: 100257
pad_token_id: 100277
segment_before_tokenization: false
encode_special_tokens: true

0 comments on commit 0b07b45

Please sign in to comment.