-
Notifications
You must be signed in to change notification settings - Fork 451
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #223 from allenai/soldni/olmo-mixing
V1 Mixing tools and changes
- Loading branch information
Showing
27 changed files
with
1,549 additions
and
72 deletions.
There are no files selected for viewing
24 changes: 24 additions & 0 deletions
24
pretrain_data/mixer/config/ablations/dedupers/c4-v0-dedup.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
{ | ||
"documents": [ | ||
"pretraining-data/sources/c4/v0/documents/train/*.gz" | ||
], | ||
"work_dir": { | ||
"input": "/data2/c4/deduper/input", | ||
"output": "/data2/c4/deduper/output" | ||
}, | ||
"dedupe": { | ||
"name": "decontamination", | ||
"paragraphs": { | ||
"attribute_name": "bff_duplicate_paragraph_spans_decontamination" | ||
}, | ||
"skip_empty": true | ||
}, | ||
"bloom_filter": { | ||
"file": "/tmp/decontamination/deduper_decontamination_lucas_20230525.bin", | ||
"size_in_bytes": 8388608, | ||
"read_only": true, | ||
"estimated_doc_count": 3898706, | ||
"desired_false_positive_rate": 0.001 | ||
}, | ||
"processes": 120 | ||
} |
83 changes: 83 additions & 0 deletions
83
pretrain_data/mixer/config/olmo-train-no-removal/common-crawl-head.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
{ | ||
"streams": [ | ||
{ | ||
"name": "cc_en_head", | ||
"documents": [ | ||
"pretraining-data/sources/common-crawl/v1-c4-cleaned/documents/cc_en_head/*.json.gz" | ||
], | ||
"attributes": [ | ||
"decontamination", | ||
"dedupe_paragraphs", | ||
"gopher_rules", | ||
"hatespeech_nsfw_cc_v3", | ||
"pii_detection", | ||
"random" | ||
], | ||
"output": { | ||
"path": "pretraining-data/sources/olmo-mix/v1-no-removal/documents/common-crawl/cc_en_head", | ||
"max_size_in_bytes": 4294967296, | ||
"discard_fields": [ | ||
"attributes", | ||
"metadata", | ||
"added", | ||
"created" | ||
] | ||
}, | ||
"filter": { | ||
"include": [], | ||
"exclude": [ | ||
"$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] && @.gopher_rules__gopher_v1__word_count[0][2] < 50)]", | ||
"$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] && @.gopher_rules__gopher_v1__word_count[0][2] > 100000)]", | ||
"$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] && @.gopher_rules__gopher_v1__median_word_length[0][2] < 3)]", | ||
"$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] && @.gopher_rules__gopher_v1__median_word_length[0][2] > 10)]", | ||
"$.attributes[?(@.gopher_rules__gopher_v1__symbol_to_word_ratio && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0] && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0][2] > 0.1)]", | ||
"$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0] && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0][2] < 0.8)]", | ||
"$.attributes[?(@.gopher_rules__gopher_v1__required_word_count && @.gopher_rules__gopher_v1__required_word_count[0] && @.gopher_rules__gopher_v1__required_word_count[0][2] < 2)]", | ||
"$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0] && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0][2] > 0.9)]", | ||
"$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0] && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0][2] > 0.3)]", | ||
"$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_duplicate_lines && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0] && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0][2] > 0.3)]", | ||
"$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0][2] > 0.3)]", | ||
"$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0][2] > 0.2)]", | ||
"$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0][2] > 0.18)]", | ||
"$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0][2] > 0.16)]", | ||
"$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0][2] > 0.15)]", | ||
"$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0][2] > 0.14)]", | ||
"$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0][2] > 0.13)]", | ||
"$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0][2] > 0.12)]", | ||
"$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0][2] > 0.11)]", | ||
"$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0][2] > 0.10)]", | ||
"[email protected][?(@.bff_duplicate_paragraph_spans_decontamination && @.bff_duplicate_paragraph_spans_decontamination[0] && @.bff_duplicate_paragraph_spans_decontamination[0][2] >= 1.0)]", | ||
"$.attributes[?(@.pii_detection__pii_regex_with_counts_fast_v2__doc_count && @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0] && @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0][2] > 5)]", | ||
"$.attributes[?(@.random__random_number_v1__random[0][2] < 0.5104606781)]" | ||
] | ||
}, | ||
"span_replacement": [ | ||
{ | ||
"span": "$.attributes.bff_duplicate_paragraph_spans", | ||
"min_score": 0.5, | ||
"replacement": "" | ||
}, | ||
{ | ||
"span": "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__EMAIL_ADDRESS", | ||
"min_score": 0.5, | ||
"replacement": " |||EMAIL_ADDRESS||| " | ||
}, | ||
{ | ||
"span": "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__PHONE_NUMBER", | ||
"min_score": 0.5, | ||
"replacement": " |||PHONE_NUMBER||| " | ||
}, | ||
{ | ||
"span": "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__IP_ADDRESS", | ||
"min_score": 0.5, | ||
"replacement": " |||IP_ADDRESS||| " | ||
} | ||
] | ||
} | ||
], | ||
"work_dir": { | ||
"input": "/tmp/olmo-mix-v1/input", | ||
"output": "/tmp/olmo-mix-v1/output" | ||
}, | ||
"processes": 128 | ||
} |
83 changes: 83 additions & 0 deletions
83
pretrain_data/mixer/config/olmo-train-no-removal/common-crawl-middle.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
{ | ||
"streams": [ | ||
{ | ||
"name": "cc_en_middle", | ||
"documents": [ | ||
"pretraining-data/sources/common-crawl/v1-c4-cleaned/documents/cc_en_middle/*.json.gz" | ||
], | ||
"attributes": [ | ||
"decontamination", | ||
"dedupe_paragraphs", | ||
"gopher_rules", | ||
"hatespeech_nsfw_cc_v3", | ||
"pii_detection", | ||
"random" | ||
], | ||
"output": { | ||
"path": "pretraining-data/sources/olmo-mix/v1-no-removal/documents/common-crawl/cc_en_middle", | ||
"max_size_in_bytes": 4294967296, | ||
"discard_fields": [ | ||
"attributes", | ||
"metadata", | ||
"added", | ||
"created" | ||
] | ||
}, | ||
"filter": { | ||
"include": [], | ||
"exclude": [ | ||
"$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] && @.gopher_rules__gopher_v1__word_count[0][2] < 50)]", | ||
"$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] && @.gopher_rules__gopher_v1__word_count[0][2] > 100000)]", | ||
"$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] && @.gopher_rules__gopher_v1__median_word_length[0][2] < 3)]", | ||
"$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] && @.gopher_rules__gopher_v1__median_word_length[0][2] > 10)]", | ||
"$.attributes[?(@.gopher_rules__gopher_v1__symbol_to_word_ratio && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0] && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0][2] > 0.1)]", | ||
"$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0] && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0][2] < 0.8)]", | ||
"$.attributes[?(@.gopher_rules__gopher_v1__required_word_count && @.gopher_rules__gopher_v1__required_word_count[0] && @.gopher_rules__gopher_v1__required_word_count[0][2] < 2)]", | ||
"$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0] && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0][2] > 0.9)]", | ||
"$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0] && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0][2] > 0.3)]", | ||
"$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_duplicate_lines && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0] && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0][2] > 0.3)]", | ||
"$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0][2] > 0.3)]", | ||
"$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0][2] > 0.2)]", | ||
"$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0][2] > 0.18)]", | ||
"$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0][2] > 0.16)]", | ||
"$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0][2] > 0.15)]", | ||
"$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0][2] > 0.14)]", | ||
"$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0][2] > 0.13)]", | ||
"$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0][2] > 0.12)]", | ||
"$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0][2] > 0.11)]", | ||
"$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0][2] > 0.10)]", | ||
"[email protected][?(@.bff_duplicate_paragraph_spans_decontamination && @.bff_duplicate_paragraph_spans_decontamination[0] && @.bff_duplicate_paragraph_spans_decontamination[0][2] >= 1.0)]", | ||
"$.attributes[?(@.pii_detection__pii_regex_with_counts_fast_v2__doc_count && @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0] && @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0][2] > 5)]", | ||
"$.attributes[?(@.random__random_number_v1__random[0][2] < 0.5104606781)]" | ||
] | ||
}, | ||
"span_replacement": [ | ||
{ | ||
"span": "$.attributes.bff_duplicate_paragraph_spans", | ||
"min_score": 0.5, | ||
"replacement": "" | ||
}, | ||
{ | ||
"span": "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__EMAIL_ADDRESS", | ||
"min_score": 0.5, | ||
"replacement": " |||EMAIL_ADDRESS||| " | ||
}, | ||
{ | ||
"span": "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__PHONE_NUMBER", | ||
"min_score": 0.5, | ||
"replacement": " |||PHONE_NUMBER||| " | ||
}, | ||
{ | ||
"span": "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__IP_ADDRESS", | ||
"min_score": 0.5, | ||
"replacement": " |||IP_ADDRESS||| " | ||
} | ||
] | ||
} | ||
], | ||
"work_dir": { | ||
"input": "/tmp/olmo-mix-v1/input", | ||
"output": "/tmp/olmo-mix-v1/output" | ||
}, | ||
"processes": 128 | ||
} |
Oops, something went wrong.