Skip to content

Commit

Permalink
Merge pull request #223 from allenai/soldni/olmo-mixing
Browse files Browse the repository at this point in the history
V1 Mixing tools and changes
  • Loading branch information
soldni authored Jul 8, 2023
2 parents e64cf42 + 70620ab commit 87f6a79
Show file tree
Hide file tree
Showing 27 changed files with 1,549 additions and 72 deletions.
24 changes: 24 additions & 0 deletions pretrain_data/mixer/config/ablations/dedupers/c4-v0-dedup.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
{
"documents": [
"pretraining-data/sources/c4/v0/documents/train/*.gz"
],
"work_dir": {
"input": "/data2/c4/deduper/input",
"output": "/data2/c4/deduper/output"
},
"dedupe": {
"name": "decontamination",
"paragraphs": {
"attribute_name": "bff_duplicate_paragraph_spans_decontamination"
},
"skip_empty": true
},
"bloom_filter": {
"file": "/tmp/decontamination/deduper_decontamination_lucas_20230525.bin",
"size_in_bytes": 8388608,
"read_only": true,
"estimated_doc_count": 3898706,
"desired_false_positive_rate": 0.001
},
"processes": 120
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
{
"streams": [
{
"name": "cc_en_head",
"documents": [
"pretraining-data/sources/common-crawl/v1-c4-cleaned/documents/cc_en_head/*.json.gz"
],
"attributes": [
"decontamination",
"dedupe_paragraphs",
"gopher_rules",
"hatespeech_nsfw_cc_v3",
"pii_detection",
"random"
],
"output": {
"path": "pretraining-data/sources/olmo-mix/v1-no-removal/documents/common-crawl/cc_en_head",
"max_size_in_bytes": 4294967296,
"discard_fields": [
"attributes",
"metadata",
"added",
"created"
]
},
"filter": {
"include": [],
"exclude": [
"$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] && @.gopher_rules__gopher_v1__word_count[0][2] < 50)]",
"$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] && @.gopher_rules__gopher_v1__word_count[0][2] > 100000)]",
"$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] && @.gopher_rules__gopher_v1__median_word_length[0][2] < 3)]",
"$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] && @.gopher_rules__gopher_v1__median_word_length[0][2] > 10)]",
"$.attributes[?(@.gopher_rules__gopher_v1__symbol_to_word_ratio && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0] && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0][2] > 0.1)]",
"$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0] && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0][2] < 0.8)]",
"$.attributes[?(@.gopher_rules__gopher_v1__required_word_count && @.gopher_rules__gopher_v1__required_word_count[0] && @.gopher_rules__gopher_v1__required_word_count[0][2] < 2)]",
"$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0] && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0][2] > 0.9)]",
"$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0] && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0][2] > 0.3)]",
"$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_duplicate_lines && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0] && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0][2] > 0.3)]",
"$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0][2] > 0.3)]",
"$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0][2] > 0.2)]",
"$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0][2] > 0.18)]",
"$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0][2] > 0.16)]",
"$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0][2] > 0.15)]",
"$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0][2] > 0.14)]",
"$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0][2] > 0.13)]",
"$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0][2] > 0.12)]",
"$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0][2] > 0.11)]",
"$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0][2] > 0.10)]",
"[email protected][?(@.bff_duplicate_paragraph_spans_decontamination && @.bff_duplicate_paragraph_spans_decontamination[0] && @.bff_duplicate_paragraph_spans_decontamination[0][2] >= 1.0)]",
"$.attributes[?(@.pii_detection__pii_regex_with_counts_fast_v2__doc_count && @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0] && @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0][2] > 5)]",
"$.attributes[?(@.random__random_number_v1__random[0][2] < 0.5104606781)]"
]
},
"span_replacement": [
{
"span": "$.attributes.bff_duplicate_paragraph_spans",
"min_score": 0.5,
"replacement": ""
},
{
"span": "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__EMAIL_ADDRESS",
"min_score": 0.5,
"replacement": " |||EMAIL_ADDRESS||| "
},
{
"span": "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__PHONE_NUMBER",
"min_score": 0.5,
"replacement": " |||PHONE_NUMBER||| "
},
{
"span": "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__IP_ADDRESS",
"min_score": 0.5,
"replacement": " |||IP_ADDRESS||| "
}
]
}
],
"work_dir": {
"input": "/tmp/olmo-mix-v1/input",
"output": "/tmp/olmo-mix-v1/output"
},
"processes": 128
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
{
"streams": [
{
"name": "cc_en_middle",
"documents": [
"pretraining-data/sources/common-crawl/v1-c4-cleaned/documents/cc_en_middle/*.json.gz"
],
"attributes": [
"decontamination",
"dedupe_paragraphs",
"gopher_rules",
"hatespeech_nsfw_cc_v3",
"pii_detection",
"random"
],
"output": {
"path": "pretraining-data/sources/olmo-mix/v1-no-removal/documents/common-crawl/cc_en_middle",
"max_size_in_bytes": 4294967296,
"discard_fields": [
"attributes",
"metadata",
"added",
"created"
]
},
"filter": {
"include": [],
"exclude": [
"$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] && @.gopher_rules__gopher_v1__word_count[0][2] < 50)]",
"$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] && @.gopher_rules__gopher_v1__word_count[0][2] > 100000)]",
"$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] && @.gopher_rules__gopher_v1__median_word_length[0][2] < 3)]",
"$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] && @.gopher_rules__gopher_v1__median_word_length[0][2] > 10)]",
"$.attributes[?(@.gopher_rules__gopher_v1__symbol_to_word_ratio && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0] && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0][2] > 0.1)]",
"$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0] && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0][2] < 0.8)]",
"$.attributes[?(@.gopher_rules__gopher_v1__required_word_count && @.gopher_rules__gopher_v1__required_word_count[0] && @.gopher_rules__gopher_v1__required_word_count[0][2] < 2)]",
"$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0] && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0][2] > 0.9)]",
"$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0] && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0][2] > 0.3)]",
"$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_duplicate_lines && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0] && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0][2] > 0.3)]",
"$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0][2] > 0.3)]",
"$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0][2] > 0.2)]",
"$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0][2] > 0.18)]",
"$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0][2] > 0.16)]",
"$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0][2] > 0.15)]",
"$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0][2] > 0.14)]",
"$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0][2] > 0.13)]",
"$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0][2] > 0.12)]",
"$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0][2] > 0.11)]",
"$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0][2] > 0.10)]",
"[email protected][?(@.bff_duplicate_paragraph_spans_decontamination && @.bff_duplicate_paragraph_spans_decontamination[0] && @.bff_duplicate_paragraph_spans_decontamination[0][2] >= 1.0)]",
"$.attributes[?(@.pii_detection__pii_regex_with_counts_fast_v2__doc_count && @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0] && @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0][2] > 5)]",
"$.attributes[?(@.random__random_number_v1__random[0][2] < 0.5104606781)]"
]
},
"span_replacement": [
{
"span": "$.attributes.bff_duplicate_paragraph_spans",
"min_score": 0.5,
"replacement": ""
},
{
"span": "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__EMAIL_ADDRESS",
"min_score": 0.5,
"replacement": " |||EMAIL_ADDRESS||| "
},
{
"span": "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__PHONE_NUMBER",
"min_score": 0.5,
"replacement": " |||PHONE_NUMBER||| "
},
{
"span": "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__IP_ADDRESS",
"min_score": 0.5,
"replacement": " |||IP_ADDRESS||| "
}
]
}
],
"work_dir": {
"input": "/tmp/olmo-mix-v1/input",
"output": "/tmp/olmo-mix-v1/output"
},
"processes": 128
}
Loading

0 comments on commit 87f6a79

Please sign in to comment.