From b5e74a87a730b1f880d4ee9756f603159662a5ca Mon Sep 17 00:00:00 2001 From: Luca Soldaini Date: Tue, 13 Jun 2023 14:15:31 -0500 Subject: [PATCH 01/11] mixing --- .../ablations/dedupers/c4-v0-dedup.json | 24 +++++++++ pretrain_data/mixer/config/olmo-train/c4.json | 53 +------------------ pretrain_data/mixing-log.md | 32 +++++++---- scripts/prepare_memmap_dataset.py | 40 +++++++++++--- 4 files changed, 81 insertions(+), 68 deletions(-) create mode 100644 pretrain_data/mixer/config/ablations/dedupers/c4-v0-dedup.json diff --git a/pretrain_data/mixer/config/ablations/dedupers/c4-v0-dedup.json b/pretrain_data/mixer/config/ablations/dedupers/c4-v0-dedup.json new file mode 100644 index 000000000..35ac9093a --- /dev/null +++ b/pretrain_data/mixer/config/ablations/dedupers/c4-v0-dedup.json @@ -0,0 +1,24 @@ +{ + "documents": [ + "pretraining-data/sources/c4/v0/documents/train/*.gz" + ], + "work_dir": { + "input": "/data2/c4/deduper/input", + "output": "/data2/c4/deduper/output" + }, + "dedupe": { + "name": "decontamination", + "paragraphs": { + "attribute_name": "bff_duplicate_paragraph_spans_decontamination" + }, + "skip_empty": true + }, + "bloom_filter": { + "file": "/tmp/decontamination/deduper_decontamination_lucas_20230525.bin", + "size_in_bytes": 8388608, + "read_only": true, + "estimated_doc_count": 3898706, + "desired_false_positive_rate": 0.001 + }, + "processes": 120 +} diff --git a/pretrain_data/mixer/config/olmo-train/c4.json b/pretrain_data/mixer/config/olmo-train/c4.json index 57854fe4a..3c6f96bd0 100644 --- a/pretrain_data/mixer/config/olmo-train/c4.json +++ b/pretrain_data/mixer/config/olmo-train/c4.json @@ -23,59 +23,10 @@ "filter": { "include": [], "exclude": [ - "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] && @.gopher_rules__gopher_v1__word_count[0][2] < 50)]", - "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] && @.gopher_rules__gopher_v1__word_count[0][2] > 100000)]", - "$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] && @.gopher_rules__gopher_v1__median_word_length[0][2] < 3)]", - "$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] && @.gopher_rules__gopher_v1__median_word_length[0][2] > 10)]", - "$.attributes[?(@.gopher_rules__gopher_v1__symbol_to_word_ratio && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0] && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0][2] > 0.1)]", - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0] && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0][2] < 0.8)]", - "$.attributes[?(@.gopher_rules__gopher_v1__required_word_count && @.gopher_rules__gopher_v1__required_word_count[0] && @.gopher_rules__gopher_v1__required_word_count[0][2] < 2)]", - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0] && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0][2] > 0.9)]", - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0] && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0][2] > 0.3)]", - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_duplicate_lines && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0] && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0][2] > 0.3)]", - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0][2] > 0.3)]", - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0][2] > 0.2)]", - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0][2] > 0.18)]", - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0][2] > 0.16)]", - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0][2] > 0.15)]", - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0][2] > 0.14)]", - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0][2] > 0.13)]", - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0][2] > 0.12)]", - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0][2] > 0.11)]", - "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0][2] > 0.10)]", - - "$@.attributes[?(@.bff_duplicate_paragraph_spans_decontamination && @.bff_duplicate_paragraph_spans_decontamination[0] && @.bff_duplicate_paragraph_spans_decontamination[0][2] >= 1.0)]", - - "$.attributes[?(@.olmo_mix_v1_taggers__pii_regex_with_counts_v2__doc[0][2] > 5)]" + "$@.attributes[?(@.bff_duplicate_paragraph_spans_decontamination && @.bff_duplicate_paragraph_spans_decontamination[0] && @.bff_duplicate_paragraph_spans_decontamination[0][2] >= 1.0)]" ] }, - "span_replacement": [ - { - "span": "$.attributes.olmo_mix_v1_taggers__jigsaw_hatespeech_sentence_v2____label__toxic", - "min_score": 0.4, - "replacement": "" - }, - { - "span": "$.attributes.olmo_mix_v1_taggers__jigsaw_nsfw_sencence_v2____label__nsfw", - "min_score": 0.4, - "replacement": "" - }, - { - "span": "$.attributes.olmo_mix_v1_taggers__pii_regex_with_counts_v2__EMAIL_ADDRESS", - "min_score": 0.5, - "replacement": " |||EMAIL_ADDRESS||| " - }, - { - "span": "$.attributes.olmo_mix_v1_taggers__pii_regex_with_counts_v2__PHONE_NUMBER", - "min_score": 0.5, - "replacement": " |||PHONE_NUMBER||| " - }, - { - "span": "$.attributes.olmo_mix_v1_taggers__pii_regex_with_counts_v2__IP_ADDRESS", - "min_score": 0.5, - "replacement": " |||IP_ADDRESS||| " - } - ] + "span_replacement": [] } ], "work_dir": { diff --git a/pretrain_data/mixing-log.md b/pretrain_data/mixing-log.md index cb36a022a..ad910ca4f 100644 --- a/pretrain_data/mixing-log.md +++ b/pretrain_data/mixing-log.md @@ -1,4 +1,4 @@ -# OLMO Mixing Log +# OLMO Mixing Log Tagged Wikipedia using following command @@ -17,7 +17,7 @@ ai2_llm_filters \ -p 96 \ --reuse-existing $HOME/wikipedia/meta \ --local-read-cache $HOME/wikipedia/cache \ - --skip-on-failure + --skip-on-failure ``` Tagged C4 with the following. Using both `v0` and `v0-c4-cleaned`. The `c4-cleaned` shouldn't have much of a diff, but it's good for consistency. @@ -34,7 +34,7 @@ ai2_llm_filters \ gopher_v1 \ -p 96 \ --reuse-existing $HOME/c4-v0-c4-cleaned/meta \ - --local-read-cache $HOME/c4-v0-c4-cleaned/cache + --local-read-cache $HOME/c4-v0-c4-cleaned/cache ``` ```shell @@ -48,7 +48,7 @@ ai2_llm_filters \ gopher_v1 \ -p 96 \ --reuse-existing $HOME/c4-v0/meta \ - --local-read-cache $HOME/c4-v0/cache + --local-read-cache $HOME/c4-v0/cache ``` @@ -99,13 +99,13 @@ ai2_llm_filters \ --local-read-cache $HOME/v1-c4-cleaned/cc_en_head_download ``` -Created configurations +Created configurations ```shell python /Users/lucas/Code/LLM/pretrain_data/mixer/scripts/partition_deduper.py -w 100 -o pretrain_data ``` -Output: +Output: ```text pretrain_data/mixer/config/pdedup_c1_v1_c4-cleaned/0.json: 1105.45 GB, 756 files. @@ -149,7 +149,7 @@ Books: python scripts/prepare_memmap_dataset.py \ s3://ai2-llm/pretraining-data/sources/olmo-mix/v1/documents/books \ --safe-mode \ - --output s3://ai2-llm/preprocessed/sources/olmo-mix/v1/gpt-neox-20b-pii-special/books \ + --output s3://ai2-llm/preprocessed/olmo-mix/v1/gpt-neox-20b-pii-special/books \ --tokenizer "allenai/eleuther-ai-gpt-neox-20b-pii-special" \ --workers 120 ``` @@ -160,7 +160,7 @@ Semantic Scholar: python scripts/prepare_memmap_dataset.py \ s3://ai2-llm/pretraining-data/sources/olmo-mix/v1/documents/s2 \ --safe-mode \ - --output s3://ai2-llm/preprocessed/sources/olmo-mix/v1/gpt-neox-20b-pii-special/s2 \ + --output s3://ai2-llm/preprocessed/olmo-mix/v1/gpt-neox-20b-pii-special/s2 \ --tokenizer "allenai/eleuther-ai-gpt-neox-20b-pii-special" \ --workers 120 ``` @@ -171,7 +171,19 @@ Wikipedia: python scripts/prepare_memmap_dataset.py \ s3://ai2-llm/pretraining-data/sources/olmo-mix/v1/documents/wiki \ --safe-mode \ - --output s3://ai2-llm/preprocessed/sources/olmo-mix/v1/gpt-neox-20b-pii-special/wiki \ + --output s3://ai2-llm/preprocessed/olmo-mix/v1/gpt-neox-20b-pii-special/wiki \ --tokenizer "allenai/eleuther-ai-gpt-neox-20b-pii-special" \ --workers 120 -``` \ No newline at end of file +``` + +C4: + +```shell +python scripts/prepare_memmap_dataset.py \ + s3://ai2-llm/pretraining-data/sources/olmo-mix/v1/documents/c4 \ + --safe-mode \ + --output s3://ai2-llm/preprocessed/olmo-mix/v1/gpt-neox-20b-pii-special/c4 \ + --tokenizer "allenai/eleuther-ai-gpt-neox-20b-pii-special" \ + --workers 120 \ + --cache-dir /data2/llm-preprocessed +``` diff --git a/scripts/prepare_memmap_dataset.py b/scripts/prepare_memmap_dataset.py index f5d3d6180..5b7bb5fbf 100644 --- a/scripts/prepare_memmap_dataset.py +++ b/scripts/prepare_memmap_dataset.py @@ -70,7 +70,12 @@ class InputDocumentSpec(msgspec.Struct): text: str -def tokenize_file(tokenizer: Tokenizer, path: str, safe_mode: bool = False) -> Generator[List[int], None, None]: +def tokenize_file( + tokenizer: Tokenizer, + path: str, + safe_mode: bool = False, + cache_dir: Optional[str] = None, +) -> Generator[List[int], None, None]: """Tokenize a file of documents using the provided tokenizer; file is expected to be a gzipped JSON lines file, each containing a field named `text`. """ @@ -80,7 +85,7 @@ def tokenize_file(tokenizer: Tokenizer, path: str, safe_mode: bool = False) -> G with ExitStack() as stack: if safe_mode: - caching_path = cached_path(path) + caching_path = cached_path(path, cache_dir=cache_dir) input_stream = stack.enter_context(gzip.open(caching_path, mode="rt")) else: input_file = stack.enter_context(stream_file_for_read(path, mode="rb")) @@ -228,7 +233,8 @@ def fill_memmap( sample_rate: float = 1.0, random_seed: int = 3920, repeat_sequence: int = 1, -): + cache_dir: Optional[str] = None, +) -> int: """Write a memmap file from a file of documents.""" # set the seed in case we need to sample @@ -243,13 +249,16 @@ def fill_memmap( # we increment this every time we create a new memmap file file_index = 0 + # total number of tokens written + total_tokens = 0 + # make sure path is a list path_or_paths = [path_or_paths] if isinstance(path_or_paths, str) else path_or_paths with ExitStack() as stack: it = itertools.chain.from_iterable( # repeat the sequence if necessary - tokenize_file(tokenizer=tokenizer, path=path, safe_mode=safe_mode) + tokenize_file(tokenizer=tokenizer, path=path, safe_mode=safe_mode, cache_dir=cache_dir) for _ in range(repeat_sequence) for path in path_or_paths ) @@ -261,6 +270,9 @@ def fill_memmap( # flush any 10k lines or so; improves stability flush = line_no % 10_000 == 0 + # increment the total number of tokens written + total_tokens += len(token_ids) + # if leftovers_to_write is not None it means that either memmap is None or it's full, # so we will need to create a new one later leftovers_to_write = memmap.write(token_ids, flush=flush) if memmap is not None else token_ids @@ -282,6 +294,8 @@ def fill_memmap( # close the last memmap stack.pop_all().close() + return total_tokens + def make_source_and_target( src: Tuple[str, ...], @@ -343,6 +357,12 @@ def make_source_and_target( @click.option("--random-seed", type=int, default=3920) @click.option("--repeat-sequence", type=click.IntRange(min=1), default=1) @click.option("--paths-per-worker", type=click.IntRange(min=1), default=1) +@click.option( + "--cache-dir", + type=str, + default=None, + help="Cache directory for the tokenizer; use system default if not specified" +) @click.option( "--max-tokens", default=512 * 1024 * 1024, @@ -368,6 +388,7 @@ def main( repeat_sequence: int = 1, paths_per_worker: int = 1, max_workers: int = 1, + cache_dir: Optional[str] = None, ): print("=== CONFIGURATION ===") print(f"src: {src}") @@ -383,6 +404,7 @@ def main( print(f"repeat_sequence: {repeat_sequence}") print(f"paths_per_worker: {paths_per_worker}") print(f"max_workers: {max_workers}") + print(f"cache_dir: {cache_dir}") print("=====================") dtype = np.dtype(dtype_str) @@ -401,17 +423,20 @@ def main( sample_rate=sample_rate, random_seed=random_seed, repeat_sequence=repeat_sequence, + cache_dir=cache_dir, ) + total_tokens_written = 0 + if debug: log.info("Running in debug mode. Only one process will be used.") for src_path, dst_path in zip(exploded_src, exploded_dst): - fill_memmap_fn(path_or_paths=src_path, memmap_path=dst_path) + total_tokens_written += fill_memmap_fn(path_or_paths=src_path, memmap_path=dst_path) else: # Now tokenizer all documents again and populate the memmap array. We do this in parallel. workers_cnt = min(max_workers or os.cpu_count() or 1, len(exploded_src)) with concurrent.futures.ProcessPoolExecutor(max_workers=workers_cnt) as executor: - futures: List[Future[None]] = [] + futures: List[Future[int]] = [] for src_path, dst_path in zip(exploded_src, exploded_dst): future = executor.submit(fill_memmap_fn, path_or_paths=src_path, memmap_path=dst_path) futures.append(future) @@ -421,9 +446,10 @@ def main( description="Filling memmap arrays...", total=len(futures), ): - future.result() + total_tokens_written += future.result() log.info(f"Done! File(s) written to {output}") + log.info(f"Total tokens written: {total_tokens_written:,}") if validate: log.info("Validating...") From a11918c16dc4548657cfc5b99b360398a4a63d0b Mon Sep 17 00:00:00 2001 From: Luca Soldaini Date: Wed, 14 Jun 2023 14:30:40 -0500 Subject: [PATCH 02/11] wip --- .../config/olmo-train/common-crawl-head.json | 25 +++++++++++++++-- .../olmo-train/common-crawl-middle.json | 2 +- .../config/olmo-train/common-crawl-tail.json | 2 +- .../mixer/config/olmo-train/stack.json | 24 ++++++++++++++++ pretrain_data/mixing-log.md | 28 +++++++++++++++++++ 5 files changed, 77 insertions(+), 4 deletions(-) create mode 100644 pretrain_data/mixer/config/olmo-train/stack.json diff --git a/pretrain_data/mixer/config/olmo-train/common-crawl-head.json b/pretrain_data/mixer/config/olmo-train/common-crawl-head.json index 541fb3784..233e027ae 100644 --- a/pretrain_data/mixer/config/olmo-train/common-crawl-head.json +++ b/pretrain_data/mixer/config/olmo-train/common-crawl-head.json @@ -3,7 +3,7 @@ { "name": "cc_en_head", "documents": [ - "pretraining-data/sources/common-crawl/v1-c4-cleaned/documents/cc_en_head/cc_en_head-0005.json.gz" + "pretraining-data/sources/common-crawl/v1-c4-cleaned/documents/cc_en_head/*.json.gz" ], "attributes": [ "decontamination", @@ -25,9 +25,30 @@ "filter": { "include": [], "exclude": [ + "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] && @.gopher_rules__gopher_v1__word_count[0][2] < 50)]", + "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] && @.gopher_rules__gopher_v1__word_count[0][2] > 100000)]", + "$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] && @.gopher_rules__gopher_v1__median_word_length[0][2] < 3)]", + "$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] && @.gopher_rules__gopher_v1__median_word_length[0][2] > 10)]", + "$.attributes[?(@.gopher_rules__gopher_v1__symbol_to_word_ratio && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0] && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0][2] > 0.1)]", + "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0] && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0][2] < 0.8)]", + "$.attributes[?(@.gopher_rules__gopher_v1__required_word_count && @.gopher_rules__gopher_v1__required_word_count[0] && @.gopher_rules__gopher_v1__required_word_count[0][2] < 2)]", + "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0] && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0][2] > 0.9)]", + "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0] && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0][2] > 0.3)]", + "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_duplicate_lines && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0] && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0][2] > 0.3)]", + "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0][2] > 0.3)]", + "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0][2] > 0.2)]", + "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0][2] > 0.18)]", + "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0][2] > 0.16)]", + "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0][2] > 0.15)]", + "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0][2] > 0.14)]", + "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0][2] > 0.13)]", + "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0][2] > 0.12)]", + "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0][2] > 0.11)]", + "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0][2] > 0.10)]", + "$@.attributes[?(@.bff_duplicate_paragraph_spans_decontamination && @.bff_duplicate_paragraph_spans_decontamination[0] && @.bff_duplicate_paragraph_spans_decontamination[0][2] >= 1.0)]", - "$.attributes[?(@.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0][2] > 5)]" + "$.attributes[?(@.pii_detection__pii_regex_with_counts_fast_v2__doc_count && @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0] && @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0][2] > 5)]" ] }, "span_replacement": [ diff --git a/pretrain_data/mixer/config/olmo-train/common-crawl-middle.json b/pretrain_data/mixer/config/olmo-train/common-crawl-middle.json index 011494016..f2e0069ef 100644 --- a/pretrain_data/mixer/config/olmo-train/common-crawl-middle.json +++ b/pretrain_data/mixer/config/olmo-train/common-crawl-middle.json @@ -46,7 +46,7 @@ "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0][2] > 0.11)]", "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0][2] > 0.10)]", "$@.attributes[?(@.bff_duplicate_paragraph_spans_decontamination && @.bff_duplicate_paragraph_spans_decontamination[0] && @.bff_duplicate_paragraph_spans_decontamination[0][2] >= 1.0)]", - "$.attributes[?(@.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0][2] > 5)]" + "$.attributes[?(@.pii_detection__pii_regex_with_counts_fast_v2__doc_count && @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0] && @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0][2] > 5)]" ] }, "span_replacement": [ diff --git a/pretrain_data/mixer/config/olmo-train/common-crawl-tail.json b/pretrain_data/mixer/config/olmo-train/common-crawl-tail.json index b7aec37a7..0b38254ac 100644 --- a/pretrain_data/mixer/config/olmo-train/common-crawl-tail.json +++ b/pretrain_data/mixer/config/olmo-train/common-crawl-tail.json @@ -46,7 +46,7 @@ "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0][2] > 0.11)]", "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0][2] > 0.10)]", "$@.attributes[?(@.bff_duplicate_paragraph_spans_decontamination && @.bff_duplicate_paragraph_spans_decontamination[0] && @.bff_duplicate_paragraph_spans_decontamination[0][2] >= 1.0)]", - "$.attributes[?(@.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0][2] > 5)]" + "$.attributes[?(@.pii_detection__pii_regex_with_counts_fast_v2__doc_count && @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0] && @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0][2] > 5)]" ] }, "span_replacement": [ diff --git a/pretrain_data/mixer/config/olmo-train/stack.json b/pretrain_data/mixer/config/olmo-train/stack.json new file mode 100644 index 000000000..b8cfa0719 --- /dev/null +++ b/pretrain_data/mixer/config/olmo-train/stack.json @@ -0,0 +1,24 @@ +{ + "streams": [ + { + "name": "stack-v2-mixer-train", + "documents": [ + "pretraining-data/sources/stack-dedup/v2-mixer-train/documents/*.gz" + ], + "output": { + "path": "pretraining-data/sources/olmo-mix/v1/documents/stack", + "max_size_in_bytes": 3894967296 + }, + "attributes": [], + "filter": { + "include": [], + "exclude": [] + } + } + ], + "work_dir": { + "input": "/tmp/stack/mixer/input", + "output": "/tmp/stack/mixer/output" + }, + "processes": 120 +} diff --git a/pretrain_data/mixing-log.md b/pretrain_data/mixing-log.md index ad910ca4f..cc5e5dd44 100644 --- a/pretrain_data/mixing-log.md +++ b/pretrain_data/mixing-log.md @@ -187,3 +187,31 @@ python scripts/prepare_memmap_dataset.py \ --workers 120 \ --cache-dir /data2/llm-preprocessed ``` + +Stack: + +```shell +python scripts/prepare_memmap_dataset.py \ + s3://ai2-llm/pretraining-data/sources/olmo-mix/v1/documents/stack \ + --safe-mode \ + --output s3://ai2-llm/preprocessed/olmo-mix/v1/gpt-neox-20b-pii-special/stack \ + --tokenizer "allenai/eleuther-ai-gpt-neox-20b-pii-special" \ + --workers 120 \ + --cache-dir /data2/llm-preprocessed +``` + + +## Calculating Size + +- Wiki: 6.21 GB -> 3,635,728,771 tokens (585m tokens per GB) +- Books: 7.08 GB -> 4,755,860,202 tokens (672m tokens per GB) +- S2: 160.01 GB -> 56,783,583,427 tokens (355m tokens per GB) +- C4: 323.95 GB -> 174,398,315,760 tokens (538m tokens per GB) +- Stack: 724.28 GB -> 430,067,843,952 tokens (593m tokens per GB) + + +Over a single cc + +2.87 GB -> 1.76 GB (61.3% reduction) + +9.48 TB -> 5.81 TB -> 3,450,675,200,000,000 tokens? (3.45 T) From c01e593f68db794ec9c6f8e0c183792bbe565e79 Mon Sep 17 00:00:00 2001 From: Luca Soldaini Date: Thu, 15 Jun 2023 11:21:21 -0500 Subject: [PATCH 03/11] commit config --- .../olmo-train-sample/common-crawl-head.json | 28 +++++ .../common-crawl-middle.json | 28 +++++ .../olmo-train-sample/common-crawl-tail.json | 28 +++++ .../mixer/config/olmo-train-sample/stack.json | 28 +++++ .../config/pdedup_c1_v1_c4-cleaned/rest.json | 23 ++++ pretrain_data/mixing-log.md | 106 +++++++++++++++++- 6 files changed, 240 insertions(+), 1 deletion(-) create mode 100644 pretrain_data/mixer/config/olmo-train-sample/common-crawl-head.json create mode 100644 pretrain_data/mixer/config/olmo-train-sample/common-crawl-middle.json create mode 100644 pretrain_data/mixer/config/olmo-train-sample/common-crawl-tail.json create mode 100644 pretrain_data/mixer/config/olmo-train-sample/stack.json create mode 100644 pretrain_data/mixer/config/pdedup_c1_v1_c4-cleaned/rest.json diff --git a/pretrain_data/mixer/config/olmo-train-sample/common-crawl-head.json b/pretrain_data/mixer/config/olmo-train-sample/common-crawl-head.json new file mode 100644 index 000000000..a66b99d87 --- /dev/null +++ b/pretrain_data/mixer/config/olmo-train-sample/common-crawl-head.json @@ -0,0 +1,28 @@ +{ + "streams": [ + { + "name": "cc_en_head", + "documents": [ + "pretraining-data/sources/olmo-mix/v1/documents/common-crawl/cc_en_head/*.gz" + ], + "output": { + "path": "pretraining-data/sources/olmo-mix/v1-sample/documents/cc_en_head", + "max_size_in_bytes": 3894967296 + }, + "attributes": [ + "random" + ], + "filter": { + "include": [ + "$.attributes[?(@.random__random_number_v1__random[0][2] < 0.5104606781)]" + ], + "exclude": [] + } + } + ], + "work_dir": { + "input": "/tmp/cc-head-sample/mixer/input", + "output": "/tmp/cc-head-sample/mixer/output" + }, + "processes": 120 +} diff --git a/pretrain_data/mixer/config/olmo-train-sample/common-crawl-middle.json b/pretrain_data/mixer/config/olmo-train-sample/common-crawl-middle.json new file mode 100644 index 000000000..d8c294435 --- /dev/null +++ b/pretrain_data/mixer/config/olmo-train-sample/common-crawl-middle.json @@ -0,0 +1,28 @@ +{ + "streams": [ + { + "name": "cc_en_middle", + "documents": [ + "pretraining-data/sources/olmo-mix/v1/documents/common-crawl/cc_en_middle/*.gz" + ], + "output": { + "path": "pretraining-data/sources/olmo-mix/v1-sample/documents/cc_en_middle", + "max_size_in_bytes": 3894967296 + }, + "attributes": [ + "random" + ], + "filter": { + "include": [ + "$.attributes[?(@.random__random_number_v1__random[0][2] < 0.5104606781)]" + ], + "exclude": [] + } + } + ], + "work_dir": { + "input": "/data2/cc-middle-sample/mixer/input", + "output": "/data2/cc-middle-sample/mixer/output" + }, + "processes": 120 +} diff --git a/pretrain_data/mixer/config/olmo-train-sample/common-crawl-tail.json b/pretrain_data/mixer/config/olmo-train-sample/common-crawl-tail.json new file mode 100644 index 000000000..ad83b5585 --- /dev/null +++ b/pretrain_data/mixer/config/olmo-train-sample/common-crawl-tail.json @@ -0,0 +1,28 @@ +{ + "streams": [ + { + "name": "cc_en_tail", + "documents": [ + "pretraining-data/sources/olmo-mix/v1/documents/common-crawl/cc_en_tail/*.gz" + ], + "output": { + "path": "pretraining-data/sources/olmo-mix/v1-sample/documents/cc_en_tail", + "max_size_in_bytes": 3894967296 + }, + "attributes": [ + "random" + ], + "filter": { + "include": [ + "$.attributes[?(@.random__random_number_v1__random[0][2] < 0.5104606781)]" + ], + "exclude": [] + } + } + ], + "work_dir": { + "input": "/tmp/cc-middle-sample/mixer/input", + "output": "/tmp/cc-middle-sample/mixer/output" + }, + "processes": 120 +} diff --git a/pretrain_data/mixer/config/olmo-train-sample/stack.json b/pretrain_data/mixer/config/olmo-train-sample/stack.json new file mode 100644 index 000000000..f55d2118c --- /dev/null +++ b/pretrain_data/mixer/config/olmo-train-sample/stack.json @@ -0,0 +1,28 @@ +{ + "streams": [ + { + "name": "stack-v2-mixer-train", + "documents": [ + "pretraining-data/sources/olmo-mix/v1/documents/stack/*.gz" + ], + "output": { + "path": "pretraining-data/sources/olmo-mix/v1-sample/documents/stack", + "max_size_in_bytes": 3894967296 + }, + "attributes": [ + "random" + ], + "filter": { + "include": [ + "$.attributes[?(@.random__random_number_v1__random[0][2] < 0.6975641061)]" + ], + "exclude": [] + } + } + ], + "work_dir": { + "input": "/tmp/stack-sample/mixer/input", + "output": "/tmp/stack-sample/mixer/output" + }, + "processes": 120 +} diff --git a/pretrain_data/mixer/config/pdedup_c1_v1_c4-cleaned/rest.json b/pretrain_data/mixer/config/pdedup_c1_v1_c4-cleaned/rest.json new file mode 100644 index 000000000..2c312d5b2 --- /dev/null +++ b/pretrain_data/mixer/config/pdedup_c1_v1_c4-cleaned/rest.json @@ -0,0 +1,23 @@ +{ + "documents": [ + "pretraining-data/sources/common-crawl/v1-c4-cleaned/documents/cc_en_tail/cc_en_tail-1005.json.gz" + ], + "work_dir": { + "input": "/tmp/v1-c4-cleaned/7.input", + "output": "/tmp/v1-c4-cleaned/7.output" + }, + "dedupe": { + "name": "dedupe_paragraphs", + "paragraphs": { + "attribute_name": "bff_duplicate_paragraph_spans" + } + }, + "bloom_filter": { + "file": "/tmp/v1-c4-cleaned/7.bloom", + "size_in_bytes": 0, + "read_only": true, + "estimated_doc_count": 30000000000, + "desired_false_positive_rate": 1e-06 + }, + "processes": 128 +} diff --git a/pretrain_data/mixing-log.md b/pretrain_data/mixing-log.md index cc5e5dd44..c3042e843 100644 --- a/pretrain_data/mixing-log.md +++ b/pretrain_data/mixing-log.md @@ -210,8 +210,112 @@ python scripts/prepare_memmap_dataset.py \ - Stack: 724.28 GB -> 430,067,843,952 tokens (593m tokens per GB) +Sampled + +- Wiki: 6.21 GB -> 3,635,728,771 tokens (585m tokens per GB) +- Books: 7.08 GB -> 4,755,860,202 tokens (672m tokens per GB) +- S2: 160.01 GB -> 56,783,583,427 tokens (355m tokens per GB) +- C4: 323.95 GB -> 174,398,315,760 tokens (538m tokens per GB) +- Stack: 593.4 GB -> 430,067,843,952 tokens (593m tokens per GB) +- Common Crawl: 2.61 TB -> 1,500,000,000,000 tokens (574m tokens per GB) + + Over a single cc 2.87 GB -> 1.76 GB (61.3% reduction) -9.48 TB -> 5.81 TB -> 3,450,675,200,000,000 tokens? (3.45 T) +9.48 TB -> 4.89 TB -> 2.861T tokens + + +Random tagger: + +```shell +ai2_llm_filters \ + -d 'olmo-mix/v1' \ + -n random \ + -t random_number_v1 \ + -p 120 \ + --reuse-existing $HOME/olmo_mix/meta \ + --files-regex-pattern 'stack-v2-mxixer-train' \ + --local-read-cache $HOME/olmo_mix/cache +``` + +```shell +ai2_llm_filters \ + -d 'olmo-mix/v1' \ + -n random \ + -t random_number_v1 \ + -p 120 \ + --reuse-existing /data2/olmo_mix/meta \ + --files-regex-pattern 'cc_en_head' \ + --local-read-cache /data2/olmo_mix/cache +``` + +```shell +ai2_llm_filters \ + -d 'olmo-mix/v1' \ + -n random \ + -t random_number_v1 \ + -p 120 \ + --reuse-existing /tmp/olmo_mix/meta \ + --files-regex-pattern 'cc_en_middle' \ + --local-read-cache /tmp/olmo_mix/cache +``` + +```shell +ai2_llm_filters \ + -d 'olmo-mix/v1' \ + -n random \ + -t random_number_v1 \ + -p 120 \ + --reuse-existing /tmp/olmo_mix/meta \ + --files-regex-pattern 'reddit-ablation-base' \ + --local-read-cache /tmp/olmo_mix/cache +``` + + +```shell +python scripts/prepare_memmap_dataset.py \ + s3://ai2-llm/pretraining-data/sources/olmo-mix/v1-sample/documents/stack \ + --safe-mode \ + --output s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack \ + --tokenizer "allenai/eleuther-ai-gpt-neox-20b-pii-special" \ + --workers 120 \ + --cache-dir /tmp/llm-preprocessed +``` + + +```shell +python scripts/prepare_memmap_dataset.py \ + s3://ai2-llm/pretraining-data/sources/olmo-mix/v1-sample/documents/cc_en_head \ + --safe-mode \ + --output s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head \ + --tokenizer "allenai/eleuther-ai-gpt-neox-20b-pii-special" \ + --workers 120 \ + --cache-dir /tmp/llm-preprocessed/cc_en_head +``` + +```shell +python scripts/prepare_memmap_dataset.py \ + s3://ai2-llm/pretraining-data/sources/olmo-mix/v1-sample/documents/cc_en_middle \ + --safe-mode \ + --output s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle \ + --tokenizer "allenai/eleuther-ai-gpt-neox-20b-pii-special" \ + --workers 120 \ + --cache-dir /tmp/llm-preprocessed +``` + +```shell +python scripts/prepare_memmap_dataset.py \ + s3://ai2-llm/pretraining-data/sources/olmo-mix/v1-sample/documents/cc_en_tail \ + --safe-mode \ + --output s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail \ + --tokenizer "allenai/eleuther-ai-gpt-neox-20b-pii-special" \ + --workers 120 \ + --cache-dir /tmp/llm-preprocessed +``` + + +```shell +~/target/release/mixer pretrain_data/mixer/config/olmo-train/common-crawl-tail.json && ai2_llm_filters -d 'olmo-mix/v1' -n random -t random_number_v1 -p 120 --reuse-existing /tmp/olmo_mix/meta --files-regex-pattern 'cc_en_tail' --local-read-cache /tmp/olmo_mix/cache && ~/target/release/mixer pretrain_data/mixer/config/olmo-train-sample/common-crawl-tail.json +``` From 3ad1ee47cdbb5cf058dbb8a8abbd0502032126da Mon Sep 17 00:00:00 2001 From: Luca Soldaini Date: Mon, 19 Jun 2023 21:50:16 -0700 Subject: [PATCH 04/11] new paths --- .../config/olmo-train/common-crawl-head.json | 2 +- pretrain_data/mixing-log.md | 43 +++++++++++++++++++ 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/pretrain_data/mixer/config/olmo-train/common-crawl-head.json b/pretrain_data/mixer/config/olmo-train/common-crawl-head.json index 233e027ae..e24986562 100644 --- a/pretrain_data/mixer/config/olmo-train/common-crawl-head.json +++ b/pretrain_data/mixer/config/olmo-train/common-crawl-head.json @@ -25,7 +25,7 @@ "filter": { "include": [], "exclude": [ - "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] && @.gopher_rules__gopher_v1__word_count[0][2] < 50)]", + "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] && @.gopher_rules__gopher_v1__word_count[0][2] < 50)]", "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] && @.gopher_rules__gopher_v1__word_count[0][2] > 100000)]", "$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] && @.gopher_rules__gopher_v1__median_word_length[0][2] < 3)]", "$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] && @.gopher_rules__gopher_v1__median_word_length[0][2] > 10)]", diff --git a/pretrain_data/mixing-log.md b/pretrain_data/mixing-log.md index c3042e843..65de88323 100644 --- a/pretrain_data/mixing-log.md +++ b/pretrain_data/mixing-log.md @@ -319,3 +319,46 @@ python scripts/prepare_memmap_dataset.py \ ```shell ~/target/release/mixer pretrain_data/mixer/config/olmo-train/common-crawl-tail.json && ai2_llm_filters -d 'olmo-mix/v1' -n random -t random_number_v1 -p 120 --reuse-existing /tmp/olmo_mix/meta --files-regex-pattern 'cc_en_tail' --local-read-cache /tmp/olmo_mix/cache && ~/target/release/mixer pretrain_data/mixer/config/olmo-train-sample/common-crawl-tail.json ``` + +## Gopher-like + + +```shell +python scripts/prepare_memmap_dataset.py \ + s3://ai2-llm/pretraining-data/sources/olmo-mix/v1-sample-small/documents/stack \ + --safe-mode \ + --output s3://ai2-llm/preprocessed/olmo-mix/v1-sample-small/gpt-neox-20b-pii-special/stack \ + --tokenizer "allenai/eleuther-ai-gpt-neox-20b-pii-special" \ + --workers 120 \ + --cache-dir /tmp/llm-preprocessed +``` + +```shell +python scripts/prepare_memmap_dataset.py \ + s3://ai2-llm/pretraining-data/sources/olmo-mix/v1-sample-small/documents/cc_en_head \ + --safe-mode \ + --output s3://ai2-llm/preprocessed/olmo-mix/v1-sample-small/gpt-neox-20b-pii-special/common-crawl/cc_en_head \ + --tokenizer "allenai/eleuther-ai-gpt-neox-20b-pii-special" \ + --workers 120 \ + --cache-dir /tmp/llm-preprocessed/cc_en_head +``` + +```shell +python scripts/prepare_memmap_dataset.py \ + s3://ai2-llm/pretraining-data/sources/olmo-mix/v1-sample-small/documents/cc_en_middle \ + --safe-mode \ + --output s3://ai2-llm/preprocessed/olmo-mix/v1-sample-small/gpt-neox-20b-pii-special/common-crawl/cc_en_middle \ + --tokenizer "allenai/eleuther-ai-gpt-neox-20b-pii-special" \ + --workers 120 \ + --cache-dir /tmp/llm-preprocessed +``` + +```shell +python scripts/prepare_memmap_dataset.py \ + s3://ai2-llm/pretraining-data/sources/olmo-mix/v1-sample-small/documents/cc_en_tail \ + --safe-mode \ + --output s3://ai2-llm/preprocessed/olmo-mix/v1-sample-small/gpt-neox-20b-pii-special/common-crawl/cc_en_tail \ + --tokenizer "allenai/eleuther-ai-gpt-neox-20b-pii-special" \ + --workers 120 \ + --cache-dir /tmp/llm-preprocessed +``` From 77005d5446b90a8f084f88642ebc29345c89ec0c Mon Sep 17 00:00:00 2001 From: Luca Soldaini Date: Mon, 19 Jun 2023 21:50:19 -0700 Subject: [PATCH 05/11] new paths --- .../common-crawl-head.json | 83 +++++++++++++++++++ .../common-crawl-middle.json | 83 +++++++++++++++++++ .../common-crawl-tail.json | 83 +++++++++++++++++++ .../common-crawl-head.json | 28 +++++++ .../common-crawl-middle.json | 28 +++++++ .../common-crawl-tail.json | 28 +++++++ .../config/olmo-train-sample-small/stack.json | 28 +++++++ .../config/olmo-train-sample/stack-half.json | 28 +++++++ 8 files changed, 389 insertions(+) create mode 100644 pretrain_data/mixer/config/olmo-train-no-removal/common-crawl-head.json create mode 100644 pretrain_data/mixer/config/olmo-train-no-removal/common-crawl-middle.json create mode 100644 pretrain_data/mixer/config/olmo-train-no-removal/common-crawl-tail.json create mode 100644 pretrain_data/mixer/config/olmo-train-sample-small/common-crawl-head.json create mode 100644 pretrain_data/mixer/config/olmo-train-sample-small/common-crawl-middle.json create mode 100644 pretrain_data/mixer/config/olmo-train-sample-small/common-crawl-tail.json create mode 100644 pretrain_data/mixer/config/olmo-train-sample-small/stack.json create mode 100644 pretrain_data/mixer/config/olmo-train-sample/stack-half.json diff --git a/pretrain_data/mixer/config/olmo-train-no-removal/common-crawl-head.json b/pretrain_data/mixer/config/olmo-train-no-removal/common-crawl-head.json new file mode 100644 index 000000000..fbf7adc48 --- /dev/null +++ b/pretrain_data/mixer/config/olmo-train-no-removal/common-crawl-head.json @@ -0,0 +1,83 @@ +{ + "streams": [ + { + "name": "cc_en_head", + "documents": [ + "pretraining-data/sources/common-crawl/v1-c4-cleaned/documents/cc_en_head/*.json.gz" + ], + "attributes": [ + "decontamination", + "dedupe_paragraphs", + "gopher_rules", + "hatespeech_nsfw_cc_v3", + "pii_detection", + "random" + ], + "output": { + "path": "pretraining-data/sources/olmo-mix/v1-no-removal/documents/common-crawl/cc_en_head", + "max_size_in_bytes": 4294967296, + "discard_fields": [ + "attributes", + "metadata", + "added", + "created" + ] + }, + "filter": { + "include": [], + "exclude": [ + "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] && @.gopher_rules__gopher_v1__word_count[0][2] < 50)]", + "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] && @.gopher_rules__gopher_v1__word_count[0][2] > 100000)]", + "$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] && @.gopher_rules__gopher_v1__median_word_length[0][2] < 3)]", + "$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] && @.gopher_rules__gopher_v1__median_word_length[0][2] > 10)]", + "$.attributes[?(@.gopher_rules__gopher_v1__symbol_to_word_ratio && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0] && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0][2] > 0.1)]", + "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0] && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0][2] < 0.8)]", + "$.attributes[?(@.gopher_rules__gopher_v1__required_word_count && @.gopher_rules__gopher_v1__required_word_count[0] && @.gopher_rules__gopher_v1__required_word_count[0][2] < 2)]", + "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0] && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0][2] > 0.9)]", + "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0] && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0][2] > 0.3)]", + "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_duplicate_lines && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0] && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0][2] > 0.3)]", + "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0][2] > 0.3)]", + "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0][2] > 0.2)]", + "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0][2] > 0.18)]", + "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0][2] > 0.16)]", + "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0][2] > 0.15)]", + "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0][2] > 0.14)]", + "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0][2] > 0.13)]", + "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0][2] > 0.12)]", + "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0][2] > 0.11)]", + "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0][2] > 0.10)]", + "$@.attributes[?(@.bff_duplicate_paragraph_spans_decontamination && @.bff_duplicate_paragraph_spans_decontamination[0] && @.bff_duplicate_paragraph_spans_decontamination[0][2] >= 1.0)]", + "$.attributes[?(@.pii_detection__pii_regex_with_counts_fast_v2__doc_count && @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0] && @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0][2] > 5)]", + "$.attributes[?(@.random__random_number_v1__random[0][2] < 0.5104606781)]" + ] + }, + "span_replacement": [ + { + "span": "$.attributes.bff_duplicate_paragraph_spans", + "min_score": 0.5, + "replacement": "" + }, + { + "span": "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__EMAIL_ADDRESS", + "min_score": 0.5, + "replacement": " |||EMAIL_ADDRESS||| " + }, + { + "span": "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__PHONE_NUMBER", + "min_score": 0.5, + "replacement": " |||PHONE_NUMBER||| " + }, + { + "span": "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__IP_ADDRESS", + "min_score": 0.5, + "replacement": " |||IP_ADDRESS||| " + } + ] + } + ], + "work_dir": { + "input": "/tmp/olmo-mix-v1/input", + "output": "/tmp/olmo-mix-v1/output" + }, + "processes": 128 +} diff --git a/pretrain_data/mixer/config/olmo-train-no-removal/common-crawl-middle.json b/pretrain_data/mixer/config/olmo-train-no-removal/common-crawl-middle.json new file mode 100644 index 000000000..4e9144c71 --- /dev/null +++ b/pretrain_data/mixer/config/olmo-train-no-removal/common-crawl-middle.json @@ -0,0 +1,83 @@ +{ + "streams": [ + { + "name": "cc_en_middle", + "documents": [ + "pretraining-data/sources/common-crawl/v1-c4-cleaned/documents/cc_en_middle/*.json.gz" + ], + "attributes": [ + "decontamination", + "dedupe_paragraphs", + "gopher_rules", + "hatespeech_nsfw_cc_v3", + "pii_detection", + "random" + ], + "output": { + "path": "pretraining-data/sources/olmo-mix/v1-no-removal/documents/common-crawl/cc_en_middle", + "max_size_in_bytes": 4294967296, + "discard_fields": [ + "attributes", + "metadata", + "added", + "created" + ] + }, + "filter": { + "include": [], + "exclude": [ + "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] && @.gopher_rules__gopher_v1__word_count[0][2] < 50)]", + "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] && @.gopher_rules__gopher_v1__word_count[0][2] > 100000)]", + "$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] && @.gopher_rules__gopher_v1__median_word_length[0][2] < 3)]", + "$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] && @.gopher_rules__gopher_v1__median_word_length[0][2] > 10)]", + "$.attributes[?(@.gopher_rules__gopher_v1__symbol_to_word_ratio && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0] && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0][2] > 0.1)]", + "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0] && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0][2] < 0.8)]", + "$.attributes[?(@.gopher_rules__gopher_v1__required_word_count && @.gopher_rules__gopher_v1__required_word_count[0] && @.gopher_rules__gopher_v1__required_word_count[0][2] < 2)]", + "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0] && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0][2] > 0.9)]", + "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0] && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0][2] > 0.3)]", + "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_duplicate_lines && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0] && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0][2] > 0.3)]", + "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0][2] > 0.3)]", + "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0][2] > 0.2)]", + "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0][2] > 0.18)]", + "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0][2] > 0.16)]", + "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0][2] > 0.15)]", + "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0][2] > 0.14)]", + "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0][2] > 0.13)]", + "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0][2] > 0.12)]", + "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0][2] > 0.11)]", + "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0][2] > 0.10)]", + "$@.attributes[?(@.bff_duplicate_paragraph_spans_decontamination && @.bff_duplicate_paragraph_spans_decontamination[0] && @.bff_duplicate_paragraph_spans_decontamination[0][2] >= 1.0)]", + "$.attributes[?(@.pii_detection__pii_regex_with_counts_fast_v2__doc_count && @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0] && @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0][2] > 5)]", + "$.attributes[?(@.random__random_number_v1__random[0][2] < 0.5104606781)]" + ] + }, + "span_replacement": [ + { + "span": "$.attributes.bff_duplicate_paragraph_spans", + "min_score": 0.5, + "replacement": "" + }, + { + "span": "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__EMAIL_ADDRESS", + "min_score": 0.5, + "replacement": " |||EMAIL_ADDRESS||| " + }, + { + "span": "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__PHONE_NUMBER", + "min_score": 0.5, + "replacement": " |||PHONE_NUMBER||| " + }, + { + "span": "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__IP_ADDRESS", + "min_score": 0.5, + "replacement": " |||IP_ADDRESS||| " + } + ] + } + ], + "work_dir": { + "input": "/tmp/olmo-mix-v1/input", + "output": "/tmp/olmo-mix-v1/output" + }, + "processes": 128 +} diff --git a/pretrain_data/mixer/config/olmo-train-no-removal/common-crawl-tail.json b/pretrain_data/mixer/config/olmo-train-no-removal/common-crawl-tail.json new file mode 100644 index 000000000..1bb1fcbee --- /dev/null +++ b/pretrain_data/mixer/config/olmo-train-no-removal/common-crawl-tail.json @@ -0,0 +1,83 @@ +{ + "streams": [ + { + "name": "cc_en_tail", + "documents": [ + "pretraining-data/sources/common-crawl/v1-c4-cleaned/documents/cc_en_tail/*.json.gz" + ], + "attributes": [ + "decontamination", + "dedupe_paragraphs", + "gopher_rules", + "hatespeech_nsfw_cc_v3", + "pii_detection", + "random" + ], + "output": { + "path": "pretraining-data/sources/olmo-mix/v1-no-removal/documents/common-crawl/cc_en_tail", + "max_size_in_bytes": 4294967296, + "discard_fields": [ + "attributes", + "metadata", + "added", + "created" + ] + }, + "filter": { + "include": [], + "exclude": [ + "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] && @.gopher_rules__gopher_v1__word_count[0][2] < 50)]", + "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] && @.gopher_rules__gopher_v1__word_count[0][2] > 100000)]", + "$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] && @.gopher_rules__gopher_v1__median_word_length[0][2] < 3)]", + "$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] && @.gopher_rules__gopher_v1__median_word_length[0][2] > 10)]", + "$.attributes[?(@.gopher_rules__gopher_v1__symbol_to_word_ratio && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0] && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0][2] > 0.1)]", + "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0] && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0][2] < 0.8)]", + "$.attributes[?(@.gopher_rules__gopher_v1__required_word_count && @.gopher_rules__gopher_v1__required_word_count[0] && @.gopher_rules__gopher_v1__required_word_count[0][2] < 2)]", + "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0] && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0][2] > 0.9)]", + "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0] && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0][2] > 0.3)]", + "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_duplicate_lines && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0] && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0][2] > 0.3)]", + "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0][2] > 0.3)]", + "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0][2] > 0.2)]", + "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0][2] > 0.18)]", + "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0][2] > 0.16)]", + "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0][2] > 0.15)]", + "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0][2] > 0.14)]", + "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0][2] > 0.13)]", + "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0][2] > 0.12)]", + "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0][2] > 0.11)]", + "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0][2] > 0.10)]", + "$@.attributes[?(@.bff_duplicate_paragraph_spans_decontamination && @.bff_duplicate_paragraph_spans_decontamination[0] && @.bff_duplicate_paragraph_spans_decontamination[0][2] >= 1.0)]", + "$.attributes[?(@.pii_detection__pii_regex_with_counts_fast_v2__doc_count && @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0] && @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0][2] > 5)]", + "$.attributes[?(@.random__random_number_v1__random[0][2] < 0.5104606781)]" + ] + }, + "span_replacement": [ + { + "span": "$.attributes.bff_duplicate_paragraph_spans", + "min_score": 0.5, + "replacement": "" + }, + { + "span": "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__EMAIL_ADDRESS", + "min_score": 0.5, + "replacement": " |||EMAIL_ADDRESS||| " + }, + { + "span": "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__PHONE_NUMBER", + "min_score": 0.5, + "replacement": " |||PHONE_NUMBER||| " + }, + { + "span": "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__IP_ADDRESS", + "min_score": 0.5, + "replacement": " |||IP_ADDRESS||| " + } + ] + } + ], + "work_dir": { + "input": "/tmp/olmo-mix-v1/input", + "output": "/tmp/olmo-mix-v1/output" + }, + "processes": 128 +} diff --git a/pretrain_data/mixer/config/olmo-train-sample-small/common-crawl-head.json b/pretrain_data/mixer/config/olmo-train-sample-small/common-crawl-head.json new file mode 100644 index 000000000..80e555026 --- /dev/null +++ b/pretrain_data/mixer/config/olmo-train-sample-small/common-crawl-head.json @@ -0,0 +1,28 @@ +{ + "streams": [ + { + "name": "cc_en_head", + "documents": [ + "pretraining-data/sources/olmo-mix/v1/documents/common-crawl/cc_en_head/*.gz" + ], + "output": { + "path": "pretraining-data/sources/olmo-mix/v1-sample-small/documents/cc_en_head", + "max_size_in_bytes": 3894967296 + }, + "attributes": [ + "random" + ], + "filter": { + "include": [ + "$.attributes[?(@.random__random_number_v1__random[0][2] < 0.4334)]" + ], + "exclude": [] + } + } + ], + "work_dir": { + "input": "/tmp/cc-head-sample/mixer/input", + "output": "/tmp/cc-head-sample/mixer/output" + }, + "processes": 120 +} diff --git a/pretrain_data/mixer/config/olmo-train-sample-small/common-crawl-middle.json b/pretrain_data/mixer/config/olmo-train-sample-small/common-crawl-middle.json new file mode 100644 index 000000000..b1b146686 --- /dev/null +++ b/pretrain_data/mixer/config/olmo-train-sample-small/common-crawl-middle.json @@ -0,0 +1,28 @@ +{ + "streams": [ + { + "name": "cc_en_middle", + "documents": [ + "pretraining-data/sources/olmo-mix/v1/documents/common-crawl/cc_en_middle/*.gz" + ], + "output": { + "path": "pretraining-data/sources/olmo-mix/v1-sample-small/documents/cc_en_middle", + "max_size_in_bytes": 3894967296 + }, + "attributes": [ + "random" + ], + "filter": { + "include": [ + "$.attributes[?(@.random__random_number_v1__random[0][2] < 0.4334)]" + ], + "exclude": [] + } + } + ], + "work_dir": { + "input": "/tmp/cc-middle-sample/mixer/input", + "output": "/tmp/cc-middle-sample/mixer/output" + }, + "processes": 120 +} diff --git a/pretrain_data/mixer/config/olmo-train-sample-small/common-crawl-tail.json b/pretrain_data/mixer/config/olmo-train-sample-small/common-crawl-tail.json new file mode 100644 index 000000000..928ffa820 --- /dev/null +++ b/pretrain_data/mixer/config/olmo-train-sample-small/common-crawl-tail.json @@ -0,0 +1,28 @@ +{ + "streams": [ + { + "name": "cc_en_tail", + "documents": [ + "pretraining-data/sources/olmo-mix/v1/documents/common-crawl/cc_en_tail/*.gz" + ], + "output": { + "path": "pretraining-data/sources/olmo-mix/v1-sample-small/documents/cc_en_tail", + "max_size_in_bytes": 3894967296 + }, + "attributes": [ + "random" + ], + "filter": { + "include": [ + "$.attributes[?(@.random__random_number_v1__random[0][2] < 0.4334)]" + ], + "exclude": [] + } + } + ], + "work_dir": { + "input": "/tmp/cc-middle-sample/mixer/input", + "output": "/tmp/cc-middle-sample/mixer/output" + }, + "processes": 120 +} diff --git a/pretrain_data/mixer/config/olmo-train-sample-small/stack.json b/pretrain_data/mixer/config/olmo-train-sample-small/stack.json new file mode 100644 index 000000000..de182c09b --- /dev/null +++ b/pretrain_data/mixer/config/olmo-train-sample-small/stack.json @@ -0,0 +1,28 @@ +{ + "streams": [ + { + "name": "stack-v2-mixer-train", + "documents": [ + "pretraining-data/sources/olmo-mix/v1/documents/stack/*.gz" + ], + "output": { + "path": "pretraining-data/sources/olmo-mix/v1-sample-small/documents/stack", + "max_size_in_bytes": 3894967296 + }, + "attributes": [ + "random" + ], + "filter": { + "include": [ + "$.attributes[?(@.random__random_number_v1__random[0][2] < 0.1429)]" + ], + "exclude": [] + } + } + ], + "work_dir": { + "input": "/tmp/stack-sample/mixer/input", + "output": "/tmp/stack-sample/mixer/output" + }, + "processes": 120 +} diff --git a/pretrain_data/mixer/config/olmo-train-sample/stack-half.json b/pretrain_data/mixer/config/olmo-train-sample/stack-half.json new file mode 100644 index 000000000..b78d55d82 --- /dev/null +++ b/pretrain_data/mixer/config/olmo-train-sample/stack-half.json @@ -0,0 +1,28 @@ +{ + "streams": [ + { + "name": "stack-v2-mixer-train", + "documents": [ + "pretraining-data/sources/olmo-mix/v1/documents/stack/*.gz" + ], + "output": { + "path": "pretraining-data/sources/olmo-mix/v1-sample-stack-half/documents/stack", + "max_size_in_bytes": 3894967296 + }, + "attributes": [ + "random" + ], + "filter": { + "include": [ + "$.attributes[?(@.random__random_number_v1__random[0][2] < 0.348782053)]" + ], + "exclude": [] + } + } + ], + "work_dir": { + "input": "/tmp/stack-sample-2x/mixer/input", + "output": "/tmp/stack-sample-2x/mixer/output" + }, + "processes": 120 +} From 1dc41f872afc673ce12bae298078c3c6ac6d606f Mon Sep 17 00:00:00 2001 From: Luca Soldaini Date: Wed, 28 Jun 2023 11:41:06 -0700 Subject: [PATCH 06/11] mixing --- pretrain_data/mixing-log.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pretrain_data/mixing-log.md b/pretrain_data/mixing-log.md index 65de88323..1fb568720 100644 --- a/pretrain_data/mixing-log.md +++ b/pretrain_data/mixing-log.md @@ -335,9 +335,9 @@ python scripts/prepare_memmap_dataset.py \ ```shell python scripts/prepare_memmap_dataset.py \ - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1-sample-small/documents/cc_en_head \ + s3://ai2-llm/pretraining-data/sources/olmo-mix/v1-no-removal/documents/common-crawl/cc_en_head \ --safe-mode \ - --output s3://ai2-llm/preprocessed/olmo-mix/v1-sample-small/gpt-neox-20b-pii-special/common-crawl/cc_en_head \ + --output s3://ai2-llm/preprocessed/olmo-mix/v1-no-removal/gpt-neox-20b-pii-special/common-crawl/cc_en_head \ --tokenizer "allenai/eleuther-ai-gpt-neox-20b-pii-special" \ --workers 120 \ --cache-dir /tmp/llm-preprocessed/cc_en_head @@ -345,9 +345,9 @@ python scripts/prepare_memmap_dataset.py \ ```shell python scripts/prepare_memmap_dataset.py \ - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1-sample-small/documents/cc_en_middle \ + s3://ai2-llm/pretraining-data/sources/olmo-mix/v1-no-removal/documents/common-crawl/cc_en_middle \ --safe-mode \ - --output s3://ai2-llm/preprocessed/olmo-mix/v1-sample-small/gpt-neox-20b-pii-special/common-crawl/cc_en_middle \ + --output s3://ai2-llm/preprocessed/olmo-mix/v1-no-removal/gpt-neox-20b-pii-special/common-crawl/cc_en_middle \ --tokenizer "allenai/eleuther-ai-gpt-neox-20b-pii-special" \ --workers 120 \ --cache-dir /tmp/llm-preprocessed @@ -355,9 +355,9 @@ python scripts/prepare_memmap_dataset.py \ ```shell python scripts/prepare_memmap_dataset.py \ - s3://ai2-llm/pretraining-data/sources/olmo-mix/v1-sample-small/documents/cc_en_tail \ + s3://ai2-llm/pretraining-data/sources/olmo-mix/v1-no-removal/documents/common-crawl/cc_en_tail \ --safe-mode \ - --output s3://ai2-llm/preprocessed/olmo-mix/v1-sample-small/gpt-neox-20b-pii-special/common-crawl/cc_en_tail \ + --output s3://ai2-llm/preprocessed/olmo-mix/v1-no-removal/gpt-neox-20b-pii-special/common-crawl/cc_en_tail \ --tokenizer "allenai/eleuther-ai-gpt-neox-20b-pii-special" \ --workers 120 \ --cache-dir /tmp/llm-preprocessed From 9965b4c28c1e3c1e34f3e5a6988a3621d8806343 Mon Sep 17 00:00:00 2001 From: Luca Soldaini Date: Fri, 30 Jun 2023 20:38:28 -0700 Subject: [PATCH 07/11] added field of study! --- pretrain_data/s2/v3-fos/README.md | 22 +++ .../s2/v3-fos/process_corpus/s2ag.sql | 134 +++++++++++++++++ .../s2/v3-fos/process_corpus/s2orc.sql | 142 ++++++++++++++++++ 3 files changed, 298 insertions(+) create mode 100644 pretrain_data/s2/v3-fos/README.md create mode 100644 pretrain_data/s2/v3-fos/process_corpus/s2ag.sql create mode 100644 pretrain_data/s2/v3-fos/process_corpus/s2orc.sql diff --git a/pretrain_data/s2/v3-fos/README.md b/pretrain_data/s2/v3-fos/README.md new file mode 100644 index 000000000..07723e2dc --- /dev/null +++ b/pretrain_data/s2/v3-fos/README.md @@ -0,0 +1,22 @@ +# S2 Corpus v3 + +> *Author*: Luca Soldaini [@soldni](github.com/soldni) + + +Version 3 of the S2ORC corpus improves over 2 by removing abstracts from sources that +are not high-quality. For example, we remove abstracts that have originated exclusively +from the Microsoft Academic Graph, as they + +We identified the following sources to be of lower quality: + +If an abstract is exclusively from one of these sources, we remove it from the corpus. + + +## Dataset Statistics + +| Dataset | Split | # Documents | # Words | +|:-------:|:-----:|------------:|---------------:| +| s2ag | train | 30,569,017 | 5,920,099,207 | +| s2ag | valid | 109,709 | 24,029,459 | +| s2orc | train | 8,242,162 | 36,088,195,908 | +| s2orc | valid | 51,323 | 255,139,074 | diff --git a/pretrain_data/s2/v3-fos/process_corpus/s2ag.sql b/pretrain_data/s2/v3-fos/process_corpus/s2ag.sql new file mode 100644 index 000000000..d269608df --- /dev/null +++ b/pretrain_data/s2/v3-fos/process_corpus/s2ag.sql @@ -0,0 +1,134 @@ +UNLOAD ( + WITH filtered_corpus AS ( + SELECT + id, + source, + added, + created, + metadata, + cast(id AS INT) as corpusid, + (metadata.title || CHR(10) || CHR(10) || metadata.abstract) AS text, + IF( + metadata.year < 2022 + OR ( + metadata.year = 2022 AND + date(from_iso8601_timestamp(created)) < date('2022-12-01') + ), + 'train', + 'valid' + ) AS split + FROM ( + SELECT + *, + ARRAY_MAX( + TRANSFORM ( + regexp_extract_all(metadata.abstract, '\b([A-Za-z]\s)([a-z]\s)*[A-Za-z]\b'), + x -> length(x) + ) || 0 + ) AS max_single_letter_sequence, + FILTER( + metadata.sources, + x -> NOT REGEXP_LIKE( + x, + '^Unpaywall|MergedPDFExtraction|ScienceParseMerged|Anansi|ScienceParsePlus|Adhoc|ScienceParse|Crawler|MAG$' + ) + ) AS filtered_sources + FROM "temp_lucas"."llm_s2ag_v0" + WHERE + (metadata.title_language = 'en' OR metadata.title_perplexity > -20) + AND metadata.abstract_language = 'en' + AND metadata.abstract_perplexity > -20 + AND metadata.title_count >= 3 + AND metadata.abstract_count >= 50 + AND metadata.abstract_count <= 1000 + AND metadata.year >= 1970 + AND ( + REGEXP_LIKE( + metadata.top_frequencies[1].token, + '^[A-Za-z][A-Za-z]+$' + ) + OR ( + metadata.top_frequencies[1].token = 'a' + AND REGEXP_LIKE( + metadata.top_frequencies[2].token, + '^[A-Za-z][A-Za-z]+$' + ) + ) + ) + ) + WHERE ( + ( + CARDINALITY(filtered_sources) > 0 AND + max_single_letter_sequence < 4 + ) OR ( + max_single_letter_sequence > 0 AND + CARDINALITY(filtered_sources) = 0 + ) + ) + ), + filtered_corpus_with_fos AS ( + SELECT + cr.id, + cr.source, + cr.added, + cr.created, + cr.text, + cr.split, + CAST( + ROW( + cr.metadata.year, + cr.metadata.title, + cr.metadata.abstract, + cr.metadata.sha1, + cr.metadata.sources, + cr.metadata.title_language, + cr.metadata.abstract_language, + cr.metadata.title_perplexity, + cr.metadata.abstract_perplexity, + cr.metadata.title_count, + cr.metadata.abstract_count, + cr.metadata.top_frequencies, + COALESCE(pq.s2FieldsOfStudy, ARRAY[]), + COALESCE(pq.fieldsOfStudy, ARRAY[]) + ) + AS + ROW( + year BIGINT, + title VARCHAR, + abstract VARCHAR, + sha1 VARCHAR, + sources ARRAY, + title_language VARCHAR, + abstract_language VARCHAR, + title_perplexity DOUBLE, + abstract_perplexity DOUBLE, + title_count BIGINT, + abstract_count BIGINT, + top_frequencies ARRAY, + s2FieldsOfStudy ARRAY, + extFieldsOfStudy ARRAY + ) + ) AS metadata + from espresso.pq_paper as pq + INNER JOIN filtered_corpus as cr + ON pq.corpusid = cr.corpusid + ) + SELECT + id, + ARRAY_AGG(source)[1] AS source, + 'v3-fos' AS version, + ARRAY_AGG(added)[1] AS added, + ARRAY_AGG(created)[1] AS created, + ARRAY_AGG(text)[1] AS text, + ARRAY_AGG(metadata)[1] AS metadata, + ARRAY_AGG(split)[1] AS split, + CAST(id AS INT) % 10 AS part_id + FROM filtered_corpus_with_fos + GROUP BY id +) +TO 's3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag' +WITH ( + format='JSON', + compression='GZIP', + partitioned_by = ARRAY['split', 'part_id'] +) diff --git a/pretrain_data/s2/v3-fos/process_corpus/s2orc.sql b/pretrain_data/s2/v3-fos/process_corpus/s2orc.sql new file mode 100644 index 000000000..bda0e817d --- /dev/null +++ b/pretrain_data/s2/v3-fos/process_corpus/s2orc.sql @@ -0,0 +1,142 @@ +UNLOAD ( + WITH s2orc_stats AS ( + SELECT + id, + source, + added, + created, + metadata, + FILTER( + metadata.paragraphs, + x -> x.perplexity >= -20 + ) as valid_paragraphs, + ( + REGEXP_LIKE( + metadata.top_frequencies[1].token, + '^[A-Za-z][a-z]+$' + ) AND ( + ( + metadata.count > 500 AND + ( + metadata.top_frequencies[1].count / metadata.count + ) <= 0.075 + ) OR ( + metadata.count <= 500 AND + ( + metadata.top_frequencies[1].count / metadata.count + ) <= 0.3 + ) + ) + ) AS valid_top_word, + ARRAY_SORT( + TRANSFORM( + MAP_ENTRIES( + TRANSFORM_VALUES( + -- from table to map + MULTIMAP_FROM_ENTRIES( + -- from list to table + TRANSFORM( + -- extract rows to count + metadata.paragraphs, + x -> ROW(x.language, 1) + ) + ), + -- merge counts + (k, v) -> REDUCE(v, 0, (s, x) -> s + x, s -> s) + ) + ), + x -> CAST(x AS ROW(lang varchar, cnt int)) + ), + (x, y) -> IF(x.cnt < y.cnt, 1, IF(x.cnt = y.cnt, 0, -1)) + )[1].lang AS language + FROM "temp_lucas"."llm_s2orc_v0" + ), + filtered_corpus AS ( + SELECT + id, + source, + added, + created, + metadata, + cast(id AS INT) as corpusid, + ( + metadata.title || CHR(10) || CHR(10) || + metadata.abstract || CHR(10) || CHR(10) || + ARRAY_JOIN(TRANSFORM(valid_paragraphs, x -> x.text), CHR(10)) + ) as text, + IF( + metadata.year < 2022 + OR ( + metadata.year = 2022 AND + date(from_iso8601_timestamp(created)) < date('2022-12-01') + ), + 'train', + 'valid' + ) AS split + FROM s2orc_stats + WHERE + language = 'en' + AND metadata.count < 50000 + AND metadata.count > 500 + AND valid_top_word + AND cardinality(valid_paragraphs) >= 5 + AND metadata.title IS NOT NULL + AND metadata.abstract is not NULL + AND metadata.year >= 1970 + ), + filtered_corpus_with_fos AS ( + SELECT + cr.id, + cr.source, + cr.added, + cr.created, + cr.text, + cr.split, + CAST( + ROW( + cr.metadata.year, + cr.metadata.title, + cr.metadata.abstract, + cr.metadata.sha1, + cr.metadata.paragraphs, + cr.metadata.count, + cr.metadata.top_frequencies, + COALESCE(pq.s2FieldsOfStudy, ARRAY[]), + COALESCE(pq.fieldsOfStudy, ARRAY[]) + ) + AS + ROW( + year BIGINT, + title VARCHAR, + abstract VARCHAR, + sha1 VARCHAR, + paragraphs ARRAY, + count BIGINT, + top_frequencies ARRAY, + s2FieldsOfStudy ARRAY, + extFieldsOfStudy ARRAY + ) + ) AS metadata + from espresso.pq_paper as pq + INNER JOIN filtered_corpus as cr + ON pq.corpusid = cr.corpusid + ) + SELECT + id, + ARRAY_AGG(source)[1] AS source, + 'v3-fos' AS version, + ARRAY_AGG(added)[1] AS added, + ARRAY_AGG(created)[1] AS created, + ARRAY_AGG(text)[1] AS text, + ARRAY_AGG(metadata)[1] AS metadata, + ARRAY_AGG(split)[1] AS split, + CAST(id AS INT) % 10 AS part_id + FROM filtered_corpus + GROUP BY id +) +TO 's3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc' +WITH ( + format='JSON', + compression='GZIP', + partitioned_by = ARRAY['split', 'part_id'] +) From 17cc37ab3d0c99c6eedf64715bf46d74b968094b Mon Sep 17 00:00:00 2001 From: Luca Soldaini Date: Wed, 5 Jul 2023 12:00:31 -0700 Subject: [PATCH 08/11] fixed sql query --- pretrain_data/s2/v3-fos/process_corpus/s2orc.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pretrain_data/s2/v3-fos/process_corpus/s2orc.sql b/pretrain_data/s2/v3-fos/process_corpus/s2orc.sql index bda0e817d..b59ae9af1 100644 --- a/pretrain_data/s2/v3-fos/process_corpus/s2orc.sql +++ b/pretrain_data/s2/v3-fos/process_corpus/s2orc.sql @@ -131,7 +131,7 @@ UNLOAD ( ARRAY_AGG(metadata)[1] AS metadata, ARRAY_AGG(split)[1] AS split, CAST(id AS INT) % 10 AS part_id - FROM filtered_corpus + FROM filtered_corpus_with_fos GROUP BY id ) TO 's3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc' From fec0bf0139ce97ab1e308640d2c2372f010f9e7e Mon Sep 17 00:00:00 2001 From: Luca Soldaini Date: Fri, 7 Jul 2023 18:46:31 -0700 Subject: [PATCH 09/11] downloading --- pretrain_data/stackexchange/README.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 pretrain_data/stackexchange/README.md diff --git a/pretrain_data/stackexchange/README.md b/pretrain_data/stackexchange/README.md new file mode 100644 index 000000000..e69de29bb From 1f9c31c15afd831ef9fd9abe1838487e1adcd3ed Mon Sep 17 00:00:00 2001 From: Luca Soldaini Date: Fri, 7 Jul 2023 18:46:44 -0700 Subject: [PATCH 10/11] downloading --- .../s2/v3-fos/process_corpus/s2orc.sql | 19 +- pretrain_data/stackexchange/download.sh | 46 +++ pretrain_data/stackexchange/names.txt | 367 ++++++++++++++++++ 3 files changed, 427 insertions(+), 5 deletions(-) create mode 100644 pretrain_data/stackexchange/download.sh create mode 100644 pretrain_data/stackexchange/names.txt diff --git a/pretrain_data/s2/v3-fos/process_corpus/s2orc.sql b/pretrain_data/s2/v3-fos/process_corpus/s2orc.sql index b59ae9af1..00d306355 100644 --- a/pretrain_data/s2/v3-fos/process_corpus/s2orc.sql +++ b/pretrain_data/s2/v3-fos/process_corpus/s2orc.sql @@ -84,6 +84,15 @@ UNLOAD ( AND metadata.abstract is not NULL AND metadata.year >= 1970 ), + filtered_espresso AS ( + SELECT + pq.corpusid, + COALESCE(pq.s2FieldsOfStudy, ARRAY[]) as s2FieldsOfStudy, + COALESCE(pq.fieldsOfStudy, ARRAY[]) as fieldsOfStudy + from espresso.pq_paper as pq + INNER JOIN filtered_corpus as cr + ON pq.corpusid = cr.corpusid + ), filtered_corpus_with_fos AS ( SELECT cr.id, @@ -98,11 +107,11 @@ UNLOAD ( cr.metadata.title, cr.metadata.abstract, cr.metadata.sha1, - cr.metadata.paragraphs, + -- cr.metadata.paragraphs, cr.metadata.count, cr.metadata.top_frequencies, - COALESCE(pq.s2FieldsOfStudy, ARRAY[]), - COALESCE(pq.fieldsOfStudy, ARRAY[]) + pq.s2FieldsOfStudy, + pq.fieldsOfStudy ) AS ROW( @@ -110,14 +119,14 @@ UNLOAD ( title VARCHAR, abstract VARCHAR, sha1 VARCHAR, - paragraphs ARRAY, + -- paragraphs ARRAY, count BIGINT, top_frequencies ARRAY, s2FieldsOfStudy ARRAY, extFieldsOfStudy ARRAY ) ) AS metadata - from espresso.pq_paper as pq + from filtered_espresso as pq INNER JOIN filtered_corpus as cr ON pq.corpusid = cr.corpusid ) diff --git a/pretrain_data/stackexchange/download.sh b/pretrain_data/stackexchange/download.sh new file mode 100644 index 000000000..fe25ad5a8 --- /dev/null +++ b/pretrain_data/stackexchange/download.sh @@ -0,0 +1,46 @@ +#!/usr/bin/env bash + +# Author: Luca Soldaini +# Email: luca@soldaini.net + +# get script directory +SOURCE="${BASH_SOURCE[0]}" +while [ -h "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink + SCRIPT_DIR="$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd )" + SOURCE="$(readlink "$SOURCE")" + # if $SOURCE was a relative symlink, we need to resolve it + # relative to the path where the symlink file was located + [[ $SOURCE != /* ]] && SOURCE="$SCRIPT_DIR/$SOURCE" +done +SCRIPT_DIR="$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd )" + +names_file="${SCRIPT_DIR}/names.txt" +num_processes=8 + +process_file() { + name=$1 + s3_prefix="s3://ai2-llm/pretraining-data/sources/stackexchange/raw/" + url_prefix="https://archive.org/download/stackexchange/" + + # Check if file exists in s3 bucket + if aws s3 ls ${s3_prefix}${name} > /dev/null 2>&1; then + echo "File ${name} exists in s3 bucket." + else + echo "File ${name} does not exist in s3 bucket. Downloading and uploading..." + wget -P "/tmp" ${url_prefix}${name} + + # Check if file downloaded successfully + if [ -f "/tmp/${name}" ]; then + echo "Successfully downloaded ${name}. Uploading to s3..." + aws s3 cp "/tmp/${name}" ${s3_prefix}${name} + rm ${name} # remove the file from the machine + else + echo "Failed to download ${name}." + fi + fi +} + +export -f process_file + +# Use xargs to pass each line in names.txt to process_file function and run in parallel. +cat $names_file | xargs -I {} -P $num_processes bash -c "process_file {}" diff --git a/pretrain_data/stackexchange/names.txt b/pretrain_data/stackexchange/names.txt new file mode 100644 index 000000000..82e96b900 --- /dev/null +++ b/pretrain_data/stackexchange/names.txt @@ -0,0 +1,367 @@ +3dprinting.meta.stackexchange.com.7z +3dprinting.stackexchange.com.7z +academia.meta.stackexchange.com.7z +academia.stackexchange.com.7z +ai.meta.stackexchange.com.7z +ai.stackexchange.com.7z +android.meta.stackexchange.com.7z +android.stackexchange.com.7z +anime.meta.stackexchange.com.7z +anime.stackexchange.com.7z +apple.meta.stackexchange.com.7z +apple.stackexchange.com.7z +arduino.meta.stackexchange.com.7z +arduino.stackexchange.com.7z +askubuntu.com.7z +astronomy.meta.stackexchange.com.7z +astronomy.stackexchange.com.7z +aviation.meta.stackexchange.com.7z +aviation.stackexchange.com.7z +avp.meta.stackexchange.com.7z +avp.stackexchange.com.7z +beer.meta.stackexchange.com.7z +beer.stackexchange.com.7z +bicycles.meta.stackexchange.com.7z +bicycles.stackexchange.com.7z +bioacoustics.meta.stackexchange.com.7z +bioacoustics.stackexchange.com.7z +bioinformatics.meta.stackexchange.com.7z +bioinformatics.stackexchange.com.7z +biology.meta.stackexchange.com.7z +biology.stackexchange.com.7z +bitcoin.meta.stackexchange.com.7z +bitcoin.stackexchange.com.7z +blender.meta.stackexchange.com.7z +blender.stackexchange.com.7z +boardgames.meta.stackexchange.com.7z +boardgames.stackexchange.com.7z +bricks.meta.stackexchange.com.7z +bricks.stackexchange.com.7z +buddhism.meta.stackexchange.com.7z +buddhism.stackexchange.com.7z +cardano.meta.stackexchange.com.7z +cardano.stackexchange.com.7z +chemistry.meta.stackexchange.com.7z +chemistry.stackexchange.com.7z +chess.meta.stackexchange.com.7z +chess.stackexchange.com.7z +chinese.meta.stackexchange.com.7z +chinese.stackexchange.com.7z +christianity.meta.stackexchange.com.7z +christianity.stackexchange.com.7z +civicrm.meta.stackexchange.com.7z +civicrm.stackexchange.com.7z +codegolf.meta.stackexchange.com.7z +codegolf.stackexchange.com.7z +codereview.meta.stackexchange.com.7z +codereview.stackexchange.com.7z +coffee.meta.stackexchange.com.7z +coffee.stackexchange.com.7z +cogsci.meta.stackexchange.com.7z +cogsci.stackexchange.com.7z +computergraphics.meta.stackexchange.com.7z +computergraphics.stackexchange.com.7z +conlang.meta.stackexchange.com.7z +conlang.stackexchange.com.7z +cooking.meta.stackexchange.com.7z +cooking.stackexchange.com.7z +craftcms.meta.stackexchange.com.7z +craftcms.stackexchange.com.7z +crafts.meta.stackexchange.com.7z +crafts.stackexchange.com.7z +crypto.meta.stackexchange.com.7z +crypto.stackexchange.com.7z +cs.meta.stackexchange.com.7z +cs.stackexchange.com.7z +cseducators.meta.stackexchange.com.7z +cseducators.stackexchange.com.7z +cstheory.meta.stackexchange.com.7z +cstheory.stackexchange.com.7z +datascience.meta.stackexchange.com.7z +datascience.stackexchange.com.7z +dba.meta.stackexchange.com.7z +dba.stackexchange.com.7z +devops.meta.stackexchange.com.7z +devops.stackexchange.com.7z +diy.meta.stackexchange.com.7z +diy.stackexchange.com.7z +drones.meta.stackexchange.com.7z +drones.stackexchange.com.7z +drupal.meta.stackexchange.com.7z +drupal.stackexchange.com.7z +dsp.meta.stackexchange.com.7z +dsp.stackexchange.com.7z +earthscience.meta.stackexchange.com.7z +earthscience.stackexchange.com.7z +ebooks.meta.stackexchange.com.7z +ebooks.stackexchange.com.7z +economics.meta.stackexchange.com.7z +economics.stackexchange.com.7z +electronics.meta.stackexchange.com.7z +electronics.stackexchange.com.7z +elementaryos.meta.stackexchange.com.7z +elementaryos.stackexchange.com.7z +ell.meta.stackexchange.com.7z +ell.stackexchange.com.7z +emacs.meta.stackexchange.com.7z +emacs.stackexchange.com.7z +engineering.meta.stackexchange.com.7z +engineering.stackexchange.com.7z +english.meta.stackexchange.com.7z +english.stackexchange.com.7z +eosio.meta.stackexchange.com.7z +eosio.stackexchange.com.7z +es.meta.stackoverflow.com.7z +es.stackoverflow.com.7z +esperanto.meta.stackexchange.com.7z +esperanto.stackexchange.com.7z +ethereum.meta.stackexchange.com.7z +ethereum.stackexchange.com.7z +expatriates.meta.stackexchange.com.7z +expatriates.stackexchange.com.7z +expressionengine.meta.stackexchange.com.7z +expressionengine.stackexchange.com.7z +fitness.meta.stackexchange.com.7z +fitness.stackexchange.com.7z +freelancing.meta.stackexchange.com.7z +freelancing.stackexchange.com.7z +french.meta.stackexchange.com.7z +french.stackexchange.com.7z +gamedev.meta.stackexchange.com.7z +gamedev.stackexchange.com.7z +gaming.meta.stackexchange.com.7z +gaming.stackexchange.com.7z +gardening.meta.stackexchange.com.7z +gardening.stackexchange.com.7z +genealogy.meta.stackexchange.com.7z +genealogy.stackexchange.com.7z +german.meta.stackexchange.com.7z +german.stackexchange.com.7z +gis.meta.stackexchange.com.7z +gis.stackexchange.com.7z +graphicdesign.meta.stackexchange.com.7z +graphicdesign.stackexchange.com.7z +ham.meta.stackexchange.com.7z +ham.stackexchange.com.7z +hardwarerecs.meta.stackexchange.com.7z +hardwarerecs.stackexchange.com.7z +health.meta.stackexchange.com.7z +health.stackexchange.com.7z +hermeneutics.meta.stackexchange.com.7z +hermeneutics.stackexchange.com.7z +hinduism.meta.stackexchange.com.7z +hinduism.stackexchange.com.7z +history.meta.stackexchange.com.7z +history.stackexchange.com.7z +homebrew.meta.stackexchange.com.7z +homebrew.stackexchange.com.7z +hsm.meta.stackexchange.com.7z +hsm.stackexchange.com.7z +interpersonal.meta.stackexchange.com.7z +interpersonal.stackexchange.com.7z +iot.meta.stackexchange.com.7z +iot.stackexchange.com.7z +iota.meta.stackexchange.com.7z +iota.stackexchange.com.7z +islam.meta.stackexchange.com.7z +islam.stackexchange.com.7z +italian.meta.stackexchange.com.7z +italian.stackexchange.com.7z +ja.meta.stackoverflow.com.7z +ja.stackoverflow.com.7z +japanese.meta.stackexchange.com.7z +japanese.stackexchange.com.7z +joomla.meta.stackexchange.com.7z +joomla.stackexchange.com.7z +judaism.meta.stackexchange.com.7z +judaism.stackexchange.com.7z +korean.meta.stackexchange.com.7z +korean.stackexchange.com.7z +languagelearning.meta.stackexchange.com.7z +languagelearning.stackexchange.com.7z +latin.meta.stackexchange.com.7z +latin.stackexchange.com.7z +law.meta.stackexchange.com.7z +law.stackexchange.com.7z +lifehacks.meta.stackexchange.com.7z +lifehacks.stackexchange.com.7z +linguistics.meta.stackexchange.com.7z +linguistics.stackexchange.com.7z +literature.meta.stackexchange.com.7z +literature.stackexchange.com.7z +magento.meta.stackexchange.com.7z +magento.stackexchange.com.7z +martialarts.meta.stackexchange.com.7z +martialarts.stackexchange.com.7z +materials.meta.stackexchange.com.7z +materials.stackexchange.com.7z +math.meta.stackexchange.com.7z +math.stackexchange.com.7z +matheducators.meta.stackexchange.com.7z +matheducators.stackexchange.com.7z +mathematica.meta.stackexchange.com.7z +mathematica.stackexchange.com.7z +mathoverflow.net.7z +mechanics.meta.stackexchange.com.7z +mechanics.stackexchange.com.7z +meta.askubuntu.com.7z +meta.mathoverflow.net.7z +meta.serverfault.com.7z +meta.stackexchange.com.7z +meta.stackoverflow.com.7z +meta.superuser.com.7z +moderators.meta.stackexchange.com.7z +moderators.stackexchange.com.7z +monero.meta.stackexchange.com.7z +monero.stackexchange.com.7z +money.meta.stackexchange.com.7z +money.stackexchange.com.7z +movies.meta.stackexchange.com.7z +movies.stackexchange.com.7z +music.meta.stackexchange.com.7z +music.stackexchange.com.7z +musicfans.meta.stackexchange.com.7z +musicfans.stackexchange.com.7z +mythology.meta.stackexchange.com.7z +mythology.stackexchange.com.7z +networkengineering.meta.stackexchange.com.7z +networkengineering.stackexchange.com.7z +opendata.meta.stackexchange.com.7z +opendata.stackexchange.com.7z +opensource.meta.stackexchange.com.7z +opensource.stackexchange.com.7z +or.meta.stackexchange.com.7z +or.stackexchange.com.7z +outdoors.meta.stackexchange.com.7z +outdoors.stackexchange.com.7z +parenting.meta.stackexchange.com.7z +parenting.stackexchange.com.7z +patents.meta.stackexchange.com.7z +patents.stackexchange.com.7z +pets.meta.stackexchange.com.7z +pets.stackexchange.com.7z +philosophy.meta.stackexchange.com.7z +philosophy.stackexchange.com.7z +photo.meta.stackexchange.com.7z +photo.stackexchange.com.7z +physics.meta.stackexchange.com.7z +physics.stackexchange.com.7z +pm.meta.stackexchange.com.7z +pm.stackexchange.com.7z +poker.meta.stackexchange.com.7z +poker.stackexchange.com.7z +politics.meta.stackexchange.com.7z +politics.stackexchange.com.7z +portuguese.meta.stackexchange.com.7z +portuguese.stackexchange.com.7z +proofassistants.meta.stackexchange.com.7z +proofassistants.stackexchange.com.7z +pt.meta.stackoverflow.com.7z +pt.stackoverflow.com.7z +puzzling.meta.stackexchange.com.7z +puzzling.stackexchange.com.7z +quant.meta.stackexchange.com.7z +quant.stackexchange.com.7z +quantumcomputing.meta.stackexchange.com.7z +quantumcomputing.stackexchange.com.7z +raspberrypi.meta.stackexchange.com.7z +raspberrypi.stackexchange.com.7z +retrocomputing.meta.stackexchange.com.7z +retrocomputing.stackexchange.com.7z +reverseengineering.meta.stackexchange.com.7z +reverseengineering.stackexchange.com.7z +robotics.meta.stackexchange.com.7z +robotics.stackexchange.com.7z +rpg.meta.stackexchange.com.7z +rpg.stackexchange.com.7z +ru.meta.stackoverflow.com.7z +ru.stackoverflow.com.7z +rus.meta.stackexchange.com.7z +rus.stackexchange.com.7z +russian.meta.stackexchange.com.7z +russian.stackexchange.com.7z +salesforce.meta.stackexchange.com.7z +salesforce.stackexchange.com.7z +scicomp.meta.stackexchange.com.7z +scicomp.stackexchange.com.7z +scifi.meta.stackexchange.com.7z +scifi.stackexchange.com.7z +security.meta.stackexchange.com.7z +security.stackexchange.com.7z +serverfault.com.7z +sharepoint.meta.stackexchange.com.7z +sharepoint.stackexchange.com.7z +sitecore.meta.stackexchange.com.7z +sitecore.stackexchange.com.7z +skeptics.meta.stackexchange.com.7z +skeptics.stackexchange.com.7z +softwareengineering.meta.stackexchange.com.7z +softwareengineering.stackexchange.com.7z +softwarerecs.meta.stackexchange.com.7z +softwarerecs.stackexchange.com.7z +solana.meta.stackexchange.com.7z +solana.stackexchange.com.7z +sound.meta.stackexchange.com.7z +sound.stackexchange.com.7z +space.meta.stackexchange.com.7z +space.stackexchange.com.7z +spanish.meta.stackexchange.com.7z +spanish.stackexchange.com.7z +sports.meta.stackexchange.com.7z +sports.stackexchange.com.7z +sqa.meta.stackexchange.com.7z +sqa.stackexchange.com.7z +stackapps.com.7z +stackoverflow.com-Badges.7z +stackoverflow.com-Comments.7z +stackoverflow.com-PostHistory.7z +stackoverflow.com-PostLinks.7z +stackoverflow.com-Posts.7z +stackoverflow.com-Tags.7z +stackoverflow.com-Users.7z +stackoverflow.com-Votes.7z +stats.meta.stackexchange.com.7z +stats.stackexchange.com.7z +stellar.meta.stackexchange.com.7z +stellar.stackexchange.com.7z +substrate.meta.stackexchange.com.7z +substrate.stackexchange.com.7z +superuser.com.7z +sustainability.meta.stackexchange.com.7z +sustainability.stackexchange.com.7z +tex.meta.stackexchange.com.7z +tex.stackexchange.com.7z +tezos.meta.stackexchange.com.7z +tezos.stackexchange.com.7z +tor.meta.stackexchange.com.7z +tor.stackexchange.com.7z +travel.meta.stackexchange.com.7z +travel.stackexchange.com.7z +tridion.meta.stackexchange.com.7z +tridion.stackexchange.com.7z +ukrainian.meta.stackexchange.com.7z +ukrainian.stackexchange.com.7z +unix.meta.stackexchange.com.7z +unix.stackexchange.com.7z +ux.meta.stackexchange.com.7z +ux.stackexchange.com.7z +vegetarianism.meta.stackexchange.com.7z +vegetarianism.stackexchange.com.7z +vi.meta.stackexchange.com.7z +vi.stackexchange.com.7z +webapps.meta.stackexchange.com.7z +webapps.stackexchange.com.7z +webmasters.meta.stackexchange.com.7z +webmasters.stackexchange.com.7z +windowsphone.meta.stackexchange.com.7z +windowsphone.stackexchange.com.7z +woodworking.meta.stackexchange.com.7z +woodworking.stackexchange.com.7z +wordpress.meta.stackexchange.com.7z +wordpress.stackexchange.com.7z +workplace.meta.stackexchange.com.7z +workplace.stackexchange.com.7z +worldbuilding.meta.stackexchange.com.7z +worldbuilding.stackexchange.com.7z +writers.meta.stackexchange.com.7z +writers.stackexchange.com.7z From 70620ab362be1afbd2fe6e70ed079c0b44ad3bb2 Mon Sep 17 00:00:00 2001 From: Luca Soldaini Date: Sat, 8 Jul 2023 10:17:26 -0700 Subject: [PATCH 11/11] style --- scripts/prepare_memmap_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/prepare_memmap_dataset.py b/scripts/prepare_memmap_dataset.py index 5b7bb5fbf..7a802ff4c 100644 --- a/scripts/prepare_memmap_dataset.py +++ b/scripts/prepare_memmap_dataset.py @@ -361,7 +361,7 @@ def make_source_and_target( "--cache-dir", type=str, default=None, - help="Cache directory for the tokenizer; use system default if not specified" + help="Cache directory for the tokenizer; use system default if not specified", ) @click.option( "--max-tokens",