From b5e74a87a730b1f880d4ee9756f603159662a5ca Mon Sep 17 00:00:00 2001
From: Luca Soldaini <lucas@allenai.org>
Date: Tue, 13 Jun 2023 14:15:31 -0500
Subject: [PATCH 01/11] mixing

---
 .../ablations/dedupers/c4-v0-dedup.json       | 24 +++++++++
 pretrain_data/mixer/config/olmo-train/c4.json | 53 +------------------
 pretrain_data/mixing-log.md                   | 32 +++++++----
 scripts/prepare_memmap_dataset.py             | 40 +++++++++++---
 4 files changed, 81 insertions(+), 68 deletions(-)
 create mode 100644 pretrain_data/mixer/config/ablations/dedupers/c4-v0-dedup.json

diff --git a/pretrain_data/mixer/config/ablations/dedupers/c4-v0-dedup.json b/pretrain_data/mixer/config/ablations/dedupers/c4-v0-dedup.json
new file mode 100644
index 000000000..35ac9093a
--- /dev/null
+++ b/pretrain_data/mixer/config/ablations/dedupers/c4-v0-dedup.json
@@ -0,0 +1,24 @@
+{
+  "documents": [
+    "pretraining-data/sources/c4/v0/documents/train/*.gz"
+  ],
+  "work_dir": {
+    "input": "/data2/c4/deduper/input",
+    "output": "/data2/c4/deduper/output"
+  },
+  "dedupe": {
+    "name": "decontamination",
+    "paragraphs": {
+      "attribute_name": "bff_duplicate_paragraph_spans_decontamination"
+    },
+    "skip_empty": true
+  },
+  "bloom_filter": {
+    "file": "/tmp/decontamination/deduper_decontamination_lucas_20230525.bin",
+    "size_in_bytes": 8388608,
+    "read_only": true,
+    "estimated_doc_count": 3898706,
+    "desired_false_positive_rate": 0.001
+  },
+  "processes": 120
+}
diff --git a/pretrain_data/mixer/config/olmo-train/c4.json b/pretrain_data/mixer/config/olmo-train/c4.json
index 57854fe4a..3c6f96bd0 100644
--- a/pretrain_data/mixer/config/olmo-train/c4.json
+++ b/pretrain_data/mixer/config/olmo-train/c4.json
@@ -23,59 +23,10 @@
       "filter": {
         "include": [],
         "exclude": [
-          "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] && @.gopher_rules__gopher_v1__word_count[0][2] < 50)]",
-          "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] && @.gopher_rules__gopher_v1__word_count[0][2] > 100000)]",
-          "$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] && @.gopher_rules__gopher_v1__median_word_length[0][2] < 3)]",
-          "$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] && @.gopher_rules__gopher_v1__median_word_length[0][2] > 10)]",
-          "$.attributes[?(@.gopher_rules__gopher_v1__symbol_to_word_ratio && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0] && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0][2] > 0.1)]",
-          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0] && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0][2] < 0.8)]",
-          "$.attributes[?(@.gopher_rules__gopher_v1__required_word_count && @.gopher_rules__gopher_v1__required_word_count[0] && @.gopher_rules__gopher_v1__required_word_count[0][2] < 2)]",
-          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0] && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0][2] > 0.9)]",
-          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0] && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0][2] > 0.3)]",
-          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_duplicate_lines && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0] && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0][2] > 0.3)]",
-          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0][2] > 0.3)]",
-          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0][2] > 0.2)]",
-          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0][2] > 0.18)]",
-          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0][2] > 0.16)]",
-          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0][2] > 0.15)]",
-          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0][2] > 0.14)]",
-          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0][2] > 0.13)]",
-          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0][2] > 0.12)]",
-          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0][2] > 0.11)]",
-          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0][2] > 0.10)]",
-
-          "$@.attributes[?(@.bff_duplicate_paragraph_spans_decontamination && @.bff_duplicate_paragraph_spans_decontamination[0] && @.bff_duplicate_paragraph_spans_decontamination[0][2] >= 1.0)]",
-
-          "$.attributes[?(@.olmo_mix_v1_taggers__pii_regex_with_counts_v2__doc[0][2] > 5)]"
+          "$@.attributes[?(@.bff_duplicate_paragraph_spans_decontamination && @.bff_duplicate_paragraph_spans_decontamination[0] && @.bff_duplicate_paragraph_spans_decontamination[0][2] >= 1.0)]"
         ]
       },
-      "span_replacement": [
-        {
-          "span": "$.attributes.olmo_mix_v1_taggers__jigsaw_hatespeech_sentence_v2____label__toxic",
-          "min_score": 0.4,
-          "replacement": ""
-        },
-        {
-          "span": "$.attributes.olmo_mix_v1_taggers__jigsaw_nsfw_sencence_v2____label__nsfw",
-          "min_score": 0.4,
-          "replacement": ""
-        },
-        {
-          "span": "$.attributes.olmo_mix_v1_taggers__pii_regex_with_counts_v2__EMAIL_ADDRESS",
-          "min_score": 0.5,
-          "replacement": " |||EMAIL_ADDRESS||| "
-        },
-        {
-          "span": "$.attributes.olmo_mix_v1_taggers__pii_regex_with_counts_v2__PHONE_NUMBER",
-          "min_score": 0.5,
-          "replacement": " |||PHONE_NUMBER||| "
-        },
-        {
-          "span": "$.attributes.olmo_mix_v1_taggers__pii_regex_with_counts_v2__IP_ADDRESS",
-          "min_score": 0.5,
-          "replacement": " |||IP_ADDRESS||| "
-        }
-      ]
+      "span_replacement": []
     }
   ],
   "work_dir": {
diff --git a/pretrain_data/mixing-log.md b/pretrain_data/mixing-log.md
index cb36a022a..ad910ca4f 100644
--- a/pretrain_data/mixing-log.md
+++ b/pretrain_data/mixing-log.md
@@ -1,4 +1,4 @@
-# OLMO Mixing Log 
+# OLMO Mixing Log
 
 
 Tagged Wikipedia using following command
@@ -17,7 +17,7 @@ ai2_llm_filters \
     -p 96 \
     --reuse-existing $HOME/wikipedia/meta \
     --local-read-cache $HOME/wikipedia/cache  \
-    --skip-on-failure 
+    --skip-on-failure
 ```
 
 Tagged C4 with the following. Using both `v0` and `v0-c4-cleaned`. The `c4-cleaned` shouldn't have much of a diff, but it's good for consistency.
@@ -34,7 +34,7 @@ ai2_llm_filters \
         gopher_v1 \
     -p 96 \
     --reuse-existing $HOME/c4-v0-c4-cleaned/meta \
-    --local-read-cache $HOME/c4-v0-c4-cleaned/cache 
+    --local-read-cache $HOME/c4-v0-c4-cleaned/cache
 ```
 
 ```shell
@@ -48,7 +48,7 @@ ai2_llm_filters \
         gopher_v1 \
     -p 96 \
     --reuse-existing $HOME/c4-v0/meta \
-    --local-read-cache $HOME/c4-v0/cache 
+    --local-read-cache $HOME/c4-v0/cache
 ```
 
 
@@ -99,13 +99,13 @@ ai2_llm_filters \
     --local-read-cache $HOME/v1-c4-cleaned/cc_en_head_download
 ```
 
-Created configurations 
+Created configurations
 
 ```shell
 python /Users/lucas/Code/LLM/pretrain_data/mixer/scripts/partition_deduper.py -w 100 -o pretrain_data
 ```
 
-Output: 
+Output:
 
 ```text
 pretrain_data/mixer/config/pdedup_c1_v1_c4-cleaned/0.json: 1105.45 GB, 756 files.
@@ -149,7 +149,7 @@ Books:
 python scripts/prepare_memmap_dataset.py  \
     s3://ai2-llm/pretraining-data/sources/olmo-mix/v1/documents/books \
     --safe-mode \
-    --output s3://ai2-llm/preprocessed/sources/olmo-mix/v1/gpt-neox-20b-pii-special/books \
+    --output s3://ai2-llm/preprocessed/olmo-mix/v1/gpt-neox-20b-pii-special/books \
     --tokenizer "allenai/eleuther-ai-gpt-neox-20b-pii-special" \
     --workers 120
 ```
@@ -160,7 +160,7 @@ Semantic Scholar:
 python scripts/prepare_memmap_dataset.py  \
     s3://ai2-llm/pretraining-data/sources/olmo-mix/v1/documents/s2 \
     --safe-mode \
-    --output s3://ai2-llm/preprocessed/sources/olmo-mix/v1/gpt-neox-20b-pii-special/s2 \
+    --output s3://ai2-llm/preprocessed/olmo-mix/v1/gpt-neox-20b-pii-special/s2 \
     --tokenizer "allenai/eleuther-ai-gpt-neox-20b-pii-special" \
     --workers 120
 ```
@@ -171,7 +171,19 @@ Wikipedia:
 python scripts/prepare_memmap_dataset.py  \
     s3://ai2-llm/pretraining-data/sources/olmo-mix/v1/documents/wiki \
     --safe-mode \
-    --output s3://ai2-llm/preprocessed/sources/olmo-mix/v1/gpt-neox-20b-pii-special/wiki \
+    --output s3://ai2-llm/preprocessed/olmo-mix/v1/gpt-neox-20b-pii-special/wiki \
     --tokenizer "allenai/eleuther-ai-gpt-neox-20b-pii-special" \
     --workers 120
-```
\ No newline at end of file
+```
+
+C4:
+
+```shell
+python scripts/prepare_memmap_dataset.py  \
+    s3://ai2-llm/pretraining-data/sources/olmo-mix/v1/documents/c4 \
+    --safe-mode \
+    --output s3://ai2-llm/preprocessed/olmo-mix/v1/gpt-neox-20b-pii-special/c4 \
+    --tokenizer "allenai/eleuther-ai-gpt-neox-20b-pii-special" \
+    --workers 120 \
+    --cache-dir /data2/llm-preprocessed
+```
diff --git a/scripts/prepare_memmap_dataset.py b/scripts/prepare_memmap_dataset.py
index f5d3d6180..5b7bb5fbf 100644
--- a/scripts/prepare_memmap_dataset.py
+++ b/scripts/prepare_memmap_dataset.py
@@ -70,7 +70,12 @@ class InputDocumentSpec(msgspec.Struct):
     text: str
 
 
-def tokenize_file(tokenizer: Tokenizer, path: str, safe_mode: bool = False) -> Generator[List[int], None, None]:
+def tokenize_file(
+    tokenizer: Tokenizer,
+    path: str,
+    safe_mode: bool = False,
+    cache_dir: Optional[str] = None,
+) -> Generator[List[int], None, None]:
     """Tokenize a file of documents using the provided tokenizer; file is expected to be a gzipped JSON lines
     file, each containing a field named `text`.
     """
@@ -80,7 +85,7 @@ def tokenize_file(tokenizer: Tokenizer, path: str, safe_mode: bool = False) -> G
 
     with ExitStack() as stack:
         if safe_mode:
-            caching_path = cached_path(path)
+            caching_path = cached_path(path, cache_dir=cache_dir)
             input_stream = stack.enter_context(gzip.open(caching_path, mode="rt"))
         else:
             input_file = stack.enter_context(stream_file_for_read(path, mode="rb"))
@@ -228,7 +233,8 @@ def fill_memmap(
     sample_rate: float = 1.0,
     random_seed: int = 3920,
     repeat_sequence: int = 1,
-):
+    cache_dir: Optional[str] = None,
+) -> int:
     """Write a memmap file from a file of documents."""
 
     # set the seed in case we need to sample
@@ -243,13 +249,16 @@ def fill_memmap(
     # we increment this every time we create a new memmap file
     file_index = 0
 
+    # total number of tokens written
+    total_tokens = 0
+
     # make sure path is a list
     path_or_paths = [path_or_paths] if isinstance(path_or_paths, str) else path_or_paths
 
     with ExitStack() as stack:
         it = itertools.chain.from_iterable(
             # repeat the sequence if necessary
-            tokenize_file(tokenizer=tokenizer, path=path, safe_mode=safe_mode)
+            tokenize_file(tokenizer=tokenizer, path=path, safe_mode=safe_mode, cache_dir=cache_dir)
             for _ in range(repeat_sequence)
             for path in path_or_paths
         )
@@ -261,6 +270,9 @@ def fill_memmap(
             # flush any 10k lines or so; improves stability
             flush = line_no % 10_000 == 0
 
+            # increment the total number of tokens written
+            total_tokens += len(token_ids)
+
             # if leftovers_to_write is not None it means that either memmap is None or it's full,
             # so we will need to create a new one later
             leftovers_to_write = memmap.write(token_ids, flush=flush) if memmap is not None else token_ids
@@ -282,6 +294,8 @@ def fill_memmap(
         # close the last memmap
         stack.pop_all().close()
 
+    return total_tokens
+
 
 def make_source_and_target(
     src: Tuple[str, ...],
@@ -343,6 +357,12 @@ def make_source_and_target(
 @click.option("--random-seed", type=int, default=3920)
 @click.option("--repeat-sequence", type=click.IntRange(min=1), default=1)
 @click.option("--paths-per-worker", type=click.IntRange(min=1), default=1)
+@click.option(
+    "--cache-dir",
+    type=str,
+    default=None,
+    help="Cache directory for the tokenizer; use system default if not specified"
+)
 @click.option(
     "--max-tokens",
     default=512 * 1024 * 1024,
@@ -368,6 +388,7 @@ def main(
     repeat_sequence: int = 1,
     paths_per_worker: int = 1,
     max_workers: int = 1,
+    cache_dir: Optional[str] = None,
 ):
     print("=== CONFIGURATION ===")
     print(f"src:              {src}")
@@ -383,6 +404,7 @@ def main(
     print(f"repeat_sequence:  {repeat_sequence}")
     print(f"paths_per_worker: {paths_per_worker}")
     print(f"max_workers:      {max_workers}")
+    print(f"cache_dir:        {cache_dir}")
     print("=====================")
 
     dtype = np.dtype(dtype_str)
@@ -401,17 +423,20 @@ def main(
         sample_rate=sample_rate,
         random_seed=random_seed,
         repeat_sequence=repeat_sequence,
+        cache_dir=cache_dir,
     )
 
+    total_tokens_written = 0
+
     if debug:
         log.info("Running in debug mode. Only one process will be used.")
         for src_path, dst_path in zip(exploded_src, exploded_dst):
-            fill_memmap_fn(path_or_paths=src_path, memmap_path=dst_path)
+            total_tokens_written += fill_memmap_fn(path_or_paths=src_path, memmap_path=dst_path)
     else:
         # Now tokenizer all documents again and populate the memmap array. We do this in parallel.
         workers_cnt = min(max_workers or os.cpu_count() or 1, len(exploded_src))
         with concurrent.futures.ProcessPoolExecutor(max_workers=workers_cnt) as executor:
-            futures: List[Future[None]] = []
+            futures: List[Future[int]] = []
             for src_path, dst_path in zip(exploded_src, exploded_dst):
                 future = executor.submit(fill_memmap_fn, path_or_paths=src_path, memmap_path=dst_path)
                 futures.append(future)
@@ -421,9 +446,10 @@ def main(
                     description="Filling memmap arrays...",
                     total=len(futures),
                 ):
-                    future.result()
+                    total_tokens_written += future.result()
 
     log.info(f"Done! File(s) written to {output}")
+    log.info(f"Total tokens written: {total_tokens_written:,}")
 
     if validate:
         log.info("Validating...")

From a11918c16dc4548657cfc5b99b360398a4a63d0b Mon Sep 17 00:00:00 2001
From: Luca Soldaini <lucas@allenai.org>
Date: Wed, 14 Jun 2023 14:30:40 -0500
Subject: [PATCH 02/11] wip

---
 .../config/olmo-train/common-crawl-head.json  | 25 +++++++++++++++--
 .../olmo-train/common-crawl-middle.json       |  2 +-
 .../config/olmo-train/common-crawl-tail.json  |  2 +-
 .../mixer/config/olmo-train/stack.json        | 24 ++++++++++++++++
 pretrain_data/mixing-log.md                   | 28 +++++++++++++++++++
 5 files changed, 77 insertions(+), 4 deletions(-)
 create mode 100644 pretrain_data/mixer/config/olmo-train/stack.json

diff --git a/pretrain_data/mixer/config/olmo-train/common-crawl-head.json b/pretrain_data/mixer/config/olmo-train/common-crawl-head.json
index 541fb3784..233e027ae 100644
--- a/pretrain_data/mixer/config/olmo-train/common-crawl-head.json
+++ b/pretrain_data/mixer/config/olmo-train/common-crawl-head.json
@@ -3,7 +3,7 @@
     {
       "name": "cc_en_head",
       "documents": [
-        "pretraining-data/sources/common-crawl/v1-c4-cleaned/documents/cc_en_head/cc_en_head-0005.json.gz"
+        "pretraining-data/sources/common-crawl/v1-c4-cleaned/documents/cc_en_head/*.json.gz"
       ],
       "attributes": [
         "decontamination",
@@ -25,9 +25,30 @@
       "filter": {
         "include": [],
         "exclude": [
+                    "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] && @.gopher_rules__gopher_v1__word_count[0][2] < 50)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] && @.gopher_rules__gopher_v1__word_count[0][2] > 100000)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] && @.gopher_rules__gopher_v1__median_word_length[0][2] < 3)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] && @.gopher_rules__gopher_v1__median_word_length[0][2] > 10)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__symbol_to_word_ratio && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0] && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0][2] > 0.1)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0] && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0][2] < 0.8)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__required_word_count && @.gopher_rules__gopher_v1__required_word_count[0] && @.gopher_rules__gopher_v1__required_word_count[0][2] < 2)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0] && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0][2] > 0.9)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0] && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0][2] > 0.3)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_duplicate_lines && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0] && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0][2] > 0.3)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0][2] > 0.3)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0][2] > 0.2)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0][2] > 0.18)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0][2] > 0.16)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0][2] > 0.15)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0][2] > 0.14)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0][2] > 0.13)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0][2] > 0.12)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0][2] > 0.11)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0][2] > 0.10)]",
+
           "$@.attributes[?(@.bff_duplicate_paragraph_spans_decontamination && @.bff_duplicate_paragraph_spans_decontamination[0] && @.bff_duplicate_paragraph_spans_decontamination[0][2] >= 1.0)]",
 
-          "$.attributes[?(@.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0][2] > 5)]"
+          "$.attributes[?(@.pii_detection__pii_regex_with_counts_fast_v2__doc_count && @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0] && @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0][2] > 5)]"
         ]
       },
       "span_replacement": [
diff --git a/pretrain_data/mixer/config/olmo-train/common-crawl-middle.json b/pretrain_data/mixer/config/olmo-train/common-crawl-middle.json
index 011494016..f2e0069ef 100644
--- a/pretrain_data/mixer/config/olmo-train/common-crawl-middle.json
+++ b/pretrain_data/mixer/config/olmo-train/common-crawl-middle.json
@@ -46,7 +46,7 @@
           "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0][2] > 0.11)]",
           "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0][2] > 0.10)]",
           "$@.attributes[?(@.bff_duplicate_paragraph_spans_decontamination && @.bff_duplicate_paragraph_spans_decontamination[0] && @.bff_duplicate_paragraph_spans_decontamination[0][2] >= 1.0)]",
-          "$.attributes[?(@.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0][2] > 5)]"
+          "$.attributes[?(@.pii_detection__pii_regex_with_counts_fast_v2__doc_count && @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0] && @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0][2] > 5)]"
         ]
       },
       "span_replacement": [
diff --git a/pretrain_data/mixer/config/olmo-train/common-crawl-tail.json b/pretrain_data/mixer/config/olmo-train/common-crawl-tail.json
index b7aec37a7..0b38254ac 100644
--- a/pretrain_data/mixer/config/olmo-train/common-crawl-tail.json
+++ b/pretrain_data/mixer/config/olmo-train/common-crawl-tail.json
@@ -46,7 +46,7 @@
           "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0][2] > 0.11)]",
           "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0][2] > 0.10)]",
           "$@.attributes[?(@.bff_duplicate_paragraph_spans_decontamination && @.bff_duplicate_paragraph_spans_decontamination[0] && @.bff_duplicate_paragraph_spans_decontamination[0][2] >= 1.0)]",
-          "$.attributes[?(@.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0][2] > 5)]"
+          "$.attributes[?(@.pii_detection__pii_regex_with_counts_fast_v2__doc_count && @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0] && @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0][2] > 5)]"
         ]
       },
       "span_replacement": [
diff --git a/pretrain_data/mixer/config/olmo-train/stack.json b/pretrain_data/mixer/config/olmo-train/stack.json
new file mode 100644
index 000000000..b8cfa0719
--- /dev/null
+++ b/pretrain_data/mixer/config/olmo-train/stack.json
@@ -0,0 +1,24 @@
+{
+  "streams": [
+    {
+      "name": "stack-v2-mixer-train",
+      "documents": [
+        "pretraining-data/sources/stack-dedup/v2-mixer-train/documents/*.gz"
+      ],
+      "output": {
+        "path": "pretraining-data/sources/olmo-mix/v1/documents/stack",
+        "max_size_in_bytes": 3894967296
+      },
+      "attributes": [],
+      "filter": {
+        "include": [],
+        "exclude": []
+      }
+    }
+  ],
+  "work_dir": {
+    "input": "/tmp/stack/mixer/input",
+    "output": "/tmp/stack/mixer/output"
+  },
+  "processes": 120
+}
diff --git a/pretrain_data/mixing-log.md b/pretrain_data/mixing-log.md
index ad910ca4f..cc5e5dd44 100644
--- a/pretrain_data/mixing-log.md
+++ b/pretrain_data/mixing-log.md
@@ -187,3 +187,31 @@ python scripts/prepare_memmap_dataset.py  \
     --workers 120 \
     --cache-dir /data2/llm-preprocessed
 ```
+
+Stack:
+
+```shell
+python scripts/prepare_memmap_dataset.py  \
+    s3://ai2-llm/pretraining-data/sources/olmo-mix/v1/documents/stack \
+    --safe-mode \
+    --output s3://ai2-llm/preprocessed/olmo-mix/v1/gpt-neox-20b-pii-special/stack \
+    --tokenizer "allenai/eleuther-ai-gpt-neox-20b-pii-special" \
+    --workers 120 \
+    --cache-dir /data2/llm-preprocessed
+```
+
+
+## Calculating Size
+
+- Wiki: 6.21 GB -> 3,635,728,771 tokens (585m tokens per GB)
+- Books: 7.08 GB -> 4,755,860,202 tokens (672m tokens per GB)
+- S2: 160.01 GB -> 56,783,583,427 tokens (355m tokens per GB)
+- C4: 323.95 GB -> 174,398,315,760 tokens (538m tokens per GB)
+- Stack: 724.28 GB -> 430,067,843,952 tokens (593m tokens per GB)
+
+
+Over a single cc
+
+2.87 GB -> 1.76 GB (61.3% reduction)
+
+9.48 TB -> 5.81 TB -> 3,450,675,200,000,000 tokens? (3.45 T)

From c01e593f68db794ec9c6f8e0c183792bbe565e79 Mon Sep 17 00:00:00 2001
From: Luca Soldaini <lucas@allenai.org>
Date: Thu, 15 Jun 2023 11:21:21 -0500
Subject: [PATCH 03/11] commit config

---
 .../olmo-train-sample/common-crawl-head.json  |  28 +++++
 .../common-crawl-middle.json                  |  28 +++++
 .../olmo-train-sample/common-crawl-tail.json  |  28 +++++
 .../mixer/config/olmo-train-sample/stack.json |  28 +++++
 .../config/pdedup_c1_v1_c4-cleaned/rest.json  |  23 ++++
 pretrain_data/mixing-log.md                   | 106 +++++++++++++++++-
 6 files changed, 240 insertions(+), 1 deletion(-)
 create mode 100644 pretrain_data/mixer/config/olmo-train-sample/common-crawl-head.json
 create mode 100644 pretrain_data/mixer/config/olmo-train-sample/common-crawl-middle.json
 create mode 100644 pretrain_data/mixer/config/olmo-train-sample/common-crawl-tail.json
 create mode 100644 pretrain_data/mixer/config/olmo-train-sample/stack.json
 create mode 100644 pretrain_data/mixer/config/pdedup_c1_v1_c4-cleaned/rest.json

diff --git a/pretrain_data/mixer/config/olmo-train-sample/common-crawl-head.json b/pretrain_data/mixer/config/olmo-train-sample/common-crawl-head.json
new file mode 100644
index 000000000..a66b99d87
--- /dev/null
+++ b/pretrain_data/mixer/config/olmo-train-sample/common-crawl-head.json
@@ -0,0 +1,28 @@
+{
+  "streams": [
+    {
+      "name": "cc_en_head",
+      "documents": [
+        "pretraining-data/sources/olmo-mix/v1/documents/common-crawl/cc_en_head/*.gz"
+      ],
+      "output": {
+        "path": "pretraining-data/sources/olmo-mix/v1-sample/documents/cc_en_head",
+        "max_size_in_bytes": 3894967296
+      },
+      "attributes": [
+        "random"
+      ],
+      "filter": {
+        "include": [
+          "$.attributes[?(@.random__random_number_v1__random[0][2] < 0.5104606781)]"
+        ],
+        "exclude": []
+      }
+    }
+  ],
+  "work_dir": {
+    "input": "/tmp/cc-head-sample/mixer/input",
+    "output": "/tmp/cc-head-sample/mixer/output"
+  },
+  "processes": 120
+}
diff --git a/pretrain_data/mixer/config/olmo-train-sample/common-crawl-middle.json b/pretrain_data/mixer/config/olmo-train-sample/common-crawl-middle.json
new file mode 100644
index 000000000..d8c294435
--- /dev/null
+++ b/pretrain_data/mixer/config/olmo-train-sample/common-crawl-middle.json
@@ -0,0 +1,28 @@
+{
+  "streams": [
+    {
+      "name": "cc_en_middle",
+      "documents": [
+        "pretraining-data/sources/olmo-mix/v1/documents/common-crawl/cc_en_middle/*.gz"
+      ],
+      "output": {
+        "path": "pretraining-data/sources/olmo-mix/v1-sample/documents/cc_en_middle",
+        "max_size_in_bytes": 3894967296
+      },
+      "attributes": [
+        "random"
+      ],
+      "filter": {
+        "include": [
+          "$.attributes[?(@.random__random_number_v1__random[0][2] < 0.5104606781)]"
+        ],
+        "exclude": []
+      }
+    }
+  ],
+  "work_dir": {
+    "input": "/data2/cc-middle-sample/mixer/input",
+    "output": "/data2/cc-middle-sample/mixer/output"
+  },
+  "processes": 120
+}
diff --git a/pretrain_data/mixer/config/olmo-train-sample/common-crawl-tail.json b/pretrain_data/mixer/config/olmo-train-sample/common-crawl-tail.json
new file mode 100644
index 000000000..ad83b5585
--- /dev/null
+++ b/pretrain_data/mixer/config/olmo-train-sample/common-crawl-tail.json
@@ -0,0 +1,28 @@
+{
+  "streams": [
+    {
+      "name": "cc_en_tail",
+      "documents": [
+        "pretraining-data/sources/olmo-mix/v1/documents/common-crawl/cc_en_tail/*.gz"
+      ],
+      "output": {
+        "path": "pretraining-data/sources/olmo-mix/v1-sample/documents/cc_en_tail",
+        "max_size_in_bytes": 3894967296
+      },
+      "attributes": [
+        "random"
+      ],
+      "filter": {
+        "include": [
+          "$.attributes[?(@.random__random_number_v1__random[0][2] < 0.5104606781)]"
+        ],
+        "exclude": []
+      }
+    }
+  ],
+  "work_dir": {
+    "input": "/tmp/cc-middle-sample/mixer/input",
+    "output": "/tmp/cc-middle-sample/mixer/output"
+  },
+  "processes": 120
+}
diff --git a/pretrain_data/mixer/config/olmo-train-sample/stack.json b/pretrain_data/mixer/config/olmo-train-sample/stack.json
new file mode 100644
index 000000000..f55d2118c
--- /dev/null
+++ b/pretrain_data/mixer/config/olmo-train-sample/stack.json
@@ -0,0 +1,28 @@
+{
+  "streams": [
+    {
+      "name": "stack-v2-mixer-train",
+      "documents": [
+        "pretraining-data/sources/olmo-mix/v1/documents/stack/*.gz"
+      ],
+      "output": {
+        "path": "pretraining-data/sources/olmo-mix/v1-sample/documents/stack",
+        "max_size_in_bytes": 3894967296
+      },
+      "attributes": [
+        "random"
+      ],
+      "filter": {
+        "include": [
+          "$.attributes[?(@.random__random_number_v1__random[0][2] < 0.6975641061)]"
+        ],
+        "exclude": []
+      }
+    }
+  ],
+  "work_dir": {
+    "input": "/tmp/stack-sample/mixer/input",
+    "output": "/tmp/stack-sample/mixer/output"
+  },
+  "processes": 120
+}
diff --git a/pretrain_data/mixer/config/pdedup_c1_v1_c4-cleaned/rest.json b/pretrain_data/mixer/config/pdedup_c1_v1_c4-cleaned/rest.json
new file mode 100644
index 000000000..2c312d5b2
--- /dev/null
+++ b/pretrain_data/mixer/config/pdedup_c1_v1_c4-cleaned/rest.json
@@ -0,0 +1,23 @@
+{
+    "documents": [
+        "pretraining-data/sources/common-crawl/v1-c4-cleaned/documents/cc_en_tail/cc_en_tail-1005.json.gz"
+    ],
+    "work_dir": {
+        "input": "/tmp/v1-c4-cleaned/7.input",
+        "output": "/tmp/v1-c4-cleaned/7.output"
+    },
+    "dedupe": {
+        "name": "dedupe_paragraphs",
+        "paragraphs": {
+            "attribute_name": "bff_duplicate_paragraph_spans"
+        }
+    },
+    "bloom_filter": {
+        "file": "/tmp/v1-c4-cleaned/7.bloom",
+        "size_in_bytes": 0,
+        "read_only": true,
+        "estimated_doc_count": 30000000000,
+        "desired_false_positive_rate": 1e-06
+    },
+    "processes": 128
+}
diff --git a/pretrain_data/mixing-log.md b/pretrain_data/mixing-log.md
index cc5e5dd44..c3042e843 100644
--- a/pretrain_data/mixing-log.md
+++ b/pretrain_data/mixing-log.md
@@ -210,8 +210,112 @@ python scripts/prepare_memmap_dataset.py  \
 - Stack: 724.28 GB -> 430,067,843,952 tokens (593m tokens per GB)
 
 
+Sampled
+
+- Wiki: 6.21 GB -> 3,635,728,771 tokens (585m tokens per GB)
+- Books: 7.08 GB -> 4,755,860,202 tokens (672m tokens per GB)
+- S2: 160.01 GB -> 56,783,583,427 tokens (355m tokens per GB)
+- C4: 323.95 GB -> 174,398,315,760 tokens (538m tokens per GB)
+- Stack: 593.4 GB -> 430,067,843,952 tokens (593m tokens per GB)
+- Common Crawl: 2.61 TB -> 1,500,000,000,000 tokens (574m tokens per GB)
+
+
 Over a single cc
 
 2.87 GB -> 1.76 GB (61.3% reduction)
 
-9.48 TB -> 5.81 TB -> 3,450,675,200,000,000 tokens? (3.45 T)
+9.48 TB -> 4.89 TB -> 2.861T tokens
+
+
+Random tagger:
+
+```shell
+ai2_llm_filters \
+    -d 'olmo-mix/v1' \
+    -n random \
+    -t random_number_v1 \
+    -p 120 \
+    --reuse-existing $HOME/olmo_mix/meta \
+    --files-regex-pattern 'stack-v2-mxixer-train' \
+    --local-read-cache $HOME/olmo_mix/cache
+```
+
+```shell
+ai2_llm_filters \
+    -d 'olmo-mix/v1' \
+    -n random \
+    -t random_number_v1 \
+    -p 120 \
+    --reuse-existing /data2/olmo_mix/meta \
+    --files-regex-pattern 'cc_en_head' \
+    --local-read-cache /data2/olmo_mix/cache
+```
+
+```shell
+ai2_llm_filters \
+    -d 'olmo-mix/v1' \
+    -n random \
+    -t random_number_v1 \
+    -p 120 \
+    --reuse-existing /tmp/olmo_mix/meta \
+    --files-regex-pattern 'cc_en_middle' \
+    --local-read-cache /tmp/olmo_mix/cache
+```
+
+```shell
+ai2_llm_filters \
+    -d 'olmo-mix/v1' \
+    -n random \
+    -t random_number_v1 \
+    -p 120 \
+    --reuse-existing /tmp/olmo_mix/meta \
+    --files-regex-pattern 'reddit-ablation-base' \
+    --local-read-cache /tmp/olmo_mix/cache
+```
+
+
+```shell
+python scripts/prepare_memmap_dataset.py  \
+    s3://ai2-llm/pretraining-data/sources/olmo-mix/v1-sample/documents/stack \
+    --safe-mode \
+    --output s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/stack \
+    --tokenizer "allenai/eleuther-ai-gpt-neox-20b-pii-special" \
+    --workers 120 \
+    --cache-dir /tmp/llm-preprocessed
+```
+
+
+```shell
+python scripts/prepare_memmap_dataset.py  \
+    s3://ai2-llm/pretraining-data/sources/olmo-mix/v1-sample/documents/cc_en_head \
+    --safe-mode \
+    --output s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_head \
+    --tokenizer "allenai/eleuther-ai-gpt-neox-20b-pii-special" \
+    --workers 120 \
+    --cache-dir /tmp/llm-preprocessed/cc_en_head
+```
+
+```shell
+python scripts/prepare_memmap_dataset.py  \
+    s3://ai2-llm/pretraining-data/sources/olmo-mix/v1-sample/documents/cc_en_middle \
+    --safe-mode \
+    --output s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_middle \
+    --tokenizer "allenai/eleuther-ai-gpt-neox-20b-pii-special" \
+    --workers 120 \
+    --cache-dir /tmp/llm-preprocessed
+```
+
+```shell
+python scripts/prepare_memmap_dataset.py  \
+    s3://ai2-llm/pretraining-data/sources/olmo-mix/v1-sample/documents/cc_en_tail \
+    --safe-mode \
+    --output s3://ai2-llm/preprocessed/olmo-mix/v1-sample/gpt-neox-20b-pii-special/common-crawl/cc_en_tail \
+    --tokenizer "allenai/eleuther-ai-gpt-neox-20b-pii-special" \
+    --workers 120 \
+    --cache-dir /tmp/llm-preprocessed
+```
+
+
+```shell
+~/target/release/mixer pretrain_data/mixer/config/olmo-train/common-crawl-tail.json && ai2_llm_filters -d 'olmo-mix/v1' -n random -t random_number_v1 -p 120 --reuse-existing /tmp/olmo_mix/meta --files-regex-pattern 'cc_en_tail' --local-read-cache /tmp/olmo_mix/cache && ~/target/release/mixer pretrain_data/mixer/config/olmo-train-sample/common-crawl-tail.json
+```

From 3ad1ee47cdbb5cf058dbb8a8abbd0502032126da Mon Sep 17 00:00:00 2001
From: Luca Soldaini <lucas@allenai.org>
Date: Mon, 19 Jun 2023 21:50:16 -0700
Subject: [PATCH 04/11] new paths

---
 .../config/olmo-train/common-crawl-head.json  |  2 +-
 pretrain_data/mixing-log.md                   | 43 +++++++++++++++++++
 2 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/pretrain_data/mixer/config/olmo-train/common-crawl-head.json b/pretrain_data/mixer/config/olmo-train/common-crawl-head.json
index 233e027ae..e24986562 100644
--- a/pretrain_data/mixer/config/olmo-train/common-crawl-head.json
+++ b/pretrain_data/mixer/config/olmo-train/common-crawl-head.json
@@ -25,7 +25,7 @@
       "filter": {
         "include": [],
         "exclude": [
-                    "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] && @.gopher_rules__gopher_v1__word_count[0][2] < 50)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] && @.gopher_rules__gopher_v1__word_count[0][2] < 50)]",
           "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] && @.gopher_rules__gopher_v1__word_count[0][2] > 100000)]",
           "$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] && @.gopher_rules__gopher_v1__median_word_length[0][2] < 3)]",
           "$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] && @.gopher_rules__gopher_v1__median_word_length[0][2] > 10)]",
diff --git a/pretrain_data/mixing-log.md b/pretrain_data/mixing-log.md
index c3042e843..65de88323 100644
--- a/pretrain_data/mixing-log.md
+++ b/pretrain_data/mixing-log.md
@@ -319,3 +319,46 @@ python scripts/prepare_memmap_dataset.py  \
 ```shell
 ~/target/release/mixer pretrain_data/mixer/config/olmo-train/common-crawl-tail.json && ai2_llm_filters -d 'olmo-mix/v1' -n random -t random_number_v1 -p 120 --reuse-existing /tmp/olmo_mix/meta --files-regex-pattern 'cc_en_tail' --local-read-cache /tmp/olmo_mix/cache && ~/target/release/mixer pretrain_data/mixer/config/olmo-train-sample/common-crawl-tail.json
 ```
+
+## Gopher-like
+
+
+```shell
+python scripts/prepare_memmap_dataset.py  \
+    s3://ai2-llm/pretraining-data/sources/olmo-mix/v1-sample-small/documents/stack \
+    --safe-mode \
+    --output s3://ai2-llm/preprocessed/olmo-mix/v1-sample-small/gpt-neox-20b-pii-special/stack \
+    --tokenizer "allenai/eleuther-ai-gpt-neox-20b-pii-special" \
+    --workers 120 \
+    --cache-dir /tmp/llm-preprocessed
+```
+
+```shell
+python scripts/prepare_memmap_dataset.py  \
+    s3://ai2-llm/pretraining-data/sources/olmo-mix/v1-sample-small/documents/cc_en_head \
+    --safe-mode \
+    --output s3://ai2-llm/preprocessed/olmo-mix/v1-sample-small/gpt-neox-20b-pii-special/common-crawl/cc_en_head \
+    --tokenizer "allenai/eleuther-ai-gpt-neox-20b-pii-special" \
+    --workers 120 \
+    --cache-dir /tmp/llm-preprocessed/cc_en_head
+```
+
+```shell
+python scripts/prepare_memmap_dataset.py  \
+    s3://ai2-llm/pretraining-data/sources/olmo-mix/v1-sample-small/documents/cc_en_middle \
+    --safe-mode \
+    --output s3://ai2-llm/preprocessed/olmo-mix/v1-sample-small/gpt-neox-20b-pii-special/common-crawl/cc_en_middle \
+    --tokenizer "allenai/eleuther-ai-gpt-neox-20b-pii-special" \
+    --workers 120 \
+    --cache-dir /tmp/llm-preprocessed
+```
+
+```shell
+python scripts/prepare_memmap_dataset.py  \
+    s3://ai2-llm/pretraining-data/sources/olmo-mix/v1-sample-small/documents/cc_en_tail \
+    --safe-mode \
+    --output s3://ai2-llm/preprocessed/olmo-mix/v1-sample-small/gpt-neox-20b-pii-special/common-crawl/cc_en_tail \
+    --tokenizer "allenai/eleuther-ai-gpt-neox-20b-pii-special" \
+    --workers 120 \
+    --cache-dir /tmp/llm-preprocessed
+```

From 77005d5446b90a8f084f88642ebc29345c89ec0c Mon Sep 17 00:00:00 2001
From: Luca Soldaini <lucas@allenai.org>
Date: Mon, 19 Jun 2023 21:50:19 -0700
Subject: [PATCH 05/11] new paths

---
 .../common-crawl-head.json                    | 83 +++++++++++++++++++
 .../common-crawl-middle.json                  | 83 +++++++++++++++++++
 .../common-crawl-tail.json                    | 83 +++++++++++++++++++
 .../common-crawl-head.json                    | 28 +++++++
 .../common-crawl-middle.json                  | 28 +++++++
 .../common-crawl-tail.json                    | 28 +++++++
 .../config/olmo-train-sample-small/stack.json | 28 +++++++
 .../config/olmo-train-sample/stack-half.json  | 28 +++++++
 8 files changed, 389 insertions(+)
 create mode 100644 pretrain_data/mixer/config/olmo-train-no-removal/common-crawl-head.json
 create mode 100644 pretrain_data/mixer/config/olmo-train-no-removal/common-crawl-middle.json
 create mode 100644 pretrain_data/mixer/config/olmo-train-no-removal/common-crawl-tail.json
 create mode 100644 pretrain_data/mixer/config/olmo-train-sample-small/common-crawl-head.json
 create mode 100644 pretrain_data/mixer/config/olmo-train-sample-small/common-crawl-middle.json
 create mode 100644 pretrain_data/mixer/config/olmo-train-sample-small/common-crawl-tail.json
 create mode 100644 pretrain_data/mixer/config/olmo-train-sample-small/stack.json
 create mode 100644 pretrain_data/mixer/config/olmo-train-sample/stack-half.json

diff --git a/pretrain_data/mixer/config/olmo-train-no-removal/common-crawl-head.json b/pretrain_data/mixer/config/olmo-train-no-removal/common-crawl-head.json
new file mode 100644
index 000000000..fbf7adc48
--- /dev/null
+++ b/pretrain_data/mixer/config/olmo-train-no-removal/common-crawl-head.json
@@ -0,0 +1,83 @@
+{
+  "streams": [
+    {
+      "name": "cc_en_head",
+      "documents": [
+        "pretraining-data/sources/common-crawl/v1-c4-cleaned/documents/cc_en_head/*.json.gz"
+      ],
+      "attributes": [
+        "decontamination",
+        "dedupe_paragraphs",
+        "gopher_rules",
+        "hatespeech_nsfw_cc_v3",
+        "pii_detection",
+        "random"
+      ],
+      "output": {
+        "path": "pretraining-data/sources/olmo-mix/v1-no-removal/documents/common-crawl/cc_en_head",
+        "max_size_in_bytes": 4294967296,
+        "discard_fields": [
+          "attributes",
+          "metadata",
+          "added",
+          "created"
+        ]
+      },
+      "filter": {
+        "include": [],
+        "exclude": [
+          "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] && @.gopher_rules__gopher_v1__word_count[0][2] < 50)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] && @.gopher_rules__gopher_v1__word_count[0][2] > 100000)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] && @.gopher_rules__gopher_v1__median_word_length[0][2] < 3)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] && @.gopher_rules__gopher_v1__median_word_length[0][2] > 10)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__symbol_to_word_ratio && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0] && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0][2] > 0.1)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0] && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0][2] < 0.8)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__required_word_count && @.gopher_rules__gopher_v1__required_word_count[0] && @.gopher_rules__gopher_v1__required_word_count[0][2] < 2)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0] && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0][2] > 0.9)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0] && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0][2] > 0.3)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_duplicate_lines && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0] && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0][2] > 0.3)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0][2] > 0.3)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0][2] > 0.2)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0][2] > 0.18)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0][2] > 0.16)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0][2] > 0.15)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0][2] > 0.14)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0][2] > 0.13)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0][2] > 0.12)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0][2] > 0.11)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0][2] > 0.10)]",
+          "$@.attributes[?(@.bff_duplicate_paragraph_spans_decontamination && @.bff_duplicate_paragraph_spans_decontamination[0] && @.bff_duplicate_paragraph_spans_decontamination[0][2] >= 1.0)]",
+          "$.attributes[?(@.pii_detection__pii_regex_with_counts_fast_v2__doc_count && @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0] && @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0][2] > 5)]",
+          "$.attributes[?(@.random__random_number_v1__random[0][2] < 0.5104606781)]"
+        ]
+      },
+      "span_replacement": [
+        {
+          "span": "$.attributes.bff_duplicate_paragraph_spans",
+          "min_score": 0.5,
+          "replacement": ""
+        },
+        {
+          "span": "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__EMAIL_ADDRESS",
+          "min_score": 0.5,
+          "replacement": " |||EMAIL_ADDRESS||| "
+        },
+        {
+          "span": "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__PHONE_NUMBER",
+          "min_score": 0.5,
+          "replacement": " |||PHONE_NUMBER||| "
+        },
+        {
+          "span": "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__IP_ADDRESS",
+          "min_score": 0.5,
+          "replacement": " |||IP_ADDRESS||| "
+        }
+      ]
+    }
+  ],
+  "work_dir": {
+    "input": "/tmp/olmo-mix-v1/input",
+    "output": "/tmp/olmo-mix-v1/output"
+  },
+  "processes": 128
+}
diff --git a/pretrain_data/mixer/config/olmo-train-no-removal/common-crawl-middle.json b/pretrain_data/mixer/config/olmo-train-no-removal/common-crawl-middle.json
new file mode 100644
index 000000000..4e9144c71
--- /dev/null
+++ b/pretrain_data/mixer/config/olmo-train-no-removal/common-crawl-middle.json
@@ -0,0 +1,83 @@
+{
+  "streams": [
+    {
+      "name": "cc_en_middle",
+      "documents": [
+        "pretraining-data/sources/common-crawl/v1-c4-cleaned/documents/cc_en_middle/*.json.gz"
+      ],
+      "attributes": [
+        "decontamination",
+        "dedupe_paragraphs",
+        "gopher_rules",
+        "hatespeech_nsfw_cc_v3",
+        "pii_detection",
+        "random"
+      ],
+      "output": {
+        "path": "pretraining-data/sources/olmo-mix/v1-no-removal/documents/common-crawl/cc_en_middle",
+        "max_size_in_bytes": 4294967296,
+        "discard_fields": [
+          "attributes",
+          "metadata",
+          "added",
+          "created"
+        ]
+      },
+      "filter": {
+        "include": [],
+        "exclude": [
+          "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] && @.gopher_rules__gopher_v1__word_count[0][2] < 50)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] && @.gopher_rules__gopher_v1__word_count[0][2] > 100000)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] && @.gopher_rules__gopher_v1__median_word_length[0][2] < 3)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] && @.gopher_rules__gopher_v1__median_word_length[0][2] > 10)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__symbol_to_word_ratio && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0] && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0][2] > 0.1)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0] && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0][2] < 0.8)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__required_word_count && @.gopher_rules__gopher_v1__required_word_count[0] && @.gopher_rules__gopher_v1__required_word_count[0][2] < 2)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0] && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0][2] > 0.9)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0] && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0][2] > 0.3)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_duplicate_lines && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0] && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0][2] > 0.3)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0][2] > 0.3)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0][2] > 0.2)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0][2] > 0.18)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0][2] > 0.16)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0][2] > 0.15)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0][2] > 0.14)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0][2] > 0.13)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0][2] > 0.12)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0][2] > 0.11)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0][2] > 0.10)]",
+          "$@.attributes[?(@.bff_duplicate_paragraph_spans_decontamination && @.bff_duplicate_paragraph_spans_decontamination[0] && @.bff_duplicate_paragraph_spans_decontamination[0][2] >= 1.0)]",
+          "$.attributes[?(@.pii_detection__pii_regex_with_counts_fast_v2__doc_count && @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0] && @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0][2] > 5)]",
+          "$.attributes[?(@.random__random_number_v1__random[0][2] < 0.5104606781)]"
+        ]
+      },
+      "span_replacement": [
+        {
+          "span": "$.attributes.bff_duplicate_paragraph_spans",
+          "min_score": 0.5,
+          "replacement": ""
+        },
+        {
+          "span": "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__EMAIL_ADDRESS",
+          "min_score": 0.5,
+          "replacement": " |||EMAIL_ADDRESS||| "
+        },
+        {
+          "span": "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__PHONE_NUMBER",
+          "min_score": 0.5,
+          "replacement": " |||PHONE_NUMBER||| "
+        },
+        {
+          "span": "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__IP_ADDRESS",
+          "min_score": 0.5,
+          "replacement": " |||IP_ADDRESS||| "
+        }
+      ]
+    }
+  ],
+  "work_dir": {
+    "input": "/tmp/olmo-mix-v1/input",
+    "output": "/tmp/olmo-mix-v1/output"
+  },
+  "processes": 128
+}
diff --git a/pretrain_data/mixer/config/olmo-train-no-removal/common-crawl-tail.json b/pretrain_data/mixer/config/olmo-train-no-removal/common-crawl-tail.json
new file mode 100644
index 000000000..1bb1fcbee
--- /dev/null
+++ b/pretrain_data/mixer/config/olmo-train-no-removal/common-crawl-tail.json
@@ -0,0 +1,83 @@
+{
+  "streams": [
+    {
+      "name": "cc_en_tail",
+      "documents": [
+        "pretraining-data/sources/common-crawl/v1-c4-cleaned/documents/cc_en_tail/*.json.gz"
+      ],
+      "attributes": [
+        "decontamination",
+        "dedupe_paragraphs",
+        "gopher_rules",
+        "hatespeech_nsfw_cc_v3",
+        "pii_detection",
+        "random"
+      ],
+      "output": {
+        "path": "pretraining-data/sources/olmo-mix/v1-no-removal/documents/common-crawl/cc_en_tail",
+        "max_size_in_bytes": 4294967296,
+        "discard_fields": [
+          "attributes",
+          "metadata",
+          "added",
+          "created"
+        ]
+      },
+      "filter": {
+        "include": [],
+        "exclude": [
+          "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] && @.gopher_rules__gopher_v1__word_count[0][2] < 50)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__word_count && @.gopher_rules__gopher_v1__word_count[0] && @.gopher_rules__gopher_v1__word_count[0][2] > 100000)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] && @.gopher_rules__gopher_v1__median_word_length[0][2] < 3)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__median_word_length && @.gopher_rules__gopher_v1__median_word_length[0] && @.gopher_rules__gopher_v1__median_word_length[0][2] > 10)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__symbol_to_word_ratio && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0] && @.gopher_rules__gopher_v1__symbol_to_word_ratio[0][2] > 0.1)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0] && @.gopher_rules__gopher_v1__fraction_of_words_with_alpha_character[0][2] < 0.8)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__required_word_count && @.gopher_rules__gopher_v1__required_word_count[0] && @.gopher_rules__gopher_v1__required_word_count[0][2] < 2)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0] && @.gopher_rules__gopher_v1__fraction_of_lines_starting_with_bullet_point[0][2] > 0.9)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0] && @.gopher_rules__gopher_v1__fraction_of_lines_ending_with_ellipsis[0][2] > 0.3)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_duplicate_lines && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0] && @.gopher_rules__gopher_v1__fraction_of_duplicate_lines[0][2] > 0.3)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_lines[0][2] > 0.3)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_2gram[0][2] > 0.2)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_3gram[0][2] > 0.18)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_most_common_4gram[0][2] > 0.16)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_5grams[0][2] > 0.15)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_6grams[0][2] > 0.14)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_7grams[0][2] > 0.13)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_8grams[0][2] > 0.12)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_9grams[0][2] > 0.11)]",
+          "$.attributes[?(@.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0] && @.gopher_rules__gopher_v1__fraction_of_characters_in_duplicate_10grams[0][2] > 0.10)]",
+          "$@.attributes[?(@.bff_duplicate_paragraph_spans_decontamination && @.bff_duplicate_paragraph_spans_decontamination[0] && @.bff_duplicate_paragraph_spans_decontamination[0][2] >= 1.0)]",
+          "$.attributes[?(@.pii_detection__pii_regex_with_counts_fast_v2__doc_count && @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0] && @.pii_detection__pii_regex_with_counts_fast_v2__doc_count[0][2] > 5)]",
+          "$.attributes[?(@.random__random_number_v1__random[0][2] < 0.5104606781)]"
+        ]
+      },
+      "span_replacement": [
+        {
+          "span": "$.attributes.bff_duplicate_paragraph_spans",
+          "min_score": 0.5,
+          "replacement": ""
+        },
+        {
+          "span": "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__EMAIL_ADDRESS",
+          "min_score": 0.5,
+          "replacement": " |||EMAIL_ADDRESS||| "
+        },
+        {
+          "span": "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__PHONE_NUMBER",
+          "min_score": 0.5,
+          "replacement": " |||PHONE_NUMBER||| "
+        },
+        {
+          "span": "$.attributes.pii_detection__pii_regex_with_counts_fast_v2__IP_ADDRESS",
+          "min_score": 0.5,
+          "replacement": " |||IP_ADDRESS||| "
+        }
+      ]
+    }
+  ],
+  "work_dir": {
+    "input": "/tmp/olmo-mix-v1/input",
+    "output": "/tmp/olmo-mix-v1/output"
+  },
+  "processes": 128
+}
diff --git a/pretrain_data/mixer/config/olmo-train-sample-small/common-crawl-head.json b/pretrain_data/mixer/config/olmo-train-sample-small/common-crawl-head.json
new file mode 100644
index 000000000..80e555026
--- /dev/null
+++ b/pretrain_data/mixer/config/olmo-train-sample-small/common-crawl-head.json
@@ -0,0 +1,28 @@
+{
+  "streams": [
+    {
+      "name": "cc_en_head",
+      "documents": [
+        "pretraining-data/sources/olmo-mix/v1/documents/common-crawl/cc_en_head/*.gz"
+      ],
+      "output": {
+        "path": "pretraining-data/sources/olmo-mix/v1-sample-small/documents/cc_en_head",
+        "max_size_in_bytes": 3894967296
+      },
+      "attributes": [
+        "random"
+      ],
+      "filter": {
+        "include": [
+          "$.attributes[?(@.random__random_number_v1__random[0][2] < 0.4334)]"
+        ],
+        "exclude": []
+      }
+    }
+  ],
+  "work_dir": {
+    "input": "/tmp/cc-head-sample/mixer/input",
+    "output": "/tmp/cc-head-sample/mixer/output"
+  },
+  "processes": 120
+}
diff --git a/pretrain_data/mixer/config/olmo-train-sample-small/common-crawl-middle.json b/pretrain_data/mixer/config/olmo-train-sample-small/common-crawl-middle.json
new file mode 100644
index 000000000..b1b146686
--- /dev/null
+++ b/pretrain_data/mixer/config/olmo-train-sample-small/common-crawl-middle.json
@@ -0,0 +1,28 @@
+{
+  "streams": [
+    {
+      "name": "cc_en_middle",
+      "documents": [
+        "pretraining-data/sources/olmo-mix/v1/documents/common-crawl/cc_en_middle/*.gz"
+      ],
+      "output": {
+        "path": "pretraining-data/sources/olmo-mix/v1-sample-small/documents/cc_en_middle",
+        "max_size_in_bytes": 3894967296
+      },
+      "attributes": [
+        "random"
+      ],
+      "filter": {
+        "include": [
+          "$.attributes[?(@.random__random_number_v1__random[0][2] < 0.4334)]"
+        ],
+        "exclude": []
+      }
+    }
+  ],
+  "work_dir": {
+    "input": "/tmp/cc-middle-sample/mixer/input",
+    "output": "/tmp/cc-middle-sample/mixer/output"
+  },
+  "processes": 120
+}
diff --git a/pretrain_data/mixer/config/olmo-train-sample-small/common-crawl-tail.json b/pretrain_data/mixer/config/olmo-train-sample-small/common-crawl-tail.json
new file mode 100644
index 000000000..928ffa820
--- /dev/null
+++ b/pretrain_data/mixer/config/olmo-train-sample-small/common-crawl-tail.json
@@ -0,0 +1,28 @@
+{
+  "streams": [
+    {
+      "name": "cc_en_tail",
+      "documents": [
+        "pretraining-data/sources/olmo-mix/v1/documents/common-crawl/cc_en_tail/*.gz"
+      ],
+      "output": {
+        "path": "pretraining-data/sources/olmo-mix/v1-sample-small/documents/cc_en_tail",
+        "max_size_in_bytes": 3894967296
+      },
+      "attributes": [
+        "random"
+      ],
+      "filter": {
+        "include": [
+          "$.attributes[?(@.random__random_number_v1__random[0][2] < 0.4334)]"
+        ],
+        "exclude": []
+      }
+    }
+  ],
+  "work_dir": {
+    "input": "/tmp/cc-middle-sample/mixer/input",
+    "output": "/tmp/cc-middle-sample/mixer/output"
+  },
+  "processes": 120
+}
diff --git a/pretrain_data/mixer/config/olmo-train-sample-small/stack.json b/pretrain_data/mixer/config/olmo-train-sample-small/stack.json
new file mode 100644
index 000000000..de182c09b
--- /dev/null
+++ b/pretrain_data/mixer/config/olmo-train-sample-small/stack.json
@@ -0,0 +1,28 @@
+{
+  "streams": [
+    {
+      "name": "stack-v2-mixer-train",
+      "documents": [
+        "pretraining-data/sources/olmo-mix/v1/documents/stack/*.gz"
+      ],
+      "output": {
+        "path": "pretraining-data/sources/olmo-mix/v1-sample-small/documents/stack",
+        "max_size_in_bytes": 3894967296
+      },
+      "attributes": [
+        "random"
+      ],
+      "filter": {
+        "include": [
+          "$.attributes[?(@.random__random_number_v1__random[0][2] < 0.1429)]"
+        ],
+        "exclude": []
+      }
+    }
+  ],
+  "work_dir": {
+    "input": "/tmp/stack-sample/mixer/input",
+    "output": "/tmp/stack-sample/mixer/output"
+  },
+  "processes": 120
+}
diff --git a/pretrain_data/mixer/config/olmo-train-sample/stack-half.json b/pretrain_data/mixer/config/olmo-train-sample/stack-half.json
new file mode 100644
index 000000000..b78d55d82
--- /dev/null
+++ b/pretrain_data/mixer/config/olmo-train-sample/stack-half.json
@@ -0,0 +1,28 @@
+{
+  "streams": [
+    {
+      "name": "stack-v2-mixer-train",
+      "documents": [
+        "pretraining-data/sources/olmo-mix/v1/documents/stack/*.gz"
+      ],
+      "output": {
+        "path": "pretraining-data/sources/olmo-mix/v1-sample-stack-half/documents/stack",
+        "max_size_in_bytes": 3894967296
+      },
+      "attributes": [
+        "random"
+      ],
+      "filter": {
+        "include": [
+          "$.attributes[?(@.random__random_number_v1__random[0][2] < 0.348782053)]"
+        ],
+        "exclude": []
+      }
+    }
+  ],
+  "work_dir": {
+    "input": "/tmp/stack-sample-2x/mixer/input",
+    "output": "/tmp/stack-sample-2x/mixer/output"
+  },
+  "processes": 120
+}

From 1dc41f872afc673ce12bae298078c3c6ac6d606f Mon Sep 17 00:00:00 2001
From: Luca Soldaini <lucas@allenai.org>
Date: Wed, 28 Jun 2023 11:41:06 -0700
Subject: [PATCH 06/11] mixing

---
 pretrain_data/mixing-log.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/pretrain_data/mixing-log.md b/pretrain_data/mixing-log.md
index 65de88323..1fb568720 100644
--- a/pretrain_data/mixing-log.md
+++ b/pretrain_data/mixing-log.md
@@ -335,9 +335,9 @@ python scripts/prepare_memmap_dataset.py  \
 
 ```shell
 python scripts/prepare_memmap_dataset.py  \
-    s3://ai2-llm/pretraining-data/sources/olmo-mix/v1-sample-small/documents/cc_en_head \
+    s3://ai2-llm/pretraining-data/sources/olmo-mix/v1-no-removal/documents/common-crawl/cc_en_head \
     --safe-mode \
-    --output s3://ai2-llm/preprocessed/olmo-mix/v1-sample-small/gpt-neox-20b-pii-special/common-crawl/cc_en_head \
+    --output s3://ai2-llm/preprocessed/olmo-mix/v1-no-removal/gpt-neox-20b-pii-special/common-crawl/cc_en_head \
     --tokenizer "allenai/eleuther-ai-gpt-neox-20b-pii-special" \
     --workers 120 \
     --cache-dir /tmp/llm-preprocessed/cc_en_head
@@ -345,9 +345,9 @@ python scripts/prepare_memmap_dataset.py  \
 
 ```shell
 python scripts/prepare_memmap_dataset.py  \
-    s3://ai2-llm/pretraining-data/sources/olmo-mix/v1-sample-small/documents/cc_en_middle \
+    s3://ai2-llm/pretraining-data/sources/olmo-mix/v1-no-removal/documents/common-crawl/cc_en_middle \
     --safe-mode \
-    --output s3://ai2-llm/preprocessed/olmo-mix/v1-sample-small/gpt-neox-20b-pii-special/common-crawl/cc_en_middle \
+    --output s3://ai2-llm/preprocessed/olmo-mix/v1-no-removal/gpt-neox-20b-pii-special/common-crawl/cc_en_middle \
     --tokenizer "allenai/eleuther-ai-gpt-neox-20b-pii-special" \
     --workers 120 \
     --cache-dir /tmp/llm-preprocessed
@@ -355,9 +355,9 @@ python scripts/prepare_memmap_dataset.py  \
 
 ```shell
 python scripts/prepare_memmap_dataset.py  \
-    s3://ai2-llm/pretraining-data/sources/olmo-mix/v1-sample-small/documents/cc_en_tail \
+    s3://ai2-llm/pretraining-data/sources/olmo-mix/v1-no-removal/documents/common-crawl/cc_en_tail \
     --safe-mode \
-    --output s3://ai2-llm/preprocessed/olmo-mix/v1-sample-small/gpt-neox-20b-pii-special/common-crawl/cc_en_tail \
+    --output s3://ai2-llm/preprocessed/olmo-mix/v1-no-removal/gpt-neox-20b-pii-special/common-crawl/cc_en_tail \
     --tokenizer "allenai/eleuther-ai-gpt-neox-20b-pii-special" \
     --workers 120 \
     --cache-dir /tmp/llm-preprocessed

From 9965b4c28c1e3c1e34f3e5a6988a3621d8806343 Mon Sep 17 00:00:00 2001
From: Luca Soldaini <lucas@allenai.org>
Date: Fri, 30 Jun 2023 20:38:28 -0700
Subject: [PATCH 07/11] added field of study!

---
 pretrain_data/s2/v3-fos/README.md             |  22 +++
 .../s2/v3-fos/process_corpus/s2ag.sql         | 134 +++++++++++++++++
 .../s2/v3-fos/process_corpus/s2orc.sql        | 142 ++++++++++++++++++
 3 files changed, 298 insertions(+)
 create mode 100644 pretrain_data/s2/v3-fos/README.md
 create mode 100644 pretrain_data/s2/v3-fos/process_corpus/s2ag.sql
 create mode 100644 pretrain_data/s2/v3-fos/process_corpus/s2orc.sql

diff --git a/pretrain_data/s2/v3-fos/README.md b/pretrain_data/s2/v3-fos/README.md
new file mode 100644
index 000000000..07723e2dc
--- /dev/null
+++ b/pretrain_data/s2/v3-fos/README.md
@@ -0,0 +1,22 @@
+# S2 Corpus v3
+
+> *Author*: Luca Soldaini [@soldni](github.com/soldni)
+
+
+Version 3 of the S2ORC corpus improves over 2 by removing abstracts from sources that
+are not high-quality. For example, we remove abstracts that have originated exclusively
+from the Microsoft Academic Graph, as they
+
+We identified the following sources to be of lower quality:
+
+If an abstract is exclusively from one of these sources, we remove it from the corpus.
+
+
+## Dataset Statistics
+
+| Dataset | Split | # Documents | # Words        |
+|:-------:|:-----:|------------:|---------------:|
+| s2ag    | train | 30,569,017  |  5,920,099,207 |
+| s2ag    | valid |    109,709  |     24,029,459 |
+| s2orc   | train |  8,242,162  | 36,088,195,908 |
+| s2orc   | valid |     51,323  |    255,139,074 |
diff --git a/pretrain_data/s2/v3-fos/process_corpus/s2ag.sql b/pretrain_data/s2/v3-fos/process_corpus/s2ag.sql
new file mode 100644
index 000000000..d269608df
--- /dev/null
+++ b/pretrain_data/s2/v3-fos/process_corpus/s2ag.sql
@@ -0,0 +1,134 @@
+UNLOAD (
+    WITH filtered_corpus AS (
+        SELECT
+            id,
+            source,
+            added,
+            created,
+            metadata,
+            cast(id AS INT) as corpusid,
+            (metadata.title || CHR(10) || CHR(10) || metadata.abstract) AS text,
+            IF(
+                metadata.year < 2022
+                OR (
+                    metadata.year = 2022 AND
+                    date(from_iso8601_timestamp(created)) < date('2022-12-01')
+                ),
+                'train',
+                'valid'
+            ) AS split
+        FROM (
+            SELECT
+                *,
+                ARRAY_MAX(
+                    TRANSFORM (
+                        regexp_extract_all(metadata.abstract, '\b([A-Za-z]\s)([a-z]\s)*[A-Za-z]\b'),
+                        x -> length(x)
+                    ) || 0
+                ) AS max_single_letter_sequence,
+                FILTER(
+                    metadata.sources,
+                    x -> NOT REGEXP_LIKE(
+                        x,
+                        '^Unpaywall|MergedPDFExtraction|ScienceParseMerged|Anansi|ScienceParsePlus|Adhoc|ScienceParse|Crawler|MAG$'
+                    )
+                ) AS filtered_sources
+            FROM "temp_lucas"."llm_s2ag_v0"
+            WHERE
+                (metadata.title_language = 'en' OR metadata.title_perplexity > -20)
+                AND metadata.abstract_language = 'en'
+                AND metadata.abstract_perplexity > -20
+                AND metadata.title_count >= 3
+                AND metadata.abstract_count >= 50
+                AND metadata.abstract_count <= 1000
+                AND metadata.year >= 1970
+                AND (
+                    REGEXP_LIKE(
+                        metadata.top_frequencies[1].token,
+                        '^[A-Za-z][A-Za-z]+$'
+                    )
+                    OR (
+                        metadata.top_frequencies[1].token = 'a'
+                        AND REGEXP_LIKE(
+                            metadata.top_frequencies[2].token,
+                            '^[A-Za-z][A-Za-z]+$'
+                        )
+                    )
+                )
+        )
+        WHERE (
+            (
+                CARDINALITY(filtered_sources) > 0 AND
+                max_single_letter_sequence < 4
+            ) OR (
+                max_single_letter_sequence > 0 AND
+                CARDINALITY(filtered_sources) = 0
+            )
+        )
+    ),
+    filtered_corpus_with_fos AS (
+        SELECT
+            cr.id,
+            cr.source,
+            cr.added,
+            cr.created,
+            cr.text,
+            cr.split,
+            CAST(
+                ROW(
+                    cr.metadata.year,
+                    cr.metadata.title,
+                    cr.metadata.abstract,
+                    cr.metadata.sha1,
+                    cr.metadata.sources,
+                    cr.metadata.title_language,
+                    cr.metadata.abstract_language,
+                    cr.metadata.title_perplexity,
+                    cr.metadata.abstract_perplexity,
+                    cr.metadata.title_count,
+                    cr.metadata.abstract_count,
+                    cr.metadata.top_frequencies,
+                    COALESCE(pq.s2FieldsOfStudy, ARRAY[]),
+                    COALESCE(pq.fieldsOfStudy, ARRAY[])
+                )
+                AS
+                ROW(
+                    year BIGINT,
+                    title VARCHAR,
+                    abstract VARCHAR,
+                    sha1 VARCHAR,
+                    sources ARRAY<VARCHAR>,
+                    title_language VARCHAR,
+                    abstract_language VARCHAR,
+                    title_perplexity DOUBLE,
+                    abstract_perplexity DOUBLE,
+                    title_count BIGINT,
+                    abstract_count BIGINT,
+                    top_frequencies ARRAY<ROW(token VARCHAR, count BIGINT)>,
+                    s2FieldsOfStudy ARRAY<VARCHAR>,
+                    extFieldsOfStudy ARRAY<VARCHAR>
+                )
+            ) AS metadata
+        from espresso.pq_paper  as pq
+        INNER JOIN filtered_corpus as cr
+            ON pq.corpusid = cr.corpusid
+    )
+    SELECT
+        id,
+        ARRAY_AGG(source)[1] AS source,
+        'v3-fos' AS version,
+        ARRAY_AGG(added)[1] AS added,
+        ARRAY_AGG(created)[1] AS created,
+        ARRAY_AGG(text)[1] AS text,
+        ARRAY_AGG(metadata)[1] AS metadata,
+        ARRAY_AGG(split)[1] AS split,
+        CAST(id AS INT) % 10 AS part_id
+    FROM filtered_corpus_with_fos
+    GROUP BY id
+)
+TO 's3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2ag'
+WITH (
+    format='JSON',
+    compression='GZIP',
+    partitioned_by = ARRAY['split', 'part_id']
+)
diff --git a/pretrain_data/s2/v3-fos/process_corpus/s2orc.sql b/pretrain_data/s2/v3-fos/process_corpus/s2orc.sql
new file mode 100644
index 000000000..bda0e817d
--- /dev/null
+++ b/pretrain_data/s2/v3-fos/process_corpus/s2orc.sql
@@ -0,0 +1,142 @@
+UNLOAD (
+    WITH s2orc_stats AS (
+        SELECT
+            id,
+            source,
+            added,
+            created,
+            metadata,
+            FILTER(
+                metadata.paragraphs,
+                x -> x.perplexity >= -20
+            ) as valid_paragraphs,
+            (
+                REGEXP_LIKE(
+                    metadata.top_frequencies[1].token,
+                    '^[A-Za-z][a-z]+$'
+                ) AND (
+                    (
+                        metadata.count > 500 AND
+                        (
+                            metadata.top_frequencies[1].count / metadata.count
+                        ) <= 0.075
+                    ) OR (
+                        metadata.count <= 500 AND
+                        (
+                            metadata.top_frequencies[1].count / metadata.count
+                        ) <= 0.3
+                    )
+                )
+            ) AS valid_top_word,
+            ARRAY_SORT(
+                TRANSFORM(
+                    MAP_ENTRIES(
+                        TRANSFORM_VALUES(
+                            -- from table to map
+                            MULTIMAP_FROM_ENTRIES(
+                                -- from list to table
+                                TRANSFORM(
+                                    -- extract rows to count
+                                    metadata.paragraphs,
+                                    x -> ROW(x.language, 1)
+                                )
+                            ),
+                            -- merge counts
+                            (k, v) -> REDUCE(v, 0, (s, x) -> s + x, s -> s)
+                        )
+                    ),
+                    x -> CAST(x AS ROW(lang varchar, cnt int))
+                ),
+                (x, y) -> IF(x.cnt < y.cnt, 1, IF(x.cnt = y.cnt, 0, -1))
+            )[1].lang AS language
+        FROM "temp_lucas"."llm_s2orc_v0"
+    ),
+    filtered_corpus AS (
+        SELECT
+            id,
+            source,
+            added,
+            created,
+            metadata,
+            cast(id AS INT) as corpusid,
+            (
+                metadata.title || CHR(10) || CHR(10) ||
+                metadata.abstract || CHR(10) || CHR(10) ||
+                ARRAY_JOIN(TRANSFORM(valid_paragraphs, x -> x.text), CHR(10))
+            ) as text,
+            IF(
+                metadata.year < 2022
+                OR (
+                    metadata.year = 2022 AND
+                    date(from_iso8601_timestamp(created)) < date('2022-12-01')
+                ),
+                'train',
+                'valid'
+            ) AS split
+        FROM s2orc_stats
+        WHERE
+            language = 'en'
+            AND metadata.count < 50000
+            AND metadata.count > 500
+            AND valid_top_word
+            AND cardinality(valid_paragraphs) >= 5
+            AND metadata.title IS NOT NULL
+            AND metadata.abstract is not NULL
+            AND metadata.year >= 1970
+    ),
+    filtered_corpus_with_fos AS (
+        SELECT
+            cr.id,
+            cr.source,
+            cr.added,
+            cr.created,
+            cr.text,
+            cr.split,
+            CAST(
+                ROW(
+                    cr.metadata.year,
+                    cr.metadata.title,
+                    cr.metadata.abstract,
+                    cr.metadata.sha1,
+                    cr.metadata.paragraphs,
+                    cr.metadata.count,
+                    cr.metadata.top_frequencies,
+                    COALESCE(pq.s2FieldsOfStudy, ARRAY[]),
+                    COALESCE(pq.fieldsOfStudy, ARRAY[])
+                )
+                AS
+                ROW(
+                    year BIGINT,
+                    title VARCHAR,
+                    abstract VARCHAR,
+                    sha1 VARCHAR,
+                    paragraphs ARRAY<ROW(language VARCHAR, perplexity DOUBLE, text VARCHAR)>,
+                    count BIGINT,
+                    top_frequencies ARRAY<ROW(token VARCHAR, count BIGINT)>,
+                    s2FieldsOfStudy ARRAY<VARCHAR>,
+                    extFieldsOfStudy ARRAY<VARCHAR>
+                )
+            ) AS metadata
+        from espresso.pq_paper  as pq
+        INNER JOIN filtered_corpus as cr
+            ON pq.corpusid = cr.corpusid
+    )
+    SELECT
+        id,
+        ARRAY_AGG(source)[1] AS source,
+        'v3-fos' AS version,
+        ARRAY_AGG(added)[1] AS added,
+        ARRAY_AGG(created)[1] AS created,
+        ARRAY_AGG(text)[1] AS text,
+        ARRAY_AGG(metadata)[1] AS metadata,
+        ARRAY_AGG(split)[1] AS split,
+        CAST(id AS INT) % 10 AS part_id
+    FROM filtered_corpus
+    GROUP BY id
+)
+TO 's3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc'
+WITH (
+    format='JSON',
+    compression='GZIP',
+    partitioned_by = ARRAY['split', 'part_id']
+)

From 17cc37ab3d0c99c6eedf64715bf46d74b968094b Mon Sep 17 00:00:00 2001
From: Luca Soldaini <lucas@allenai.org>
Date: Wed, 5 Jul 2023 12:00:31 -0700
Subject: [PATCH 08/11] fixed sql query

---
 pretrain_data/s2/v3-fos/process_corpus/s2orc.sql | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pretrain_data/s2/v3-fos/process_corpus/s2orc.sql b/pretrain_data/s2/v3-fos/process_corpus/s2orc.sql
index bda0e817d..b59ae9af1 100644
--- a/pretrain_data/s2/v3-fos/process_corpus/s2orc.sql
+++ b/pretrain_data/s2/v3-fos/process_corpus/s2orc.sql
@@ -131,7 +131,7 @@ UNLOAD (
         ARRAY_AGG(metadata)[1] AS metadata,
         ARRAY_AGG(split)[1] AS split,
         CAST(id AS INT) % 10 AS part_id
-    FROM filtered_corpus
+    FROM filtered_corpus_with_fos
     GROUP BY id
 )
 TO 's3://ai2-llm/pretraining-data/sources/s2/v3-fos/documents/dataset=s2orc'

From fec0bf0139ce97ab1e308640d2c2372f010f9e7e Mon Sep 17 00:00:00 2001
From: Luca Soldaini <lucas@allenai.org>
Date: Fri, 7 Jul 2023 18:46:31 -0700
Subject: [PATCH 09/11] downloading

---
 pretrain_data/stackexchange/README.md | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 pretrain_data/stackexchange/README.md

diff --git a/pretrain_data/stackexchange/README.md b/pretrain_data/stackexchange/README.md
new file mode 100644
index 000000000..e69de29bb

From 1f9c31c15afd831ef9fd9abe1838487e1adcd3ed Mon Sep 17 00:00:00 2001
From: Luca Soldaini <lucas@allenai.org>
Date: Fri, 7 Jul 2023 18:46:44 -0700
Subject: [PATCH 10/11] downloading

---
 .../s2/v3-fos/process_corpus/s2orc.sql        |  19 +-
 pretrain_data/stackexchange/download.sh       |  46 +++
 pretrain_data/stackexchange/names.txt         | 367 ++++++++++++++++++
 3 files changed, 427 insertions(+), 5 deletions(-)
 create mode 100644 pretrain_data/stackexchange/download.sh
 create mode 100644 pretrain_data/stackexchange/names.txt

diff --git a/pretrain_data/s2/v3-fos/process_corpus/s2orc.sql b/pretrain_data/s2/v3-fos/process_corpus/s2orc.sql
index b59ae9af1..00d306355 100644
--- a/pretrain_data/s2/v3-fos/process_corpus/s2orc.sql
+++ b/pretrain_data/s2/v3-fos/process_corpus/s2orc.sql
@@ -84,6 +84,15 @@ UNLOAD (
             AND metadata.abstract is not NULL
             AND metadata.year >= 1970
     ),
+    filtered_espresso AS (
+        SELECT
+            pq.corpusid,
+            COALESCE(pq.s2FieldsOfStudy, ARRAY[]) as s2FieldsOfStudy,
+            COALESCE(pq.fieldsOfStudy, ARRAY[]) as fieldsOfStudy
+        from espresso.pq_paper as pq
+        INNER JOIN filtered_corpus as cr
+            ON pq.corpusid = cr.corpusid
+    ),
     filtered_corpus_with_fos AS (
         SELECT
             cr.id,
@@ -98,11 +107,11 @@ UNLOAD (
                     cr.metadata.title,
                     cr.metadata.abstract,
                     cr.metadata.sha1,
-                    cr.metadata.paragraphs,
+                    -- cr.metadata.paragraphs,
                     cr.metadata.count,
                     cr.metadata.top_frequencies,
-                    COALESCE(pq.s2FieldsOfStudy, ARRAY[]),
-                    COALESCE(pq.fieldsOfStudy, ARRAY[])
+                    pq.s2FieldsOfStudy,
+                    pq.fieldsOfStudy
                 )
                 AS
                 ROW(
@@ -110,14 +119,14 @@ UNLOAD (
                     title VARCHAR,
                     abstract VARCHAR,
                     sha1 VARCHAR,
-                    paragraphs ARRAY<ROW(language VARCHAR, perplexity DOUBLE, text VARCHAR)>,
+                    -- paragraphs ARRAY<ROW(language VARCHAR, perplexity DOUBLE, text VARCHAR)>,
                     count BIGINT,
                     top_frequencies ARRAY<ROW(token VARCHAR, count BIGINT)>,
                     s2FieldsOfStudy ARRAY<VARCHAR>,
                     extFieldsOfStudy ARRAY<VARCHAR>
                 )
             ) AS metadata
-        from espresso.pq_paper  as pq
+        from filtered_espresso as pq
         INNER JOIN filtered_corpus as cr
             ON pq.corpusid = cr.corpusid
     )
diff --git a/pretrain_data/stackexchange/download.sh b/pretrain_data/stackexchange/download.sh
new file mode 100644
index 000000000..fe25ad5a8
--- /dev/null
+++ b/pretrain_data/stackexchange/download.sh
@@ -0,0 +1,46 @@
+#!/usr/bin/env bash
+
+# Author:   Luca Soldaini
+# Email:    luca@soldaini.net
+
+# get script directory
+SOURCE="${BASH_SOURCE[0]}"
+while [ -h "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink
+  SCRIPT_DIR="$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd )"
+  SOURCE="$(readlink "$SOURCE")"
+  # if $SOURCE was a relative symlink, we need to resolve it
+  # relative to the path where the symlink file was located
+  [[ $SOURCE != /* ]] && SOURCE="$SCRIPT_DIR/$SOURCE"
+done
+SCRIPT_DIR="$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd )"
+
+names_file="${SCRIPT_DIR}/names.txt"
+num_processes=8
+
+process_file() {
+    name=$1
+    s3_prefix="s3://ai2-llm/pretraining-data/sources/stackexchange/raw/"
+    url_prefix="https://archive.org/download/stackexchange/"
+
+    # Check if file exists in s3 bucket
+    if aws s3 ls ${s3_prefix}${name} > /dev/null 2>&1; then
+        echo "File ${name} exists in s3 bucket."
+    else
+        echo "File ${name} does not exist in s3 bucket. Downloading and uploading..."
+        wget -P "/tmp" ${url_prefix}${name}
+
+        # Check if file downloaded successfully
+        if [ -f "/tmp/${name}" ]; then
+        echo "Successfully downloaded ${name}. Uploading to s3..."
+        aws s3 cp "/tmp/${name}" ${s3_prefix}${name}
+        rm ${name}   # remove the file from the machine
+        else
+        echo "Failed to download ${name}."
+        fi
+    fi
+}
+
+export -f process_file
+
+# Use xargs to pass each line in names.txt to process_file function and run in parallel.
+cat $names_file | xargs -I {} -P $num_processes bash -c "process_file {}"
diff --git a/pretrain_data/stackexchange/names.txt b/pretrain_data/stackexchange/names.txt
new file mode 100644
index 000000000..82e96b900
--- /dev/null
+++ b/pretrain_data/stackexchange/names.txt
@@ -0,0 +1,367 @@
+3dprinting.meta.stackexchange.com.7z
+3dprinting.stackexchange.com.7z
+academia.meta.stackexchange.com.7z
+academia.stackexchange.com.7z
+ai.meta.stackexchange.com.7z
+ai.stackexchange.com.7z
+android.meta.stackexchange.com.7z
+android.stackexchange.com.7z
+anime.meta.stackexchange.com.7z
+anime.stackexchange.com.7z
+apple.meta.stackexchange.com.7z
+apple.stackexchange.com.7z
+arduino.meta.stackexchange.com.7z
+arduino.stackexchange.com.7z
+askubuntu.com.7z
+astronomy.meta.stackexchange.com.7z
+astronomy.stackexchange.com.7z
+aviation.meta.stackexchange.com.7z
+aviation.stackexchange.com.7z
+avp.meta.stackexchange.com.7z
+avp.stackexchange.com.7z
+beer.meta.stackexchange.com.7z
+beer.stackexchange.com.7z
+bicycles.meta.stackexchange.com.7z
+bicycles.stackexchange.com.7z
+bioacoustics.meta.stackexchange.com.7z
+bioacoustics.stackexchange.com.7z
+bioinformatics.meta.stackexchange.com.7z
+bioinformatics.stackexchange.com.7z
+biology.meta.stackexchange.com.7z
+biology.stackexchange.com.7z
+bitcoin.meta.stackexchange.com.7z
+bitcoin.stackexchange.com.7z
+blender.meta.stackexchange.com.7z
+blender.stackexchange.com.7z
+boardgames.meta.stackexchange.com.7z
+boardgames.stackexchange.com.7z
+bricks.meta.stackexchange.com.7z
+bricks.stackexchange.com.7z
+buddhism.meta.stackexchange.com.7z
+buddhism.stackexchange.com.7z
+cardano.meta.stackexchange.com.7z
+cardano.stackexchange.com.7z
+chemistry.meta.stackexchange.com.7z
+chemistry.stackexchange.com.7z
+chess.meta.stackexchange.com.7z
+chess.stackexchange.com.7z
+chinese.meta.stackexchange.com.7z
+chinese.stackexchange.com.7z
+christianity.meta.stackexchange.com.7z
+christianity.stackexchange.com.7z
+civicrm.meta.stackexchange.com.7z
+civicrm.stackexchange.com.7z
+codegolf.meta.stackexchange.com.7z
+codegolf.stackexchange.com.7z
+codereview.meta.stackexchange.com.7z
+codereview.stackexchange.com.7z
+coffee.meta.stackexchange.com.7z
+coffee.stackexchange.com.7z
+cogsci.meta.stackexchange.com.7z
+cogsci.stackexchange.com.7z
+computergraphics.meta.stackexchange.com.7z
+computergraphics.stackexchange.com.7z
+conlang.meta.stackexchange.com.7z
+conlang.stackexchange.com.7z
+cooking.meta.stackexchange.com.7z
+cooking.stackexchange.com.7z
+craftcms.meta.stackexchange.com.7z
+craftcms.stackexchange.com.7z
+crafts.meta.stackexchange.com.7z
+crafts.stackexchange.com.7z
+crypto.meta.stackexchange.com.7z
+crypto.stackexchange.com.7z
+cs.meta.stackexchange.com.7z
+cs.stackexchange.com.7z
+cseducators.meta.stackexchange.com.7z
+cseducators.stackexchange.com.7z
+cstheory.meta.stackexchange.com.7z
+cstheory.stackexchange.com.7z
+datascience.meta.stackexchange.com.7z
+datascience.stackexchange.com.7z
+dba.meta.stackexchange.com.7z
+dba.stackexchange.com.7z
+devops.meta.stackexchange.com.7z
+devops.stackexchange.com.7z
+diy.meta.stackexchange.com.7z
+diy.stackexchange.com.7z
+drones.meta.stackexchange.com.7z
+drones.stackexchange.com.7z
+drupal.meta.stackexchange.com.7z
+drupal.stackexchange.com.7z
+dsp.meta.stackexchange.com.7z
+dsp.stackexchange.com.7z
+earthscience.meta.stackexchange.com.7z
+earthscience.stackexchange.com.7z
+ebooks.meta.stackexchange.com.7z
+ebooks.stackexchange.com.7z
+economics.meta.stackexchange.com.7z
+economics.stackexchange.com.7z
+electronics.meta.stackexchange.com.7z
+electronics.stackexchange.com.7z
+elementaryos.meta.stackexchange.com.7z
+elementaryos.stackexchange.com.7z
+ell.meta.stackexchange.com.7z
+ell.stackexchange.com.7z
+emacs.meta.stackexchange.com.7z
+emacs.stackexchange.com.7z
+engineering.meta.stackexchange.com.7z
+engineering.stackexchange.com.7z
+english.meta.stackexchange.com.7z
+english.stackexchange.com.7z
+eosio.meta.stackexchange.com.7z
+eosio.stackexchange.com.7z
+es.meta.stackoverflow.com.7z
+es.stackoverflow.com.7z
+esperanto.meta.stackexchange.com.7z
+esperanto.stackexchange.com.7z
+ethereum.meta.stackexchange.com.7z
+ethereum.stackexchange.com.7z
+expatriates.meta.stackexchange.com.7z
+expatriates.stackexchange.com.7z
+expressionengine.meta.stackexchange.com.7z
+expressionengine.stackexchange.com.7z
+fitness.meta.stackexchange.com.7z
+fitness.stackexchange.com.7z
+freelancing.meta.stackexchange.com.7z
+freelancing.stackexchange.com.7z
+french.meta.stackexchange.com.7z
+french.stackexchange.com.7z
+gamedev.meta.stackexchange.com.7z
+gamedev.stackexchange.com.7z
+gaming.meta.stackexchange.com.7z
+gaming.stackexchange.com.7z
+gardening.meta.stackexchange.com.7z
+gardening.stackexchange.com.7z
+genealogy.meta.stackexchange.com.7z
+genealogy.stackexchange.com.7z
+german.meta.stackexchange.com.7z
+german.stackexchange.com.7z
+gis.meta.stackexchange.com.7z
+gis.stackexchange.com.7z
+graphicdesign.meta.stackexchange.com.7z
+graphicdesign.stackexchange.com.7z
+ham.meta.stackexchange.com.7z
+ham.stackexchange.com.7z
+hardwarerecs.meta.stackexchange.com.7z
+hardwarerecs.stackexchange.com.7z
+health.meta.stackexchange.com.7z
+health.stackexchange.com.7z
+hermeneutics.meta.stackexchange.com.7z
+hermeneutics.stackexchange.com.7z
+hinduism.meta.stackexchange.com.7z
+hinduism.stackexchange.com.7z
+history.meta.stackexchange.com.7z
+history.stackexchange.com.7z
+homebrew.meta.stackexchange.com.7z
+homebrew.stackexchange.com.7z
+hsm.meta.stackexchange.com.7z
+hsm.stackexchange.com.7z
+interpersonal.meta.stackexchange.com.7z
+interpersonal.stackexchange.com.7z
+iot.meta.stackexchange.com.7z
+iot.stackexchange.com.7z
+iota.meta.stackexchange.com.7z
+iota.stackexchange.com.7z
+islam.meta.stackexchange.com.7z
+islam.stackexchange.com.7z
+italian.meta.stackexchange.com.7z
+italian.stackexchange.com.7z
+ja.meta.stackoverflow.com.7z
+ja.stackoverflow.com.7z
+japanese.meta.stackexchange.com.7z
+japanese.stackexchange.com.7z
+joomla.meta.stackexchange.com.7z
+joomla.stackexchange.com.7z
+judaism.meta.stackexchange.com.7z
+judaism.stackexchange.com.7z
+korean.meta.stackexchange.com.7z
+korean.stackexchange.com.7z
+languagelearning.meta.stackexchange.com.7z
+languagelearning.stackexchange.com.7z
+latin.meta.stackexchange.com.7z
+latin.stackexchange.com.7z
+law.meta.stackexchange.com.7z
+law.stackexchange.com.7z
+lifehacks.meta.stackexchange.com.7z
+lifehacks.stackexchange.com.7z
+linguistics.meta.stackexchange.com.7z
+linguistics.stackexchange.com.7z
+literature.meta.stackexchange.com.7z
+literature.stackexchange.com.7z
+magento.meta.stackexchange.com.7z
+magento.stackexchange.com.7z
+martialarts.meta.stackexchange.com.7z
+martialarts.stackexchange.com.7z
+materials.meta.stackexchange.com.7z
+materials.stackexchange.com.7z
+math.meta.stackexchange.com.7z
+math.stackexchange.com.7z
+matheducators.meta.stackexchange.com.7z
+matheducators.stackexchange.com.7z
+mathematica.meta.stackexchange.com.7z
+mathematica.stackexchange.com.7z
+mathoverflow.net.7z
+mechanics.meta.stackexchange.com.7z
+mechanics.stackexchange.com.7z
+meta.askubuntu.com.7z
+meta.mathoverflow.net.7z
+meta.serverfault.com.7z
+meta.stackexchange.com.7z
+meta.stackoverflow.com.7z
+meta.superuser.com.7z
+moderators.meta.stackexchange.com.7z
+moderators.stackexchange.com.7z
+monero.meta.stackexchange.com.7z
+monero.stackexchange.com.7z
+money.meta.stackexchange.com.7z
+money.stackexchange.com.7z
+movies.meta.stackexchange.com.7z
+movies.stackexchange.com.7z
+music.meta.stackexchange.com.7z
+music.stackexchange.com.7z
+musicfans.meta.stackexchange.com.7z
+musicfans.stackexchange.com.7z
+mythology.meta.stackexchange.com.7z
+mythology.stackexchange.com.7z
+networkengineering.meta.stackexchange.com.7z
+networkengineering.stackexchange.com.7z
+opendata.meta.stackexchange.com.7z
+opendata.stackexchange.com.7z
+opensource.meta.stackexchange.com.7z
+opensource.stackexchange.com.7z
+or.meta.stackexchange.com.7z
+or.stackexchange.com.7z
+outdoors.meta.stackexchange.com.7z
+outdoors.stackexchange.com.7z
+parenting.meta.stackexchange.com.7z
+parenting.stackexchange.com.7z
+patents.meta.stackexchange.com.7z
+patents.stackexchange.com.7z
+pets.meta.stackexchange.com.7z
+pets.stackexchange.com.7z
+philosophy.meta.stackexchange.com.7z
+philosophy.stackexchange.com.7z
+photo.meta.stackexchange.com.7z
+photo.stackexchange.com.7z
+physics.meta.stackexchange.com.7z
+physics.stackexchange.com.7z
+pm.meta.stackexchange.com.7z
+pm.stackexchange.com.7z
+poker.meta.stackexchange.com.7z
+poker.stackexchange.com.7z
+politics.meta.stackexchange.com.7z
+politics.stackexchange.com.7z
+portuguese.meta.stackexchange.com.7z
+portuguese.stackexchange.com.7z
+proofassistants.meta.stackexchange.com.7z
+proofassistants.stackexchange.com.7z
+pt.meta.stackoverflow.com.7z
+pt.stackoverflow.com.7z
+puzzling.meta.stackexchange.com.7z
+puzzling.stackexchange.com.7z
+quant.meta.stackexchange.com.7z
+quant.stackexchange.com.7z
+quantumcomputing.meta.stackexchange.com.7z
+quantumcomputing.stackexchange.com.7z
+raspberrypi.meta.stackexchange.com.7z
+raspberrypi.stackexchange.com.7z
+retrocomputing.meta.stackexchange.com.7z
+retrocomputing.stackexchange.com.7z
+reverseengineering.meta.stackexchange.com.7z
+reverseengineering.stackexchange.com.7z
+robotics.meta.stackexchange.com.7z
+robotics.stackexchange.com.7z
+rpg.meta.stackexchange.com.7z
+rpg.stackexchange.com.7z
+ru.meta.stackoverflow.com.7z
+ru.stackoverflow.com.7z
+rus.meta.stackexchange.com.7z
+rus.stackexchange.com.7z
+russian.meta.stackexchange.com.7z
+russian.stackexchange.com.7z
+salesforce.meta.stackexchange.com.7z
+salesforce.stackexchange.com.7z
+scicomp.meta.stackexchange.com.7z
+scicomp.stackexchange.com.7z
+scifi.meta.stackexchange.com.7z
+scifi.stackexchange.com.7z
+security.meta.stackexchange.com.7z
+security.stackexchange.com.7z
+serverfault.com.7z
+sharepoint.meta.stackexchange.com.7z
+sharepoint.stackexchange.com.7z
+sitecore.meta.stackexchange.com.7z
+sitecore.stackexchange.com.7z
+skeptics.meta.stackexchange.com.7z
+skeptics.stackexchange.com.7z
+softwareengineering.meta.stackexchange.com.7z
+softwareengineering.stackexchange.com.7z
+softwarerecs.meta.stackexchange.com.7z
+softwarerecs.stackexchange.com.7z
+solana.meta.stackexchange.com.7z
+solana.stackexchange.com.7z
+sound.meta.stackexchange.com.7z
+sound.stackexchange.com.7z
+space.meta.stackexchange.com.7z
+space.stackexchange.com.7z
+spanish.meta.stackexchange.com.7z
+spanish.stackexchange.com.7z
+sports.meta.stackexchange.com.7z
+sports.stackexchange.com.7z
+sqa.meta.stackexchange.com.7z
+sqa.stackexchange.com.7z
+stackapps.com.7z
+stackoverflow.com-Badges.7z
+stackoverflow.com-Comments.7z
+stackoverflow.com-PostHistory.7z
+stackoverflow.com-PostLinks.7z
+stackoverflow.com-Posts.7z
+stackoverflow.com-Tags.7z
+stackoverflow.com-Users.7z
+stackoverflow.com-Votes.7z
+stats.meta.stackexchange.com.7z
+stats.stackexchange.com.7z
+stellar.meta.stackexchange.com.7z
+stellar.stackexchange.com.7z
+substrate.meta.stackexchange.com.7z
+substrate.stackexchange.com.7z
+superuser.com.7z
+sustainability.meta.stackexchange.com.7z
+sustainability.stackexchange.com.7z
+tex.meta.stackexchange.com.7z
+tex.stackexchange.com.7z
+tezos.meta.stackexchange.com.7z
+tezos.stackexchange.com.7z
+tor.meta.stackexchange.com.7z
+tor.stackexchange.com.7z
+travel.meta.stackexchange.com.7z
+travel.stackexchange.com.7z
+tridion.meta.stackexchange.com.7z
+tridion.stackexchange.com.7z
+ukrainian.meta.stackexchange.com.7z
+ukrainian.stackexchange.com.7z
+unix.meta.stackexchange.com.7z
+unix.stackexchange.com.7z
+ux.meta.stackexchange.com.7z
+ux.stackexchange.com.7z
+vegetarianism.meta.stackexchange.com.7z
+vegetarianism.stackexchange.com.7z
+vi.meta.stackexchange.com.7z
+vi.stackexchange.com.7z
+webapps.meta.stackexchange.com.7z
+webapps.stackexchange.com.7z
+webmasters.meta.stackexchange.com.7z
+webmasters.stackexchange.com.7z
+windowsphone.meta.stackexchange.com.7z
+windowsphone.stackexchange.com.7z
+woodworking.meta.stackexchange.com.7z
+woodworking.stackexchange.com.7z
+wordpress.meta.stackexchange.com.7z
+wordpress.stackexchange.com.7z
+workplace.meta.stackexchange.com.7z
+workplace.stackexchange.com.7z
+worldbuilding.meta.stackexchange.com.7z
+worldbuilding.stackexchange.com.7z
+writers.meta.stackexchange.com.7z
+writers.stackexchange.com.7z

From 70620ab362be1afbd2fe6e70ed079c0b44ad3bb2 Mon Sep 17 00:00:00 2001
From: Luca Soldaini <lucas@allenai.org>
Date: Sat, 8 Jul 2023 10:17:26 -0700
Subject: [PATCH 11/11] style

---
 scripts/prepare_memmap_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/prepare_memmap_dataset.py b/scripts/prepare_memmap_dataset.py
index 5b7bb5fbf..7a802ff4c 100644
--- a/scripts/prepare_memmap_dataset.py
+++ b/scripts/prepare_memmap_dataset.py
@@ -361,7 +361,7 @@ def make_source_and_target(
     "--cache-dir",
     type=str,
     default=None,
-    help="Cache directory for the tokenizer; use system default if not specified"
+    help="Cache directory for the tokenizer; use system default if not specified",
 )
 @click.option(
     "--max-tokens",