Remove max_words filtering from data importers (#901)

* Add a converter for Chinese * Convert imported datasets to simplified * Add augmentation modifier for cjk * Update tests * Move constants to the beginning of the file * Remove max_words filtering from data importers * Nits * Relock packages * Use arguments instead of env vars * Add cjk modifier to the docs * Update config generator * Parametrize test * Fix formatting * Add zh test cases * Use dataset statistics to output convertion stats json * Fix cjk test * Fix args * Accumulate lines in HPLT based on characters * Use named arguments * Return text comparison
mozilla · Nov 6, 2024 · ff17e30 · ff17e30
1 parent a615bf5
commit ff17e30
Show file tree

Hide file tree

Showing 8 changed files with 65 additions and 74 deletions.
diff --git a/pipeline/clean/merge-corpus.py b/pipeline/clean/merge-corpus.py
@@ -30,9 +30,6 @@
 
 logger = get_logger(__file__)
 
-# TODO(CJK) - Issue #424
-MAX_WORDS_IN_SENTENCE = 100
-
 
 class FilteringStatistics(Statistics):
     """
@@ -92,7 +89,6 @@ def run(
                     line_stream=self.yield_lines_string(stack),
                     seed=38540735095,
                     max_lines=max_lines,
-                    max_words_in_sentence=MAX_WORDS_IN_SENTENCE,
                     total_byte_size=total_corpus_bytes,
                 ):
                     src_line, trg_line = line.split("\t")
@@ -199,7 +195,6 @@ def join_src_trg():
             line_stream=join_src_trg(),
             seed=9834523434,
             max_lines=sample_size,
-            max_words_in_sentence=MAX_WORDS_IN_SENTENCE,
             total_byte_size=total_byte_size,
         ):
             sample_outfile.write(line)

diff --git a/pipeline/clean/merge-mono.py b/pipeline/clean/merge-mono.py
@@ -23,9 +23,6 @@
 
 logger = get_logger(__file__)
 
-# TODO(CJK) - Issue #424
-MAX_WORDS_IN_SENTENCE = 100
-
 
 @dataclass
 class FilteringStatistics(Statistics):
@@ -133,7 +130,6 @@ def deduplicate_lines(lines: Generator[str, None, None]) -> Generator[str, None,
             ),
             seed=347489345,
             max_lines=max_lines,
-            max_words_in_sentence=MAX_WORDS_IN_SENTENCE,
             total_byte_size=byte_size_estimate,
         )
 
@@ -160,7 +156,6 @@ def deduplicate_lines(lines: Generator[str, None, None]) -> Generator[str, None,
             line_stream=final_lines,
             seed=9834523434,
             max_lines=sample_size,
-            max_words_in_sentence=MAX_WORDS_IN_SENTENCE,
             total_byte_size=os.path.getsize(output_path),
         ):
             outfile.write(line)
@@ -250,7 +245,12 @@ def main() -> None:
     stats = FilteringStatistics(output_path)
 
     filter_and_write_monolingual_data(
-        mono_dataset_paths, output_path, line_hashes, max_sentences, args.sample_size, stats
+        mono_datasets=mono_dataset_paths,
+        output_path=output_path,
+        parallel_hashes=line_hashes,
+        max_lines=max_sentences,
+        sample_size=args.sample_size,
+        stats=stats,
     )
 
     logger.info("Done: Merging monolingual datasets")

diff --git a/pipeline/common/datasets.py b/pipeline/common/datasets.py
@@ -94,7 +94,6 @@ def shuffle_with_max_lines(
     line_stream: Iterator[str],
     seed: str,
     max_lines: int,
-    max_words_in_sentence,
     total_byte_size: Optional[int] = None,
     estimate_total_byte_size: Optional[Callable[[float], int]] = None,
 ) -> list[str]:
@@ -132,11 +131,6 @@ def shuffle_with_max_lines(
         # Encoding returns the underlying byte representation which is then measured.
         total_bytes = total_bytes + len(line.encode("utf-8"))
 
-        if len(line.split()) > max_words_in_sentence:
-            # TODO(CJK) - Issue #424
-            # This sentence is too long.
-            continue
-
         lines.append(line)
 
         if len(lines) == max_lines:

diff --git a/pipeline/data/download-mono.py b/pipeline/data/download-mono.py
@@ -62,6 +62,13 @@ def main(args_list: Optional[list[str]] = None) -> None:
         help="The minimum fluency score to filter datasets that include this metric",
         default=0.8,
     )
+    parser.add_argument(
+        "--hlpt_max_characters",
+        type=int,
+        help="The maximum number of characters to merge lines in a document before writing. "
+        "0 - preserve original lines of HPLT dataset",
+        default=0,
+    )
     parser.add_argument(
         "--artifacts", type=Path, help="The location where the dataset will be saved"
     )
@@ -85,8 +92,8 @@ def main(args_list: Optional[list[str]] = None) -> None:
         download_hplt(
             language=args.language,
             hlpt_min_fluency=args.hlpt_min_fluency,
+            max_characters=args.hlpt_max_characters,
             max_lines=args.max_sentences,
-            max_words_in_sentence=MAX_WORDS_IN_SENTENCE,
             file_destination=file_destination,
         )
 
@@ -116,7 +123,6 @@ def main(args_list: Optional[list[str]] = None) -> None:
             line_stream=lines,
             seed=dataset.name,
             max_lines=args.max_sentences,
-            max_words_in_sentence=MAX_WORDS_IN_SENTENCE,
             total_byte_size=get_download_size(url),
         ):
             outfile.write(line)

diff --git a/pipeline/data/importers/mono/hplt.py b/pipeline/data/importers/mono/hplt.py
@@ -68,7 +68,7 @@ def __init__(self, dataset_path: Path) -> None:
             "Of the collected lines, this counts how many were duplicates and discarded.",
         )
         self.final_lines = CountingStep(
-            "How many lines were actually written. Smaller lines will be combined together.",
+            "How many lines were actually written.",
         )
 
     def count_shards_visited(self, *_args):
@@ -95,6 +95,7 @@ def load_shuffled_shard_urls(language: str) -> list[str]:
     https://data.hplt-project.org/one/monotext/cleaned/en/en_110.jsonl.zst
     """
 
+    # TODO: migrate to HPLT 2.0: https://hplt-project.org/datasets/v2.0, https://github.com/mozilla/firefox-translations-training/issues/884
     url = f"https://data.hplt-project.org/one/monotext/cleaned/{language}_map.txt"
     logger.info(f"Downloading shard list: {url}")
 
@@ -113,8 +114,8 @@ def load_shuffled_shard_urls(language: str) -> list[str]:
 def download_hplt(
     language: str,
     hlpt_min_fluency: float,
+    max_characters: int,
     max_lines: int,
-    max_words_in_sentence,
     file_destination: Path,
 ):
     """
@@ -128,8 +129,8 @@ def download_hplt(
     Parameters:
      - language: The BCP 47 language code to filter the documents.
      - hlpt_min_fluency: The minimum score a sentence must have to be included in the final dataset.
+     - max_characters: The maximum number of characters to merge sentences in the document before writing. 0 - preserve the lines as in the dataset
      - max_lines: The maximum number of lines to include in the final dataset.
-     - max_words_in_sentence: The maximum number of words allowed in each sentence.
      - file_destination: The destination path where the final dataset will be written.
     """
 
@@ -153,7 +154,7 @@ def download_hplt(
 
         strings_seen = WeakStringSet()
         accumulated_text: str = ""
-        cumulative_word_count = 0
+        cumulative_char_count = 0
         visited_lines = 0
 
         def maybe_write_accumulated_text():
@@ -163,8 +164,8 @@ def maybe_write_accumulated_text():
             written out when either the text gets too long, or the next line is discarded.
             """
             nonlocal accumulated_text
-            nonlocal cumulative_word_count
-            cumulative_word_count = 0
+            nonlocal cumulative_char_count
+            cumulative_char_count = 0
             if accumulated_text:
                 if accumulated_text in strings_seen:
                     stats.duplicate_lines.value += 1
@@ -187,10 +188,9 @@ def maybe_write_accumulated_text():
 
                 # Check for the fluency scores.
                 if lang_item == language and score >= hlpt_min_fluency:
-                    # TODO(CJK) - Issue #424
-                    word_count = len(line.split())
+                    char_count = len(line)
 
-                    if word_count > max_words_in_sentence:
+                    if char_count > max_characters:
                         # This sentence is too long.
                         maybe_write_accumulated_text()
                     else:
@@ -199,11 +199,11 @@ def maybe_write_accumulated_text():
                         # Determine if this sentence should be added to the previous one or
                         # written out as a new line. Only concurrent sentences that meet
                         # the fluency requirement will be combined together.
-                        if cumulative_word_count + word_count > max_words_in_sentence:
+                        if cumulative_char_count + char_count > max_characters:
                             # This line would be too long, write it out.
                             maybe_write_accumulated_text()
 
-                        cumulative_word_count += word_count
+                        cumulative_char_count += char_count
                         # Collect this line to write.
                         if accumulated_text:
                             accumulated_text = f"{accumulated_text} {line}"

diff --git a/tests/test_common_datasets.py b/tests/test_common_datasets.py
@@ -98,7 +98,6 @@ def test_shuffle_with_max_lines(params):
         line_stream,
         seed="test",
         max_lines=MAX_LINES,
-        max_words_in_sentence=100,
         total_byte_size=get_total_byte_size(line_stream),
     )