Skip to content

Commit

Permalink
Remove max_words filtering from data importers (#901)
Browse files Browse the repository at this point in the history
* Add a converter for Chinese

* Convert imported datasets to simplified

* Add augmentation modifier for cjk

* Update tests

* Move constants to the beginning of the file

* Remove max_words filtering from data importers

* Nits

* Relock packages

* Use arguments instead of env vars

* Add cjk modifier to the docs

* Update config generator

* Parametrize test

* Fix formatting

* Add zh test cases

* Use dataset statistics to output convertion stats json

* Fix cjk test

* Fix args

* Accumulate lines in HPLT based on characters

* Use named arguments

* Return text comparison
  • Loading branch information
eu9ene authored Nov 6, 2024
1 parent a615bf5 commit ff17e30
Show file tree
Hide file tree
Showing 8 changed files with 65 additions and 74 deletions.
5 changes: 0 additions & 5 deletions pipeline/clean/merge-corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,6 @@

logger = get_logger(__file__)

# TODO(CJK) - Issue #424
MAX_WORDS_IN_SENTENCE = 100


class FilteringStatistics(Statistics):
"""
Expand Down Expand Up @@ -92,7 +89,6 @@ def run(
line_stream=self.yield_lines_string(stack),
seed=38540735095,
max_lines=max_lines,
max_words_in_sentence=MAX_WORDS_IN_SENTENCE,
total_byte_size=total_corpus_bytes,
):
src_line, trg_line = line.split("\t")
Expand Down Expand Up @@ -199,7 +195,6 @@ def join_src_trg():
line_stream=join_src_trg(),
seed=9834523434,
max_lines=sample_size,
max_words_in_sentence=MAX_WORDS_IN_SENTENCE,
total_byte_size=total_byte_size,
):
sample_outfile.write(line)
Expand Down
12 changes: 6 additions & 6 deletions pipeline/clean/merge-mono.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,6 @@

logger = get_logger(__file__)

# TODO(CJK) - Issue #424
MAX_WORDS_IN_SENTENCE = 100


@dataclass
class FilteringStatistics(Statistics):
Expand Down Expand Up @@ -133,7 +130,6 @@ def deduplicate_lines(lines: Generator[str, None, None]) -> Generator[str, None,
),
seed=347489345,
max_lines=max_lines,
max_words_in_sentence=MAX_WORDS_IN_SENTENCE,
total_byte_size=byte_size_estimate,
)

Expand All @@ -160,7 +156,6 @@ def deduplicate_lines(lines: Generator[str, None, None]) -> Generator[str, None,
line_stream=final_lines,
seed=9834523434,
max_lines=sample_size,
max_words_in_sentence=MAX_WORDS_IN_SENTENCE,
total_byte_size=os.path.getsize(output_path),
):
outfile.write(line)
Expand Down Expand Up @@ -250,7 +245,12 @@ def main() -> None:
stats = FilteringStatistics(output_path)

filter_and_write_monolingual_data(
mono_dataset_paths, output_path, line_hashes, max_sentences, args.sample_size, stats
mono_datasets=mono_dataset_paths,
output_path=output_path,
parallel_hashes=line_hashes,
max_lines=max_sentences,
sample_size=args.sample_size,
stats=stats,
)

logger.info("Done: Merging monolingual datasets")
Expand Down
6 changes: 0 additions & 6 deletions pipeline/common/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,6 @@ def shuffle_with_max_lines(
line_stream: Iterator[str],
seed: str,
max_lines: int,
max_words_in_sentence,
total_byte_size: Optional[int] = None,
estimate_total_byte_size: Optional[Callable[[float], int]] = None,
) -> list[str]:
Expand Down Expand Up @@ -132,11 +131,6 @@ def shuffle_with_max_lines(
# Encoding returns the underlying byte representation which is then measured.
total_bytes = total_bytes + len(line.encode("utf-8"))

if len(line.split()) > max_words_in_sentence:
# TODO(CJK) - Issue #424
# This sentence is too long.
continue

lines.append(line)

if len(lines) == max_lines:
Expand Down
10 changes: 8 additions & 2 deletions pipeline/data/download-mono.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,13 @@ def main(args_list: Optional[list[str]] = None) -> None:
help="The minimum fluency score to filter datasets that include this metric",
default=0.8,
)
parser.add_argument(
"--hlpt_max_characters",
type=int,
help="The maximum number of characters to merge lines in a document before writing. "
"0 - preserve original lines of HPLT dataset",
default=0,
)
parser.add_argument(
"--artifacts", type=Path, help="The location where the dataset will be saved"
)
Expand All @@ -85,8 +92,8 @@ def main(args_list: Optional[list[str]] = None) -> None:
download_hplt(
language=args.language,
hlpt_min_fluency=args.hlpt_min_fluency,
max_characters=args.hlpt_max_characters,
max_lines=args.max_sentences,
max_words_in_sentence=MAX_WORDS_IN_SENTENCE,
file_destination=file_destination,
)

Expand Down Expand Up @@ -116,7 +123,6 @@ def main(args_list: Optional[list[str]] = None) -> None:
line_stream=lines,
seed=dataset.name,
max_lines=args.max_sentences,
max_words_in_sentence=MAX_WORDS_IN_SENTENCE,
total_byte_size=get_download_size(url),
):
outfile.write(line)
Expand Down
22 changes: 11 additions & 11 deletions pipeline/data/importers/mono/hplt.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def __init__(self, dataset_path: Path) -> None:
"Of the collected lines, this counts how many were duplicates and discarded.",
)
self.final_lines = CountingStep(
"How many lines were actually written. Smaller lines will be combined together.",
"How many lines were actually written.",
)

def count_shards_visited(self, *_args):
Expand All @@ -95,6 +95,7 @@ def load_shuffled_shard_urls(language: str) -> list[str]:
https://data.hplt-project.org/one/monotext/cleaned/en/en_110.jsonl.zst
"""

# TODO: migrate to HPLT 2.0: https://hplt-project.org/datasets/v2.0, https://github.com/mozilla/firefox-translations-training/issues/884
url = f"https://data.hplt-project.org/one/monotext/cleaned/{language}_map.txt"
logger.info(f"Downloading shard list: {url}")

Expand All @@ -113,8 +114,8 @@ def load_shuffled_shard_urls(language: str) -> list[str]:
def download_hplt(
language: str,
hlpt_min_fluency: float,
max_characters: int,
max_lines: int,
max_words_in_sentence,
file_destination: Path,
):
"""
Expand All @@ -128,8 +129,8 @@ def download_hplt(
Parameters:
- language: The BCP 47 language code to filter the documents.
- hlpt_min_fluency: The minimum score a sentence must have to be included in the final dataset.
- max_characters: The maximum number of characters to merge sentences in the document before writing. 0 - preserve the lines as in the dataset
- max_lines: The maximum number of lines to include in the final dataset.
- max_words_in_sentence: The maximum number of words allowed in each sentence.
- file_destination: The destination path where the final dataset will be written.
"""

Expand All @@ -153,7 +154,7 @@ def download_hplt(

strings_seen = WeakStringSet()
accumulated_text: str = ""
cumulative_word_count = 0
cumulative_char_count = 0
visited_lines = 0

def maybe_write_accumulated_text():
Expand All @@ -163,8 +164,8 @@ def maybe_write_accumulated_text():
written out when either the text gets too long, or the next line is discarded.
"""
nonlocal accumulated_text
nonlocal cumulative_word_count
cumulative_word_count = 0
nonlocal cumulative_char_count
cumulative_char_count = 0
if accumulated_text:
if accumulated_text in strings_seen:
stats.duplicate_lines.value += 1
Expand All @@ -187,10 +188,9 @@ def maybe_write_accumulated_text():

# Check for the fluency scores.
if lang_item == language and score >= hlpt_min_fluency:
# TODO(CJK) - Issue #424
word_count = len(line.split())
char_count = len(line)

if word_count > max_words_in_sentence:
if char_count > max_characters:
# This sentence is too long.
maybe_write_accumulated_text()
else:
Expand All @@ -199,11 +199,11 @@ def maybe_write_accumulated_text():
# Determine if this sentence should be added to the previous one or
# written out as a new line. Only concurrent sentences that meet
# the fluency requirement will be combined together.
if cumulative_word_count + word_count > max_words_in_sentence:
if cumulative_char_count + char_count > max_characters:
# This line would be too long, write it out.
maybe_write_accumulated_text()

cumulative_word_count += word_count
cumulative_char_count += char_count
# Collect this line to write.
if accumulated_text:
accumulated_text = f"{accumulated_text} {line}"
Expand Down
1 change: 0 additions & 1 deletion tests/test_common_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,6 @@ def test_shuffle_with_max_lines(params):
line_stream,
seed="test",
max_lines=MAX_LINES,
max_words_in_sentence=100,
total_byte_size=get_total_byte_size(line_stream),
)

Expand Down
Loading

0 comments on commit ff17e30

Please sign in to comment.