Skip to content

Commit

Permalink
v0.4.2: jparacrawl updated for wmt24. fix occasional empty lines in e…
Browse files Browse the repository at this point in the history
…cho (#161)

Co-authored-by: Thamme Gowda <[email protected]>
  • Loading branch information
thammegowda and Thamme Gowda authored May 25, 2024
1 parent 9579e11 commit 671d33d
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 11 deletions.
2 changes: 1 addition & 1 deletion mtdata/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# Created: 4/4/20


__version__ = '0.4.1'
__version__ = '0.4.2'
__description__ = 'mtdata is a tool to download datasets for machine translation'
__author__ = 'Thamme Gowda'

Expand Down
22 changes: 14 additions & 8 deletions mtdata/index/paracrawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,16 +82,22 @@ def load_all(index: Index):
url=f'{URL_PREFIX}/bonus/en-uk-v1.txt.gz',
cite=cite, ext='tsv.gz'))

# Japanese-English paracrawl (5.1) used by WMT20 and WMT21
# Japanese-English paracrawl (5.1) used by WMT20 ...
for version, cols in [('2', (2, 3)), ('3', (3, 4))]:
ent = Entry(did=DatasetId(group='KECL', name=f'paracrawl', version=version, langs=('eng', 'jpn')),
in_paths=['en-ja/en-ja.bicleaner05.txt'], in_ext='tsv', cols=cols, cite='',
url=f'http://www.kecl.ntt.co.jp/icl/lirg/jparacrawl/release/{version}.0/bitext/en-ja.tar.gz')
index.add_entry(ent)

# JParaCrawl Chinese-Japanese, only version 2 is available
if version == '2':
ent = Entry(did=DatasetId(group='KECL', name=f'paracrawl', version=version, langs=('zho', 'jpn')),
in_paths=['zh-ja/zh-ja.bicleaner05.txt'], in_ext='tsv', cols=cols, cite='',
url=f'http://www.kecl.ntt.co.jp/icl/lirg/jparacrawl/release/{version}.0/bitext/zh-ja.tar.gz')
index.add_entry(ent)

# JParaCrawl Chinese-Japanese, v2: cols=2,3
ent = Entry(did=DatasetId(group='KECL', name=f'paracrawl', version='2', langs=('zho', 'jpn')),
in_paths=['zh-ja/zh-ja.bicleaner05.txt'], filename='jparacrawl-2.0-zh-ja.tar.gz',
in_ext='tsv', cols=(2, 3), cite=('morishita-etal-2022-jparacrawl',),
url=f'http://www.kecl.ntt.co.jp/icl/lirg/jparacrawl/release/zh/2.0/bitext/zh-ja.tar.gz')
index.add_entry(ent)
# v2wmt24: columns=(3, 4)
ent = Entry(did=DatasetId(group='KECL', name=f'paracrawl', version=f'2wmt24', langs=('zho', 'jpn')),
in_paths=['zh-ja/zh-ja.crowdsourcing_b05l07.txt'], filename='jparacrawl-2.0-zh-ja.tar.gz',
in_ext='tsv', cols=(3, 4), cite=('nagata2024japanesechinese',),
url=f'http://www.kecl.ntt.co.jp/icl/lirg/jparacrawl/release/zh/2.0/bitext/zh-ja.tar.gz')
index.add_entry(ent)
5 changes: 4 additions & 1 deletion mtdata/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,8 @@ def echo_data(did:DatasetId, delim='\t'):
path = cache.get_entry(entry)
parser = Parser(path, ext=entry.in_ext or None, ent=entry)
count = 0
for rec in parser.read_segs():
all_segs = parser.read_segs()
for rec in all_segs:
if isinstance(rec, (list, tuple)):
rec = (col.replace(delim, ' ').replace('\n', ' ') for col in rec)
rec = delim.join(rec)
Expand Down Expand Up @@ -323,6 +324,8 @@ def main():
elif args.task == 'get':
get_data(**vars(args))
elif args.task == 'echo':
# disable progress bar for echo; it sometimes insert new lines in the output
pbar_man.enabled = False
echo_data(did=args.dataset_id)
elif args.task == 'list-recipe':
list_recipes(id_only=args.id, format=args.format)
Expand Down
28 changes: 27 additions & 1 deletion mtdata/resource/refs.bib
Original file line number Diff line number Diff line change
Expand Up @@ -693,4 +693,30 @@ @inproceedings{goldhahn-etal-2012-building
publisher = "European Language Resources Association (ELRA)",
url = "http://www.lrec-conf.org/proceedings/lrec2012/pdf/327_Paper.pdf",
pages = "759--765",
}
}

%%% Japanese paracrawl
@inproceedings{morishita-etal-2022-jparacrawl,
title = "{JP}ara{C}rawl v3.0: A Large-scale {E}nglish-{J}apanese Parallel Corpus",
author = "Morishita, Makoto and
Chousa, Katsuki and
Suzuki, Jun and
Nagata, Masaaki",
booktitle = "Proceedings of the Thirteenth Language Resources and Evaluation Conference",
month = jun,
year = "2022",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2022.lrec-1.721",
pages = "6704--6710",
}


@misc{nagata2024japanesechinese,
title={A Japanese-Chinese Parallel Corpus Using Crowdsourcing for Web Mining},
author={Masaaki Nagata and Makoto Morishita and Katsuki Chousa and Norihito Yasuda},
year={2024},
eprint={2405.09017},
archivePrefix={arXiv},
primaryClass={cs.CL},
}

0 comments on commit 671d33d

Please sign in to comment.