v0.4.2: jparacrawl updated for wmt24. fix occasional empty lines in e…

…cho (#161) Co-authored-by: Thamme Gowda <[email protected]>
thammegowda · May 25, 2024 · 671d33d · 671d33d
1 parent 9579e11
commit 671d33d
Show file tree

Hide file tree

Showing 4 changed files with 46 additions and 11 deletions.
diff --git a/mtdata/__init__.py b/mtdata/__init__.py
@@ -4,7 +4,7 @@
 # Created: 4/4/20
 
 
-__version__ = '0.4.1'
+__version__ = '0.4.2'
 __description__ = 'mtdata is a tool to download datasets for machine translation'
 __author__ = 'Thamme Gowda'
 

diff --git a/mtdata/index/paracrawl.py b/mtdata/index/paracrawl.py
@@ -82,16 +82,22 @@ def load_all(index: Index):
         url=f'{URL_PREFIX}/bonus/en-uk-v1.txt.gz',
         cite=cite, ext='tsv.gz'))
 
-    # Japanese-English paracrawl (5.1) used by WMT20 and WMT21
+    # Japanese-English paracrawl (5.1) used by WMT20 ...
     for version, cols in [('2', (2, 3)), ('3', (3, 4))]:
         ent = Entry(did=DatasetId(group='KECL', name=f'paracrawl', version=version, langs=('eng', 'jpn')),
                     in_paths=['en-ja/en-ja.bicleaner05.txt'], in_ext='tsv', cols=cols, cite='',
                     url=f'http://www.kecl.ntt.co.jp/icl/lirg/jparacrawl/release/{version}.0/bitext/en-ja.tar.gz')
         index.add_entry(ent)
-
-        # JParaCrawl Chinese-Japanese, only version 2 is available
-        if version == '2':
-            ent = Entry(did=DatasetId(group='KECL', name=f'paracrawl', version=version, langs=('zho', 'jpn')),
-                            in_paths=['zh-ja/zh-ja.bicleaner05.txt'], in_ext='tsv', cols=cols, cite='',
-                            url=f'http://www.kecl.ntt.co.jp/icl/lirg/jparacrawl/release/{version}.0/bitext/zh-ja.tar.gz')
-            index.add_entry(ent)
+
+    # JParaCrawl Chinese-Japanese, v2: cols=2,3
+    ent = Entry(did=DatasetId(group='KECL', name=f'paracrawl', version='2', langs=('zho', 'jpn')),
+                    in_paths=['zh-ja/zh-ja.bicleaner05.txt'], filename='jparacrawl-2.0-zh-ja.tar.gz',
+                    in_ext='tsv', cols=(2, 3), cite=('morishita-etal-2022-jparacrawl',),
+                    url=f'http://www.kecl.ntt.co.jp/icl/lirg/jparacrawl/release/zh/2.0/bitext/zh-ja.tar.gz')
+    index.add_entry(ent)
+    # v2wmt24: columns=(3, 4)
+    ent = Entry(did=DatasetId(group='KECL', name=f'paracrawl', version=f'2wmt24', langs=('zho', 'jpn')),
+                    in_paths=['zh-ja/zh-ja.crowdsourcing_b05l07.txt'], filename='jparacrawl-2.0-zh-ja.tar.gz',
+                    in_ext='tsv', cols=(3, 4), cite=('nagata2024japanesechinese',),
+                    url=f'http://www.kecl.ntt.co.jp/icl/lirg/jparacrawl/release/zh/2.0/bitext/zh-ja.tar.gz')
+    index.add_entry(ent)
diff --git a/mtdata/main.py b/mtdata/main.py
@@ -71,7 +71,8 @@ def echo_data(did:DatasetId, delim='\t'):
     path = cache.get_entry(entry)
     parser = Parser(path, ext=entry.in_ext or None, ent=entry)
     count = 0
-    for rec in parser.read_segs():
+    all_segs = parser.read_segs()
+    for rec in all_segs:
         if isinstance(rec, (list, tuple)):
             rec = (col.replace(delim, ' ').replace('\n', ' ') for col in rec)
             rec = delim.join(rec)
@@ -323,6 +324,8 @@ def main():
     elif args.task == 'get':
         get_data(**vars(args))
     elif args.task == 'echo':
+        # disable progress bar for echo; it sometimes insert new lines in the output
+        pbar_man.enabled = False
         echo_data(did=args.dataset_id)
     elif args.task == 'list-recipe':
         list_recipes(id_only=args.id, format=args.format)

diff --git a/mtdata/resource/refs.bib b/mtdata/resource/refs.bib
@@ -693,4 +693,30 @@ @inproceedings{goldhahn-etal-2012-building
     publisher = "European Language Resources Association (ELRA)",
     url = "http://www.lrec-conf.org/proceedings/lrec2012/pdf/327_Paper.pdf",
     pages = "759--765",
-}
+}
+
+%%% Japanese paracrawl
+@inproceedings{morishita-etal-2022-jparacrawl,
+    title = "{JP}ara{C}rawl v3.0: A Large-scale {E}nglish-{J}apanese Parallel Corpus",
+    author = "Morishita, Makoto  and
+      Chousa, Katsuki  and
+      Suzuki, Jun  and
+      Nagata, Masaaki",
+    booktitle = "Proceedings of the Thirteenth Language Resources and Evaluation Conference",
+    month = jun,
+    year = "2022",
+    address = "Marseille, France",
+    publisher = "European Language Resources Association",
+    url = "https://aclanthology.org/2022.lrec-1.721",
+    pages = "6704--6710",
+}
+
+
+@misc{nagata2024japanesechinese,
+    title={A Japanese-Chinese Parallel Corpus Using Crowdsourcing for Web Mining},
+    author={Masaaki Nagata and Makoto Morishita and Katsuki Chousa and Norihito Yasuda},
+    year={2024},
+    eprint={2405.09017},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL},
+}