Add a file

cl-tohoku · Aug 28, 2019 · 7ac6305 · 7ac6305
1 parent dd5b2d3
commit 7ac6305
Showing 1 changed file with 120 additions and 0 deletions.
diff --git a/make_corpus.py b/make_corpus.py
@@ -0,0 +1,120 @@
+import re
+import bz2
+import json
+import unicodedata
+import argparse
+
+import MeCab
+from logzero import logger
+
+
+class MeCabSentenceSplitter(object):
+    def __init__(self, mecab_dict_path=None):
+        if mecab_dict_path is not None:
+            self.mecab = MeCab.Tagger('-d {}'.format(mecab_dict_path))
+        else:
+            self.mecab = MeCab.Tagger()
+
+    def __call__(self, text):
+        sentences = []
+        start = 0
+        end = 0
+        for line in self.mecab.parse(text).split('\n'):
+            if line == 'EOS':
+                if text[start:]:
+                    sentences.append(text[start:])
+
+                break
+
+            token, token_info = line.split('\t')
+            end = text.index(token, end) + len(token)
+            if token_info.startswith('記号,句点,'):
+                sentences.append(text[start:end])
+                start = end
+
+        return sentences
+
+
+def preprocess_text(text):
+    text = re.sub(r'、+', '、', text)
+    text = text.replace('(、', '(')
+    text = text.replace('、)', ')')
+    text = text.replace('()', '')
+    text = re.sub(r'\s+', ' ', text)
+    return text.strip()
+
+
+def filter_text(text, min_length, max_length):
+    if re.search(r'\| *\|+', text):
+        return False
+    if len(text) < min_length or len(text) > max_length:
+        return False
+
+    return True
+
+
+regex_link = re.compile(r'\<a href="(.*?)"\>(.*?)\</a\>')
+
+
+def main(args):
+    sent_splitter = MeCabSentenceSplitter(args.mecab_dict_path)
+
+    num_processed_docs = 0
+    with bz2.open(args.input_file, 'rt') as input_file, \
+         open(args.output_file, 'w') as output_file:
+        for line in input_file:
+            page_item = json.loads(line)
+            text = page_item['text']
+
+            # replace links
+            text = regex_link.sub(r'\2', text)
+
+            # normalize text
+            text = unicodedata.normalize('NFKC', text)
+
+            paragraphs = re.split(r'\n\n+', text)[1:]
+            sentences = [preprocess_text(s) for p in paragraphs
+                         for s in sent_splitter(p)]
+            # ignore too short/long sentences
+            sentences = [s for s in sentences
+                         if filter_text(s, args.min_length, args.max_length)]
+            if sentences:
+                # write document to a file
+                for s in sentences:
+                    assert not '\n' in s, s
+                    assert s, s
+                    output_file.write(s + '\n')
+
+                output_file.write('\n')
+
+            num_processed_docs += 1
+            if args.debug and num_processed_docs == 1000:
+                logger.info('processed: {}'.format(num_processed_docs))
+                break
+
+            # logging
+            if num_processed_docs % 10000 == 0:
+                logger.info('processed: {}'.format(num_processed_docs))
+
+        if num_processed_docs % 10000 != 0:
+            logger.info('processed: {}'.format(num_processed_docs))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input_file', type=str, required=True,
+        help='preprocessed Wikipedia articles file (.bz2)')
+    parser.add_argument('--output_file', type=str, required=True,
+        help='output corpus file')
+    parser.add_argument('--min_length', type=int, default=16,
+        help='only extract sentences with no less than N characters [16]')
+    parser.add_argument('--max_length', type=int, default=1024,
+        help='only extract sentences with no more than N characters [1024]')
+    parser.add_argument('--mecab_dict_path', type=str,
+        help='path to MeCab dictionary')
+    parser.add_argument('--do_normalize_text', action='store_true',
+        help='Apply Unicode normalization to the input text.')
+    parser.add_argument('--debug', action='store_true')
+    args = parser.parse_args()
+
+    main(args)