forked from pln-fing-udelar/covid19-qa
-
Notifications
You must be signed in to change notification settings - Fork 0
/
corpus_creator.py
executable file
·40 lines (29 loc) · 1.33 KB
/
corpus_creator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import json
import os
import re
import xml.etree.ElementTree as ET
PATH_ARTICLES_FOLDER = "data/la_diaria_v1"
def generate_corpus_from_json(file_name: str) -> None:
"""Given a JSON containing news, creates several files according to our XML schema """
with open(os.path.join(PATH_ARTICLES_FOLDER, file_name)) as file:
articles = json.load(file)["articles"]
for i, article in enumerate(articles, start=1):
create_xml_file(i, article["html"], article["slug"])
def create_xml_file(id_: int, html: str, slug: str) -> None:
date = re.split(r"-", re.search(r'<time[^>]*datetime="(\d+-\d+-\d+).', html).group(1)) # get article date
year = date[0]
month = date[1]
day = date[2]
xml_id = "t" + str(id_).zfill(4)
article = ET.Element('article')
article.set("id", xml_id)
article.set("date", year + "-" + month + "-" + day)
article.set("url", "https://ladiaria.com.uy/articulo/" + year + "/" + month + "/" + slug)
article.set("src", "ladiaria")
article.text = re.sub(r"<.*?>", "", html) # clean html tags
with open(os.path.join(PATH_ARTICLES_FOLDER, xml_id + ".xml"), "w") as file:
file.write('\n'.join(ET.tostringlist(article, encoding='unicode')))
if __name__ == "__main__":
generate_corpus_from_json("la_diaria_v1.json")