Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(vax,chn): Much more robust regex and handling of Chinese numerals #2596

Merged
merged 5 commits into from
Apr 15, 2022
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 31 additions & 25 deletions scripts/src/cowidev/vax/incremental/china.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,15 @@ class China(CountryVaxBase):
source_url_complete: str = "http://www.nhc.gov.cn/xcs/s2906/new_list.shtml"
regex: dict = {
"date": r"截至(20\d{2})年(\d{1,2})月(\d{1,2})日",
"total_vaccinations": "([\d\.]+\s*万)剂次",
"total_vaccinations": r"([\d\.]+\s*万)剂次",
}
regex_complete = {
"title": r"国务院联防联控机制(20\d\d)年(\d{1,2})月(\d{1,2})日新闻发布会文字实录",
"doses": r"截至(\d{1,2})月(\d{1,2})日.*计报告接种新冠疫苗([\d\.]+)亿([\d\.]+)万剂次",
"people": r"接种总人数达([\d\.]+)亿([\d\.]+)万",
"fully": r"已完成全程接种([\d\.]+)亿([\d\.]+)万人,覆盖人数占全国总人口的",
"boosters": r"完成加强免疫接种([\d\.]+)亿([\d\.]+)万人,其中序贯加强免疫接种",
"title": r"国务院联防联控机制(20\d{2})年(\d{1,2})月(\d{1,2})日新闻发布会文字实录",
"summary": r"截至(\d{1,2})月(\d{1,2})日.*疫苗([\d\.亿零]+万)剂次.*全程接种的?人数(?:为|.{0,9}达到)([\d\.亿零]+万)人",
"vaccinated": r"(?:接种|疫苗)的?总人数达到?([\d\.亿零]+万)",
"boosters": r"加强免疫(?:已经)?接种的?是?([\d\.亿零]+万)人",
}
num_links_complete = 3
num_links_complete = 6
timeout = 30

def read(self, last_update: str):
Expand Down Expand Up @@ -59,35 +58,41 @@ def read_complete(self):
time.sleep(5)
links = self._get_links_complete(driver)
for link in links[: self.num_links_complete]:
records.append(self._parse_data_complete(driver, link))
record = self._parse_data_complete(driver, link)
if record:
records.append(record)
return pd.DataFrame(records)

def _get_links_complete(self, driver):
elems = driver.find_elements_by_css_selector("li>a")
return [elem.get_property("href") for elem in elems if re.search(self.regex_complete["title"], elem.text)]

def _parse_data_complete(self, driver, url):
def _estimate_metric(position_1, position_2):
return int(float(position_1) * 1e8 + float(position_2) * 1e4)
def _clean_count(num_as_str):
num = float(re.search(r"([\d\.]+)万", num_as_str).group(1)) * 1e4
num_100m = re.search(r"([\d\.]+)亿零?", num_as_str)
if num_100m:
num += float(num_100m.group(1)) * 1e8
return int(num)

driver.get(url)
elem = driver.find_element_by_id("xw_box")
# Apply regex
month, day, total_vaccinations_1, total_vaccinations_2 = re.search(
self.regex_complete["doses"], elem.text
).groups()
people_vaccinated_1, people_vaccinated_2 = re.search(self.regex_complete["people"], elem.text).groups()
people_fully_vaccinated_1, people_fully_vaccinated_2 = re.search(
self.regex_complete["fully"], elem.text
).groups()
total_boosters_1, total_boosters_2 = re.search(self.regex_complete["boosters"], elem.text).groups()
year = re.search(self.regex_complete["title"], driver.title).group(1)
summary = re.search(self.regex_complete["summary"], elem.text)
if summary:
month, day, total_vaccinations, people_fully_vaccinated = summary.groups()
else:
return
vaccinated = re.search(self.regex_complete["vaccinated"], elem.text)
boosters = re.search(self.regex_complete["boosters"], elem.text)
# Get metrics
metrics = {
"total_vaccinations": _estimate_metric(total_vaccinations_1, total_vaccinations_2),
"people_vaccinated": _estimate_metric(people_vaccinated_1, people_vaccinated_2),
"people_fully_vaccinated": _estimate_metric(people_fully_vaccinated_1, people_fully_vaccinated_2),
"total_boosters": _estimate_metric(total_boosters_1, total_boosters_2),
"date": clean_date(f"2022-{month}-{day}", "%Y-%m-%d"),
"total_vaccinations": _clean_count(total_vaccinations),
"people_vaccinated": _clean_count(vaccinated.group(1)) if vaccinated else None,
"people_fully_vaccinated": _clean_count(people_fully_vaccinated),
"total_boosters": _clean_count(boosters.group(1)) if boosters else None,
"date": clean_date(f"{year}-{month}-{day}", "%Y-%m-%d"),
"source_url": url,
}
return metrics
Expand Down Expand Up @@ -115,11 +120,12 @@ def export(self):
# Merge
if df.empty:
df = df_complete
else:
elif not df_complete.empty:
msk = ~df.date.isin(df_complete.date)
df = pd.concat([df_complete, df.loc[msk]])
# Export
self.export_datafile(df, attach=True)
if not df.empty:
self.export_datafile(df, attach=True)


def main():
Expand Down