forked from ipcjs/fcitx5-pinyin-zhwiki
-
Notifications
You must be signed in to change notification settings - Fork 0
/
zhwiki-web-slang.py
executable file
·50 lines (40 loc) · 1.34 KB
/
zhwiki-web-slang.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import json
import urllib.parse
import urllib.request
import collections
_ZHWIKI_SOURCE_URL = "https://zh.wikipedia.org/w/api.php?action=parse&format=json&prop=wikitext&uselang=zh&formatversion=2&page="
_PAGE = "中国大陆网络用语列表"
page = urllib.request.urlopen(_ZHWIKI_SOURCE_URL + urllib.parse.quote(_PAGE)).read()
wikitext = json.loads(page)["parse"]["wikitext"]
words = collections.OrderedDict()
def add_word(word):
if word.startswith("形容"):
return
for garbage in ("、", "[", "]", "…"):
word = word.replace(garbage, "")
words[word.strip()] = None
def add_words(word):
for word_separator in ("、", "/", "|", ",", "。"):
if word_separator in word:
for w in word.split(word_separator):
# recursively resolve
add_words(w.strip())
break
else:
add_word(word)
for line in wikitext.split("\n"):
if line.startswith("*"):
# Lists
for table_separator in (":", ":"):
if table_separator in line:
word = line.split(table_separator)[0].strip("*").strip()
add_words(word)
break
elif line.startswith("|"):
# Tables
word = line.split("|")[1]
add_words(word)
for word in words:
print(word)