-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathCrawler.py
74 lines (62 loc) · 2.36 KB
/
Crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import csv
import pandas as pd
def search_link(kanji):
return 'https://ja.dict.naver.com/#/search?query=' + kanji
def selenium_get_html(kanji):
chrome_options = Options()
chrome_options.add_argument("--headless")
with webdriver.Chrome(options=chrome_options) as driver:
driver.get(search_link(kanji))
try:
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, 'word'))
)
finally:
return driver.page_source
def crawl(kanji):
html = selenium_get_html(kanji)
soup = BeautifulSoup(html, 'html.parser')
um = soup.select('#searchPage_entry > div > div:nth-child(1) > ul > li:nth-child(1) > div > span')
hun = soup.select('#searchPage_entry > div > div:nth-child(1) > ul > li:nth-child(2) > div > span')
mean = soup.select('#searchPage_entry > div > div:nth-child(1) > ul > li:nth-child(5) > div > span:nth-child(2)')
if not mean:
mean = soup.select('#searchPage_entry > div > div:nth-child(1) > ul > li:nth-child(4) > div > span:nth-child(2)')
print(um, hun, mean)
try:
um = um[0].text
except (IndexError, ValueError):
um = ''
try:
hun = hun[0].text
except (IndexError, ValueError):
hun = ''
try:
mean = mean[0].text[1:-1]
except (IndexError, ValueError):
mean = ''
kanji_data = {'um': um, 'hun': hun, 'mean': mean}
print(kanji_data)
return kanji_data
def read_csv(file):
with open(file) as csv_file:
csv_reader = csv.DictReader(csv_file)
if __name__ == '__main__':
print('start running')
df = pd.read_csv('Data/Heisigs RTK 6th Edition.csv')
for i in range(0, len(df)):
row = df.iloc[i]
if 'um' in row and not pd.isnull(row['um']):
continue
kanji = row['kanji']
kanji_data = crawl(kanji)
df.at[i, 'um'] = kanji_data['um']
df.at[i, 'hun'] = kanji_data['hun']
df.at[i, 'mean'] = kanji_data['mean']
print(df.iloc[i])
df.to_csv('anki.csv', index=False, header=True, encoding='utf-8-sig')