-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPatentSpider.py
175 lines (160 loc) · 7.24 KB
/
PatentSpider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
# -*- coding: utf-8 -*-
from telnetlib import EC
import selenium
from selenium import webdriver
from bs4 import BeautifulSoup
# from selenium.common.exceptions import NoSuchElementException
import time
import xlwt
# import xdrlib, sys
import xlrd
# from selenium.webdriver.common.by import By
from selenium.webdriver.remote.webelement import WebElement
# from selenium.webdriver.support import wait
from xlutils.copy import copy
# 新建一个xls用来存数据
excel = xlwt.Workbook(encoding='utf-8') # 创建一个Excel
sheet = excel.add_sheet('Sheet1') # 在其中创建一个名为hello的sheet
path = 'C:\\data6.xls' # 每次运行都更改以避免覆盖上一轮数据
excel.save(path)
# 登录,需要破解算数验证码
# 可以暂时手动识别验证码
driver = webdriver.Firefox(executable_path='C:\\Users\Administrator\Desktop\geckodriver.exe')
driver.get("http://www.pss-system.gov.cn/sipopublicsearch/patentsearch/tableSearch-showTableSearchIndex.shtml")
time.sleep(5)
driver.find_element_by_link_text(u"请登录").click()
driver.find_element_by_id("j_username").click()
driver.find_element_by_id("j_username").clear()
driver.find_element_by_id("j_username").send_keys("11111111")
driver.find_element_by_id("j_password_show").click()
driver.find_element_by_id("j_password_show").clear()
driver.find_element_by_id("j_password_show").send_keys("1111111111")
driver.find_element_by_id("j_validation_code").click()
validation = input("手动识别验证吧")
driver.find_element_by_id("j_validation_code").clear()
driver.find_element_by_id("j_validation_code").send_keys(validation)
driver.find_element_by_link_text(u"登录").click()
time.sleep(5)
driver.find_element_by_link_text(u"高级检索").click()
time.sleep(5)
driver.find_element_by_id("tableSearchItemIdIVDB020").click()
driver.find_element_by_id("tableSearchItemIdIVDB020").clear()
driver.find_element_by_id("tableSearchItemIdIVDB020").send_keys(u"电子科技大学")
driver.find_element_by_link_text(u"生成检索式").click()
driver.find_element_by_xpath(u"(.//*[normalize-space(text()) and normalize-space(.)='清空检索式'])[1]/following::a[1]").click()
time.sleep(20)
driver.find_element_by_id("txt").click()
driver.find_element_by_id("txt").clear()
# 在爬虫中断于870页后,我增加了page用于下次中断后的自动刷新定位页面重启爬虫
# 改变path,page之后可以重启爬虫
# 在哪一页中断就从哪一页开始
# page = 917
page = 2213
# page = 1832
js="var q=document.documentElement.scrollTop=10000"
driver.execute_script(js)
time.sleep(3)
driver.find_element_by_id("txt").send_keys(page)
driver.find_element_by_link_text(u"确定").click()
time.sleep(15)
total = 0
while total < 70000:
driver.maximize_window()
total = total + 1
# 本条数据需要写入xls
workbook = xlrd.open_workbook(path) # 打开工作簿
sheets = workbook.sheet_names() # 获取工作簿中的所有表格
worksheet = workbook.sheet_by_name(sheets[0]) # 获取工作簿中所有表格中的的第一个表格
rows_old = worksheet.nrows # 获取表格中已存在的数据的行数
new_workbook = copy(workbook) # 将xlrd对象拷贝转化为xlwt对象
new_worksheet = new_workbook.get_sheet(0) # 获取转化后工作簿中的第一个表格
for i in range(12):
count = 1
# 爬取一个页面的内容
html = driver.page_source
soup = BeautifulSoup(html)
# 专利名
items_b = soup.find_all('b', attrs={'style': "color: #4B4B4B"})
try:
item_b = items_b[i]
except:
i=i-1
time.sleep(1)
print("items_b may be empty")
count = count + 1
if count > 15:
break
continue
# print(item_b.text)
# 公开号
items_number = soup.find_all('a', attrs={'class': "btn btn-operation", 'role': "lawState"})
# print(items_number[i].get('pn'))
# 摘要描述
items_abview = soup.select('.abview-content')
# print(items_abview[i].text)
# proposer
proposer = soup.find_all('a', attrs={'href': "javascript:;",
'onclick': "drillSearch('IVDB020','申请(专利权)人','','false',this);return false;"})
# print(proposer[i].text)
# IPC
y = i + 1
time.sleep(1)
try:
e = driver.find_element_by_xpath(u"(.//*[normalize-space(text()) and normalize-space(.)='公开(公告)日 :'])[%s]/following::p[1]" % y).text
except:
time.sleep(5)
continue
# print(e)
new_worksheet.write(i + rows_old, 0, item_b.text) # 追加写入数据,注意是从i+rows_old行开始写入,第0列写专利名
new_worksheet.write(i + rows_old, 1, items_number[i].get('pn')) # 公开号
try:
new_worksheet.write(i + rows_old, 2, items_abview[i].text) # 摘要
except:
continue
new_worksheet.write(i + rows_old, 3, proposer[i].text) # 申请人
try:
new_worksheet.write(i + rows_old, 4, e) # IPC
except:
continue
new_workbook.save(path) # 保存工作簿
# 点击下一页
time.sleep(1)
try:
# time.sleep(3)
# # try:
# wait.until(EC.invisibility_of_element_located([By.XPATH,
# "//div[@class='blockUI blockOverlay']"]))
#
# driver.find_element_by_link_text(u"下一页").click()
# # except:
element: WebElement = driver.find_element_by_link_text(u"下一页")
driver.execute_script('arguments[0].click();', element)
page = page+1
except:
driver.maximize_window()
print(str(page)+'page')
driver.refresh() # 刷新
time.sleep(15)
driver.find_element_by_link_text(u"高级检索").click()
time.sleep(5)
driver.find_element_by_id("tableSearchItemIdIVDB020").click()
driver.find_element_by_id("tableSearchItemIdIVDB020").clear()
driver.find_element_by_id("tableSearchItemIdIVDB020").send_keys(u"电子科技大学")
driver.find_element_by_link_text(u"生成检索式").click()
driver.find_element_by_xpath(
u"(.//*[normalize-space(text()) and normalize-space(.)='清空检索式'])[1]/following::a[1]").click()
driver.maximize_window()
time.sleep(20)
js = "var q=document.documentElement.scrollTop=10000"
driver.execute_script(js)
time.sleep(3)
driver.find_element_by_id("txt").click()
driver.find_element_by_id("txt").clear()
page_string = str(page+1)
# 1787页中断,并且发现1786页缺少最后一条,于是决定从1786页从新开始,可能会重复一页,但不遗漏
# 重复检查时发现1786页已经爬取完毕,是重复爬取的第二遍缺少一条数据,因此应删除这组数据,从page+1开始
driver.find_element_by_id("txt").send_keys(page_string)
driver.find_element_by_link_text(u"确定").click()
time.sleep(10)
continue
time.sleep(7)