-
Notifications
You must be signed in to change notification settings - Fork 9
/
1688爬虫主文件.py
384 lines (337 loc) · 16.3 KB
/
1688爬虫主文件.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
# -*- coding: utf-8 -*-
"""
@Time : 2022/3/21 18:28
@Auth : wes
@File :1688_goods_spider.py
@IDE :PyCharm
@email:
"""
import time
import json
import requests
import re
from utility import generate_headers
import urllib.parse
import csv
from jsonsearch import JsonSearch
from seleniumwire import webdriver
from tkinter import messagebox
from browsermobproxy import Server
from requests.cookies import RequestsCookieJar
from selenium.webdriver import ActionChains
def save_cookies():
cookies = driver.get_cookies()
#获取cookies并保存
with open("cookies.txt", "w") as fp:
json.dump(cookies, fp)
def create_driver(
show: bool = True
) :
"""
创建浏览器驱动
:param show: 是否弹出浏览器页面
:return: 浏览器驱动
"""
chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
prefs = {
'profile.default_content_setting_values':
{
'notifications': 2
},
'profile.password_manager_enabled': False,
'credentials_enable_service': False
}
chrome_options.add_experimental_option('prefs', prefs)
# chrome_options.binary_location = "C:\Program Files\Google\Chrome\Application\chrome.exe"
if not show:
chrome_options.add_argument("--headless")
# 开发者模式防止被识别出
# 网址:https://blog.csdn.net/dslkfajoaijfdoj/article/details/109146051
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_experimental_option('w3c', False)
caps = {
'browserName': 'chrome',
'loggingPrefs': {
'browser': 'ALL',
'driver': 'ALL',
'performance': 'ALL',
},
'goog:chromeOptions': {
'perfLoggingPrefs': {
'enableNetwork': True,
},
'w3c': False,
},
}
driver = webdriver.Chrome(desired_capabilities= caps,options=chrome_options)
# 执行cdp命令
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
return driver
def find_element_exists(xpath_,flag=0,erro_warning = """"""):
"""
flag为0时 采用循环定位知道元素存在
flag为1时,仅判断元素存在与否
默认为0时无需传参
:param xpath_: 元素xpath路径
:param flag:int
:return:
"""
if flag==0:
while True:
try:
ele = driver.find_element_by_xpath(xpath_)
return ele
except Exception as file:
print(file)
print(erro_warning)
time.sleep(1)
else:
try:
ele = driver.find_element_by_xpath(xpath_)
return ele
except Exception as file:
return False
def get_goods_json(url):
while True:
try:
driver.get(url)
#程序继续运行所必要的网页条件筛选 在未加载出来之前不执行之后的命令
find_element_exists('//*[@id="alisearch-input"]',"请手动跳过验证码进行下一步操作")
#写入js 滑动脚本 模拟人工滑动 加载出ajax 请求
js = "var q=document.documentElement.scrollTop=10000"
time.sleep(1)
driver.execute_script(js)
time.sleep(2)
driver.execute_script(js)
#加载完之后需要一定程度的延迟 否则获取日志时报错
time.sleep(5)
#这一部分是网页原生加载的请求 无需ajax 包括20个商品
content = '{"code":200,"data":{"asyncConfig":{"async":false,"asyncCount":20,"asyncUrl"' + driver.page_source.split(
'window.data.offerresultData = successDataCheck({"code":200,"data":{"asyncConfig":{"async":false,"asyncCount":20,"asyncUrl"')[
-1]
check_num = re.findall('"ok","time":(.*?)}',content)[0]
# print(check_num,'"ok","time":' + str(check_num) +'});')
# input()
content = content.split('"ok","time":' + str(check_num) +'});')[0] + '"ok","time":' + str(check_num) +'}'
#抓selenium的network内置日志
two_ajax = []
#去selenium日志中查看遍历所有http请求并且配和js滚动提取需要的ajax
for entry in driver.get_log('performance'):
si = json.loads(entry['message'])
si = JsonSearch(object=si, mode='j')
try:
request_id = si.search_all_value(key='requestId')[-1]
except:
continue
try:
#寻找是否是带有url的网络请求 不是则跳过
url = si.search_all_value(key='url')[-1]
except:
continue
#匹配所需要寻找的ajax请求 匹配则继续
if "asyncCount=20&pageSize=60" not in url:
continue
else:
pass
# print(str(url))
excluded = []
try :
#执行命令 获取得到的requestid的具体返回值
session_infor = driver.execute("executeCdpCommand",
{'cmd': 'Network.getResponseBody', 'params': {'requestId': request_id}})['value']
print(type(session_infor['body']))
true = True
false = False
# d = JsonSearch(object=eval(str(session_infor['body'])), mode='j')
ajax = JsonSearch(object=eval(str(session_infor['body'])), mode='j')
ori_ajax = str(session_infor['body'])
# print(session_infor,type)
index = ajax.search_all_value(key="startIndex")[0]
if index not in excluded:
excluded.append(index)
two_ajax.append(ori_ajax)
else:
continue
print(index)
except Exception as file:
print(file)
return [content, two_ajax[0], two_ajax[2]]
except:
input("请手动跳过验证码之后按回车键进行下一步操作")
#取日志ajax失败时重新加载页面,一般只考虑出现各类验证码场景时用
continue
def login(choice):
global driver,headers
driver.get('https://login.taobao.com/?https://login.taobao.com/?redirect_url=https%3A%2F%2Flogin.1688.com%2Fmember%2Fjump.htm%3Ftarget%3Dhttps%253A%252F%252Flogin.1688.com%252Fmember%252FmarketSigninJump.htm%253FDone%253D%25252F%25252Fdetail.1688.com%25252Foffer%25252F564418418427.html&style=tao_custom&from=1688web')
if choice.strip() == "1":
#输入账号
user = input("请输入用户名")
find_element_exists('/html/body/div/div[2]/div[3]/div/div/div/div[2]/div/form/div[1]/div[2]/input').send_keys("user")
time.sleep(1)
#输入密码
password = input("请输入密码")
find_element_exists('/html/body/div/div[2]/div[3]/div/div/div/div[2]/div/form/div[2]/div[2]/input').send_keys(password)
#点击登录按钮
find_element_exists('/html/body/div/div[2]/div[3]/div/div/div/div[2]/div/form/div[4]/button').click()
# print(driver.page_source)
# input("check")
else:
input("请在扫码后按回车键继续")
#暂时弃用 玩不过1688的滑块验证 只能往换ip的思路着手
# if "请按住滑块,拖动到最右边" in driver.page_source:
# #用来二次定位的图片元素
# ele = find_element_exists('//*[@id="bg-img"]',flag = 1)
# if ele:
# ActionChains(driver).move_to_element_with_offset(ele, -46,-332).context_click().perform()
# input('check001')
#目前用不到de cookies存储 备用
save_cookies()
def get_cookies():
cookies_dict = dict()
with open("cookies.txt", "r") as fp:
cookies = json.load(fp)
for cookie in cookies:
cookies_dict[cookie['name']] = cookie['value']
return cookies_dict
choice = input("请输入登录方式:\n\t1.账号密码自动登录\n\t2.自助扫码登录\n\n")
driver = create_driver()
login(choice)
for sort_type in ["综合搜索","成交量搜索"]:
keyword = "自行车配件"
#打开文件便于后续存储
file = open(f"{keyword}_{sort_type}.csv", "w", newline="", encoding="utf-8")
csv_writer = csv.writer(file)
head = ['搜索关键词','id', '价格', '标题', '加红标题', '成交额', '复购率', '搜索页面链接','商品链接', '企业名称', 'consultantscores', 'returnscore', 'quality_experiences', 'disputescores', 'expressscores', '综合服务星级']
csv_writer.writerow(head)
uncode_keyword = urllib.parse.quote(keyword.encode('gb2312'))
for page_num in [1,2]:
if sort_type =="综合搜索":
url = f"https://s.1688.com/selloffer/offer_search.htm?keywords={uncode_keyword}&beginPage={page_num}#sm-filtbar"
else:
# 按成交额进行排序的商品json信息
url = f"https://s.1688.com/selloffer/offer_search.htm?&keywords={uncode_keyword}&beginPage={page_num}&sortType=va_rmdarkgmv30&descendOrder=true&uniqfield=userid&n=y#sm-filtbar"
print(f"page: {page_num}")
#传回三个原始字符串json 第一个是页面原有的 第二第三个系ajax生成
res_3 = get_goods_json(url)
#回溯索引
num1 = -1
#回溯标志 初始化为0 值为1时代表需要回溯
num1_flag = 0
for res_ in res_3:
num1+=1
# print(res.text)
false= False
true = True
null = None
res_json=eval(res_)
#循环三次加载的循环(60 个商品被分为三次请求)
jsondata = JsonSearch(object=res_json, mode='j')
#获取json商品中列表 一次20
while True:
try:
#解析出需要用到的json
all = jsondata.search_all_value(key='offerList')[-1]
break
except:
#在查询下一个链接是出现验证码的处理,人工操控后按回车继续
driver.refresh()
input("请手动通过验证码成功后按回车")
res_3 = get_goods_json(url)
num1-=1
num1_flag = 1
# 获取json商品中列表 一次20
break
if num1_flag:
continue
#获取公司名
companies = jsondata.search_all_value(key='hoverName')
#获取价格
prices = jsondata.search_all_value(key='price')
#获取商品标题
titles = jsondata.search_all_value(key='simpleSubject')
#获取商品红色标题(这里是原始数据)
red_titles_origins = jsondata.search_all_value(key='subject')
#咨询评分
consultantscores = jsondata.search_all_value(key='consultationScore')
#物流评分
expressscores = jsondata.search_all_value(key='logisticsScore')
#质量评分
quality_experiences = jsondata.search_all_value(key='goodsScore')
#纠纷评分
disputescores = jsondata.search_all_value(key='disputeScore')
#综合评分
totalscores = jsondata.search_all_value(key= 'compositeNewScore')
#是否展现复购率的标志
ifres = jsondata.search_all_value(key= 'showShopRepurchaseRate')
red_titles = []
#遍历原始标题取出红色html标签
for i in red_titles_origins:
reds = re.findall("<font color=red>(.*?)</font>",i)
if len(reds) != 0:
red_titles.append(" ".join(reds))
else:
red_titles.append(' ')
for k in range(len(titles)):
#判断部分值无效时设置为空(如界面内部分广告产品)
rePurchaseRate = all[k]['company']["shopRepurchaseRate"]
try:
regCapitalUnit = all[k]["tradeQuantity"]["gmvValueFuzzify"]["integer"]
except:
regCapitalUnit="无"
try:
id = all[k]["id"]
except:
id ="无"
if id != "无":
self_url = f"https://detail.1688.com/offer/{id}.html"
else:
self_url = "无"
try:
returnscore = all[k]["tradeService"]["returnScore"]
except:
returnscore="无"
if str(rePurchaseRate) =="0.0":
rePurchaseRate ="无"
if str(returnscore) =="-1.0":
returnscore ="无"
if sort_type =="综合搜索":
dad_url = f"https://s.1688.com/selloffer/offer_search.htm?spm=a26352.13672862.filtbar.9.27ed5860tLwp6H&keywords={uncode_keyword}&beginPage=1&sortType=normal&descendOrder=true&uniqfield=userid&n=y#sm-filtbar"
else:
dad_url = f"https://s.1688.com/selloffer/offer_search.htm?spm=a26352.13672862.filtbar.9.27ed5860tLwp6H&keywords={uncode_keyword}&beginPage=1&sortType=va_rmdarkgmv30&descendOrder=true&uniqfield=userid&n=y#sm-filtbar"
if regCapitalUnit =="":
regCapitalUnit ="无"
if rePurchaseRate =="":
rePurchaseRate ="无"
#必要的行纠错,便于排查
prices[k]
titles[k]
red_titles[k]
companies[k]
consultantscores[k]
quality_experiences[k]
disputescores[k]
expressscores[k]
totalscores[k]
print(ifres[k])
#定义存入csv的列表
row = [keyword,id,prices[k],titles[k],red_titles[k],regCapitalUnit,rePurchaseRate,dad_url,self_url,companies[k],consultantscores[k],returnscore,quality_experiences[k],disputescores[k],expressscores[k],totalscores[k]]
print(row)
# input("检查单页正确性 调试用")
#写入csv 一个简单的容错文件占用循环
while True:
try:
csv_writer.writerow(row)
break
except:
print("程序运行期间不可打开特定写入表格,请及时关闭等待程序运行.")
#页数增加传入下一个循环
file.close()