Skip to content

Commit

Permalink
使用xpath语法获取百度网站上的'百度一下'四个字
Browse files Browse the repository at this point in the history
  • Loading branch information
JLUVicent committed Sep 15, 2021
1 parent 7b6a012 commit b4b99f7
Showing 1 changed file with 50 additions and 0 deletions.
50 changes: 50 additions & 0 deletions 18_解析_获取百度网站的百度一下.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@

# (1)获取网页的源码
# (2)解析 解析的服务器响应的文件 etree.HTML
# (3)打印

from lxml import etree
import urllib.request

url = 'https://www.baidu.com'

headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36'
}

# 请求对象的定制
request = urllib.request.Request(url=url, headers=headers)
# 模拟浏览器发送请求
response = urllib.request.urlopen(request)

# 另外一种模拟浏览器的方式
# 走代理池
# import random
# #里面是代理
# proxies_poll=[
# {}
# {}
# ]
# proxies=random.choice(proxies_poll)

# handler=urllib.request.ProxyHandler(proxies=proxies)
# opener=urllib.request.build_opener(handler)
# response=opener.open(response)

# 得到数据
content = response.read().decode('utf-8')
# print(content)

# 解析网页源码获取需要的数据

# 解析服务器响应的文献
tree = etree.HTML(content)

# 获取想要的数据 Xpath返回的就是列表类型
# xpath括号中的内容在浏览器中xpath插件中可以得到
# 如果不让返回列表,直接返回其索引值即可
# //*[@id="su"]
# result = tree.xpath('//input[@id="su"]/@value')[0]
result = tree.xpath('//input[@id="su"]/@value')[0]
print(result)
print(type(result))

0 comments on commit b4b99f7

Please sign in to comment.