-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
7 changed files
with
221 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
# 使用urllib来获取百度首页源码 | ||
|
||
# (1)定义一个url,就是要访问的地址 | ||
import urllib.request | ||
url = 'http://www.baidu.com' | ||
|
||
# (2)模拟浏览器向服务器发送请求 response响应 | ||
response = urllib.request.urlopen(url) | ||
|
||
# (3)获取响应中的页面源码 | ||
# read方法 返回字节形式的二进制数据 | ||
# 将二进制数据转换为字符串 | ||
# 二进制--》字符串 解码 decode('编码的格式') | ||
content = response.read().decode('utf-8') | ||
|
||
|
||
# (4)打印数据 | ||
print(content) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
import urllib.request | ||
|
||
url = 'http://www.baidu.com' | ||
|
||
# 模拟浏览器向服务器发送请求 | ||
response = urllib.request.urlopen(url) | ||
|
||
# 一个类型和六个方法 | ||
# response是HTTPResponse的类型 | ||
# print(type(response)) | ||
# <class 'http.client.HTTPResponse'> | ||
|
||
# 按照一个字节一个字节读 | ||
# content = response.read() | ||
# print(content) | ||
|
||
# 返回多少个字节 | ||
# content = response.read(5) | ||
# print(content) | ||
|
||
# 读取一行 | ||
# content = response.readline() | ||
# print(content) | ||
|
||
# 一行一行读取所有行 | ||
# content = response.readlines() | ||
# print(content) | ||
|
||
# 返回状态码,如果是200证明逻辑没错,若出现404或其他就有问题 | ||
# print(response.getcode()) | ||
|
||
# 返回访问的url地址 | ||
# print(response.geturl()) | ||
|
||
# 获取状态信息 | ||
# print(response.getheaders()) | ||
|
||
# 一个类型 HTTPResponse | ||
# 六个方法 read readline readlines getcode geturl getheaders |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
import urllib.request | ||
|
||
# 下载网页 | ||
# url_page = 'http://www.baidu.com' | ||
|
||
# url代表下载的路径,filename代表文件名字 | ||
# 在python中可以写变量的名字,也可以直接写值 | ||
# 上面说的两个必须统一,要么都用变量名字,要么都直接写值。 | ||
# urllib.request.urlretrieve(url_page, 'baidu.html') | ||
|
||
# 下载图片 | ||
# url_img = 'https://gimg2.baidu.com/image_search/src=http%3A%2F%2Fhbimg.b0.upaiyun.com%2F7309047bf2b60563f54929c6eb5e457d03d180f12c605-Sj1plD_fw658&refer=http%3A%2F%2Fhbimg.b0.upaiyun.com&app=2002&size=f9999,10000&q=a80&n=0&g=0n&fmt=jpeg?sec=1634050507&t=375789bbaf42af82d1021b7e00f58a41' | ||
|
||
# urllib.request.urlretrieve(url_img, 'taylor.jpg') | ||
# 下载视频 | ||
url_video = 'https://vd3.bdstatic.com/mda-mia2yrww4rrhfp93/fhd/cae_h264_nowatermark/1631326151977910571/mda-mia2yrww4rrhfp93.mp4?v_from_s=hkapp-haokan-nanjing&auth_key=1631461397-0-0-dd9ae5510d8ceb9f534534bb62f3a2dc&bcevod_channel=searchbox_feed&pd=1&pt=3&abtest=3000187_1' | ||
|
||
urllib.request.urlretrieve(url_video, 'quanhongchan.mp4') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
import urllib.request | ||
|
||
url = 'https://www.baidu.com' | ||
|
||
# URL的组成 | ||
# https://www.baidu.com/s?tn=59044660_hao_pg&ie=utf-8&wd=周杰伦 | ||
#http/https www.baidu.com 80/443 s wd=周杰伦 # | ||
# 协议 主机 端口号 路径 参数 锚点 | ||
# http 80 | ||
# https 443 | ||
# mysql 3306 | ||
# oracle 1521 | ||
# redis 6379 | ||
# mongodb 27017 | ||
# headers内容在浏览器中点击检查-network-刷新-看域名-到最下面 | ||
headers = { | ||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36' | ||
} | ||
|
||
# 因为urlopen方法中不能存储字典,所以headers不能传递出去 | ||
# 请求对象的定制 | ||
# 注意 因为参数顺序的问题,不能直接写url和headers 中间还有data 所以我们需要关键字传参 | ||
request = urllib.request.Request(url=url, headers=headers) | ||
response = urllib.request.urlopen(request) | ||
|
||
content = response.read().decode('utf-8') | ||
|
||
print(content) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
# https://www.baidu.com/s?&wd=%E5%91%A8%E6%9D%B0%E4%BC%A6 | ||
# Unicode编码统一 | ||
|
||
# 需求:获取https://www.baidu.com/s?&wd=周杰伦的网页源码 | ||
|
||
import urllib.request | ||
import urllib.parse | ||
|
||
url = 'https://www.baidu.com/s?&wd=' | ||
|
||
# 请求对象的定制是为了解决反爬的第一种手段UA | ||
headers = { | ||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36' | ||
} | ||
|
||
# 将周杰伦三个字变成unicode编码的格式 | ||
# 我们需要依赖于urllib.parse | ||
name = urllib.parse.quote('周杰伦') | ||
|
||
url = url+name | ||
|
||
# 请求对象的定制 指定关键字 | ||
request = urllib.request.Request(url=url, headers=headers) | ||
|
||
# 模拟浏览器向服务器发送请求 | ||
response = urllib.request.urlopen(request) | ||
|
||
# 获取响应的内容 | ||
content = response.read().decode('utf-8') | ||
|
||
# 打印数据 | ||
print(content) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
# urlencode应用场景:多个参数的时候 | ||
|
||
# https://www.baidu.com/s?tn=59044660_hao_pg&ie=utf-8&wd=%E5%91%A8%E6%9D%B0%E4%BC%A6 | ||
|
||
# import urllib.parse | ||
# data = { | ||
# 'wd': '周杰伦', | ||
# 'sex': '男', | ||
# 'location':'中国台湾省' | ||
# } | ||
|
||
# a = urllib.parse.urlencode(data) | ||
# print(a) | ||
|
||
# 获取https://www.baidu.com/s?wd=%E5%91%A8%E6%9D%B0%E4%BC%A6&sex=%E7%94%B7&location=%E4%B8%AD%E5%9B%BD%E5%8F%B0%E6%B9%BE的网页源码 | ||
|
||
import urllib.request | ||
import urllib.parse | ||
|
||
base_url = 'https://www.baidu.com/s?' | ||
|
||
data = { | ||
'wd': '周杰伦', | ||
'sex': '男', | ||
'location': '中国台湾省' | ||
} | ||
|
||
new_data = urllib.parse.urlencode(data) | ||
|
||
# 请求资源路径 | ||
url = base_url+new_data | ||
|
||
# 防止反爬 | ||
headers = { | ||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36' | ||
} | ||
|
||
# 请求对象的定制 | ||
request = urllib.request.Request(url=url, headers=headers) | ||
|
||
# 模拟浏览器向服务器发送请求 | ||
response = urllib.request.urlopen(request) | ||
|
||
# 获取网页源码的数据 | ||
content = response.read().decode('utf-8') | ||
|
||
print(content) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
# post请求 | ||
import json | ||
import urllib.request | ||
import urllib.parse | ||
|
||
url = 'https://fanyi.baidu.com/sug' | ||
|
||
# 反爬请求头 | ||
headers = { | ||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36' | ||
} | ||
|
||
# 写入数据 | ||
data = { | ||
'kw': 'spider' | ||
} | ||
|
||
# post请求的参数必须进行编码 | ||
data = urllib.parse.urlencode(data).encode('utf-8') | ||
# print(data) | ||
|
||
# post的请求参数 是不会拼接在url后面的 而是需要放在请求对象定制的参数中 | ||
# post请求的参数必须要进行编码 | ||
request = urllib.request.Request(url=url, data=data, headers=headers) | ||
|
||
# 模拟浏览器向服务器发送请求 | ||
response = urllib.request.urlopen(request) | ||
|
||
# 获取响应的数据 | ||
content = response.read().decode('utf-8') | ||
|
||
# 字符串--》json对象 | ||
|
||
obj = json.loads(content) | ||
print(obj) | ||
|
||
# post请求方式的参数必须编码 | ||
# 编码之后必须调用encode方法 | ||
# 参数是放在请求定制的方法中 |