Skip to content

Commit

Permalink
按照序号顺序修改了文件名
Browse files Browse the repository at this point in the history
  • Loading branch information
JLUVicent committed Sep 13, 2021
1 parent b29b19c commit 86fc4d9
Show file tree
Hide file tree
Showing 7 changed files with 221 additions and 0 deletions.
18 changes: 18 additions & 0 deletions 1_urllib的基本使用.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# 使用urllib来获取百度首页源码

# (1)定义一个url,就是要访问的地址
import urllib.request
url = 'http://www.baidu.com'

# (2)模拟浏览器向服务器发送请求 response响应
response = urllib.request.urlopen(url)

# (3)获取响应中的页面源码
# read方法 返回字节形式的二进制数据
# 将二进制数据转换为字符串
# 二进制--》字符串 解码 decode('编码的格式')
content = response.read().decode('utf-8')


# (4)打印数据
print(content)
39 changes: 39 additions & 0 deletions 2_urllib_1个类型和6个方法.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import urllib.request

url = 'http://www.baidu.com'

# 模拟浏览器向服务器发送请求
response = urllib.request.urlopen(url)

# 一个类型和六个方法
# response是HTTPResponse的类型
# print(type(response))
# <class 'http.client.HTTPResponse'>

# 按照一个字节一个字节读
# content = response.read()
# print(content)

# 返回多少个字节
# content = response.read(5)
# print(content)

# 读取一行
# content = response.readline()
# print(content)

# 一行一行读取所有行
# content = response.readlines()
# print(content)

# 返回状态码,如果是200证明逻辑没错,若出现404或其他就有问题
# print(response.getcode())

# 返回访问的url地址
# print(response.geturl())

# 获取状态信息
# print(response.getheaders())

# 一个类型 HTTPResponse
# 六个方法 read readline readlines getcode geturl getheaders
18 changes: 18 additions & 0 deletions 3_urllib_下载.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import urllib.request

# 下载网页
# url_page = 'http://www.baidu.com'

# url代表下载的路径,filename代表文件名字
# 在python中可以写变量的名字,也可以直接写值
# 上面说的两个必须统一,要么都用变量名字,要么都直接写值。
# urllib.request.urlretrieve(url_page, 'baidu.html')

# 下载图片
# url_img = 'https://gimg2.baidu.com/image_search/src=http%3A%2F%2Fhbimg.b0.upaiyun.com%2F7309047bf2b60563f54929c6eb5e457d03d180f12c605-Sj1plD_fw658&refer=http%3A%2F%2Fhbimg.b0.upaiyun.com&app=2002&size=f9999,10000&q=a80&n=0&g=0n&fmt=jpeg?sec=1634050507&t=375789bbaf42af82d1021b7e00f58a41'

# urllib.request.urlretrieve(url_img, 'taylor.jpg')
# 下载视频
url_video = 'https://vd3.bdstatic.com/mda-mia2yrww4rrhfp93/fhd/cae_h264_nowatermark/1631326151977910571/mda-mia2yrww4rrhfp93.mp4?v_from_s=hkapp-haokan-nanjing&auth_key=1631461397-0-0-dd9ae5510d8ceb9f534534bb62f3a2dc&bcevod_channel=searchbox_feed&pd=1&pt=3&abtest=3000187_1'

urllib.request.urlretrieve(url_video, 'quanhongchan.mp4')
28 changes: 28 additions & 0 deletions 4_urllib_请求对象的定制.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import urllib.request

url = 'https://www.baidu.com'

# URL的组成
# https://www.baidu.com/s?tn=59044660_hao_pg&ie=utf-8&wd=周杰伦
#http/https www.baidu.com 80/443 s wd=周杰伦 #
# 协议 主机 端口号 路径 参数 锚点
# http 80
# https 443
# mysql 3306
# oracle 1521
# redis 6379
# mongodb 27017
# headers内容在浏览器中点击检查-network-刷新-看域名-到最下面
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36'
}

# 因为urlopen方法中不能存储字典,所以headers不能传递出去
# 请求对象的定制
# 注意 因为参数顺序的问题,不能直接写url和headers 中间还有data 所以我们需要关键字传参
request = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(request)

content = response.read().decode('utf-8')

print(content)
32 changes: 32 additions & 0 deletions 5_urllib_get请求的quote方法.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# https://www.baidu.com/s?&wd=%E5%91%A8%E6%9D%B0%E4%BC%A6
# Unicode编码统一

# 需求:获取https://www.baidu.com/s?&wd=周杰伦的网页源码

import urllib.request
import urllib.parse

url = 'https://www.baidu.com/s?&wd='

# 请求对象的定制是为了解决反爬的第一种手段UA
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36'
}

# 将周杰伦三个字变成unicode编码的格式
# 我们需要依赖于urllib.parse
name = urllib.parse.quote('周杰伦')

url = url+name

# 请求对象的定制 指定关键字
request = urllib.request.Request(url=url, headers=headers)

# 模拟浏览器向服务器发送请求
response = urllib.request.urlopen(request)

# 获取响应的内容
content = response.read().decode('utf-8')

# 打印数据
print(content)
47 changes: 47 additions & 0 deletions 6_urllib_get请求的urlencode方法.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# urlencode应用场景:多个参数的时候

# https://www.baidu.com/s?tn=59044660_hao_pg&ie=utf-8&wd=%E5%91%A8%E6%9D%B0%E4%BC%A6

# import urllib.parse
# data = {
# 'wd': '周杰伦',
# 'sex': '男',
# 'location':'中国台湾省'
# }

# a = urllib.parse.urlencode(data)
# print(a)

# 获取https://www.baidu.com/s?wd=%E5%91%A8%E6%9D%B0%E4%BC%A6&sex=%E7%94%B7&location=%E4%B8%AD%E5%9B%BD%E5%8F%B0%E6%B9%BE的网页源码

import urllib.request
import urllib.parse

base_url = 'https://www.baidu.com/s?'

data = {
'wd': '周杰伦',
'sex': '男',
'location': '中国台湾省'
}

new_data = urllib.parse.urlencode(data)

# 请求资源路径
url = base_url+new_data

# 防止反爬
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36'
}

# 请求对象的定制
request = urllib.request.Request(url=url, headers=headers)

# 模拟浏览器向服务器发送请求
response = urllib.request.urlopen(request)

# 获取网页源码的数据
content = response.read().decode('utf-8')

print(content)
39 changes: 39 additions & 0 deletions 7_urllib_post请求百度翻译.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# post请求
import json
import urllib.request
import urllib.parse

url = 'https://fanyi.baidu.com/sug'

# 反爬请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36'
}

# 写入数据
data = {
'kw': 'spider'
}

# post请求的参数必须进行编码
data = urllib.parse.urlencode(data).encode('utf-8')
# print(data)

# post的请求参数 是不会拼接在url后面的 而是需要放在请求对象定制的参数中
# post请求的参数必须要进行编码
request = urllib.request.Request(url=url, data=data, headers=headers)

# 模拟浏览器向服务器发送请求
response = urllib.request.urlopen(request)

# 获取响应的数据
content = response.read().decode('utf-8')

# 字符串--》json对象

obj = json.loads(content)
print(obj)

# post请求方式的参数必须编码
# 编码之后必须调用encode方法
# 参数是放在请求定制的方法中

0 comments on commit 86fc4d9

Please sign in to comment.