按照序号顺序修改了文件名

JLUVicent · Sep 13, 2021 · 86fc4d9 · 86fc4d9
1 parent b29b19c
commit 86fc4d9
Show file tree

Hide file tree

Showing 7 changed files with 221 additions and 0 deletions.
diff --git a/1_urllib的基本使用.py b/1_urllib的基本使用.py
@@ -0,0 +1,18 @@
+# 使用urllib来获取百度首页源码
+
+# （1）定义一个url,就是要访问的地址
+import urllib.request
+url = 'http://www.baidu.com'
+
+# （2）模拟浏览器向服务器发送请求 response响应
+response = urllib.request.urlopen(url)
+
+# （3）获取响应中的页面源码
+# read方法 返回字节形式的二进制数据
+# 将二进制数据转换为字符串
+# 二进制--》字符串 解码 decode('编码的格式')
+content = response.read().decode('utf-8')
+
+
+# （4）打印数据
+print(content)
diff --git a/2_urllib_1个类型和6个方法.py b/2_urllib_1个类型和6个方法.py
@@ -0,0 +1,39 @@
+import urllib.request
+
+url = 'http://www.baidu.com'
+
+# 模拟浏览器向服务器发送请求
+response = urllib.request.urlopen(url)
+
+# 一个类型和六个方法
+# response是HTTPResponse的类型
+# print(type(response))
+# <class 'http.client.HTTPResponse'>
+
+# 按照一个字节一个字节读
+# content = response.read()
+# print(content)
+
+# 返回多少个字节
+# content = response.read(5)
+# print(content)
+
+# 读取一行
+# content = response.readline()
+# print(content)
+
+# 一行一行读取所有行
+# content = response.readlines()
+# print(content)
+
+# 返回状态码，如果是200证明逻辑没错，若出现404或其他就有问题
+# print(response.getcode())
+
+# 返回访问的url地址
+# print(response.geturl())
+
+# 获取状态信息
+# print(response.getheaders())
+
+# 一个类型  HTTPResponse
+# 六个方法  read  readline  readlines getcode geturl getheaders
diff --git a/3_urllib_下载.py b/3_urllib_下载.py
@@ -0,0 +1,18 @@
+import urllib.request
+
+# 下载网页
+# url_page = 'http://www.baidu.com'
+
+# url代表下载的路径，filename代表文件名字
+# 在python中可以写变量的名字，也可以直接写值
+# 上面说的两个必须统一，要么都用变量名字，要么都直接写值。
+# urllib.request.urlretrieve(url_page, 'baidu.html')
+
+# 下载图片
+# url_img = 'https://gimg2.baidu.com/image_search/src=http%3A%2F%2Fhbimg.b0.upaiyun.com%2F7309047bf2b60563f54929c6eb5e457d03d180f12c605-Sj1plD_fw658&refer=http%3A%2F%2Fhbimg.b0.upaiyun.com&app=2002&size=f9999,10000&q=a80&n=0&g=0n&fmt=jpeg?sec=1634050507&t=375789bbaf42af82d1021b7e00f58a41'
+
+# urllib.request.urlretrieve(url_img, 'taylor.jpg')
+# 下载视频
+url_video = 'https://vd3.bdstatic.com/mda-mia2yrww4rrhfp93/fhd/cae_h264_nowatermark/1631326151977910571/mda-mia2yrww4rrhfp93.mp4?v_from_s=hkapp-haokan-nanjing&auth_key=1631461397-0-0-dd9ae5510d8ceb9f534534bb62f3a2dc&bcevod_channel=searchbox_feed&pd=1&pt=3&abtest=3000187_1'
+
+urllib.request.urlretrieve(url_video, 'quanhongchan.mp4')
diff --git a/4_urllib_请求对象的定制.py b/4_urllib_请求对象的定制.py
@@ -0,0 +1,28 @@
+import urllib.request
+
+url = 'https://www.baidu.com'
+
+# URL的组成
+# https://www.baidu.com/s?tn=59044660_hao_pg&ie=utf-8&wd=周杰伦
+#http/https     www.baidu.com   80/443      s         wd=周杰伦 #
+# 协议           主机             端口号      路径      参数      锚点
+# http 80
+# https 443
+# mysql 3306
+# oracle 1521
+# redis 6379
+# mongodb 27017
+# headers内容在浏览器中点击检查-network-刷新-看域名-到最下面
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36'
+}
+
+# 因为urlopen方法中不能存储字典，所以headers不能传递出去
+# 请求对象的定制
+# 注意 因为参数顺序的问题，不能直接写url和headers 中间还有data 所以我们需要关键字传参
+request = urllib.request.Request(url=url, headers=headers)
+response = urllib.request.urlopen(request)
+
+content = response.read().decode('utf-8')
+
+print(content)
diff --git a/5_urllib_get请求的quote方法.py b/5_urllib_get请求的quote方法.py
@@ -0,0 +1,32 @@
+# https://www.baidu.com/s?&wd=%E5%91%A8%E6%9D%B0%E4%BC%A6
+# Unicode编码统一
+
+# 需求：获取https://www.baidu.com/s?&wd=周杰伦的网页源码
+
+import urllib.request
+import urllib.parse
+
+url = 'https://www.baidu.com/s?&wd='
+
+# 请求对象的定制是为了解决反爬的第一种手段UA
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36'
+}
+
+# 将周杰伦三个字变成unicode编码的格式
+# 我们需要依赖于urllib.parse
+name = urllib.parse.quote('周杰伦')
+
+url = url+name
+
+# 请求对象的定制 指定关键字
+request = urllib.request.Request(url=url, headers=headers)
+
+# 模拟浏览器向服务器发送请求
+response = urllib.request.urlopen(request)
+
+# 获取响应的内容
+content = response.read().decode('utf-8')
+
+# 打印数据
+print(content)
diff --git a/6_urllib_get请求的urlencode方法.py b/6_urllib_get请求的urlencode方法.py
@@ -0,0 +1,47 @@
+# urlencode应用场景：多个参数的时候
+
+# https://www.baidu.com/s?tn=59044660_hao_pg&ie=utf-8&wd=%E5%91%A8%E6%9D%B0%E4%BC%A6
+
+# import urllib.parse
+# data = {
+#     'wd': '周杰伦',
+#     'sex': '男',
+#     'location':'中国台湾省'
+# }
+
+# a = urllib.parse.urlencode(data)
+# print(a)
+
+# 获取https://www.baidu.com/s?wd=%E5%91%A8%E6%9D%B0%E4%BC%A6&sex=%E7%94%B7&location=%E4%B8%AD%E5%9B%BD%E5%8F%B0%E6%B9%BE的网页源码
+
+import urllib.request
+import urllib.parse
+
+base_url = 'https://www.baidu.com/s?'
+
+data = {
+    'wd': '周杰伦',
+    'sex': '男',
+    'location': '中国台湾省'
+}
+
+new_data = urllib.parse.urlencode(data)
+
+# 请求资源路径
+url = base_url+new_data
+
+# 防止反爬
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36'
+}
+
+# 请求对象的定制
+request = urllib.request.Request(url=url, headers=headers)
+
+# 模拟浏览器向服务器发送请求
+response = urllib.request.urlopen(request)
+
+# 获取网页源码的数据
+content = response.read().decode('utf-8')
+
+print(content)
diff --git a/7_urllib_post请求百度翻译.py b/7_urllib_post请求百度翻译.py
@@ -0,0 +1,39 @@
+# post请求
+import json
+import urllib.request
+import urllib.parse
+
+url = 'https://fanyi.baidu.com/sug'
+
+# 反爬请求头
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36'
+}
+
+# 写入数据
+data = {
+    'kw': 'spider'
+}
+
+# post请求的参数必须进行编码
+data = urllib.parse.urlencode(data).encode('utf-8')
+# print(data)
+
+# post的请求参数 是不会拼接在url后面的 而是需要放在请求对象定制的参数中
+# post请求的参数必须要进行编码
+request = urllib.request.Request(url=url, data=data, headers=headers)
+
+# 模拟浏览器向服务器发送请求
+response = urllib.request.urlopen(request)
+
+# 获取响应的数据
+content = response.read().decode('utf-8')
+
+# 字符串--》json对象
+
+obj = json.loads(content)
+print(obj)
+
+# post请求方式的参数必须编码
+# 编码之后必须调用encode方法
+# 参数是放在请求定制的方法中