爬虫 | 梦创空间

爬虫

1.1 爬虫的定义

爬虫是指使用代码模拟用户批量发送网络请求，批量获取数据的行为。

1.2 urllib的基本使用

# 使用urllib来获取百度首页的源码
import urllib.request
# 1.定义一个url
url = "http://www.baidu.com"
# 模拟浏览器向服务器发送请求,response响应
response = urllib.request.urlopen(url)
# 获取响应的源码，content内容 read()返回二进制的字节数据，要将二进制数据转化成字符，这个过程叫解码
content = response.read().decode('utf-8')
#打印数据
print(content)

1.3 urllib 一个类型6个方法

coding: 'utf8'
import urllib.request

url = 'http://www.baidu.com'
# 模拟浏览器向服务器发送请求

response = urllib.request.urlopen(url)
# 一个类型和6个方法 response是HTTPResponse的类型
print(type(response))
# 按照字节形式去读
# content=response.read()

# 如果read()方法里面有数字，代表返回多少个字节
# content=response.read(8)
# print(content)

# content = response.readline()  # 按行读取,只能读一行，速度快
# print(content)

# content = response.readlines()  #按行读取
#print(content)

# 返回状态码 200
print(response.getcode())

# 返回url地址
print(response.geturl())

# 请求头信息
print(response.getheaders())

# 一个类型 HTTPresponse
# 6个方法 read()  reline() relines() getcode() geturl() getheaders()

1.4 请求对象定制

import urllib.request
url = 'https://www.baidu.com'

# url 组成部分
# https://www.baidu.com/s?wd=周杰伦
# http https   www.baidu.com  http:80  https:443   s         wd=周杰伦
# 协议          域名                 端口号             路径      参数       锚点
# 端口号  http:80  https:443
#查看更多内容
#简介：useragent一般指用户代理。 用户代理（User Agent，简称 UA），是一个特殊字符串头，
# 使得服务器能够识别客户使用的操作系统及版本、CPU 类型、浏览器及版本、浏览器渲染引擎、浏览器语言、浏览器插件等。

headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:104.0) Gecko/20100101 Firefox/104.0'
}

# 因为URLopen不能存放字典 所以headers不能传递进去，需要请求对象定制,可以存放request对象
# 请求对象的定制
# 注意 因为参数顺序问题 不能直接学URL 和headers 需要用变量来接受收
request=urllib.request.Request(url=url,headers=headers)

response=urllib.request.urlopen(request)
content=response.read().decode('utf-8')
print(content)

扩展

1.5 urllib_get请求方式 quote方法

import urllib.request
import urllib.parse
# https://www.baidu.com/s?wd=%E5%91%A8%E6%9D%B0%E4%BC%A6
url='https://www.baidu.com/s?wd=%E5%91%A8%E6%9D%B0%E4%BC%A6'
# 请求对象的定制，是为了解决反爬虫的第一种手段
headers={
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:104.0) Gecko/20100101 Firefox/104.0'
}
# 将周杰伦变成unicode的格式
# 我们需要依赖与urllib.parse
name=urllib.parse.quote('周杰伦')
url = url + name
requst=urllib.request.Request(url=url,headers=headers)

# 模拟浏览器向服务器发送请求
response=urllib.request.urlopen(requst)
# 获取响应的内容
content=response.read().decode('utf-8')
# 打印输出
print(content)

1.6urllib_get请求方式 urlencode方法

# urlencode的应用场景
import urllib.request
import urllib.parse
base_url = 'https://www.baidu.com/s?'
data = {
    'wd': '周杰伦',
    'sex': '男'
}
new_data = urllib.parse.urlencode(data)
# 请求资源路径
data = base_url + new_data
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:104.0) Gecko/20100101 Firefox/104.0'
}
# 请求对象定制
request = urllib.request.Request(url=data, headers=headers)
# 向浏览器发送请求
response = urllib.request.urlopen(request)
# 获取响应内容
content = response.read().decode('utf-8')
print(content)

1.7post请求方式百度方式

import urllib.request
import urllib.parse
# post请求
url = 'https://fanyi.baidu.com/sug'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:104.0) Gecko/20100101 Firefox/104.0'
}
data = {
    'kw': 'spider'
}
# post请求的方式必须就行编码
data = urllib.parse.urlencode(data).encode('utf-8')
# post的请求方式是不会拼接到URL后面的
# 请求对象定制
request = urllib.request.Request(url=url, data=data, headers=headers)
# 向浏览器发送请求
response = urllib.request.urlopen(request)
# 获取响应数据
content = response.read().decode('utf-8')
#字符串变成json对象
import json
obj=json.loads(content)
print(obj)
# post请求方式必须编码 data=urllib.parse.urlencode(data).encode('utf-8')
# 编码之后必须调用encode方法encode('utf-8')
# 请求参数是放在请求定制中的，request=urllib.request.Request(url=url,data=data,headers=headers)

1.9 ajax get请求豆瓣电影的第一页数据

# get 请求 获取豆瓣电影第一页的数据 并且保存起来
import urllib.request
import urllib.parse
url='https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&start=0&limit=20'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:104.0) Gecko/20100101 Firefox/104.0'
}
# 请求对象的定制
request = urllib.request.Request(url, headers=headers)
#向浏览器请求数据
response = urllib.request.urlopen(request)
#获取响应数据
content = response.read().decode('utf-8')

# print(content)
# 将数据下载到本地,open方法默认使用gbk编码，文件中有中文使用decoding更改编码为utf-8
fp = open('douban.json','w',encoding='utf-8')
fp.write(content)


#with open('douban1.json','w',encoding='utf-8') as fp:
#    fp.write(content)

2.0 urllib异常处理

import urllib.error
import urllib.request

url = 'https://blog.csdn.net/nav/lang111'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:104.0) Gecko/20100101 Firefox/104.0'
}
try:
    request = urllib.request.Request(url=url,headers=headers)
    response=urllib.request.urlopen(request)
    content=response.read().decode('utf-8')
    print(content)
except urllib.error.HTTPError:
    print("系统在升级")

2.1 handler处理器

import urllib.request

url = 'http://www.baidu.com'
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:104.0) Gecko/20100101 Firefox/104.0'
}
reqeust = urllib.request.Request(url=url,headers=headers)
# handler  build_opener   open
# 获取handler对象
handler = urllib.request.HTTPHandler()
# 获取opener对象
opener = urllib.request.build_opener(handler)
#调用open方法
response = opener.open(reqeust)
content=response.read().decode('utf-8')
print(content)

2.2 代理

import  urllib.request

url='http://www.baidu.com/'
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:104.0) Gecko/20100101 Firefox/104.0'
}
#请求对象定制
request= urllib.request.Request(url=url,headers=headers)
proxies={
    'http':'183.64.239.19:8060'
}
# handler  build_opener    open
handler = urllib.request.ProxyHandler(proxies=proxies)
opener = urllib.request.build_opener(handler)
response =opener.open(request)

#向浏览器发送请求
#response = urllib.request.urlopen(request)
#获取响应数据
context=response.read().decode('utf-8')
print(context)

2.3 代理池

import urllib.request
import random
proxies_pool =[
    {'http':'183.64.239.19:8060'},
    {'http':'222.66.202.6:80'},
    {'http':'222.66.202.6:80'}
]
posixs= random.choice(proxies_pool)
url='http://www.baidu.com/'
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:104.0) Gecko/20100101 Firefox/104.0'
}
request= urllib.request.Request(url=url,headers=headers)
handler= urllib.request.ProxyHandler(proxies=posixs)
opener=urllib.request.build_opener(handler)
response=opener.open(request)
context=response.read().decode('utf-8')
print(context)

爬虫

1.1 爬虫的定义

1.2 urllib的基本使用

1.3 urllib 一个类型6个方法

1.4 请求对象定制

1.5 urllib_get请求方式 quote方法

1.6urllib_get请求方式 urlencode方法

1.7post请求方式百度方式

1.9 ajax get请求 豆瓣电影的第一页数据

2.0 urllib异常处理

2.1 handler处理器

2.2 代理

2.3 代理池

1.9 ajax get请求豆瓣电影的第一页数据