爬虫模块

简单案例

  • requests模块
1
2
3
4
5
6
7
8
9
import requests

url = 'https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1602605452394&di=ec07c6bdbd32e07fb5b1d1f90028e35f&imgtype=0&src=http%3A%2F%2Fa3.att.hudong.com%2F14%2F75%2F01300000164186121366756803686.jpg'

req = requests.get(url)

fn = open('code.png','wb')
fn.write(req.content)
fn.close()
  • request
1
2
3
4
5
from urllib import request

url = 'https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1602605452394&di=ec07c6bdbd32e07fb5b1d1f90028e35f&imgtype=0&src=http%3A%2F%2Fa3.att.hudong.com%2F14%2F75%2F01300000164186121366756803686.jpg'

request.urlretrieve(url,'code2.png')

url.request 模块

  • 版本
    • python2与3 合并为urllib.request
  • 常用方法
    • urllib.request.urlopen(‘网址’) 向网站发出请求
    • 字节流 = response.read()
    • 字符串 = response.read().decode(’utf-8’)
    • urllib.request.Request(url,headers=) urlopen不支持重构User-Agent
  • 爬取百度源码
1
2
3
4
5
6
7
8
9
10
import urllib.request

url = 'https://www.baidu.com/'

res = urllib.request.urlopen(url)

print(res.read())

b'<html>\r\n<head>\r\n\t<script>\r\n\t\tlocation.replace(location.href.replace("https://","http://"));\r\n\t</script>\r\n</head>\r\n<body>\r\n\t<noscript><meta http-equiv="refresh" content="0;url=http://www.baidu.com/"></noscript>\r\n</body>\r\n</html>'

  • 解码后
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
import urllib.request

url = 'https://www.baidu.com/'

res = urllib.request.urlopen(url)

html = res.read().decode('utf-8')
print(html)

<html>
<head>
<script>
location.replace(location.href.replace("https://","http://"));
</script>
</head>
<body>
<noscript><meta http-equiv="refresh" content="0;url=http://www.baidu.com/"></noscript>
</body>
</html>

网页源码极少,因为百度反爬,可以通过请求头部解决

  • urllib 请求需要通过Request 创建请求对象
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
import urllib.request

url = 'https://www.baidu.com/'

headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'
}
#创建请求对象
req = urllib.request.Request(url,headers=headers)
#获取响应对象
res = urllib.request.urlopen(req)
#读取内容
html = res.read().decode('utf-8')
print(html)
print(res.getcode()) #获取状态码 200
print(res.geturl()) #返回请求url https://www.baidu.com/

#网页源代码不再赘述

urllib.parse 模块

  • urllib.parse 可以解码
1
2
3
4
5
6
7
8
9
import urllib.parse

url = 'https://tieba.baidu.com/f?ie=utf-8&kw=%E6%B5%B7%E8%B4%BC%E7%8E%8B' #海贼王吧

te = {'wd':'海贼王'}
result = urllib.parse.urlencode(te)
print(result)

# kw=%E6%B5%B7%E8%B4%BC%E7%8E%8B 与上方网页相同
  • 爬取百度网址 urlencode
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
import urllib.request
import urllib.parse

baseurl = 'https://www.baidu.com/s?'
key = input('请输入你想搜索的网址')
wd = {'kw':key}
k = urllib.parse.urlencode(wd)
#拼接URL
url = baseurl + k
# print(url)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36','Cookie':'BIDUPSID=CB45392F574DD4111E414FAAA03C8005; PSTM=1601735228; BAIDUID=CB45392F574DD411C9795EF65AFD7189:FG=1; BD_UPN=12314753; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; H_PS_PSSID=7506_32617_1449_32793_7543_31253_32705_32230_7517_7605_32115_32718_32761_26350; H_PS_645EC=8271oXSRAgOc%2F9442KfEEl5n8qjo%2B5pXa1%2FQcI18t7x0HwcrZZ7WawjN6Jg; COOKIE_SESSION=732_0_5_2_2_1_0_0_5_1_1_0_4562_0_0_0_1602512063_0_1602595372%7C6%230_0_1602597291%7C1%7C1; delPer=0; BD_CK_SAM=1; PSINO=3; BDSVRTM=1022'
}
#创建请求对象
req = urllib.request.Request(url,headers=headers)
#获取响应对象
res = urllib.request.urlopen(req)
#读取内容
html = res.read().decode('utf-8')
#写入文件
with open('file.html','w',encoding='utf-8') as f:
f.write(html)
  • 爬取百度网址 quote
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
import urllib.request
import urllib.parse
baseurl = 'https://www.baidu.com/s?wd=' #相比于字典转化 字方法需要在baseurl添加wd= 此方法只适合文字解码
key = input('请输入你想搜索的网址')
k = urllib.parse.quote(key)
url = baseurl + k
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36','Cookie':'BIDUPSID=CB45392F574DD4111E414FAAA03C8005; PSTM=1601735228; BAIDUID=CB45392F574DD411C9795EF65AFD7189:FG=1; BD_UPN=12314753; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; H_PS_PSSID=7506_32617_1449_32793_7543_31253_32705_32230_7517_7605_32115_32718_32761_26350; H_PS_645EC=8271oXSRAgOc%2F9442KfEEl5n8qjo%2B5pXa1%2FQcI18t7x0HwcrZZ7WawjN6Jg; COOKIE_SESSION=732_0_5_2_2_1_0_0_5_1_1_0_4562_0_0_0_1602512063_0_1602595372%7C6%230_0_1602597291%7C1%7C1; delPer=0; BD_CK_SAM=1; PSINO=3; BDSVRTM=1022'
}
#创建请求对象
req = urllib.request.Request(url,headers=headers)
#获取响应对象
res = urllib.request.urlopen(req)
#读取内容
html = res.read().decode('utf-8')
#写入文件
with open('file2.html','w',encoding='utf-8') as f:
f.write(html)

爬取贴吧数据

  • urllib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import urllib.request
import urllib.parse
#主题框架
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'
}
#爬取贴吧名称
name = input('请输入贴吧名字:')
#爬取页面
begin = int(input('请输入贴吧起始页:'))
end = int(input('请输入贴吧结束页:'))

kw = {'kw':name}
kw = urllib.parse.urlencode(kw)
#拼接url 获取响应
for i in range (begin,end+1):

pn = (i-1)*50
# https://tieba.baidu.com/f?kw=%E6%B5%B7%E8%B4%BC%E7%8E%8B&ie=utf-8&pn=0
baseurl = 'https://tieba.baidu.com/f?'
url = baseurl + kw + '&pn' + str(pn)
#print(url)
#发起请求
req = urllib.request.Request(url,headers=headers)
res = urllib.request.urlopen(req)
html = res.read().decode('utf-8')
#写入文件
file = '第'+str(i)+'页.html'
with open(file,'w',encoding='utf-8') as f:
f.write(html)
  • 用函数进行封装
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import urllib.request
import urllib.parse

#读取页面
def readpage(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'
}
#发起请求
req = urllib.request.Request(url,headers=headers)
res = urllib.request.urlopen(req)
html = res.read().decode('utf-8')
return html

#写入文件
def writepage(file,html):
with open(file,'w',encoding='utf-8') as f:
f.write(html)

#主函数
def main():
name = input('请输入贴吧名字:')
begin = int(input('请输入贴吧起始页:'))
end = int(input('请输入贴吧结束页:'))

kw = {'kw': name}
kw = urllib.parse.urlencode(kw)

for i in range(begin, end + 1):
pn = (i - 1) * 50
# https://tieba.baidu.com/f?kw=%E6%B5%B7%E8%B4%BC%E7%8E%8B&ie=utf-8&pn=0
baseurl = 'https://tieba.baidu.com/f?'

url = baseurl + kw + '&pn' + str(pn)

#调用函数
html = readpage(url)
file = '第' + str(i) + '页.html'
writepage(file,html)


if __name__ == '__main__':
main()
  • 使用init 进行面向对象编程
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import urllib.request
import urllib.parse

class BaiduSpider:
#把常用的 不变的放到init方法中
def __init__(self):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'
}
self.baseurl = 'https://tieba.baidu.com/f?'

def readPage(self,url):
req = urllib.request.Request(url, headers=self.headers)
res = urllib.request.urlopen(req)
html = res.read().decode('utf-8')
return html

def writePage(self,file,html):
with open(file, 'w', encoding='utf-8') as f:
f.write(html)
print('写入成功')

def main(self):
name = input('请输入贴吧名字:')
begin = int(input('请输入贴吧起始页:'))
end = int(input('请输入贴吧结束页:'))

kw = {'kw': name}
kw = urllib.parse.urlencode(kw)

for i in range(begin, end + 1):
pn = (i - 1) * 50
# https://tieba.baidu.com/f?kw=%E6%B5%B7%E8%B4%BC%E7%8E%8B&ie=utf-8&pn=0
self.baseurl = 'https://tieba.baidu.com/f?'

url = self.baseurl + kw + '&pn' + str(pn)

html = self.readPage(url)
file = '第' + str(i) + '页.html'
self.writePage(file, html)

if __name__ == '__main__':
#调用main方法 就实例化类
spider = BaiduSpider()
spider.main()

爬取有道翻译

  • 需要使用post方法发起请求
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import urllib.request
import urllib.parse
import json #导入jason模块 可以将jason进行转字典操作

headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'
}
url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule'
# 请输入你要翻译的内容
key = input('请输入你要翻译的内容:')

#拿到form表单的数据
data = {
'i': key,
'from': 'AUTO',
'to': 'AUTO',
'smartresult': 'dict',
'client': 'fanyideskweb',
'salt': '16025076981759',
'sign': 'b4f4f6b09fa355131e184201607498b1',
'lts': '1602507698175',
'bv': '4abf2733c66fbf953861095a23a839a8',
'doctype': 'json',
'version': '2.1',
'keyfrom': 'fanyi.web',
'action': 'FY_BY_REALTlME'
}
data = urllib.parse.urlencode(data).encode('utf-8')
data = bytes(data) #此方法必须将data变为字节流

req = urllib.request.Request(url,data=data,headers=headers)
res = urllib.request.urlopen(req)
html = res.read().decode('utf-8')
# print(html,type(html))
r_dict = json.loads(html) #json转字典
print(r_dict,type(r_dict))
print(r_dict['translateResult'][0][0]['tgt'])

#请输入你要翻译的内容:你好
#{'type': 'ZH_CN2EN', 'errorCode': 0, 'elapsedTime': 0, 'translateResult': [[{'src': '你
#好', 'tgt': 'hello'}]]} <class 'dict'>
#hello

requests模块

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
import requests
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'
}
#发送请求
response = requests.get('https://qq.yh31.com/zjbq/2920180.html',headers=headers)

#打印响应对象
#print(response.text) #返回str类型的数据 有乱码需要解码
#print(response.content) #返回字节流的数据
#1 推荐
print(response.content.decode('utf-8'))
# print(type(response.content))
#2
# response.encoding = 'utf-8'
# print(response.text)
  • requests实现有道翻译
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import requests
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'
}

url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule'
# 请输入你要翻译的内容
key = input('请输入你要翻译的内容:')

data = {
'i': key,
'from': 'AUTO',
'to': 'AUTO',
'smartresult': 'dict',
'client': 'fanyideskweb',
'salt': '16025076981759',
'sign': 'b4f4f6b09fa355131e184201607498b1',
'lts': '1602507698175',
'bv': '4abf2733c66fbf953861095a23a839a8',
'doctype': 'json',
'version': '2.1',
'keyfrom': 'fanyi.web',
'action': 'FY_BY_REALTlME'
}
res = requests.post(url,data=data,headers=headers)
res.encoding = 'utf-8'
html = res.text
print(html)
  • 设置代理
1
2
3
4
5
6
7
8
9
10
import requests
#设置代理
proxy = {
'http':'175.42.158.172:9999' #175.42.158.172 ip # port
}
url = 'http://httpbin.org/ip'
res = requests.get(url,proxy=proxy)
print(res.text)

#此代理为免费 代理 并不能成功获取ip
  • 处理不信任ssl
1
2
3
4
5
6
import requests

url = 'https://inv-veri.chinatax.gov.cn/'

res = requests.get(url,verify=False) #verify=False 否则会报错
print(res.text)
  • cookie的使用

    • 通过在客户端记录信息确定用户的身份
    • 反爬
    • 模拟登陆
  • 为什么要具备阅读源代码的能力

    • 提供思路
    • 了解优秀代码编写逻辑
    • 企业需求
  • session

    • 通过在服务器记录信息确定用户身份
    • 指的是一个保持会话的操作
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    # {result_message: "验证码校验成功", result_code: "4"}
    import requests
    req = requests.session()
    def login():

    # 获取图片
    pic_response = req.get('https://kyfw.12306.cn/passport/captcha/captcha-image?login_site=E&module=login&rand=sjrand')
    codeImage = pic_response.content
    fn = open('code2.png','wb')
    fn.write(codeImage)
    fn.close()

    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'
    }
    codeStr = input('请输入验证码坐标:')

    data = {
    'answer': codeStr,
    'rand': 'sjrand',
    'login_site': 'E'
    }

    response = req.post('https://kyfw.12306.cn/passport/captcha/captcha-check',data=data,headers=headers)

    print(response.text)

    login()