Scrapy模拟登陆
方法
正常模拟登陆(scrapy模拟登陆方式)
- 直接携带cookie
模拟登陆人人网
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20import scrapy
class RenSpider(scrapy.Spider):
name = 'ren'
allowed_domains = ['renren,com']
start_urls = ['http://www.renren.com/975472415/profile']
def start_requests(self):
cookies = 'xxxx=xxxx; xxxx=xxxx; xxxxx=xxxxx; '#已登录的cookie
cookie = {
i.split('=')[0] : i.split('=')[1] for i in cookies.split('; ')
} #只能以字典形式携带cookie
yield scrapy.Request(
url=self.start_urls[0],
callback= self.parse,
cookies=cookie
)
def parse(self, response):
with open('open.html','w',encoding='utf-8') as f:
f.write(response.body.decode())找到发送post请求的URL,带上信息,发送请求
- 模拟登陆github
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39import scrapy
class GithubSpider(scrapy.Spider):
name = 'github'
allowed_domains = ['github.com']
start_urls = ['https://github.com/login'] #登陆界面
def parse(self, response):
commit = 'Sign in'
authenticity_token = response.xpath("//input[@name='authenticity_token']/@value").extract_first()
login = 'sansanbudejiuya' #请输入正确用户名
password = 'sansanbudejiuya' #输入正确密码
timestamp = response.xpath("//input[@name='timestamp']/@value").extract_first()
timestamp_secret = response.xpath("//input[@name='timestamp_secret']/@value").extract_first()
# 定义一个字典来提交数据
data = {
'commit': commit,
'authenticity_token': authenticity_token,
# 'ga_id': ga_id,
'login': login,
'password': password,
'webauthn-support': 'supported',
'webauthn-iuvpaa-support': 'unsupported',
'timestamp': timestamp,
'timestamp_secret': timestamp_secret,
}
yield scrapy.FormRequest(
#提交url地址
url = 'https://github.com/session', #提交数据界面
#提交数据
formdata=data,
#获取响应
callback=self.after_login
)
def after_login(self,response):
with open('github.html','w',encoding='utf-8') as f:
f.write(response.body.decode())selenium模拟登陆
- 找到对应的input标签 输入文字登陆
快速登陆条件
- 数据都在页面上
- scrapy.FormRequest.from_response()方法来实现
- formdata输入login以及password
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23import scrapy
class GithubSpider(scrapy.Spider):
name = 'github2'
allowed_domains = ['github.com']
start_urls = ['https://github.com/login']
def parse(self, response):
yield scrapy.FormRequest.from_response(
#q请求响应结果
response=response,
formdata={
'login' : 'sansan', #输入正确用户
'password' : 'budejiuya' #输入正确密码
},
callback=self.after_login
)
def after_login(self,response):
with open('github2.html','w',encoding='utf-8') as f:
f.write(response.body.decode())