爬取我爱我家武汉房租

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import re
import requests
from bs4 import BeautifulSoup
from lxml import html
import time
from fake_useragent import UserAgent
import copy

ua = UserAgent(use_cache_server=False)

header = {
'User-Agent': ua.random,
}

etree = html.etree

def get_random_proxy():
"""
get random proxy from proxypool
:return: proxy
"""
proxypool_url = 'http://127.0.0.1:5555/random'
return requests.get(proxypool_url).text.strip()

def get_city(url):
proxies = {'http': 'http://' + get_random_proxy()}
response = requests.get(url, headers=header,proxies=proxies)
res = response.text
html = etree.HTML(res)
address = html.xpath('//ul[@class="new_di_tab sTab"]/a/@title')
addres = copy.copy(address)
addr_links = html.xpath('//ul[@class="new_di_tab sTab"]/a/@href')
add_links = copy.copy(addr_links)
print(addres,add_links)
# print(addr_links[2])
for i in range(1,len(addres)-1):
print('当前处理:',i)
addr = addres[i].replace('武汉','').replace('租房','')
addr_link = 'https://wh.5i5j.com' + add_links[i]
print('当前addr_link',addr_link)
try:
get_addr_page(addr,addr_link)
except Exception as e:
# print(i)
continue


def get_addr_page(addr,addr_link):
key = 1
num = 1
url = addr_link
print('url:',url)
while key:
time.sleep(10)
proxies = {'http': 'http://' + get_random_proxy()}
response = requests.get(url,headers=header,proxies=proxies)
html = response.text
nextpage_link_message = re.findall('<div.*?pageSty.*?href="(.*?)" class=', html, re.S)[0]
print(nextpage_link_message)
if nextpage_link_message:
if nextpage_link_message == 'javascript:void(0);':
key = 0
else:
url = 'https://wh.5i5j.com' + nextpage_link_message.strip("\"")
print('keyurl',url)
num += 1
print(num)
else:
break
pages = num
print(pages)
if pages == 1:
pass
else:
for page in range(1, pages+1):
url = addr_link + 'n' + str(page)
get_house_data(addr, url)

def get_house_data(addr,url):
print('当前传入分析:',url)
time.sleep(10)
try:
proxies = {'http': 'http://' + get_random_proxy()}
response = requests.get(url,headers=header,proxies=proxies)
html = etree.HTML(response.text)
item = len(html.xpath("//h3[@class='listTit']/a/text()"))
position1 = html.xpath("//div[@class='listX']/p[2]/text()")
for i in position1:
if i == ' · ':
position1.remove(i)
print(position1)
for i in range(item):
title = html.xpath("//h3[@class='listTit']/a/text()")[i].replace(',', ',')
rooms_data = html.xpath("//div[@class='listX']/p[1]/text()")[i]
room_type = rooms_data[:10]
square = re.findall(r"\d+\.?\d*", rooms_data[15:25])[0]
# print(square)
position = position1[i].split(' ',1)[0]
positions = html.xpath("//div[@class='listX']/p[2]/a[1]/text()")[i]
price = html.xpath("//div[@class='jia']/p[1]/strong/text()")[i]
if room_type[0:1] != '多':
one_room_price = str(round(float(price) / float(room_type[0:1])))
# print(one_room_price)
# print(addr,title,room_type,square,position,price,one_room_price)
with open('我爱我家武汉租房.txt', 'a', encoding='utf-8') as f:
f.write(addr + ',' + title + ',' + room_type + ',' + room_type[0:1] + ',' + square + ',' + position + ',' + positions + ',' + price + ',' + one_room_price + '\n')
print('写入完成')
except:
print("have a problem")
time.sleep(20)
return get_house_data(addr, url)


def main():
print("start the work")
url = 'https://wh.5i5j.com/zufang/'
get_city(url)

if __name__ == '__main__':
time.sleep(2)
main()

数据分析平均租房价格

  • 转excel格式
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import xlwt
import codecs

input_txt = '我爱我家武汉租房.txt'
output_excel = '我爱我家武汉租房.xls'
sheetName = 'Sheet1'
start_row = 0
start_col = 0

wb = xlwt.Workbook(encoding='utf-8')
ws = wb.add_sheet(sheetName)

f = open(input_txt, encoding='utf-8')

row_excel = start_row
for line in f:
line = line.strip('\n')
line = line.split(',')

print(line)

col_excel = start_col
len_line = len(line)
for j in range(len_line):
print(line[j])
ws.write(row_excel, col_excel, line[j])
col_excel += 1
wb.save(output_excel)

row_excel += 1

f.close
  • 数据分析单间平均价格
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
import pandas as pd
from pyecharts import Bar, Line, Overlap

f = open('我爱我家武汉租房.xls', 'rb')
df = pd.read_excel(f, header=None, names=['area', 'title', 'room_type', 'room_quantity', 'square', 'xiaoqu', 'loupan', 'price', 'one_room_price'])

area_message = df.groupby(['area'])
area_com = area_message['one_room_price'].agg(['mean', 'count'])
area_com.reset_index(inplace=True)
area_message_last = area_com.sort_values('count', ascending=False)
attr = area_message_last['area']
v1 = area_message_last['count']
v2 = area_message_last['mean']
line = Line("武汉主城区单间均价")
line.add("主城区", attr, v2, is_stack=True,xaxis_rotate=30, yaxis_min=0, mark_point=["max", "min"], xaxis_interval=0, line_color='lightblue', line_width=4, mark_point_textcolor='black', mark_point_color='lightblue', is_splitline_show=False)
bar = Bar("武汉主城区出租房数量&单间均价")
bar.add("主城区", attr, v1, is_stack=True, xaxis_rotate=30, yaxis_min=0, xaxis_interval=0, is_splitline_show=False)
overlap = Overlap()
overlap.add(bar)
overlap.add(line, yaxis_index=1, is_add_yaxis=True)
overlap.render('武汉主城区出租房数量&单间均价.html')

  • 武汉小区单间均价
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
xiaoqu_message = df.groupby(['xiaoqu'])
xiaoqu_com = xiaoqu_message['one_room_price'].agg(['mean', 'count'])
xiaoqu_com.reset_index(inplace=True)
xiaoqu_message_last = xiaoqu_com.sort_values('count', ascending=False)[0:20]
attr = xiaoqu_message_last['xiaoqu']
v1 = xiaoqu_message_last['count']
v2 = xiaoqu_message_last['mean']
line = Line("武汉小区单间均价")
line.add("小区", attr, v2, is_stack=True, xaxis_rotate=30, yaxis_min=0, mark_point=["max", "min"], xaxis_interval=0, line_color='lightblue', line_width=4, mark_point_textcolor='black', mark_point_color='lightblue', is_splitline_show=False)
bar = Bar("武汉小区出租房数量&单间均价")
bar.add("小区", attr, v1, is_stack=True, xaxis_rotate=30, yaxis_min=0, xaxis_interval=0, is_splitline_show=False)
overlap = Overlap()
overlap.add(bar)
overlap.add(line, yaxis_index=1, is_add_yaxis=True)
overlap.render('武汉小区出租房数量&单间均价.html')

image-20210520075404587

  • 武汉出租房单价区间及数量
1
2
3
4
5
6
7
8
9
price_info = df['one_room_price']
bins = [0, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 5000, 6000, 7000, 8000, 9000, 10000]
level = ['0-1000', '1000-1500', '1500-2000', '2000-2500', '2500-3000', '3000-3500', '3500-4000', '4000-5000', '5000-6000', '6000-7000', '7000-8000', '8000-9000', '10000以上']
price_stage = pd.cut(price_info, bins=bins, labels=level).value_counts().sort_index()
attr = price_stage.index
v1 = price_stage.values
bar = Bar("武汉出租房单间价格区间及数量")
bar.add("",attr,v1,is_stack=True, xaxis_rotate=30, yaxis_min=0, xaxis_interval=0, is_splitline_show=False)
bar.render("武汉出租房单间价格区间及数量.html")

  • 武汉租房面积分布
1
2
3
4
5
6
7
8
9
square_info = df['square']
bins = [0, 30, 60, 90, 120, 150, 200, 300]
level = ['0-30', '30-60', '60-90', '90-120', '120-150', '150-200', '200+']
square_stage = pd.cut(square_info, bins=bins, labels=level).value_counts().sort_index()
attr = square_stage.index
v1 = square_stage.values
pie = Pie("武汉出租房房屋面积分布", title_pos='center')
pie.add("", attr, v1, radius=[40, 75], label_text_color=None, is_label_show=True, legend_orient="vertical", legend_pos="left",)
pie.render('武汉出租房房屋面积分布.html')