1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121
| import re import requests from bs4 import BeautifulSoup from lxml import html import time from fake_useragent import UserAgent import copy
ua = UserAgent(use_cache_server=False)
header = { 'User-Agent': ua.random, }
etree = html.etree
def get_random_proxy(): """ get random proxy from proxypool :return: proxy """ proxypool_url = 'http://127.0.0.1:5555/random' return requests.get(proxypool_url).text.strip()
def get_city(url): proxies = {'http': 'http://' + get_random_proxy()} response = requests.get(url, headers=header,proxies=proxies) res = response.text html = etree.HTML(res) address = html.xpath('//ul[@class="new_di_tab sTab"]/a/@title') addres = copy.copy(address) addr_links = html.xpath('//ul[@class="new_di_tab sTab"]/a/@href') add_links = copy.copy(addr_links) print(addres,add_links) for i in range(1,len(addres)-1): print('当前处理:',i) addr = addres[i].replace('武汉','').replace('租房','') addr_link = 'https://wh.5i5j.com' + add_links[i] print('当前addr_link',addr_link) try: get_addr_page(addr,addr_link) except Exception as e: continue
def get_addr_page(addr,addr_link): key = 1 num = 1 url = addr_link print('url:',url) while key: time.sleep(10) proxies = {'http': 'http://' + get_random_proxy()} response = requests.get(url,headers=header,proxies=proxies) html = response.text nextpage_link_message = re.findall('<div.*?pageSty.*?href="(.*?)" class=', html, re.S)[0] print(nextpage_link_message) if nextpage_link_message: if nextpage_link_message == 'javascript:void(0);': key = 0 else: url = 'https://wh.5i5j.com' + nextpage_link_message.strip("\"") print('keyurl',url) num += 1 print(num) else: break pages = num print(pages) if pages == 1: pass else: for page in range(1, pages+1): url = addr_link + 'n' + str(page) get_house_data(addr, url)
def get_house_data(addr,url): print('当前传入分析:',url) time.sleep(10) try: proxies = {'http': 'http://' + get_random_proxy()} response = requests.get(url,headers=header,proxies=proxies) html = etree.HTML(response.text) item = len(html.xpath("//h3[@class='listTit']/a/text()")) position1 = html.xpath("//div[@class='listX']/p[2]/text()") for i in position1: if i == ' · ': position1.remove(i) print(position1) for i in range(item): title = html.xpath("//h3[@class='listTit']/a/text()")[i].replace(',', ',') rooms_data = html.xpath("//div[@class='listX']/p[1]/text()")[i] room_type = rooms_data[:10] square = re.findall(r"\d+\.?\d*", rooms_data[15:25])[0] position = position1[i].split(' ',1)[0] positions = html.xpath("//div[@class='listX']/p[2]/a[1]/text()")[i] price = html.xpath("//div[@class='jia']/p[1]/strong/text()")[i] if room_type[0:1] != '多': one_room_price = str(round(float(price) / float(room_type[0:1]))) with open('我爱我家武汉租房.txt', 'a', encoding='utf-8') as f: f.write(addr + ',' + title + ',' + room_type + ',' + room_type[0:1] + ',' + square + ',' + position + ',' + positions + ',' + price + ',' + one_room_price + '\n') print('写入完成') except: print("have a problem") time.sleep(20) return get_house_data(addr, url)
def main(): print("start the work") url = 'https://wh.5i5j.com/zufang/' get_city(url)
if __name__ == '__main__': time.sleep(2) main()
|