需求:爬取全国的城市以及对应的温度(省会 直辖市) 保存到csv当中

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import requests
from bs4 import BeautifulSoup
import csv
titles = ('city','temp')
# 定义一个函数来解析网页
def pares_page(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'
}
response = requests.get(url,headers=headers)
# print(response.content.decode('utf-8'))
text = response.content.decode('utf-8')
# pip install html5lib
soup = BeautifulSoup(text,'html5lib')
# 网页解析
# 找conMidtab标签
conMidtab = soup.find('div',class_ = 'conMidtab')
# 找所有的table标签
tables = conMidtab.find_all('table')
# 保存数据
lst = []
for table in tables:
# print(table)
# print('-'*10)
# 找所有的tr标签 过滤掉前2个
trs = table.find_all('tr')[2:]
# enumerate() 返回2个值 一个是值 另一个是这个值所对应的下标索引值
for index,tr in enumerate(trs):
# print(tr)
tds = tr.find_all('td')
city_td = tds[0] # 除了省会的地区都是正确的 取到第一个城市就可以了
if index == 0:
city_td = tds[1]

info = {}
city = list(city_td.stripped_strings)[0] # 城市

temp_td = tds[-2]
temp = list(temp_td.stripped_strings)[0] # 温度

info['city'] = city
info['temp'] = temp
lst.append(info)
print('city:',city,'temp:',temp)
# break # 先打印北京的数据
return lst

def writeData(lst):


with open('citytemp.csv','w',encoding='utf-8',newline='') as f:
writer = csv.DictWriter(f,titles)
writer.writeheader()
writer.writerows(lst)


def main():
lst = []
url = 'http://www.weather.com.cn/textFC/hb.shtml'
# url = 'http://www.weather.com.cn/textFC/db.shtml'
# url = 'http://www.weather.com.cn/textFC/gat.shtml'

urls = ['http://www.weather.com.cn/textFC/hb.shtml','http://www.weather.com.cn/textFC/db.shtml','http://www.weather.com.cn/textFC/gat.shtml']
for url in urls:
lst += pares_page(url)
# pares_page(url)
writeData(lst)
if __name__ == '__main__':

main()