毕设中python爬取中国天气网各城市天气实现

目标网站:http://www.weather.com.cn/

目的:将目标网站中中国大陆各个城市天气爬取并保存至 mongoDB 数据库。

分析

  • 中国天气网将全国分为华北、东北、华东、华中、华南、西北、西南和港澳台八个大区。
    网址分别如下:
1
2
3
4
5
6
7
8
http://www.weather.com.cn/textFC/hb.shtml
http://www.weather.com.cn/textFC/db.shtml
http://www.weather.com.cn/textFC/hd.shtml
http://www.weather.com.cn/textFC/hz.shtml
http://www.weather.com.cn/textFC/hn.shtml
http://www.weather.com.cn/textFC/xb.shtml
http://www.weather.com.cn/textFC/xn.shtml
http://www.weather.com.cn/textFC/gat.shtml

每个大区的地址是有规律的,并且很少,所以没有通过爬虫获取(分页获取),直接放进放入数组中,用函数获取即可。

代码实现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import re
import time
from lxml import etree
import pymongo
import requests
from bs4 import BeautifulSoup
# import schedule # schedule模块是用来定时执行任务的,可以设置每天的时间点执行任务

uri = "mongodb://user:pass@127.0.0.1:27017/demo"
arr = []
now = time.strftime("%Y-%m-%d", time.localtime())
print('time.localtime=>', now)

client = pymongo.MongoClient(uri)
db = client.demo
collection = db.weather_cn_data
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36',
'Cookie': ''
}


def detail(detail_url, option):
url = detail_url
res = requests.get(url, headers=headers)
paqu_shengfen_table(res)


def paqu_shengfen_table(res):
soup = BeautifulSoup(res.content, 'html.parser')
print('soup', soup)
table_nums = soup.select('.lQCity > ul > li').__len__()
for i in range(table_nums):
# 爬取每个省份table
tr_all = soup.select('.conMidtab2 > table')[i]

tr = tr_all.find_all('tr')
shengfen = (tr[2].get_text().split())[0]
beautiful(res, tr, shengfen)

# 分析每个省份table


def beautiful(res, tr, shengfen):

td = tr[2].find_all('td') # 缩小单位到每个td标签
city = td[1].get_text().strip()
tianqixianxiang_1 = td[2].get_text().strip()
fengxiangfengli_1 = td[3].get_text().strip()
zuigaoqiwen = td[4].get_text().strip()
tianqixianxiang_2 = td[5].get_text().strip()
fengxiangfengli_2 = td[6].get_text().strip()
zuidiqiwen = td[7].get_text().strip()

db_data = {
'time': now,
'province': shengfen,
'city': city,
'daytime_weather_conditions': tianqixianxiang_1,
'daytime_wind': fengxiangfengli_1,
'maximum_temperature': zuigaoqiwen,
'nighttime_weather_conditions': tianqixianxiang_2,
'nighttime__wind': fengxiangfengli_2,
'minimum_temperature': zuidiqiwen
}
print(res)
arr.append(db_data)
# 遍历tr
for j in tr[3:]: # 遍历tr,tr下还有td
td = j.find_all('td') # 缩小单位到td

city = td[0].get_text().strip()
tianqixianxiang_1 = td[1].get_text().strip()
fengxiangfengli_1 = td[2].get_text().strip()
zuigaoqiwen = td[3].get_text().strip()
tianqixianxiang_2 = td[4].get_text().strip()
fengxiangfengli_2 = td[5].get_text().strip()
zuidiqiwen = td[6].get_text().strip()

db_data = {
'time': now,
'province': shengfen,
'city': city,
'daytime_weather_conditions': tianqixianxiang_1,
'daytime_wind': fengxiangfengli_1,
'maximum_temperature': zuigaoqiwen,
'nighttime_weather_conditions': tianqixianxiang_2,
'nighttime__wind': fengxiangfengli_2,
'minimum_temperature': zuidiqiwen
}
print(res)
arr.append(db_data)


def get_datail_url(url, option):
detail(url, option)


def start(option):

urls = [
'http://www.weather.com.cn/textFC/hb.shtml',
'http://www.weather.com.cn/textFC/db.shtml',
'http://www.weather.com.cn/textFC/hd.shtml',
'http://www.weather.com.cn/textFC/hz.shtml',
'http://www.weather.com.cn/textFC/hn.shtml',
'http://www.weather.com.cn/textFC/xb.shtml',
'http://www.weather.com.cn/textFC/xn.shtml',
'http://www.weather.com.cn/textFC/gat.shtml'
]
# 港澳台的table跟前面的不太一样,本次不爬取
for i in range(0, 7):
get_datail_url(urls[i], option)


def job():
start(1)
print('arr', arr)
print('db result', collection.insert_many(arr))

if __name__ == '__main__':
# schedule.every().day.at("9:00").do(job) # 每天早上9点执行
# while True:
# schedule.run_pending() # 运行所有可以运行的任务
# time.sleep(1000)
job()

参考