Share&Joy

Ginger' Blog


  • 首页
  • 归档
  • 标签
  •   

© 2018 1-riverfish

Theme Typography by Makito

Proudly published with Hexo

两只小爬虫

发布于 2018-05-13 Python 爬虫 Request库 

两只小爬虫

冯兄推荐文学院的洪学姐,帮忙爬取旅游网站的数据用在毕业论文里,其一是出行矩阵,其二是词频统计,目标网站是马蜂窝和携程,反爬虫做的很烂,可以用Request库,还好两天加班加点把矩阵和词频统计都做出来了,学姐之前还开玩笑说做不完就毕不了业,现在应该可以毕业了:smile:

代码写了两部分,第一个代码是爬取游记链接并保存到文件,第二段代码是爬取游记文本并进行文本分析

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# 爬取携程的第一个代码 
# Stage_1
# @author:1-riverfish

# coding: utf-8
import os, io, sys, re, time, base64, json
from urllib.request import urlopen
from urllib.request import Request
from urllib.parse import quote
from bs4 import BeautifulSoup
import random
import lxml
import re

import socket
socket.setdefaulttimeout(15)

def getHtml(url):
my_headers = ["Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"]
req = Request(url)
req.add_header("User-Agent",my_headers[0])
req.add_header("GET",url)

#随机等待 反爬虫
time.sleep(random.randint(1,2))
html = urlopen(req)
return html

url_header = "http://you.ctrip.com/searchsite/travels/?query=%E5%8D%83%E5%B2%9B%E6%B9%96&isAnswered=&isRecommended=&publishDate=&PageNo="

#一共有 293页
count = 0

file=open('xc_data.txt','w')
hrefList = []
for i in range(1,294):
# url 没有问题
# 解析 游记链接
# 解析日期 时间周期 2013-2017
url = url_header + str(i)
print(url)

html = getHtml(url)
#html.read().decode("utf-8")
bsObj = BeautifulSoup(html,"lxml")

#使用正则表达式匹配
aList = bsObj.findAll("a",{"href":re.compile("/travels/[a-z]+")})
#print(aList)
for a in aList:
href = a.get("href")
#print(href)
hrefList.append(href)
#print(hrefList)

#写入文件
file.write(str(hrefList))
# 关闭文件
file.close()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# 爬取携程的第二段代码
# Stage_1
# @author:1-riverfish
# 从文本中读取数据
with open('xc_data.txt', 'r') as f:
#txt中所有字符串读入data
data = f.readlines()

# 将 data.txt文件的头尾中括号去掉
#将单个数据分隔开存好 得到urlList
for line in data:
urlList = line.split(',')

# 去掉重复项
urlList = urlList[::2]
linkList =[]
for url in urlList:
# 得到干净的url
url = url.strip(" ")
url = url.strip('\'')
url = url.strip("http://you.ctrip.com")
url = "http://you.ctrip.com/tr"+ url
#print(url)
linkList.append(url)
print(len(linkList))

# coding: utf-8
import os, io, sys, re, time, base64, json
from urllib.request import urlopen
from urllib.request import Request
from bs4 import BeautifulSoup
import random
import lxml
import re
import numpy as np

import socket
socket.setdefaulttimeout(15)

def getHtml(url):
my_headers = ["Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"]
req = Request(url)
req.add_header("User-Agent",my_headers[0])
req.add_header("GET",url)

#随机等待 反爬虫
time.sleep(random.randint(1,2)/10)
html = urlopen(req)
return html

# 生成 全为 0 的矩阵
matrix = np.zeros([29,29])
# 携程一次爬取是否可行?感觉不行
# 爬取下来的游记 先不保存 因为太多了
#count_1 记录次数
count_1 = 0
for url in linkList:
print(url)
html = getHtml(url)
print(html.getcode(),count_1)
bsObj = BeautifulSoup(html,"lxml")
count_1 += 1

# 获取游记文本内容
content = ""
# 这边获取文本内容不是很准确
for p in bsObj.findAll("p"):
part = p.get_text()
content += part
#print(content)
# 得到大概完整的文本内容 进行顺序解析

## 这里要考虑一下,景点名称的选择很重要
visitList = ["中心湖区","东南湖区","秀水广场","千岛湖广场","水之灵","明珠观光","天屿","梦姑塘","森林氧吧","牧心谷","林海归真","热气球","钓鱼岛","石林","芹川","龙川湾","狮城","九咆界","啤酒风情小镇"," 大峡谷","下姜","汾口","九龙溪漂流","白云溪漂流","王子谷漂流","龙门峡谷漂流","九潭峡谷漂流","环湖骑行","环岛骑行"]

# 这边有可能不能找到所有的数字
# 先把数字全部找出来替换为空格
content = re.sub("[0-9]",str(""),content,0,0)
#print(content)

# 再用数字替换景点
# 进行景点顺序解析
count = 0
for visit in visitList:
content = re.sub(visit,str(count),content,0,0)
count+=1
# 检验一下是否将景点替换为数字
# print(content)

# 得到数字列表即对应顺序 下边这种正则写的方式有问题
# 提取元文本中的数字
sequenceList = re.findall("\d+",content,flags=0)
# 检验提取数字的顺序 没有问题
print(sequenceList)

# 根据相邻元素填写矩阵
size = len(sequenceList)
for num in range(0,size-1):
num1 = int(sequenceList[num])
num2 = int(sequenceList[num+1])
if num1 != num2:
matrix[num1%29][num2%29]+=1
# 相等则不进行操作

# 爬取完所有文章 输出矩阵
print(matrix)

np.savetxt('matrix.csv', matrix, delimiter = ',')

统计词频的代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#  从文本中读取数据
with open('xc_data.txt', 'r') as f:
#txt中所有字符串读入data
data = f.readlines()

# 将 data.txt文件的头尾中括号去掉
#将单个数据分隔开存好 得到urlList
for line in data:
urlList = line.split(',')

# 去掉重复项
urlList = urlList[::2]
linkList =[]
for url in urlList:
# 得到干净的url
url = url.strip(" ")
url = url.strip('\'')
url = url.strip("http://you.ctrip.com")
url = "http://you.ctrip.com/tr"+ url
#print(url)
linkList.append(url)
print(len(linkList))

# coding: utf-8
import os, io, sys, re, time, base64, json
from urllib.request import urlopen
from urllib.request import Request
from bs4 import BeautifulSoup
import random
import lxml
import re
import numpy as np
import csv

import socket
socket.setdefaulttimeout(15)

def getHtml(url):
my_headers = ["Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"]
req = Request(url)
req.add_header("User-Agent",my_headers[0])
req.add_header("GET",url)

#随机等待 反爬虫
time.sleep(random.randint(1,2)/10)
html = urlopen(req)
return html

# 统计每个景点出现的词频
csvFile1 = open('wordFre4.csv','w', newline='',encoding='utf-8-sig')
writer2 = csv.writer(csvFile1)

dic = {"中心湖区":0,"东南湖区":0,"秀水广场":0,"千岛湖广场":0,"水之灵":0,"明珠观光":0,"天屿":0,"梦姑塘":0,"森林氧吧":0,"牧心谷":0,"林海归真":0,"热气球":0,"钓鱼岛":0,"石林":0,"芹川":0,"龙川湾":0,"狮城":0,"九咆界":0,"啤酒风情小镇":0,"大峡谷":0,"下姜":0,"汾口":0,"九龙溪漂流":0,"白云溪漂流":0,"王子谷漂流":0,"龙门峡谷漂流":0,"九潭峡谷漂流":0,"环湖骑行":0,"环岛骑行":0}
count_1 = 0
for url in linkList[2432:]:
print(url)
html = getHtml(url)
#print(html.getcode(),count_1)
bsObj = BeautifulSoup(html,"lxml")
count_1 += 1

# 获取游记文本内容
content = ""
# 这边获取文本内容不是很准确
for p in bsObj.findAll("p"):
part = p.get_text()
content += part
# 得到大概完整的文本内容

## 景点名称的选择很重要
visitList = ["中心湖区","东南湖区","秀水广场","千岛湖广场","水之灵","明珠观光","天屿","梦姑塘","森林氧吧","牧心谷","林海归真","热气球","钓鱼岛","石林","芹川","龙川湾","狮城","九咆界","啤酒风情小镇","大峡谷","下姜","汾口","九龙溪漂流","白云溪漂流","王子谷漂流","龙门峡谷漂流","九潭峡谷漂流","环湖骑行","环岛骑行"]


#在文本中查找 写成字典形式
for visit in visitList:
dic[visit] = dic[visit] + content.count(visit)
print(dic[visit])

for key in dic:
writer2.writerow([key, dic[key]])
csvFile1.close()

这周完成的第二件事情是和两位同学做数模比赛,很巧的是我们选的也是旅游类型的赛题,做路线规划,更巧的是又是我去爬数据(请携程爸爸放过我),所以很多东西就很熟悉了,但是这份代码数据解析的部分真的烦,要做好多解析,话不多少,贴上代码还有爬取下来文件的百度云链接

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# Stage1
# @author:1-riverfish

# coding: utf-8

# In[3]:


# coding: utf-8
import os, io, sys, re, time, base64, json
from urllib.request import urlopen
from urllib.request import Request
from urllib.parse import quote
from bs4 import BeautifulSoup
import random
import lxml
import re

import socket
socket.setdefaulttimeout(15)


# In[4]:


def getHtml(url):
my_headers = ["Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"]
req = Request(url)
req.add_header("User-Agent",my_headers[0])
req.add_header("GET",url)

#随机等待 反爬虫
time.sleep(random.randint(1,2))
html = urlopen(req)
return html


# In[12]:


url_header = "http://www.mafengwo.cn/search/s.php?q=%E5%8D%97%E4%BA%AC&p="
url_tail = "&t=poi&kt=1"

file=open('module_data.txt','w')
hrefList = []
for i in range(1,51):
url = url_header + str(i) + url_tail
print(url)

html = getHtml(url)
#html.read().decode("utf-8")
bsObj = BeautifulSoup(html,"lxml")

#使用正则表达式匹配
aList = bsObj.findAll("a",{"href":re.compile("http://www.mafengwo.cn/poi/[0-9]+")})
for a in aList:
href = a.get("href")
#print(href)
hrefList.append(href)

hrefList = hrefList[::4]
for href in hrefList:
print(href)

print(len(hrefList))
#写入文件
file.write(str(hrefList))
# 关闭文件
file.close()

第二段代码进行解析,写的有点乱,没有进行优化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
# Stage2
# @author:1-riverfish

# coding: utf-8

# In[74]:


# 从文本中读取数据
with open('module_data.txt', 'r') as f:
#txt中所有字符串读入data
data = f.readlines()

# 将 data.txt文件的头尾中括号去掉
#将单个数据分隔开存好 得到urlList
for line in data:
urlList = line.split(',')

linkList = []
for url in urlList:
# 得到干净的url
url = url.strip(" ")
url = url.strip('\'')
#print(url)
linkList.append(url)
#print(len(linkList))


# In[75]:


# coding: utf-8
import os, io, sys, re, time, base64, json
from urllib.request import urlopen
from urllib.request import Request
from bs4 import BeautifulSoup
import random
import lxml
import re
import numpy as np
import csv

import socket
socket.setdefaulttimeout(15)


# In[76]:


def getHtml(url):
my_headers = ["Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"]
req = Request(url)
req.add_header("User-Agent",my_headers[0])
req.add_header("GET",url)

#随机等待 反爬虫
time.sleep(random.randint(1,2)/10.0)
html = urlopen(req)
return html


# In[77]:


# 750条数据 分成七次

count_1 =1
# 景点标号
num = 1

# 打开csv文件 注意设置 csv文件编码格式
csvFile = open('TF_750.csv','w', newline='',encoding='utf-8-sig') # 设置newline,否则两行之间会空一行
writer = csv.writer(csvFile)

for url in linkList[:]:
# 存储相关信息的列表
tour_list = []
poi_id_list = re.findall("\d+",url,flags=0)
poi_id = poi_id_list[0]
#print(poi_id)
html = getHtml(url)
print(html.getcode(),count_1)
#print(html.read().decode("utf-8"))
count_1 += 1
### num title price numcomment text time_refer1 time_refer2 traffic opentime
bsObj = BeautifulSoup(html,"lxml")

# 解析得到数据 把每一行数据集中在一起
# 景点名称
try:
title = bsObj.find("h1").get_text()
except:
title = "null"
print(num,title)
tour_list.append(num)
tour_list.append(title)
num += 1

#室内室外 根据title判断

#门票价格
try:
price_dt = bsObj.find("dt",text="门票")
price_dd = price_dt.next_sibling.next_sibling
price = (price_dd.div).get_text()
price_list = re.findall("\d+元",price,flags=0)
total = 0
for p in price_list:
p = p.strip("")
p = float(p.strip("元"))
total += p
if len(price_list) != 0:
price = total / len(price_list)
else:
price = 0
except:
price = 0
print("门票价格",str(int(price)))
tour_list.append(int(price))

#点评数量
#需要写成一个向量的形式
num_comment_a = bsObj.find("a",{"title":"蜂蜂点评"})
try:
num_comment = (num_comment_a.span).get_text()
num_comment = num_comment.lstrip("(")
num_comment = num_comment.rstrip(")条")
except:
num_comment = 0
print("点评数量",num_comment)
tour_list.append(num_comment)

# 动态生成的评论
# 请求 解析返回的json
json_url_head = 'http://pagelet.mafengwo.cn/poi/pagelet/poiCommentListApi?&params={"poi_id":"'
json_url_tail = '"}'
json_url = json_url_head + poi_id + json_url_tail
# print(json_url)
# 得到 json 链接 进行请求
try:
json_content = urlopen(json_url).read().decode("utf-8")
json_content = json.loads(json_content)
# 成功请求到 json 字符串 格式转换 得到想要的内容
# 格式转换成功
review = json_content["data"]['html']
except:
review = ""

# 顺序固定 有图 好评 中评 差评
text = []
text_num = []
text = re.findall('\d+条',review,flags=0)
for titem in text:
titem = titem.strip("")
titem = titem.strip("条")
text_num.append(int(titem))
tour_list.append(text_num)
print("评论列表 有图,好,中,差",text_num)

#用时参考
# 默认参观时间优先级为 3
time_priority = 3
try:
time_list = bsObj.find("li",{"class":"item-time"}).get_text()
time_refer1 = re.findall("\d+小时",time_list,flags=0)
except:
time_refer1 = []
try:
time_refer2 = re.findall("\d+天",time_list,flags=0)
except:
time_refer2 = []
# 如果参观时间 包含天数 默认时间很长 分到第四等级
if len(time_refer2) != 0:
time_priority = 4
if len(time_refer1) != 0:
time_ref = int((re.findall("\d+",time_refer1[0],flags=0))[0])
if time_ref <= 2:
time_priority = 1
else:
if time_ref <= 4:
time_priority = 2
else:
if time_ref <= 6:
time_priority = 3
else:
time_priority = 4
print("参观时间优先级",time_priority)
tour_list.append(time_priority)

#交通 包含地铁及公交车
traffic_p = bsObj.find("dt",text="交通")
try:
traffic = traffic_p.next_sibling.next_sibling
traffic = traffic.get_text()
except:
traffic = ""
route_list = re.findall('\d+路',traffic,flags=0)
zhida = re.findall("直达",traffic,flags=0)
route_num = len(route_list)

#周围景点及交通
#请求json dis_list 周围地铁站的 J距离列表
jsonurl_head = 'http://pagelet.mafengwo.cn/poi/pagelet/poiLocationApi?&params={"poi_id":"'
jsonurl_tail = '"}'
jsonurl = jsonurl_head + poi_id + jsonurl_tail

jsoncontent = urlopen(jsonurl).read().decode("utf-8")
jsoncontent = json.loads(jsoncontent)
location = jsoncontent["data"]['html']
# 先分割 再匹配
index = location.find("位置-附近交通")
if len(zhida) != 0:
dis_list = [0.0]
else:
dis_list = []
if index != -1:
location = location[index:]
dis_mlist = re.findall("\d+[米]",location,flags=0)
dis_glist = re.findall("\d.\d+[公]",location,flags=0)

for mitem in dis_mlist:
mitem = mitem.strip("")
mitem = mitem.strip("米")
mitem = str(float(mitem)/1000)
dis_list.append(float(mitem))
for gitem in dis_glist:
gitem = gitem.strip("")
gitem = gitem.strip("公")
dis_list.append(float(gitem))
#距离列表
# print(dis_list)两项放到一起
jiaotong = [dis_list,route_num]
print("地铁距离向量 公交车数量",jiaotong)
tour_list.append(jiaotong)

#开放时间 格式的转换 XX:XX-YY:YY 选择第一个符合格式要求的时间写入
opentime_p = bsObj.find("dt",text="开放时间")
try:
opentime = opentime_p.next_sibling.next_sibling
opentime = opentime.get_text()
opentime = re.findall("\d+:\d+",opentime,flags=0)
if len(opentime) >= 2:
opentimef = opentime[0]+"-"+opentime[1]
except:
opentimef = "00:00-24:00"
print("开放时间",opentimef)
tour_list.append(opentimef)

# 写入一行到csv文件
writer.writerow(tour_list)

#关闭csv文件
csvFile.close()

数据链接

密码: t3x7

今天好像还是母亲节……忘记给老妈打电话了……

分享到 

 上一篇: 搭建Chevereto图床 下一篇: Scrapy食用指南2 

© 2018 1-riverfish

Theme Typography by Makito

Proudly published with Hexo