Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit d058083

Browse files
committed
add-实例二贴吧爬虫
1 parent caa19ca commit d058083

File tree

3 files changed

+199
-0
lines changed

3 files changed

+199
-0
lines changed

‎code_demo/Tieba.py

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
#!/usr/bin/python3
2+
# -*- coding: utf-8 -*-
3+
import requests
4+
5+
6+
class TiebaSpider():
7+
8+
def __init__(self, kw, max_pn):
9+
self.max_pn = max_pn
10+
self.kw = kw
11+
self.base_url = "https://tieba.baidu.com/f?kw={}&ie=utf-8&pn={}"
12+
self.headers = {
13+
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36"
14+
}
15+
pass
16+
17+
def get_url_list(self):
18+
'''
19+
获取 url 列表
20+
:return:
21+
'''
22+
# 写法一
23+
'''
24+
url_list = []
25+
26+
for pn in range(0,self.max_pn,50):
27+
url = self.base_url.format(self.kw,pn)
28+
url_list.append(url)
29+
30+
return url_list
31+
'''
32+
# 写法二
33+
return [self.base_url.format(self.kw, pn) for pn in range(0, self.max_pn, 50)]
34+
35+
def get_content(self, url):
36+
'''
37+
发送请求获取响应内容
38+
:param url:
39+
:return:
40+
'''
41+
response = requests.get(
42+
url=url,
43+
headers=self.headers
44+
)
45+
46+
return response.content
47+
48+
def get_items(self, content, index):
49+
'''
50+
从响应内容中提取数据
51+
:param content:
52+
:return:
53+
'''
54+
with open('tieba-{}.html'.format(index), 'wb') as f:
55+
f.write(content)
56+
return None
57+
58+
def save_items(self, items):
59+
'''
60+
保存数据
61+
:param items:
62+
:return:
63+
'''
64+
pass
65+
66+
def run(self):
67+
# 1. 获取 url 列表
68+
url_list = self.get_url_list()
69+
70+
for url in url_list:
71+
# 2. 发送请求获取响应
72+
content = self.get_content(url)
73+
# 3. 从响应中提取数据
74+
items = self.get_items(content, url_list.index(url) + 1)
75+
# 4. 保存数据
76+
self.save_items(items)
77+
78+
pass
79+
80+
81+
if __name__ == '__main__':
82+
spider = TiebaSpider("英雄联盟", 150)
83+
spider.run()

‎images/百度贴吧分析.jpg

145 KB
Loading[フレーム]

‎百度贴吧爬虫.md

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
# 百度贴吧爬虫
2+
## 分析
3+
### 分析流程图
4+
5+
> 分析 ``url`` 的时候我们一般都是从第二页开始分析,可以看出 ``url`` 的变化
6+
7+
![](./images/百度贴吧分析.jpg)
8+
9+
### 分析结果
10+
#### 结果概要
11+
12+
| 请求目标 | 分析结果 |
13+
|-----------------------------------|-------------------|
14+
| 请求方式分析 | GET |
15+
| 请求参数分析 | pn每页50发生变化,其他参数固定不变 |
16+
| 请求头分析 | 只需要添加User-Agent |
17+
| 请求url分析 | https://tieba.baidu.com/f?kw=英雄联盟&ie=utf-8&pn=50
18+
19+
### 代码实现流程
20+
1. 实现面向对象构建爬虫对象
21+
2. 爬虫流程四步骤
22+
1. 获取 url 列表
23+
2. 发送请求获取响应
24+
3. 从响应中提取数据
25+
4. 保存数据
26+
27+
28+
## 代码实现
29+
30+
```python
31+
#!/usr/bin/python3
32+
# -*- coding: utf-8 -*-
33+
import requests
34+
35+
class TiebaSpider():
36+
37+
def __init__(self,kw,max_pn):
38+
self.max_pn = max_pn
39+
self.kw = kw
40+
self.base_url = "https://tieba.baidu.com/f?kw={}&ie=utf-8&pn={}"
41+
self.headers = {
42+
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36"
43+
}
44+
pass
45+
46+
def get_url_list(self):
47+
'''
48+
获取 url 列表
49+
:return:
50+
'''
51+
# 写法一
52+
'''
53+
url_list = []
54+
55+
for pn in range(0,self.max_pn,50):
56+
url = self.base_url.format(self.kw,pn)
57+
url_list.append(url)
58+
59+
return url_list
60+
'''
61+
# 写法二
62+
return [self.base_url.format(self.kw,pn) for pn in range(0,self.max_pn,50)]
63+
64+
def get_content(self,url):
65+
'''
66+
发送请求获取响应内容
67+
:param url:
68+
:return:
69+
'''
70+
response = requests.get(
71+
url=url,
72+
headers = self.headers
73+
)
74+
75+
return response.content
76+
77+
def get_items(self,content,idx):
78+
'''
79+
从响应内容中提取数据
80+
:param content:
81+
:return:
82+
'''
83+
with open('08-{}.html'.format(idx),'wb') as f:
84+
f.write(content)
85+
return None
86+
87+
def save_items(self,items):
88+
'''
89+
保存数据
90+
:param items:
91+
:return:
92+
'''
93+
pass
94+
95+
96+
def run(self):
97+
98+
# 1. 获取 url 列表
99+
url_list = self.get_url_list()
100+
101+
for url in url_list:
102+
# 2. 发送请求获取响应
103+
content = self.get_content(url)
104+
105+
# 3. 从响应中提取数据
106+
items = self.get_items(content,url_list.index(url) + 1)
107+
108+
# 4. 保存数据
109+
self.save_items(items)
110+
111+
pass
112+
113+
if __name__ == '__main__':
114+
spider = TiebaSpider("英雄联盟",150)
115+
spider.run()
116+
```

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /