Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit da69b37

Browse files
committed
用 Python 来了解一下《安家》
1 parent 13df7b1 commit da69b37

File tree

10 files changed

+1305
-0
lines changed

10 files changed

+1305
-0
lines changed

‎anjia/__init__.py

Whitespace-only changes.

‎anjia/actor.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
import pandas as pd, jieba, matplotlib.pyplot as plt
2+
3+
csv_data = pd.read_csv('data.csv')
4+
roles = {'姑姑':0, '房似锦':0, '王子':0, '闪闪':0, '老油条':0, '楼山关':0, '鱼化龙':0}
5+
names = list(roles.keys())
6+
for name in names:
7+
jieba.add_word(name)
8+
for row in csv_data['comments']:
9+
row = str(row)
10+
for name in names:
11+
count = row.count(name)
12+
roles[name] += count
13+
plt.figure(figsize=(8, 5))
14+
# 数据
15+
plt.bar(list(roles.keys()), list(roles.values()), width=0.5, label='提及次数', color=['g', 'r', 'dodgerblue', 'c', 'm', 'y', 'aquamarine'])
16+
# 设置数字标签
17+
for a, b in zip(list(roles.keys()), list(roles.values())):
18+
plt.text(a, b, b, ha='center', va='bottom', fontsize=13, color='black')
19+
plt.title('角色被提及次数柱状图')
20+
plt.xticks(rotation=270)
21+
plt.tick_params(labelsize=10)
22+
plt.ylim(0, 30)
23+
plt.legend(loc='upper right')
24+
plt.show()

‎anjia/anjia.png

195 KB
Loading[フレーム]

‎anjia/bg.jpg

30.8 KB
Loading[フレーム]

‎anjia/cloud.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
from wordcloud import WordCloud
2+
import numpy as np, jieba
3+
from PIL import Image
4+
5+
def jieba_():
6+
# 打开评论数据文件
7+
content = open('comment.csv', 'rb').read()
8+
# jieba 分词
9+
word_list = jieba.cut(content)
10+
words = []
11+
# 过滤掉的词
12+
remove_words = ['以及', '不会', '一些', '那个', '只有',
13+
'不过', '东西', '这个', '所有', '这么',
14+
'但是', '全片', '一点', '一部', '一个',
15+
'什么', '虽然', '一切', '样子', '一样',
16+
'只能', '不是', '一种', '这个', '为了']
17+
for word in word_list:
18+
if word not in remove_words:
19+
words.append(word)
20+
global word_cloud
21+
# 用逗号隔开词语
22+
word_cloud = ','.join(words)
23+
24+
def cloud():
25+
# 打开词云背景图
26+
cloud_mask = np.array(Image.open('bg.jpg'))
27+
# 定义词云的一些属性
28+
wc = WordCloud(
29+
# 背景图分割颜色为白色
30+
background_color='white',
31+
# 背景图样
32+
mask=cloud_mask,
33+
# 显示最大词数
34+
max_words=100,
35+
# 显示中文
36+
font_path='./fonts/simhei.ttf',
37+
# 最大尺寸
38+
max_font_size=80
39+
)
40+
global word_cloud
41+
# 词云函数
42+
x = wc.generate(word_cloud)
43+
# 生成词云图片
44+
image = x.to_image()
45+
# 展示词云图片
46+
image.show()
47+
# 保存词云图片
48+
wc.to_file('anjia.png')
49+
50+
jieba_()
51+
cloud()

‎anjia/comment.csv

Lines changed: 554 additions & 0 deletions
Large diffs are not rendered by default.

‎anjia/comment.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
import pandas as pd, matplotlib.pyplot as plt
2+
3+
csv_data = pd.read_csv('data.csv')
4+
df = pd.DataFrame(csv_data)
5+
df_gp = df.groupby(['time']).size()
6+
values = df_gp.values.tolist()
7+
index = df_gp.index.tolist()
8+
# 设置画布大小
9+
plt.figure(figsize=(10, 6))
10+
# 数据
11+
# plt.plot(index, values, label='weight changes', linewidth=3, color='r', marker='o',
12+
# markerfacecolor='blue', markersize=20)
13+
plt.plot(index, values, label='评论数')
14+
# 设置数字标签
15+
for a, b in zip(index, values):
16+
plt.text(a, b, b, ha='center', va='bottom', fontsize=13, color='black')
17+
plt.title('评论数随时间变化折线图')
18+
# plt.xlabel('日期')
19+
# plt.ylabel('评论数')
20+
plt.xticks(rotation=330)
21+
plt.tick_params(labelsize=10)
22+
plt.ylim(0, 200)
23+
plt.legend(loc='upper right')
24+
plt.show()
25+
26+

‎anjia/data.csv

Lines changed: 555 additions & 0 deletions
Large diffs are not rendered by default.

‎anjia/spd.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
import requests, time, random, pandas as pd
2+
from lxml import etree
3+
4+
def spider():
5+
url = 'https://accounts.douban.com/j/mobile/login/basic'
6+
headers = {"User-Agent": 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)'}
7+
# 安家评论网址,为了动态翻页,start 后加了格式化数字,短评页面有 20 条数据,每页增加 20 条
8+
url_comment = 'https://movie.douban.com/subject/30482003/comments?start=%d&limit=20&sort=new_score&status=P'
9+
data = {
10+
'ck': '',
11+
'name': '自己的用户',
12+
'password': '自己的密码',
13+
'remember': 'false',
14+
'ticket': ''
15+
}
16+
session = requests.session()
17+
session.post(url=url, headers=headers, data=data)
18+
# 初始化 4 个 list 分别存用户名、评星、时间、评论文字
19+
users = []
20+
stars = []
21+
times = []
22+
content = []
23+
# 抓取 500 条,每页 20 条,这也是豆瓣给的上限
24+
for i in range(0, 500, 20):
25+
# 获取 HTML
26+
data = session.get(url_comment % i, headers=headers)
27+
# 状态码 200 表是成功
28+
print('第', i, '页', '状态码:',data.status_code)
29+
# 暂停 0-1 秒时间,防止IP被封
30+
time.sleep(random.random())
31+
# 解析 HTML
32+
selector = etree.HTML(data.text)
33+
# 用 xpath 获取单页所有评论
34+
comments = selector.xpath('//div[@class="comment"]')
35+
# 遍历所有评论,获取详细信息
36+
for comment in comments:
37+
# 获取用户名
38+
user = comment.xpath('.//h3/span[2]/a/text()')[0]
39+
# 获取评星
40+
star = comment.xpath('.//h3/span[2]/span[2]/@class')[0][7:8]
41+
# 获取时间
42+
date_time = comment.xpath('.//h3/span[2]/span[3]/@title')
43+
# 有的时间为空,需要判断下
44+
if len(date_time) != 0:
45+
date_time = date_time[0]
46+
date_time = date_time[:10]
47+
else:
48+
date_time = None
49+
# 获取评论文字
50+
comment_text = comment.xpath('.//p/span/text()')[0].strip()
51+
# 添加所有信息到列表
52+
users.append(user)
53+
stars.append(star)
54+
times.append(date_time)
55+
content.append(comment_text)
56+
# 用字典包装
57+
comment_dic = {'user': users, 'star': stars, 'time': times, 'comments': content}
58+
# 转换成 DataFrame 格式
59+
comment_df = pd.DataFrame(comment_dic)
60+
# 保存数据
61+
comment_df.to_csv('data.csv')
62+
# 将评论单独再保存下来
63+
comment_df['comments'].to_csv('comment.csv', index=False)
64+
65+
spider()

‎anjia/star.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
import pandas as pd, numpy as np, matplotlib.pyplot as plt
2+
3+
csv_data = pd.read_csv('data.csv')
4+
df_time = csv_data.groupby(['time']).size()
5+
df_star = csv_data.groupby(['star']).size()
6+
index = df_time.index.tolist()
7+
value = [0] * len(index)
8+
# 生成字典
9+
dic = dict(zip(index, value))
10+
# rows = df.loc[df['time'] == '2020年03月05日', 'star']
11+
# list = list(map(int, rows.values.tolist()))
12+
# avg = np.mean(list)
13+
# print(list)
14+
# print(avg)
15+
for k, v in dic.items():
16+
stars = csv_data.loc[csv_data['time'] == str(k), 'star']
17+
# 平均值
18+
avg = np.mean(list(map(int, stars.values.tolist())))
19+
dic[k] = round(avg ,2)
20+
# 设置画布大小
21+
plt.figure(figsize=(9, 6))
22+
# 数据
23+
plt.plot(list(dic.keys()), list(dic.values()), label='星级', color='red', marker='o')
24+
plt.title('星级随时间变化折线图')
25+
plt.xticks(rotation=330)
26+
plt.tick_params(labelsize=10)
27+
plt.ylim(0, 5)
28+
plt.legend(loc='upper right')
29+
plt.show()
30+

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /