Commit da69b37

committed

用 Python 来了解一下《安家》

1 parent 13df7b1 commit da69b37Copy full SHA for da69b37

File tree

10 files changed

+1305

-0

lines changed

anjia

10 files changed

+1305

-0

lines changed

`‎anjia/init.py`

Whitespace-only changes.

`‎anjia/actor.py`

Lines changed: 24 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,24 @@`
	`1`	`+import pandas as pd, jieba, matplotlib.pyplot as plt`
	`2`	`+`
	`3`	`+csv_data = pd.read_csv('data.csv')`
	`4`	`+roles = {'姑姑':0, '房似锦':0, '王子':0, '闪闪':0, '老油条':0, '楼山关':0, '鱼化龙':0}`
	`5`	`+names = list(roles.keys())`
	`6`	`+for name in names:`
	`7`	`+ jieba.add_word(name)`
	`8`	`+for row in csv_data['comments']:`
	`9`	`+ row = str(row)`
	`10`	`+ for name in names:`
	`11`	`+ count = row.count(name)`
	`12`	`+ roles[name] += count`
	`13`	`+plt.figure(figsize=(8, 5))`
	`14`	`+# 数据`
	`15`	`+plt.bar(list(roles.keys()), list(roles.values()), width=0.5, label='提及次数', color=['g', 'r', 'dodgerblue', 'c', 'm', 'y', 'aquamarine'])`
	`16`	`+# 设置数字标签`
	`17`	`+for a, b in zip(list(roles.keys()), list(roles.values())):`
	`18`	`+ plt.text(a, b, b, ha='center', va='bottom', fontsize=13, color='black')`
	`19`	`+plt.title('角色被提及次数柱状图')`
	`20`	`+plt.xticks(rotation=270)`
	`21`	`+plt.tick_params(labelsize=10)`
	`22`	`+plt.ylim(0, 30)`
	`23`	`+plt.legend(loc='upper right')`
	`24`	`+plt.show()`

`‎anjia/anjia.png`

195 KB

Loading[フレーム]

`‎anjia/bg.jpg`

30.8 KB

Loading[フレーム]

`‎anjia/cloud.py`

Lines changed: 51 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,51 @@`
	`1`	`+from wordcloud import WordCloud`
	`2`	`+import numpy as np, jieba`
	`3`	`+from PIL import Image`
	`4`	`+`
	`5`	`+def jieba_():`
	`6`	`+ # 打开评论数据文件`
	`7`	`+ content = open('comment.csv', 'rb').read()`
	`8`	`+ # jieba 分词`
	`9`	`+ word_list = jieba.cut(content)`
	`10`	`+ words = []`
	`11`	`+ # 过滤掉的词`
	`12`	`+ remove_words = ['以及', '不会', '一些', '那个', '只有',`
	`13`	`+ '不过', '东西', '这个', '所有', '这么',`
	`14`	`+ '但是', '全片', '一点', '一部', '一个',`
	`15`	`+ '什么', '虽然', '一切', '样子', '一样',`
	`16`	`+ '只能', '不是', '一种', '这个', '为了']`
	`17`	`+ for word in word_list:`
	`18`	`+ if word not in remove_words:`
	`19`	`+ words.append(word)`
	`20`	`+ global word_cloud`
	`21`	`+ # 用逗号隔开词语`
	`22`	`+ word_cloud = ','.join(words)`
	`23`	`+`
	`24`	`+def cloud():`
	`25`	`+ # 打开词云背景图`
	`26`	`+ cloud_mask = np.array(Image.open('bg.jpg'))`
	`27`	`+ # 定义词云的一些属性`
	`28`	`+ wc = WordCloud(`
	`29`	`+ # 背景图分割颜色为白色`
	`30`	`+ background_color='white',`
	`31`	`+ # 背景图样`
	`32`	`+ mask=cloud_mask,`
	`33`	`+ # 显示最大词数`
	`34`	`+ max_words=100,`
	`35`	`+ # 显示中文`
	`36`	`+ font_path='./fonts/simhei.ttf',`
	`37`	`+ # 最大尺寸`
	`38`	`+ max_font_size=80`
	`39`	`+ )`
	`40`	`+ global word_cloud`
	`41`	`+ # 词云函数`
	`42`	`+ x = wc.generate(word_cloud)`
	`43`	`+ # 生成词云图片`
	`44`	`+ image = x.to_image()`
	`45`	`+ # 展示词云图片`
	`46`	`+ image.show()`
	`47`	`+ # 保存词云图片`
	`48`	`+ wc.to_file('anjia.png')`
	`49`	`+`
	`50`	`+jieba_()`
	`51`	`+cloud()`

`‎anjia/comment.csv`

Lines changed: 554 additions & 0 deletions

Large diffs are not rendered by default.

`‎anjia/comment.py`

Lines changed: 26 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,26 @@`
	`1`	`+import pandas as pd, matplotlib.pyplot as plt`
	`2`	`+`
	`3`	`+csv_data = pd.read_csv('data.csv')`
	`4`	`+df = pd.DataFrame(csv_data)`
	`5`	`+df_gp = df.groupby(['time']).size()`
	`6`	`+values = df_gp.values.tolist()`
	`7`	`+index = df_gp.index.tolist()`
	`8`	`+# 设置画布大小`
	`9`	`+plt.figure(figsize=(10, 6))`
	`10`	`+# 数据`
	`11`	`+# plt.plot(index, values, label='weight changes', linewidth=3, color='r', marker='o',`
	`12`	`+# markerfacecolor='blue', markersize=20)`
	`13`	`+plt.plot(index, values, label='评论数')`
	`14`	`+# 设置数字标签`
	`15`	`+for a, b in zip(index, values):`
	`16`	`+ plt.text(a, b, b, ha='center', va='bottom', fontsize=13, color='black')`
	`17`	`+plt.title('评论数随时间变化折线图')`
	`18`	`+# plt.xlabel('日期')`
	`19`	`+# plt.ylabel('评论数')`
	`20`	`+plt.xticks(rotation=330)`
	`21`	`+plt.tick_params(labelsize=10)`
	`22`	`+plt.ylim(0, 200)`
	`23`	`+plt.legend(loc='upper right')`
	`24`	`+plt.show()`
	`25`	`+`
	`26`	`+`

`‎anjia/data.csv`

Lines changed: 555 additions & 0 deletions

Large diffs are not rendered by default.

`‎anjia/spd.py`

Lines changed: 65 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,65 @@`
	`1`	`+import requests, time, random, pandas as pd`
	`2`	`+from lxml import etree`
	`3`	`+`
	`4`	`+def spider():`
	`5`	`+ url = 'https://accounts.douban.com/j/mobile/login/basic'`
	`6`	`+ headers = {"User-Agent": 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)'}`
	`7`	`+ # 安家评论网址,为了动态翻页,start 后加了格式化数字,短评页面有 20 条数据,每页增加 20 条`
	`8`	`+ url_comment = 'https://movie.douban.com/subject/30482003/comments?start=%d&limit=20&sort=new_score&status=P'`
	`9`	`+ data = {`
	`10`	`+ 'ck': '',`
	`11`	`+ 'name': '自己的用户',`
	`12`	`+ 'password': '自己的密码',`
	`13`	`+ 'remember': 'false',`
	`14`	`+ 'ticket': ''`
	`15`	`+ }`
	`16`	`+ session = requests.session()`
	`17`	`+ session.post(url=url, headers=headers, data=data)`
	`18`	`+ # 初始化 4 个 list 分别存用户名、评星、时间、评论文字`
	`19`	`+ users = []`
	`20`	`+ stars = []`
	`21`	`+ times = []`
	`22`	`+ content = []`
	`23`	`+ # 抓取 500 条,每页 20 条,这也是豆瓣给的上限`
	`24`	`+ for i in range(0, 500, 20):`
	`25`	`+ # 获取 HTML`
	`26`	`+ data = session.get(url_comment % i, headers=headers)`
	`27`	`+ # 状态码 200 表是成功`
	`28`	`+ print('第', i, '页', '状态码:',data.status_code)`
	`29`	`+ # 暂停 0-1 秒时间,防止IP被封`
	`30`	`+ time.sleep(random.random())`
	`31`	`+ # 解析 HTML`
	`32`	`+ selector = etree.HTML(data.text)`
	`33`	`+ # 用 xpath 获取单页所有评论`
	`34`	`+ comments = selector.xpath('//div[@class="comment"]')`
	`35`	`+ # 遍历所有评论,获取详细信息`
	`36`	`+ for comment in comments:`
	`37`	`+ # 获取用户名`
	`38`	`+ user = comment.xpath('.//h3/span[2]/a/text()')[0]`
	`39`	`+ # 获取评星`
	`40`	`+ star = comment.xpath('.//h3/span[2]/span[2]/@class')[0][7:8]`
	`41`	`+ # 获取时间`
	`42`	`+ date_time = comment.xpath('.//h3/span[2]/span[3]/@title')`
	`43`	`+ # 有的时间为空,需要判断下`
	`44`	`+ if len(date_time) != 0:`
	`45`	`+ date_time = date_time[0]`
	`46`	`+ date_time = date_time[:10]`
	`47`	`+ else:`
	`48`	`+ date_time = None`
	`49`	`+ # 获取评论文字`
	`50`	`+ comment_text = comment.xpath('.//p/span/text()')[0].strip()`
	`51`	`+ # 添加所有信息到列表`
	`52`	`+ users.append(user)`
	`53`	`+ stars.append(star)`
	`54`	`+ times.append(date_time)`
	`55`	`+ content.append(comment_text)`
	`56`	`+ # 用字典包装`
	`57`	`+ comment_dic = {'user': users, 'star': stars, 'time': times, 'comments': content}`
	`58`	`+ # 转换成 DataFrame 格式`
	`59`	`+ comment_df = pd.DataFrame(comment_dic)`
	`60`	`+ # 保存数据`
	`61`	`+ comment_df.to_csv('data.csv')`
	`62`	`+ # 将评论单独再保存下来`
	`63`	`+ comment_df['comments'].to_csv('comment.csv', index=False)`
	`64`	`+`
	`65`	`+spider()`

`‎anjia/star.py`

Lines changed: 30 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,30 @@`
	`1`	`+import pandas as pd, numpy as np, matplotlib.pyplot as plt`
	`2`	`+`
	`3`	`+csv_data = pd.read_csv('data.csv')`
	`4`	`+df_time = csv_data.groupby(['time']).size()`
	`5`	`+df_star = csv_data.groupby(['star']).size()`
	`6`	`+index = df_time.index.tolist()`
	`7`	`+value = [0] * len(index)`
	`8`	`+# 生成字典`
	`9`	`+dic = dict(zip(index, value))`
	`10`	`+# rows = df.loc[df['time'] == '2020年03月05日', 'star']`
	`11`	`+# list = list(map(int, rows.values.tolist()))`
	`12`	`+# avg = np.mean(list)`
	`13`	`+# print(list)`
	`14`	`+# print(avg)`
	`15`	`+for k, v in dic.items():`
	`16`	`+ stars = csv_data.loc[csv_data['time'] == str(k), 'star']`
	`17`	`+ # 平均值`
	`18`	`+ avg = np.mean(list(map(int, stars.values.tolist())))`
	`19`	`+ dic[k] = round(avg ,2)`
	`20`	`+# 设置画布大小`
	`21`	`+plt.figure(figsize=(9, 6))`
	`22`	`+# 数据`
	`23`	`+plt.plot(list(dic.keys()), list(dic.values()), label='星级', color='red', marker='o')`
	`24`	`+plt.title('星级随时间变化折线图')`
	`25`	`+plt.xticks(rotation=330)`
	`26`	`+plt.tick_params(labelsize=10)`
	`27`	`+plt.ylim(0, 5)`
	`28`	`+plt.legend(loc='upper right')`
	`29`	`+plt.show()`
	`30`	`+`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit da69b37

File tree

10 files changed

10 files changed

`‎anjia/init.py`

`‎anjia/actor.py`

`‎anjia/anjia.png`

`‎anjia/bg.jpg`

`‎anjia/cloud.py`

`‎anjia/comment.csv`

`‎anjia/comment.py`

`‎anjia/data.csv`

`‎anjia/spd.py`

`‎anjia/star.py`

0 commit comments