|
| 1 | +#!/usr/bin/env python3 |
| 2 | +# -*- coding: utf-8 -*- |
| 3 | +""" |
| 4 | +@author: 闲欢 |
| 5 | +""" |
| 6 | +import json |
| 7 | +import pandas as pd |
| 8 | +from pyecharts.charts import Bar, Pie |
| 9 | +from pyecharts import options as opts |
| 10 | +import jieba |
| 11 | +from PIL import Image |
| 12 | +from wordcloud import WordCloud |
| 13 | +from matplotlib import pyplot as plt |
| 14 | +import numpy as np |
| 15 | +from os import path |
| 16 | + |
| 17 | +size = ['XXL', 'XL', 'XS', 'S', 'M', 'L'] |
| 18 | + |
| 19 | +color = [] |
| 20 | +size1 = [] |
| 21 | +size2 = [] |
| 22 | +comments = [] |
| 23 | + |
| 24 | +with open("comments.txt", "r", encoding="utf-8") as f: |
| 25 | + for line in f: |
| 26 | + data_obj = json.loads(line) |
| 27 | + comments.append(data_obj['content']) |
| 28 | + skuinfo = data_obj['skuInfo'] |
| 29 | + # skuArr = skuinfo.split(",") |
| 30 | + for sku in skuinfo: |
| 31 | + if '颜色' in sku and '内裤' not in sku: |
| 32 | + color.append(sku.replace("颜色:", "").strip().replace("开扣", "").replace("套头", "").replace("文胸", "").replace("套装", "").replace("(薄杯)", "").replace("(厚杯)", "")) |
| 33 | + elif '尺码' in sku: |
| 34 | + is_size1 = False |
| 35 | + for s in size: |
| 36 | + if s in sku: |
| 37 | + is_size1 = True |
| 38 | + size1.append(s) |
| 39 | + break |
| 40 | + |
| 41 | + # 非SML这种定义尺寸的,就是简单罩杯定义的,同时去掉"适合75ABCD"这种定义的 |
| 42 | + if not is_size1 and '适合' not in sku: |
| 43 | + size2.append(sku.replace('尺码:', "")) |
| 44 | + |
| 45 | +# 颜色可视化 |
| 46 | +df = pd.DataFrame(color, columns=['color']) |
| 47 | +analyse_color = df['color'].value_counts() |
| 48 | + |
| 49 | +bar = Bar() |
| 50 | +bar.add_xaxis(analyse_color.index.values.tolist()) |
| 51 | +bar.add_yaxis("", analyse_color.values.tolist()) |
| 52 | +bar.set_global_opts( |
| 53 | + xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=-90)), |
| 54 | + title_opts=opts.TitleOpts(title="颜色分布"), |
| 55 | + # datazoom_opts=opts.DataZoomOpts(), |
| 56 | +) |
| 57 | +# bar.render_notebook() |
| 58 | +bar.render('color.html') |
| 59 | + |
| 60 | + |
| 61 | +# 尺码可视化 |
| 62 | +df2 = pd.DataFrame(size1, columns=['size']) |
| 63 | +analyse_size = df2['size'].value_counts() |
| 64 | + |
| 65 | +bar = Bar() |
| 66 | +bar.add_xaxis(analyse_size.index.values.tolist()) |
| 67 | +bar.add_yaxis("", analyse_size.values.tolist()) |
| 68 | +bar.set_global_opts( |
| 69 | + xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=0)), |
| 70 | + title_opts=opts.TitleOpts(title="尺寸分布"), |
| 71 | + # datazoom_opts=opts.DataZoomOpts(), |
| 72 | +) |
| 73 | +bar.render('size1.html') |
| 74 | + |
| 75 | +df2 = pd.DataFrame(size2, columns=['size']) |
| 76 | +analyse_size = df2['size'].value_counts() |
| 77 | + |
| 78 | +bar = Bar() |
| 79 | +bar.add_xaxis(analyse_size.index.values.tolist()) |
| 80 | +bar.add_yaxis("", analyse_size.values.tolist()) |
| 81 | +bar.set_global_opts( |
| 82 | + xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=0)), |
| 83 | + title_opts=opts.TitleOpts(title="尺寸分布"), |
| 84 | + # datazoom_opts=opts.DataZoomOpts(), |
| 85 | +) |
| 86 | +bar.render('size2.html') |
| 87 | + |
| 88 | + |
| 89 | + |
| 90 | +# 评论可视化 |
| 91 | +text = " ".join(comments) |
| 92 | +def gen_wc_split_text(text='There is no txt', max_words=None, background_color=None, |
| 93 | + font_path='/System/Library/Fonts/PingFang.ttc', |
| 94 | + output_path='', output_name='', |
| 95 | + mask_path=None, mask_name=None, |
| 96 | + width=400, height=200, max_font_size=100, axis='off'): |
| 97 | + all_seg = jieba.cut(text, cut_all=False) |
| 98 | + split_text = ' ' |
| 99 | + for seg in all_seg: |
| 100 | + split_text = split_text + seg + ' ' |
| 101 | + |
| 102 | + # 设置一个底图 |
| 103 | + mask = None |
| 104 | + if mask_path is not None: |
| 105 | + mask = np.array(Image.open(path.join(mask_path, mask_name))) |
| 106 | + |
| 107 | + wordcloud = WordCloud(background_color=background_color, |
| 108 | + mask=mask, |
| 109 | + max_words=max_words, |
| 110 | + max_font_size=max_font_size, |
| 111 | + width=width, |
| 112 | + height=height, |
| 113 | + # 如果不设置中文字体,可能会出现乱码 |
| 114 | + font_path=font_path) |
| 115 | + myword = wordcloud.generate(str(split_text)) |
| 116 | + # 展示词云图 |
| 117 | + plt.imshow(myword) |
| 118 | + plt.axis(axis) |
| 119 | + plt.show() |
| 120 | + |
| 121 | + # 保存词云图 |
| 122 | + wordcloud.to_file(path.join(output_path, output_name)) |
| 123 | + |
| 124 | +gen_wc_split_text(text, output_name='comments_wc.png', output_path='./') |
0 commit comments