Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 60c2db2

Browse files
committed
python showdata
1 parent 9e639b4 commit 60c2db2

25 files changed

+99952
-0
lines changed
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
#!/usr/bin/env python3
2+
# -*- coding: utf-8 -*-
3+
# @Time : 2020年8月27日 14:06
4+
# @Author : way
5+
# @Site :
6+
# @Describe: 通过 ip 获取所在省份
7+
8+
import sys
9+
import json
10+
import requests
11+
import os
12+
13+
ak = "<换成你的ak>" # 百度 ak 自行申请 http://lbsyun.baidu.com/index.php?title=webapi/ip-api
14+
15+
ipCache = {}
16+
if os.path.exists("ip_cache.txt"):
17+
with open("ip_cache.txt", "r") as f:
18+
data = f.readline()
19+
while data:
20+
ip, province = data.strip().split("\t")
21+
ipCache[ip] = province
22+
data = f.readline()
23+
24+
def ip2province(ip):
25+
province = ipCache.get(ip, None)
26+
if province is None:
27+
url = f"https://api.map.baidu.com/location/ip?ak={ak}&ip={ip}&coor=bd09ll"
28+
try:
29+
province = json.loads(requests.get(url).text)['address'].split('|')[1]
30+
ipCache[ip] = province
31+
# 这里就需要写入
32+
with open("ip_cache.txt","a") as f:
33+
f.write(ip + "\t" + province + "\n")
34+
return province
35+
except Exception as e:
36+
return "未知"
37+
else:
38+
return province
39+
40+
if __name__ == '__main__':
41+
for line in sys.stdin:
42+
cols = line.replace('\n', '').split('\t')
43+
print(cols)
44+
cols = [ip2province(cols[0]), cols[0]]
45+
sys.stdout.write('\t'.join(cols) + '\n')

‎taiyangxue/showdata/analyse/main.py‎

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
import re
2+
import os
3+
import datetime
4+
from baidu_api import ip2province
5+
import pandas as pd
6+
import openpyxl
7+
from openpyxl import load_workbook
8+
9+
# 命名分组
10+
obj = re.compile(r'(?P<ip>.*?)- - \[(?P<time>.*?)\] "(?P<request>.*?)" (?P<status>.*?) (?P<bytes>.*?) "(?P<referer>.*?)" "(?P<ua>.*?)"')
11+
def load_log(path):
12+
lst = []
13+
error_lst = []
14+
i = 0
15+
with open(path, mode="r", encoding="utf-8") as f:
16+
for line in f:
17+
line = line.strip()
18+
dic = parse(line)
19+
if dic: # 正确的数据添加到lst列表中
20+
lst.append(dic)
21+
else:
22+
error_lst.append(line) # 脏数据添加到error_lst列表中
23+
i += 1
24+
if i % 1000 == 0:
25+
print(i,"行")
26+
return lst, error_lst
27+
28+
def parse(line):
29+
# 解析单行nginx日志
30+
dic = {}
31+
try:
32+
# print(line)
33+
result = obj.match(line)
34+
# print(result.group("time"))
35+
# ip处理
36+
ip = result.group("ip")
37+
if ip.strip() == '-' or ip.strip() == "": # 如果是匹配到没有ip就把这条数据丢弃
38+
return False
39+
dic['ip'] = ip.split(",")[0].strip() # 如果有两个ip,取第一个ip
40+
dic['province'] = ip2province(dic['ip']) # 用 IP 转换为省份
41+
# print("dic['province']:",dic['province'])
42+
# 状态码处理
43+
status = result.group("status") # 状态码
44+
dic['status'] = status
45+
46+
# 时间处理
47+
time = result.group("time") # 21/Dec/2019:21:45:31 +0800
48+
time = time.replace(" +0800", "") # 替换+0800为空
49+
t = datetime.datetime.strptime(time, "%d/%b/%Y:%H:%M:%S") # 将时间格式化成友好的格式
50+
dic['time'] = t
51+
dic['hour'] = t.hour
52+
# request处理
53+
request = result.group("request")
54+
a = request.split()[1].split("?")[0] # 往往url后面会有一些参数,url和参数之间用?分隔,取出不带参数的url
55+
dic['request'] = a
56+
57+
# user_agent处理
58+
ua = result.group("ua")
59+
if "Windows NT" in ua:
60+
u = "windows"
61+
elif "iPad" in ua:
62+
u = "ipad"
63+
elif "Android" in ua:
64+
u = "android"
65+
elif "Macintosh" in ua:
66+
u = "mac"
67+
elif "iPhone" in ua:
68+
u = "iphone"
69+
else:
70+
u = "其他设备"
71+
dic['ua'] = u
72+
73+
# refer处理
74+
referer = result.group("referer")
75+
dic['referer'] = referer
76+
77+
return dic
78+
except Exception as e:
79+
print("[parse]",line, "-->", e)
80+
return None
81+
82+
def analyse(lst, datafile):
83+
df = pd.DataFrame(lst) # 创建 DataFrame
84+
85+
# 统计省份
86+
province_count_df = pd.value_counts(df['province']).reset_index().rename(columns={"index": "province", "province": "count"})
87+
88+
# 统计时段
89+
hour_count_df = pd.value_counts(df['hour']).reset_index().rename(columns={"index": "hour", "hour": "count"}).sort_values(by='hour')
90+
91+
# 统计客户端
92+
ua_count_df = pd.value_counts(df['ua']).reset_index().rename(columns={"index": "ua", "ua": "count"})
93+
94+
# 数据存储
95+
to_excel(province_count_df, datafile, sheet_name='省份')
96+
to_excel(hour_count_df, datafile, sheet_name='按时')
97+
to_excel(ua_count_df, datafile, sheet_name='客户端')
98+
99+
def to_excel(dataframe, filepath, sheet_name):
100+
if os.path.exists(filepath):
101+
excelWriter = pd.ExcelWriter(filepath, engine='openpyxl')
102+
book = load_workbook(excelWriter.path)
103+
excelWriter.book = book
104+
dataframe.to_excel(excel_writer=excelWriter,sheet_name=sheet_name,index=None, header=None)
105+
excelWriter.close()
106+
else:
107+
dataframe.to_excel(filepath, sheet_name=sheet_name, index=None, header=None)
108+
109+
if __name__ == '__main__':
110+
lst, error_lst = load_log("nginx_access.log")
111+
analyse(lst, "data.xlsx")

‎taiyangxue/showdata/ip_cache.txt‎

Lines changed: 186 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,186 @@
1+
124.64.19.27 北京
2+
124.64.18.118 北京
3+
114.246.34.133 北京
4+
123.119.247.136 北京
5+
221.219.132.144 北京
6+
116.2.39.176 辽宁
7+
36.104.125.224 吉林
8+
124.64.16.10 北京
9+
124.64.19.17 北京
10+
124.64.19.198 北京
11+
61.181.218.54 天津
12+
124.64.17.231 北京
13+
218.69.54.122 天津
14+
124.64.17.228 北京
15+
221.192.179.8 河北
16+
211.94.246.253 天津
17+
211.94.239.98 天津
18+
114.242.249.80 北京
19+
114.242.250.129 北京
20+
101.91.60.81 江苏
21+
221.192.179.30 河北
22+
43.249.136.26 天津
23+
124.64.16.188 北京
24+
114.242.248.31 北京
25+
124.64.18.198 北京
26+
218.68.91.101 辽宁
27+
61.148.245.90 北京
28+
117.10.206.88 天津
29+
203.208.60.64 福建
30+
203.208.60.3 福建
31+
203.208.60.60 福建
32+
124.64.19.57 北京
33+
123.151.77.91 河北
34+
124.64.18.144 北京
35+
123.151.76.158 天津
36+
61.148.243.176 北京
37+
124.64.17.67 北京
38+
223.104.3.204 北京
39+
124.64.16.93 北京
40+
211.94.208.121 天津
41+
211.94.246.15 天津
42+
124.64.16.138 北京
43+
61.148.243.38 北京
44+
120.244.52.54 北京
45+
124.64.16.136 北京
46+
124.64.18.217 北京
47+
120.245.4.41 北京
48+
211.94.195.158 天津
49+
124.64.16.135 北京
50+
123.151.76.248 天津
51+
61.148.244.12 北京
52+
124.64.16.253 北京
53+
218.68.91.112 辽宁
54+
124.64.18.76 北京
55+
211.94.238.171 天津
56+
125.39.46.56 辽宁
57+
111.196.106.59 北京
58+
114.241.45.159 北京
59+
211.94.225.8 天津
60+
123.150.174.182 天津
61+
61.181.236.214 天津
62+
124.64.16.98 北京
63+
61.148.243.25 北京
64+
124.64.17.42 北京
65+
123.151.77.81 河北
66+
211.94.230.118 天津
67+
103.3.96.2 天津
68+
114.242.250.155 北京
69+
114.240.67.63 北京
70+
124.64.16.221 北京
71+
211.94.252.135 天津
72+
124.64.16.62 北京
73+
218.69.61.133 天津
74+
124.64.19.236 北京
75+
114.242.248.188 北京
76+
117.10.207.38 天津
77+
124.64.16.19 北京
78+
211.94.251.187 天津
79+
139.214.251.167 吉林
80+
123.151.77.70 河北
81+
111.30.142.186 河北
82+
111.30.142.227 河北
83+
223.104.236.216 辽宁
84+
111.30.142.78 河北
85+
223.104.3.186 北京
86+
180.97.118.219 江苏
87+
124.64.18.192 北京
88+
122.115.226.173 北京
89+
220.194.106.92 天津
90+
220.194.106.94 北京
91+
202.99.89.162 天津
92+
124.64.19.220 北京
93+
124.64.18.4 北京
94+
61.148.245.99 北京
95+
223.104.3.198 北京
96+
211.94.239.184 天津
97+
125.39.132.94 北京
98+
211.94.225.128 天津
99+
61.151.207.158 上海
100+
117.136.38.145 北京
101+
223.104.227.204 天津
102+
103.3.96.18 天津
103+
113.96.232.118 重庆
104+
223.104.3.11 北京
105+
211.94.239.180 天津
106+
139.214.246.116 吉林
107+
211.94.237.208 天津
108+
101.89.239.230 上海
109+
202.99.112.190 天津
110+
124.64.17.172 北京
111+
124.64.19.5 北京
112+
61.148.243.117 北京
113+
221.192.179.120 河北
114+
122.97.175.103 江苏
115+
202.99.113.50 天津
116+
218.67.234.74 天津
117+
58.218.133.250 江苏
118+
122.97.175.148 江苏
119+
112.224.67.28 山东
120+
61.151.207.252 江苏
121+
61.181.218.93 天津
122+
117.10.206.177 天津
123+
223.104.236.240 辽宁
124+
123.151.77.123 河北
125+
220.194.107.221 北京
126+
110.251.15.159 河北
127+
175.24.45.114 上海
128+
124.64.19.86 北京
129+
220.194.107.222 北京
130+
221.192.178.44 河北
131+
114.242.248.155 北京
132+
223.104.175.237 辽宁
133+
223.104.175.86 辽宁
134+
211.94.240.149 天津
135+
223.104.176.106 辽宁
136+
221.192.179.167 河北
137+
124.64.17.217 北京
138+
222.186.136.164 江苏
139+
124.64.18.245 北京
140+
211.94.208.8 天津
141+
223.104.176.23 辽宁
142+
122.97.175.145 江苏
143+
211.94.253.31 天津
144+
36.104.39.237 吉林
145+
221.192.179.96 河北
146+
218.69.52.34 天津
147+
211.94.254.22 天津
148+
223.104.103.17 河北
149+
61.148.243.204 北京
150+
124.64.17.54 北京
151+
139.214.251.83 吉林
152+
139.214.244.217 吉林
153+
124.64.19.162 北京
154+
117.136.54.52 天津
155+
220.181.108.101 广东
156+
220.181.108.171 广东
157+
111.206.221.22 北京
158+
111.206.221.45 北京
159+
111.206.221.11 北京
160+
111.206.198.26 北京
161+
111.206.221.43 北京
162+
111.206.198.101 北京
163+
111.206.221.108 北京
164+
117.10.206.50 天津
165+
103.3.96.166 天津
166+
61.181.219.241 天津
167+
124.64.19.43 北京
168+
103.3.97.8 天津
169+
61.148.243.124 北京
170+
124.64.19.186 北京
171+
221.192.179.34 河北
172+
36.104.122.38 吉林
173+
221.192.180.153 河北
174+
211.94.245.65 天津
175+
221.192.178.240 河北
176+
220.181.108.177 广东
177+
220.181.108.161 广东
178+
111.206.198.50 北京
179+
111.206.198.119 北京
180+
111.206.198.70 北京
181+
111.206.198.6 北京
182+
111.206.221.102 北京
183+
111.206.221.44 北京
184+
111.206.198.41 北京
185+
36.98.226.192 河北
186+
124.64.19.93 北京

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /