Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit a90ece2

Browse files
author
situxy
committed
add files
1 parent 7b28bbb commit a90ece2

8 files changed

+1524
-0
lines changed

‎analysis.py

Lines changed: 200 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,200 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Created on Fri Jun 29 20:30:12 2018
4+
5+
@author: situ
6+
"""
7+
8+
import numpy as np
9+
import pandas as pd
10+
import os
11+
import re
12+
13+
#os.chdir("E:/graduate/class/EDA/final")
14+
os.chdir("/Users/situ/Documents/EDA/final")
15+
data = pd.read_csv("data_with_skill.csv",encoding = "gbk")
16+
data.head()
17+
data.info()
18+
19+
data.drop(["jobname","jobgood","url","city"],axis = 1,inplace = True)
20+
#数值型数据处理----------------------
21+
#每周工作天数
22+
data.jobway.unique()
23+
mapping = {}
24+
for i in range(2,7):
25+
mapping[str(i) + '天/周'] = i
26+
print(mapping)
27+
data['day_per_week'] = data['jobway'].map(mapping)
28+
data['day_per_week'].head()
29+
30+
31+
#公司规模
32+
data["size"].unique()
33+
data["comp_size"] = ""
34+
data["comp_size"][data['size'] == '少于15人'] = '小型企业'
35+
data["comp_size"][data['size'] == '15-50人'] = '小型企业'
36+
data["comp_size"][data['size'] == '50-150人'] = '中型企业'
37+
data["comp_size"][data['size'] == '150-500人'] = '中型企业'
38+
data["comp_size"][data['size'] == '500-2000人'] = '大型企业'
39+
data["comp_size"][data['size'] == '2000人以上'] = '大型企业'
40+
41+
#实习月数
42+
data.month.unique()
43+
mapping = {}
44+
for i in range(1,22):
45+
mapping["实习"+str(i) + '个月'] = i
46+
print(mapping)
47+
data['time_span'] = data['month'].map(mapping)
48+
data['time_span'].apply(lambda f:int(f))
49+
50+
#每天工资
51+
def get_mean_salary(s):
52+
return np.mean([int(i) for i in s[:(len(s)-2)].split("-")])
53+
data['average_wage'] = data['salary'].apply(lambda s:get_mean_salary(s))
54+
data['average_wage'].head()
55+
56+
data.drop(['jobway','size','month','salary'], axis = 1,inplace=True)
57+
58+
#字符型数据处理--------------------------------
59+
#(城市)处理
60+
#北京、上海、杭州、深圳、广州
61+
62+
def get_less_dummies(data,feature,useful_classes,prefix):
63+
useful_classes_prefix = [prefix+"_"+token for token in useful_classes]
64+
dum = pd.get_dummies(data[feature],prefix=prefix).ix[:,useful_classes_prefix]
65+
if sum(np.sum(dum.isnull()))>0:
66+
dum = dum.fillna(0)
67+
search_index = np.where(np.sum(dum,axis=1)==0)[0]
68+
for j in range(len(useful_classes)):
69+
token = useful_classes[j]
70+
for i in search_index:
71+
if len(re.findall(token,data.ix[i,feature]))>0:
72+
dum.ix[i,useful_classes_prefix[j]] = 1
73+
# print(dum.head())
74+
75+
data = pd.concat([data,dum],axis = 1)
76+
return data
77+
78+
feature = "address"
79+
useful_classes = ["北京","上海","杭州","深圳","广州","成都","武汉"]
80+
data = get_less_dummies(data,feature,useful_classes,prefix="city")
81+
82+
#行业
83+
#互联网,计算机,金融,电子商务和企业服务
84+
85+
86+
87+
feature = "industry"
88+
useful_classes = ["互联网","计算机","金融","电子商务","企业服务","广告","文化传媒","电子","通信"]
89+
data = get_less_dummies(data,feature,useful_classes,"industry")
90+
91+
data.head()
92+
93+
94+
data.drop(['address','industry'], axis = 1,inplace=True)
95+
96+
97+
#专业要求
98+
def get_imp_info(data,feature,useful_classes,prefix):
99+
"""直接从文本中提取"""
100+
useful_classes_prefix = [prefix+"_"+token for token in useful_classes]
101+
dum = pd.DataFrame(np.zeros((len(data),len(useful_classes))),columns = useful_classes_prefix)
102+
dum = dum.fillna(0)
103+
for j in range(len(useful_classes)):
104+
token = useful_classes[j]
105+
# print(token)
106+
for i in range(len(data)):
107+
# print(i)
108+
if len(re.findall(token,data.ix[i,feature].lower()))>0:
109+
dum.ix[i,useful_classes_prefix[j]] = 1
110+
print(dum.head())
111+
112+
# data = pd.concat([data,dum],axis = 1)
113+
return dum
114+
115+
116+
feature = "contents"
117+
useful_classes = ["统计","计算机","数学"]
118+
dum = get_imp_info(data,feature,useful_classes,"subject")
119+
data = pd.concat([data,dum],axis = 1)
120+
data.head()
121+
122+
#技能要求
123+
def get_imp_info2(data,feature,useful_classes,prefix):
124+
"""从分词中提取"""
125+
useful_classes_prefix = [prefix+"_"+token for token in useful_classes]
126+
dum = pd.DataFrame(np.zeros((len(data),len(useful_classes))),columns = useful_classes_prefix)
127+
dum = dum.fillna(0)
128+
for j in range(len(useful_classes)):
129+
token = useful_classes[j]
130+
# print(token)
131+
for i in range(len(data)):
132+
word_list = data.ix[i,feature].split()
133+
if token in word_list:
134+
print(data.ix[i,feature])
135+
dum.ix[i,useful_classes_prefix[j]] = 1
136+
print(dum.head())
137+
138+
# data = pd.concat([data,dum],axis = 1)
139+
return dum
140+
141+
142+
feature = "contents"
143+
#useful_classes = ["python","r语言","spss","excel","ppt","word","sql","sas","vba","office","msoffice",
144+
# "hadoop","spark","hive","scala","hbase","java","matlab","linux","shell","c#"]
145+
# "机器学习","数据挖掘","数学建模","自然语言处理","自然语言","文本挖掘",
146+
useful_classes = ['excel', 'sql', 'python', 'sas', 'spss','hadoop', 'spark', 'hive', 'shell', 'java']
147+
dum = get_imp_info(data,feature,useful_classes,"skill")
148+
np.sum(dum)
149+
# 技能要求前10:excel sql python sas spss | hadoop spark hive shell java
150+
data = pd.concat([data,dum],axis = 1)
151+
data.head()
152+
153+
#技能与平均薪资
154+
def mean_salary(useful_classes,data,salary,prefix):
155+
feature_list = [prefix+"_"+skill for skill in useful_classes]
156+
p = len(feature_list)
157+
df = pd.DataFrame(np.zeros((p,3)),columns = ["skill","mean_salary","count"])
158+
df["skill"] = useful_classes
159+
for i in range(p):
160+
df["mean_salary"][df["skill"]==useful_classes[i]] = np.mean(data[salary][data[feature_list[i]]==1])
161+
df["count"][df["skill"]==useful_classes[i]] = len(data[salary][data[feature_list[i]]==1])
162+
return df
163+
164+
useful_classes = ['excel', 'sql', 'python', 'sas', 'spss','hadoop', 'spark', 'hive', 'shell', 'java']
165+
salary = "average_wage"
166+
prefix = "skill"
167+
df = mean_salary(useful_classes,data,salary,prefix)
168+
169+
import matplotlib.pyplot as plt
170+
import seaborn as sns
171+
172+
plt.style.use('ggplot')
173+
plt.figure(figsize=(8,5))
174+
sns.stripplot(x = "skill",y="mean_salary",data=df,size = 10)
175+
plt.xlabel("skill_software")
176+
plt.ylabel("mean_salary")
177+
plt.savefig("skill_salary.jpg")
178+
179+
# 公司
180+
data["compname"].value_counts()
181+
182+
183+
data.drop(['compname'], axis = 1,inplace=True)
184+
#data = pd.get_dummies(data)
185+
186+
#data.to_csv("data_analysis.csv",index = False,encoding = "gbk")
187+
188+
189+
from sklearn.linear_model import LinearRegression
190+
X = data.drop(["average_wage",'contents','kmeans','gmm','nmf',"skill_text","index","compname"],axis = 1);Y = data["average_wage"]
191+
X = pd.get_dummies(X)
192+
regr = LinearRegression().fit(X,Y)
193+
#输出R的平方
194+
print(regr.score(X,Y))
195+
regr.coef_
196+
197+
198+
199+
200+
#职位诱惑可以做词云图

‎crawl_shixiseng.py

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Created on Sun Jun 17 20:21:59 2018
4+
5+
@author: situ
6+
"""
7+
8+
import requests,re,time
9+
import os
10+
import pandas as pd
11+
import numpy as np
12+
from urllib.parse import urlencode
13+
from lxml import etree
14+
15+
16+
headers = {
17+
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
18+
}
19+
20+
replace_dict={
21+
"&#xf09f":"0",
22+
"&#xeff8":"1",
23+
"&#xecfa":"2",
24+
"&#xf748":"3",
25+
"&#xf298":"4",
26+
"&#xed58":"5",
27+
"&#xee56":"6",
28+
"&#xe253":"7",
29+
"&#xe504":"8",
30+
"&#xecfd":"9"}
31+
def get_links(start_url,n,replace_dict):
32+
all_pd = pd.DataFrame()
33+
for i in list(range(1,n+1)):
34+
print("————————————正在爬取第%d页招聘信息———————————————"%i)
35+
url = start_url+"&p=%s"%str(i)
36+
try:
37+
wb_data = requests.get(url,headers=headers)
38+
wb_data.encoding=wb_data.apparent_encoding
39+
links = re.findall('class="name-box clearfix".*?href="(.*?)"',wb_data.text,re.S)
40+
for link in links:
41+
print(link)
42+
try:
43+
one_pd = get_infos('https://www.shixiseng.com'+link,replace_dict)
44+
except:
45+
one_pd = pd.DataFrame({"url":link,"jobname":"","salary":"","address":"",
46+
"education":"","jobway":"","month":"",
47+
"jobgood":"","contents":"","compname":"",
48+
"city":"","size":"","industry":""})
49+
print("can't crawl"+link)
50+
all_pd = all_pd.append(one_pd)
51+
except:
52+
print("can't reach page %d"%i)
53+
pass
54+
55+
return all_pd
56+
57+
def get_infos(url,replace_dict):
58+
one_dict = {}
59+
wb_data = requests.get(url,headers=headers)
60+
print(wb_data.status_code)
61+
wb_data.encoding=wb_data.apparent_encoding
62+
jobname = re.findall('<div class="new_job_name" title="(.*?)">',wb_data.text,re.S)
63+
salarys = re.findall('class="job_money cutom_font">(.*?)</span>',wb_data.text,re.S)
64+
addresses = re.findall('class="job_position">(.*?)</span>',wb_data.text,re.S)
65+
educations = re.findall('class="job_academic">(.*?)</span>',wb_data.text,re.S)
66+
jobways = re.findall('class="job_week cutom_font">(.*?)</span>',wb_data.text,re.S)
67+
months = re.findall('class="job_time cutom_font">(.*?)</span>',wb_data.text,re.S)
68+
jobgoods = re.findall('class="job_good".*?>(.*?)</div>',wb_data.text,re.S)
69+
contents = re.findall(r'div class="job_til">([\s\S]*?)<div class="job_til">', wb_data.text, re.S)[0].replace(' ','').replace('\n', '').replace('&nbsp;', '')
70+
contents = re.sub(r'<[\s\S]*?>', "", str(contents))
71+
compname = re.findall('class="job_com_name">(.*?)</div>',wb_data.text,re.S)
72+
compintro = re.findall('<div class="job_detail job_detail_msg"><span>([\s\S]*?)</span></div>',wb_data.text,re.S)
73+
city,size,industry = re.sub(r'<[\s\S]*?>', " ", str(compintro[0])).split()
74+
for salary,address,education,jobway,month,jobgood in zip(salarys,addresses,educations,jobways,months,jobgoods):
75+
for key, vaule in replace_dict.items():
76+
salary = salary.replace(key, vaule)
77+
jobway = jobway.replace(key,vaule)
78+
month = month.replace(key,vaule)
79+
one_dict = {"url":url,"jobname":jobname,"salary":salary,"address":address,
80+
"education":education,"jobway":jobway,"month":month,
81+
"jobgood":jobgood,"contents":contents,"compname":compname,
82+
"city":city,"size":size,"industry":industry}
83+
# list_i=[url,salary,address,education,jobway,month,jobgood,contents,compname,city,size,industry]
84+
print(jobname)
85+
one_pd = pd.DataFrame(one_dict)
86+
return one_pd
87+
88+
89+
if __name__ == '__main__':
90+
os.chdir("E:/graduate/class/EDA/final")
91+
print('请输入您想爬取内容的关键字:')
92+
compRawStr = input('关键字: \n') #键盘读入 多个关键字则用空格隔开
93+
print('正在爬取"' + compRawStr.capitalize()+ '"有关实习信息!')
94+
d = {'k': compRawStr.encode('utf-8')}
95+
word = urlencode(d)
96+
97+
start_url = "https://www.shixiseng.com/interns/st-intern_c-None_?%s" %word
98+
result = requests.get(start_url,headers=headers)
99+
# result.status_code
100+
result.encoding = 'utf-8'
101+
selector = etree.HTML(result.text)
102+
last_page_link = selector.xpath('//*[@id="pagebar"]/ul/li[10]/a/@href')
103+
n = int(last_page_link[0].split("p=")[1])
104+
print("将爬取%d页的招聘信息"%n)
105+
time_start=time.time()
106+
df = get_links(start_url,n,replace_dict)
107+
df.to_csv(compRawStr+"_共"+str(n)+"页.csv",index = False,encoding = "gb18030")
108+
time_end=time.time()
109+
print("成功爬取%d条关于【%s】的招聘信息"%(len(df),compRawStr))
110+
print('totally cost %f seconds'%(time_end-time_start))
111+
112+

‎salary_and_skill.png

72.5 KB
Loading[フレーム]

‎tagxedo.png

1.21 MB
Loading[フレーム]

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /