Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit bd3a3c8

Browse files
Create job_crawl.py
1 parent 5d7b225 commit bd3a3c8

File tree

1 file changed

+70
-0
lines changed

1 file changed

+70
-0
lines changed

‎jobSkill/job_crawl.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
# -*- coding: utf-8 -*-
2+
import requests
3+
from bs4 import BeautifulSoup
4+
import time
5+
import random
6+
7+
urlFileName = 'shurls.txt'
8+
contentFileName = 'context.txt'
9+
10+
11+
def getUrls2Txt(page_num):
12+
13+
p = page_num+1
14+
for i in range(1, p):
15+
urls = []
16+
# 抓取魔都的
17+
url = 'https://search.51job.com/list/020000,000000,0000,00,2,99,Python,2,'+str(i)+'.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
18+
19+
html = requests.get(url)
20+
soup = BeautifulSoup(html.content, "html.parser")
21+
ps = soup.find_all('p', class_='t1')
22+
for p in ps:
23+
a = p.find('a')
24+
urls.append(str(a['href']))
25+
with open(urlFileName, 'a', encoding='utf-8') as f:
26+
for url in urls:
27+
f.write(url+'\n')
28+
s = random.randint(5, 30)
29+
print(str(i)+'page done,'+str(s)+'s later')
30+
time.sleep(s)
31+
32+
def getContent(url, headers):
33+
record = ''
34+
try:
35+
html = requests.get(url, headers=headers)
36+
soup = BeautifulSoup(html.content, "html.parser")
37+
positionTitle = str(soup.find('h1')['title'])
38+
salary = soup.find_all('strong')[1].get_text()
39+
companyName = soup.find('p', class_='cname').get_text().strip().replace('\n','').replace('查看所有职位','')
40+
positionInfo = soup.find(
41+
'div', class_='bmsg job_msg inbox').get_text().strip().replace('\n', '').replace('分享', '').replace('举报', '').replace(' ', '').replace('\r', '')
42+
record = positionTitle + '&&&' + salary + '&&&' + companyName + '&&&' + '&&&' + positionInfo
43+
except Exception as e:
44+
print('错误了')
45+
return record
46+
47+
48+
def main():
49+
page_num = 93
50+
getUrls2Txt(page_num)
51+
user_Agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'
52+
headers = {'User-Agent': user_Agent}
53+
with open(urlFileName, 'r', encoding='utf-8') as f:
54+
urls = f.readlines()
55+
i = 0
56+
for url in urls:
57+
url = url.strip()
58+
if url != '':
59+
record = getContent(url, headers)
60+
with open(contentFileName, 'a', encoding='utf-8') as f:
61+
f.write(record + '\n')
62+
i += 1
63+
print(str(i)+'详情抓取完成')
64+
time.sleep(1)
65+
66+
print('完成了')
67+
68+
69+
if __name__ == '__main__':
70+
main()

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /