Commit bd3a3c8

authored

Create job_crawl.py

1 parent 5d7b225 commit bd3a3c8Copy full SHA for bd3a3c8

File tree

1 file changed

+70

-0

lines changed

jobSkill
- job_crawl.py

1 file changed

+70

-0

lines changed

`‎jobSkill/job_crawl.py‎`

Lines changed: 70 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,70 @@`
	`1`	`+# -- coding: utf-8 --`
	`2`	`+import requests`
	`3`	`+from bs4 import BeautifulSoup`
	`4`	`+import time`
	`5`	`+import random`
	`6`	`+`
	`7`	`+urlFileName = 'shurls.txt'`
	`8`	`+contentFileName = 'context.txt'`
	`9`	`+`
	`10`	`+`
	`11`	`+def getUrls2Txt(page_num):`
	`12`	`+`
	`13`	`+ p = page_num+1`
	`14`	`+ for i in range(1, p):`
	`15`	`+ urls = []`
	`16`	`+ # 抓取魔都的`
	`17`	`+ url = 'https://search.51job.com/list/020000,000000,0000,00,2,99,Python,2,'+str(i)+'.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='`
	`18`	`+`
	`19`	`+ html = requests.get(url)`
	`20`	`+ soup = BeautifulSoup(html.content, "html.parser")`
	`21`	`+ ps = soup.find_all('p', class_='t1')`
	`22`	`+ for p in ps:`
	`23`	`+ a = p.find('a')`
	`24`	`+ urls.append(str(a['href']))`
	`25`	`+ with open(urlFileName, 'a', encoding='utf-8') as f:`
	`26`	`+ for url in urls:`
	`27`	`+ f.write(url+'\n')`
	`28`	`+ s = random.randint(5, 30)`
	`29`	`+ print(str(i)+'page done,'+str(s)+'s later')`
	`30`	`+ time.sleep(s)`
	`31`	`+`
	`32`	`+def getContent(url, headers):`
	`33`	`+ record = ''`
	`34`	`+ try:`
	`35`	`+ html = requests.get(url, headers=headers)`
	`36`	`+ soup = BeautifulSoup(html.content, "html.parser")`
	`37`	`+ positionTitle = str(soup.find('h1')['title'])`
	`38`	`+ salary = soup.find_all('strong')[1].get_text()`
	`39`	`+ companyName = soup.find('p', class_='cname').get_text().strip().replace('\n','').replace('查看所有职位','')`
	`40`	`+ positionInfo = soup.find(`
	`41`	`+ 'div', class_='bmsg job_msg inbox').get_text().strip().replace('\n', '').replace('分享', '').replace('举报', '').replace(' ', '').replace('\r', '')`
	`42`	`+ record = positionTitle + '&&&' + salary + '&&&' + companyName + '&&&' + '&&&' + positionInfo`
	`43`	`+ except Exception as e:`
	`44`	`+ print('错误了')`
	`45`	`+ return record`
	`46`	`+`
	`47`	`+`
	`48`	`+def main():`
	`49`	`+ page_num = 93`
	`50`	`+ getUrls2Txt(page_num)`
	`51`	`+ user_Agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'`
	`52`	`+ headers = {'User-Agent': user_Agent}`
	`53`	`+ with open(urlFileName, 'r', encoding='utf-8') as f:`
	`54`	`+ urls = f.readlines()`
	`55`	`+ i = 0`
	`56`	`+ for url in urls:`
	`57`	`+ url = url.strip()`
	`58`	`+ if url != '':`
	`59`	`+ record = getContent(url, headers)`
	`60`	`+ with open(contentFileName, 'a', encoding='utf-8') as f:`
	`61`	`+ f.write(record + '\n')`
	`62`	`+ i += 1`
	`63`	`+ print(str(i)+'详情抓取完成')`
	`64`	`+ time.sleep(1)`
	`65`	`+`
	`66`	`+ print('完成了')`
	`67`	`+`
	`68`	`+`
	`69`	`+if __name__ == '__main__':`
	`70`	`+ main()`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit bd3a3c8

File tree

1 file changed

1 file changed

`‎jobSkill/job_crawl.py‎`

0 commit comments