|
| 1 | +import urllib |
| 2 | +import time |
| 3 | +import requests |
| 4 | +import re |
| 5 | +import csv |
| 6 | +from bs4 import BeautifulSoup |
| 7 | + |
| 8 | + |
| 9 | +def write_csv(loc, info): |
| 10 | + """ |
| 11 | + The function writes the job openings collected in a .csv file |
| 12 | + """ |
| 13 | + headers = ['Title', 'Company Name', 'Location', 'Date', 'Summary', 'Url'] |
| 14 | + |
| 15 | + # Adding info into the rows of the file |
| 16 | + with open(loc+'_openings.csv', 'a', encoding='utf-8') as csv_f: |
| 17 | + csv_p = csv.writer(csv_f, delimiter=',') |
| 18 | + csv_p.writerow(headers) |
| 19 | + csv_p.writerows(info) |
| 20 | + |
| 21 | + print(f'\n{loc}_openings.csv has been saved to your directory!\n') |
| 22 | + |
| 23 | + |
| 24 | +def job_scraper(): |
| 25 | + """ |
| 26 | + The function scrapes the required number of job openings posted for a given job title and location |
| 27 | + and stores all the associated information in a .csv file |
| 28 | + """ |
| 29 | + title = input("\nEnter job title: ").replace(" ", "+") |
| 30 | + loc = input("Enter job location: ").replace(" ", "+") |
| 31 | + num = int(input("Enter the number of job openings to obtain: ")) |
| 32 | + |
| 33 | + url = f'https://in.indeed.com/jobs?q={title}&l={loc}' |
| 34 | + req_page = requests.get(url) |
| 35 | + |
| 36 | + job_array = [] |
| 37 | + |
| 38 | + if req_page.status_code == 200: |
| 39 | + soup = BeautifulSoup(req_page.text, "html.parser") |
| 40 | + job_table = soup.find("td", id="resultsCol") |
| 41 | + count = 0 |
| 42 | + |
| 43 | + flag = 1 |
| 44 | + while flag : |
| 45 | + for job_card in job_table.find_all("div", class_="jobsearch-SerpJobCard"): |
| 46 | + # Getting the job title |
| 47 | + title_elem = job_card.find('a', class_='jobtitle turnstileLink') |
| 48 | + title = title_elem.text.strip() |
| 49 | + |
| 50 | + # Getting the company name |
| 51 | + company_details = job_card.find('div', class_='sjcl') |
| 52 | + company_name = company_details.find('span', class_='company') |
| 53 | + company_name = company_name.text.strip() |
| 54 | + |
| 55 | + # Getting the company location |
| 56 | + company_loc = company_details.find('span', class_='location') |
| 57 | + if company_loc!= None: |
| 58 | + company_loc = company_loc.text.strip() |
| 59 | + else: |
| 60 | + company_loc = loc |
| 61 | + |
| 62 | + # Getting the URL of the post |
| 63 | + link = job_card.find('a')['href'] |
| 64 | + link = 'https://in.indeed.com' + link |
| 65 | + |
| 66 | + # Getting the date of the post |
| 67 | + date_elem = job_card.find('span', class_='date') |
| 68 | + date = date_elem.text.strip() |
| 69 | + |
| 70 | + # Getting the job summary |
| 71 | + summary_ele = job_card.findAll('div', attrs={'class': 'summary'}) |
| 72 | + for span in summary_ele: |
| 73 | + span = span.text.strip() |
| 74 | + |
| 75 | + count += 1 |
| 76 | + |
| 77 | + job_array.append([title, company_name, company_loc, date, span, link]) |
| 78 | + if count == num: |
| 79 | + flag = 0 |
| 80 | + break |
| 81 | + |
| 82 | + # To go to the next page |
| 83 | + page = soup.find("ul", class_="pagination-list") |
| 84 | + found = 0 |
| 85 | + for page in page.find_all('a'): |
| 86 | + if page.attrs['aria-label'] == 'Next': |
| 87 | + found = 1 |
| 88 | + break |
| 89 | + |
| 90 | + if found: |
| 91 | + next_page_link = 'https://in.indeed.com' + page.attrs['href'] |
| 92 | + |
| 93 | + time.sleep(2) |
| 94 | + |
| 95 | + req_page = requests.get(next_page_link) |
| 96 | + soup = BeautifulSoup(req_page.text, "html.parser") |
| 97 | + job_table = soup.find("td", id="resultsCol") |
| 98 | + |
| 99 | + else: |
| 100 | + flag = 0 |
| 101 | + |
| 102 | + write_csv(loc, job_array) |
| 103 | + |
| 104 | + |
| 105 | + else: |
| 106 | + print('There seems to be a problem fetching the results. Check your inputs, connections and try again') |
| 107 | + |
| 108 | + |
| 109 | +if __name__ == '__main__': |
| 110 | + job_scraper() |
| 111 | + |
0 commit comments