|
| 1 | +from bs4 import BeautifulSoup |
| 2 | +import requests |
| 3 | +import csv |
| 4 | + |
| 5 | + |
| 6 | +pages = int(input('How many pages do you want to scrape ? : ')) |
| 7 | +dict0 = { |
| 8 | + 1:'Computer Science', |
| 9 | + 2: 'Marketing', |
| 10 | + 3: 'Finance Internship', |
| 11 | + 4:'Mechanical Internship', |
| 12 | + 5:'HR Internship', |
| 13 | + 6:'Digital Marketing Internship', |
| 14 | + 7:'Electronics Internship', |
| 15 | + 8:'Content Writing Internship', |
| 16 | + 9:'Civil Internship', |
| 17 | + } |
| 18 | +dict = { |
| 19 | + 'Computer Science':'https://internshala.com/internships/computer%20science-internship', |
| 20 | + 'Marketing': 'https://internshala.com/internships/marketing-internship', |
| 21 | + 'Finance Internship':'https://internshala.com/internships/finance-internship', |
| 22 | + 'Mechanical Internship':'https://internshala.com/internships/mechanical-internship', |
| 23 | + 'HR Internship':'https://internshala.com/internships/hr-internship', |
| 24 | + 'Digital Marketing Internship':'https://internshala.com/internships/digital%20marketing-internship', |
| 25 | + 'Electronics Internship':'https://internshala.com/internships/electronics-internship', |
| 26 | + 'Content Writing Internship':'https://internshala.com/internships/content%20writing-internship', |
| 27 | + 'Civil Internship':'https://internshala.com/internships/civil-internship' |
| 28 | + } |
| 29 | +x=1 |
| 30 | +for item in dict.keys(): |
| 31 | + print(x,item) |
| 32 | + x+=1 |
| 33 | +ch = int(input("Enter the categroy. eg 1 for Computer Science : ")) |
| 34 | +url = dict[dict0[ch]] |
| 35 | +print('--------URL : '+ url) |
| 36 | +with open('internshala.csv', mode='a') as f: |
| 37 | + writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) |
| 38 | + writer.writerow(['Company', 'Profile', 'Location/s','From','Upto','Duration','Stipend','Link']) |
| 39 | + for i in range(1,pages+1): |
| 40 | + print('Page',i) |
| 41 | + resp=requests.get(url+"/page-"+str(i)) |
| 42 | + data=BeautifulSoup(resp.content,'lxml') |
| 43 | + companies = data.findAll("div", { "class" : "heading_6 company_name" }) |
| 44 | + profiles = data.findAll("div", { "class" : "heading_4_5 profile" }) |
| 45 | + locations = data.findAll("div", { "id" : "location_names" }) |
| 46 | + details = data.findAll("div", { "class" : "internship_other_details_container" }) |
| 47 | + links = data.findAll("a",{"class":"view_detail_button"}) |
| 48 | + for x in range(0,len(companies)): |
| 49 | + company = companies[x].text.strip() |
| 50 | + profile = profiles[x].text.strip() |
| 51 | + location = locations[x].text.strip() |
| 52 | + link = 'www.internshala.com/'+links[x]['href'] |
| 53 | + detail = details[x].text |
| 54 | + detail = detail.split('\n') |
| 55 | + extracted = [] |
| 56 | + for item in detail: |
| 57 | + item = item.strip() |
| 58 | + if item != '': |
| 59 | + extracted.append(item) |
| 60 | + info = [company,profile,location] |
| 61 | + info.append(extracted[1].replace('immediatelyImmediately','Immediately')) |
| 62 | + info.append(extracted[7]) |
| 63 | + info.append(extracted[3]) |
| 64 | + info.append(extracted[5]) |
| 65 | + info.append(link) |
| 66 | + writer.writerow(info) |
| 67 | +input('Done!\nAll the best ;-)') |
0 commit comments