I'm a newbie in programming, I chose Python. I'm learning on my own. Currently I'm preparing code for a portfolio on github.
I will be grateful for any code review, especially in the subject of OOP: should I try to create classes? Does it make any sense with such a code? Is the code structure pythonic? I will be very grateful for any hint.
The goal is to find out what technologies are most often requested in those jobs listing where Python is required. One table contains mandatory technologies, the other: nice to have sorted in descending order. I chose the site https://nofluffjobs.com/ to find out which technologies are most in demand.
from bs4 import BeautifulSoup
import requests
import pandas as pd
from collections import Counter
# In[2]:
def get_page (page):
#making the soup
r= requests.get('https://nofluffjobs.com/pl/Python?page='+(str(page)))
c= r.content
soup= BeautifulSoup (c, 'html.parser')
return soup
#
def display_technologies(requirements,tech):
#making dataframe with the technologies
tech_dict = {i: x for i, x in enumerate(Counter(requirements).items())}
df = pd.DataFrame.from_dict(tech_dict, orient='index', columns=[tech, 'number'])
pd.set_option('display.max_rows', None)
df.sort_values(by=['number'], inplace=True, ascending= False)
print (df)
def make_dict(list_req):
#making dictionary to display list of technologies
return [item.strip() for sublist in list_req for item in sublist]
page= 1
list_req_must=[]
list_req_nice=[]
while True:
#searching for subpages with offers
soup= get_page(page)
if (soup.find('a', {'class': 'jobs-link'}))== None:
break
job_sites= soup.find_all('a',{'target':'_self'}, href=True)
for h in job_sites:
link= h.get('href')
if '/job/' in (link):
#searching for title and requirements
try:
if requests.Response() == 200:
break
r= requests.get('https://nofluffjobs.com/'+(str(link.lstrip('/pl/'))), timeout=1000)
c= r.content
soup_job= BeautifulSoup (c, 'html.parser')
title= soup_job.find('h1').text
requirements_must= list(l.text for l in list(soup_job.find_all('ul', {'class': 'mb-0 ng-star-inserted'})))
rm = list((requirements_must)[0].split(" "))
list_req_must.append(rm)
try:
requirements_nice_to_have= (str(l.text).strip() for l in (soup_job.find('section', {'id': 'posting-nice-to-have'})))
rn= (list(requirements_nice_to_have)[1].split(" "))
list_req_nice.append(rn)
except TypeError as e:
pass
except Exception as inst:
print (inst)
page +=1
display_technologies(make_dict(list_req_must), 'tech- must')
display_technologies(make_dict(list_req_nice), 'tech- nice_to_have')
1 Answer 1
I do not think that Pandas is well-applied here. I suggest that you use a built-in such as pprint
instead.
None of your comments make the code any more clean than if there were no comments at all - so you can delete them.
Add PEP484 typehints.
Do not write that much code in the global namespace. From page=1
onward, move that into multiple functions. Iterators can help you divide this work.
Loop like a native: do not manually increment page
; instead, use itertools.count
.
if (soup.find('a', {'class': 'jobs-link'}))== None:
is not necessary. More necessary: check for the presence of a pagination link.
soup.find_all('a',{'target':'_self'}, href=True)
is not a very narrow selector. Improve your choice of selectors by inspecting the DOM. Also, add soup strainers to narrow the parsing job of BeautifulSoup
.
if requests.Response() == 200:
does not do what you think it does. Instead, it creates a brand new response and checks its code. You're better off calling raise_for_status
.
Do not split()
. You should be selecting on each individual span
.
Catching TypeError
suggests a programming error; you should remove this try
.
Use a requests.Session
to better characterise what you're doing, with cross-request headers, cookies and connections, etc.
Add a __main__
guard.
Suggested
import itertools
from pprint import pprint
from typing import Iterator
from bs4 import BeautifulSoup
from bs4.element import SoupStrainer
from collections import Counter
from requests import Session
SEARCH_STRAINER = SoupStrainer(
name='div',
class_='list-container ng-star-inserted',
)
JOB_STRAINER = SoupStrainer(
name='div',
id='posting-requirements',
)
def get_search_page(session: Session, country: str, language: str, page: int) -> BeautifulSoup:
with session.get(
url=f'https://nofluffjobs.com/{country}/{language}',
params={'page': page},
headers={'Accept': 'text/html'},
timeout=1000,
) as resp:
resp.raise_for_status()
return BeautifulSoup(markup=resp.text, features='html.parser', parse_only=SEARCH_STRAINER)
def get_job_page(session: Session, link: str) -> BeautifulSoup:
with session.get(
url=f'https://nofluffjobs.com/{link}',
headers={'Accept': 'text/html'},
timeout=1000,
) as resp:
resp.raise_for_status()
return BeautifulSoup(markup=resp.text, features='html.parser', parse_only=JOB_STRAINER)
def get_links(
session: Session, country: str, language: str,
) -> Iterator[str]:
for page in itertools.count(1):
soup = get_search_page(session, country, language, page)
job_sites = soup.find_all(name='a', class_='posting-list-item')
for anchor in job_sites:
yield anchor.get('href').removeprefix(f'/{country}/')
if soup.find(name='a', class_='page_link', label='Next') is None:
break
def get_musts(soup: BeautifulSoup) -> Iterator[str]:
musts = soup.find(name='section', branch='musts')
if musts is not None:
for span in musts.find_all(name='span'):
yield span.text.strip()
def get_nices(soup: BeautifulSoup) -> Iterator[str]:
nices = soup.find(name='section', branch='nices')
if nices is not None:
for span in nices.find_all(name='span'):
yield span.text.strip()
def scrape(
session: Session, country: str = 'pl', language: str = 'Python',
) -> tuple[Counter[str], Counter[str]]:
req_must = Counter()
req_nice = Counter()
for link in get_links(session, country, language):
print(f'Scraping {link}...')
soup = get_job_page(session, link)
req_must.update(get_musts(soup))
req_nice.update(get_nices(soup))
return req_must, req_nice
def display_technologies(req: Counter[str], title: str) -> None:
print(title)
pprint(req)
print()
def main() -> None:
with Session() as session:
req_must, req_nice = scrape(session)
display_technologies(req_must, 'tech- must')
display_technologies(req_nice, 'tech- nice_to_have')
if __name__ == '__main__':
main()
Output
Scraping job/mid-senior-data-engineer-remote-devsdata-llc-ycq636sg...
Scraping job/junior-data-engineer-nix-tech-kft-budapest-7dbftgsu...
Scraping job/python-developer-rtb-house-remote-xm36kkv1...
Scraping job/senior-data-engineer-sigma-it-poland-remote-ai1un7ji...
Scraping job/data-engineer-addepto-remote-oo36be4q...
Scraping job/python-developer-freysoft-remote-zzqgols5...
Scraping job/site-reliability-engineer-python-or-c-rits-professional-services-remote-ys1kq0b5...
Scraping job/machine-learning-engineer-infracert-tsi-warszawa-4egsv3nx...
Scraping job/akademia-it-data-scientist-python-xtb-remote-7bj8ysfw...
Scraping job/python-developer-bazy-danych-team-connect-remote-zdhnplmv...
Scraping job/remote-senior-backend-python-developer-devopsbay-lpj6imsi...
Scraping job/junior-python-developer-green-minds-remote-h9e26nbg...
Scraping job/technical-data-steward-link-group-warszawa-38bkgutb...
Scraping job/remote-fullstack-developer-tappr-rayrqvhd...
Scraping job/junior-python-developer-optimo-development-lodz-hxa7lmsn...
Scraping job/python-developer-with-relocation-zoostation-thehague-jp1cuhz5...
Scraping job/remote-data-engineer-with-python-inuits-c1y8w8za...
Scraping job/python-team-lead-tagging-tools-lead-form-remote-tqmix3yd...
Scraping job/remote-machine-learning-python-developer-clurgo-dvlbyqx1...
Scraping job/data-scientist-python-developer-avenga-remote-tn4qfdr8...
tech- must
Counter({'Python': 21,
'SQL': 10,
'English': 9,
'English (B2)': 7,
'Polish': 7,
'Git': 6,
'Docker': 5,
'PostgreSQL': 4,
'AWS': 4,
'Kubernetes': 4,
'MongoDB': 3,
'Django': 3,
'MySQL': 2,
'Oracle': 2,
'ETL': 2,
'Linux': 2,
'Big Data': 2,
'Azure': 2,
'REST API': 2,
'pandas': 2,
'PyTorch': 2,
'TensorFlow': 2,
'Flask': 2,
'JavaScript': 2,
'Design Patterns': 2,
'NoSQL': 2,
'RDBMS': 1,
'Airflow': 1,
'Version Control Systems': 1,
'AWS Glue': 1,
'English (B1)': 1,
'Snowflake': 1,
'Data Integration': 1,
'ETL tools': 1,
'Spark': 1,
'Celery': 1,
'RabbitMQ': 1,
'HTTP': 1,
'K8s': 1,
'Ansible': 1,
'Terraform': 1,
'scikit-learn': 1,
'NumPy': 1,
'Keras': 1,
'Polish (NATIVE)': 1,
'Data science': 1,
'Umiejętności analityczne': 1,
'Znajomość oprogramowania bazodanowego': 1,
'Amazon Web Services': 1,
'HTML': 1,
'CSS': 1,
'Data management': 1,
'banking': 1,
'Excel': 1,
'Python scripts': 1,
'TypeScript': 1,
'REST': 1,
'React': 1,
'Bootstrap': 1,
'JSON': 1,
'SOAP': 1,
'CI/CD': 1,
'Jenkins': 1,
'FastAPI': 1,
'Angular': 1,
'Next.js': 1,
'Nuxt.js': 1,
'Scala': 1,
'database': 1,
'Communication skills': 1,
'Team player': 1,
'English (C1)': 1,
'Polish (C1)': 1,
'PySpark': 1})
tech- nice_to_have
Counter({'GCP': 4,
'AWS': 3,
'Azure': 3,
'Kafka': 3,
'Airflow': 2,
'Django': 2,
'Elasticsearch': 2,
'Proactivity': 2,
'Problem solving': 2,
'MLOps': 2,
'Splunk': 2,
'DevOps': 2,
'BigQuery': 1,
'OAuth': 1,
'OIDC': 1,
'Azure Data Factory': 1,
'Celery': 1,
'Redis': 1,
'PostgreSQL': 1,
'Google cloud platform': 1,
'Java': 1,
'Scala': 1,
'NiFi': 1,
'Databricks': 1,
'Polish': 1,
'MongoDB': 1,
'pytest': 1,
'Flask': 1,
'PyMongo': 1,
'C++': 1,
'Oracle': 1,
'SQL Server': 1,
'TCP/IP': 1,
'Scrum': 1,
'Kanban': 1,
'CI/CD Pipelines': 1,
'AI / ML expertise': 1,
'ML': 1,
'Odoo': 1,
'English': 1,
'CSS': 1,
'Vue.js': 1,
'React': 1,
'Docker': 1,
'Kubernetes': 1,
'Rancher': 1,
'Sentry': 1,
'Grafana': 1,
'NoSQL': 1,
'Apache Airflow': 1,
'Apache Spark': 1,
'Machine Learning': 1,
'ML Ops concepts': 1})
Explore related questions
See similar questions with these tags.
ipython
prompt and run the code through a simple linter likepycodestyle
\$\endgroup\$while True:
loop in a named function. \$\endgroup\$black
, especially for a beginner. It's a steamroller and doesn't make you go through the work of learning each recommendation explicitly. \$\endgroup\$if requests.Response() == 200
doesn't do what you think it does. What did you intend here? \$\endgroup\$