1
\$\begingroup\$

I'm new to Python and just started to learn about scraping and pandas library. Here is a little scraper I wrote. I'd like to know what's a professional code for this would look like. I have a sense my code has a lot of redundancy but I don't know where to improve.

import pandas as pd
from bs4 import BeautifulSoup as bs
import requests
url = 'https://vancouver.craigslist.org/d/baby-kid-stuff/search/baa'
html_file = requests.get(url)
soup = bs(html_file.text, 'lxml')
#print(soup.prettify())
postings = soup.find_all('li', class_ = 'result-row')
locations = list()
prices = list()
names = list()
for posting in postings:
 location = posting.find('span', class_ = 'result-hood').text
 price = posting.find('span', class_ = 'result-price').text
 name = posting.find('h3', class_= 'result-heading').text.strip()
 locations.append(location)
 prices.append(price)
 names.append(name)
list_of_tuples = list(zip(locations,prices,names))
df = pd.DataFrame(list_of_tuples, columns= ['locations', 'prices', 'names'])
print(df)
mdfst13
22.4k6 gold badges34 silver badges70 bronze badges
asked Aug 31, 2021 at 9:24
\$\endgroup\$
1
  • 2
    \$\begingroup\$ As a warning, craigslist is incredibly aggressive about blocking scrapers. \$\endgroup\$ Commented Sep 1, 2021 at 7:25

1 Answer 1

3
\$\begingroup\$
  • Don't hard-code your URL - slice it up into parameters accepted by a function
  • Omit the friendly baby-kid-stuff. There is a corresponding product category code, baa, that is more important to the requests and less redundant.
  • Pass a parse_only strainer to BeautifulSoup.
  • Move your code out of the global namespace.
  • Why are you using Pandas? Currently it seems to be just for printing; so don't use it at all; don't flatten your data. Keep it in well-typed structures.
  • Consider supporting sortation criteria, a query string, an enumeration for known product codes, a "sub-area" code (e.g. rch for Richmond), parametric category codes, and including the item URL and timestamp in your results.
  • Use a session to preserve cross-request cookies and set common headers.
  • Set semi-realistic browser-like request headers and cookies.
  • Use the response in a with, and check for failures via raise_for_status.
  • Do not leave your price as a string; make a Decimal.

Suggested

from dataclasses import dataclass
from datetime import datetime
from decimal import Decimal
from enum import Enum
from typing import Optional, Iterable
from bs4 import SoupStrainer, BeautifulSoup
from requests import Session
class Sort(Enum):
 TIME_DESC = 'date'
 PRICE_ASC = 'priceasc'
 PRICE_DESC = 'pricedesc'
 RELEVANT_DESC = 'rel'
class Category(Enum):
 ALL_FOR_SALE = 'sss'
 FURNITURE = 'fua'
 HOUSEHOLD = 'hsa'
 GENERAL = 'foa'
 BABY_KIDS = 'baa'
 HEALTH_BEAUTY = 'haa'
 # etc.
RESULT_STRAIN = SoupStrainer('li', class_='result-row')
def search(
 session: Session,
 area: str,
 sub_area: Optional[str] = None,
 category: Category = Category.ALL_FOR_SALE,
 sort: Sort = Sort.TIME_DESC,
 query: Optional[str] = None,
) -> str:
 url = f'https://{area}.craigslist.org/search'
 if sub_area is not None:
 url += f'/{sub_area}'
 url += f'/{category.value}'
 params = {'sort': sort.value}
 if query is not None:
 params['query'] = query
 thumb_list = f'{Category.ALL_FOR_SALE.value}:pic'
 with session.get(
 url,
 params=params,
 cookies={
 'cl_def_hp': area,
 'cl_tocmode': thumb_list,
 },
 headers = {
 'Referer': f'https://{area}.craigslist.org/',
 },
 ) as resp:
 resp.raise_for_status()
 return resp.text
@dataclass(frozen=True)
class Result:
 url: str
 when: datetime
 title: str
 neighbourhood: str
 price: Decimal
 @classmethod
 def parse(cls, html: str) -> Iterable['Result']:
 doc = BeautifulSoup(html, features='lxml', parse_only=RESULT_STRAIN)
 for item in doc.find_all('li', recursive=False):
 anchor = item.select_one('a.result-title')
 yield cls(
 url=anchor['href'],
 when=datetime.strptime(item.time['datetime'], '%Y-%m-%d %H:%M'),
 title=anchor.text,
 neighbourhood=item.select_one('span.result-hood').text,
 price=Decimal(item.select_one('span.result-price').text.removeprefix('$')),
 )
 def __str__(self):
 return self.title
 def print(self) -> None:
 print(
 f'{self.title}'
 f'\n{self.url}'
 f'\n{self.neighbourhood}'
 f'\n${self.price}'
 f'\n{self.when.strftime("%c")}'
 f'\n'
 )
def main():
 with Session() as session:
 session.headers = {
 'User-Agent':
 'Mozilla/5.0 (X11; Linux x86_64; rv:91.0) '
 'Gecko/20100101 '
 'Firefox/91.0',
 'Accept': 'text/html,application/xhtml+xml'
 }
 for item in Result.parse(search(
 session, area='vancouver', category=Category.BABY_KIDS,
 )):
 item.print()
if __name__ == '__main__':
 main()

Output

Like new pricess pinky rain boots with lining size: 2
https://vancouver.craigslist.org/bnc/bab/d/new-westminster-southwest-like-new/7363474417.html
 (New Westminster burnaby/newwest )
20ドル
Thu Sep 2 11:06:00 2021
# etc.
tdy
2,2661 gold badge10 silver badges21 bronze badges
answered Sep 2, 2021 at 18:35
\$\endgroup\$
0

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.