Scrape from Craigslist

Question 1

I'm new to Python and just started to learn about scraping and pandas library. Here is a little scraper I wrote. I'd like to know what's a professional code for this would look like. I have a sense my code has a lot of redundancy but I don't know where to improve.

import pandas as pd
from bs4 import BeautifulSoup as bs
import requests
url = 'https://vancouver.craigslist.org/d/baby-kid-stuff/search/baa'
html_file = requests.get(url)
soup = bs(html_file.text, 'lxml')
#print(soup.prettify())
postings = soup.find_all('li', class_ = 'result-row')
locations = list()
prices = list()
names = list()
for posting in postings:
 location = posting.find('span', class_ = 'result-hood').text
 price = posting.find('span', class_ = 'result-price').text
 name = posting.find('h3', class_= 'result-heading').text.strip()
 locations.append(location)
 prices.append(price)
 names.append(name)
list_of_tuples = list(zip(locations,prices,names))
df = pd.DataFrame(list_of_tuples, columns= ['locations', 'prices', 'names'])
print(df)

Question 2

As a warning, craigslist is incredibly aggressive about blocking scrapers.

Question 3

Don't hard-code your URL - slice it up into parameters accepted by a function
Omit the friendly baby-kid-stuff. There is a corresponding product category code, baa, that is more important to the requests and less redundant.
Pass a parse_only strainer to BeautifulSoup.
Move your code out of the global namespace.
Why are you using Pandas? Currently it seems to be just for printing; so don't use it at all; don't flatten your data. Keep it in well-typed structures.
Consider supporting sortation criteria, a query string, an enumeration for known product codes, a "sub-area" code (e.g. rch for Richmond), parametric category codes, and including the item URL and timestamp in your results.
Use a session to preserve cross-request cookies and set common headers.
Set semi-realistic browser-like request headers and cookies.
Use the response in a with, and check for failures via raise_for_status.
Do not leave your price as a string; make a Decimal.

Suggested

from dataclasses import dataclass
from datetime import datetime
from decimal import Decimal
from enum import Enum
from typing import Optional, Iterable
from bs4 import SoupStrainer, BeautifulSoup
from requests import Session
class Sort(Enum):
 TIME_DESC = 'date'
 PRICE_ASC = 'priceasc'
 PRICE_DESC = 'pricedesc'
 RELEVANT_DESC = 'rel'
class Category(Enum):
 ALL_FOR_SALE = 'sss'
 FURNITURE = 'fua'
 HOUSEHOLD = 'hsa'
 GENERAL = 'foa'
 BABY_KIDS = 'baa'
 HEALTH_BEAUTY = 'haa'
 # etc.
RESULT_STRAIN = SoupStrainer('li', class_='result-row')
def search(
 session: Session,
 area: str,
 sub_area: Optional[str] = None,
 category: Category = Category.ALL_FOR_SALE,
 sort: Sort = Sort.TIME_DESC,
 query: Optional[str] = None,
) -> str:
 url = f'https://{area}.craigslist.org/search'
 if sub_area is not None:
 url += f'/{sub_area}'
 url += f'/{category.value}'
 params = {'sort': sort.value}
 if query is not None:
 params['query'] = query
 thumb_list = f'{Category.ALL_FOR_SALE.value}:pic'
 with session.get(
 url,
 params=params,
 cookies={
 'cl_def_hp': area,
 'cl_tocmode': thumb_list,
 },
 headers = {
 'Referer': f'https://{area}.craigslist.org/',
 },
 ) as resp:
 resp.raise_for_status()
 return resp.text
@dataclass(frozen=True)
class Result:
 url: str
 when: datetime
 title: str
 neighbourhood: str
 price: Decimal
 @classmethod
 def parse(cls, html: str) -> Iterable['Result']:
 doc = BeautifulSoup(html, features='lxml', parse_only=RESULT_STRAIN)
 for item in doc.find_all('li', recursive=False):
 anchor = item.select_one('a.result-title')
 yield cls(
 url=anchor['href'],
 when=datetime.strptime(item.time['datetime'], '%Y-%m-%d %H:%M'),
 title=anchor.text,
 neighbourhood=item.select_one('span.result-hood').text,
 price=Decimal(item.select_one('span.result-price').text.removeprefix('$')),
 )
 def __str__(self):
 return self.title
 def print(self) -> None:
 print(
 f'{self.title}'
 f'\n{self.url}'
 f'\n{self.neighbourhood}'
 f'\n${self.price}'
 f'\n{self.when.strftime("%c")}'
 f'\n'
 )
def main():
 with Session() as session:
 session.headers = {
 'User-Agent':
 'Mozilla/5.0 (X11; Linux x86_64; rv:91.0) '
 'Gecko/20100101 '
 'Firefox/91.0',
 'Accept': 'text/html,application/xhtml+xml'
 }
 for item in Result.parse(search(
 session, area='vancouver', category=Category.BABY_KIDS,
 )):
 item.print()
if __name__ == '__main__':
 main()

Output

Like new pricess pinky rain boots with lining size: 2
https://vancouver.craigslist.org/bnc/bab/d/new-westminster-southwest-like-new/7363474417.html
 (New Westminster burnaby/newwest )
20ドル
Thu Sep 2 11:06:00 2021
# etc.

Reinderien Reinderien 70.9k5 gold badges76 silver badges256 bronze badges · Accepted Answer · 2021-09-02 18:35:11Z

Don't hard-code your URL - slice it up into parameters accepted by a function
Omit the friendly baby-kid-stuff. There is a corresponding product category code, baa, that is more important to the requests and less redundant.
Pass a parse_only strainer to BeautifulSoup.
Move your code out of the global namespace.
Why are you using Pandas? Currently it seems to be just for printing; so don't use it at all; don't flatten your data. Keep it in well-typed structures.
Consider supporting sortation criteria, a query string, an enumeration for known product codes, a "sub-area" code (e.g. rch for Richmond), parametric category codes, and including the item URL and timestamp in your results.
Use a session to preserve cross-request cookies and set common headers.
Set semi-realistic browser-like request headers and cookies.
Use the response in a with, and check for failures via raise_for_status.
Do not leave your price as a string; make a Decimal.

Suggested

from dataclasses import dataclass
from datetime import datetime
from decimal import Decimal
from enum import Enum
from typing import Optional, Iterable
from bs4 import SoupStrainer, BeautifulSoup
from requests import Session
class Sort(Enum):
 TIME_DESC = 'date'
 PRICE_ASC = 'priceasc'
 PRICE_DESC = 'pricedesc'
 RELEVANT_DESC = 'rel'
class Category(Enum):
 ALL_FOR_SALE = 'sss'
 FURNITURE = 'fua'
 HOUSEHOLD = 'hsa'
 GENERAL = 'foa'
 BABY_KIDS = 'baa'
 HEALTH_BEAUTY = 'haa'
 # etc.
RESULT_STRAIN = SoupStrainer('li', class_='result-row')
def search(
 session: Session,
 area: str,
 sub_area: Optional[str] = None,
 category: Category = Category.ALL_FOR_SALE,
 sort: Sort = Sort.TIME_DESC,
 query: Optional[str] = None,
) -> str:
 url = f'https://{area}.craigslist.org/search'
 if sub_area is not None:
 url += f'/{sub_area}'
 url += f'/{category.value}'
 params = {'sort': sort.value}
 if query is not None:
 params['query'] = query
 thumb_list = f'{Category.ALL_FOR_SALE.value}:pic'
 with session.get(
 url,
 params=params,
 cookies={
 'cl_def_hp': area,
 'cl_tocmode': thumb_list,
 },
 headers = {
 'Referer': f'https://{area}.craigslist.org/',
 },
 ) as resp:
 resp.raise_for_status()
 return resp.text
@dataclass(frozen=True)
class Result:
 url: str
 when: datetime
 title: str
 neighbourhood: str
 price: Decimal
 @classmethod
 def parse(cls, html: str) -> Iterable['Result']:
 doc = BeautifulSoup(html, features='lxml', parse_only=RESULT_STRAIN)
 for item in doc.find_all('li', recursive=False):
 anchor = item.select_one('a.result-title')
 yield cls(
 url=anchor['href'],
 when=datetime.strptime(item.time['datetime'], '%Y-%m-%d %H:%M'),
 title=anchor.text,
 neighbourhood=item.select_one('span.result-hood').text,
 price=Decimal(item.select_one('span.result-price').text.removeprefix('$')),
 )
 def __str__(self):
 return self.title
 def print(self) -> None:
 print(
 f'{self.title}'
 f'\n{self.url}'
 f'\n{self.neighbourhood}'
 f'\n${self.price}'
 f'\n{self.when.strftime("%c")}'
 f'\n'
 )
def main():
 with Session() as session:
 session.headers = {
 'User-Agent':
 'Mozilla/5.0 (X11; Linux x86_64; rv:91.0) '
 'Gecko/20100101 '
 'Firefox/91.0',
 'Accept': 'text/html,application/xhtml+xml'
 }
 for item in Result.parse(search(
 session, area='vancouver', category=Category.BABY_KIDS,
 )):
 item.print()
if __name__ == '__main__':
 main()

Output

Like new pricess pinky rain boots with lining size: 2
https://vancouver.craigslist.org/bnc/bab/d/new-westminster-southwest-like-new/7363474417.html
 (New Westminster burnaby/newwest )
20ドル
Thu Sep 2 11:06:00 2021
# etc.

Stack Exchange Network

Scrape from Craigslist

1 Answer 1

Suggested

Output

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

Scrape from Craigslist

1 Answer 1

Suggested

Output

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions