Adding page iteration capability to Requests scraper

Question 1

I am trying to build on @Reinderien's answer to my previous question over here to add page iteration functionality to the code:

from base64 import b64encode
from datetime import date
from typing import Iterable, ClassVar, List
from attr import dataclass
from bs4 import BeautifulSoup, SoupStrainer, Tag
from requests import Session
import re
from itertools import count
from urllib.parse import urljoin
BASE_URL = 'https://www.ctwx.tsinghua.edu.cn'
@dataclass
class Result:
 caption: str
 when: date
 path: str
 @classmethod
 def from_list_item(cls, item: Tag) -> 'Result':
 return cls(
 caption=item.a.text,
 path=item.a['href'],
 when=date.fromisoformat(item.find('span', recursive=False).text),
 )
class TsinghuaSite:
 subdoc: ClassVar[SoupStrainer] = SoupStrainer(name='ul', class_='search_list')
 pagination: ClassVar[SoupStrainer] = SoupStrainer(name='table', class_='listFrame')
 def __init__(self):
 self.session = Session()
 def __enter__(self) -> 'TsinghuaSite':
 return self
 def __exit__(self, exc_type, exc_val, exc_tb):
 self.session.close()
 def search(self, query: str) -> Iterable[List]:
 with self.session.post(
 urljoin(BASE_URL, 'search.jsp'),
 params={'wbtreeid': 1001},
 data={
 'lucenenewssearchkey': b64encode(query.encode()),
 '_lucenesearchtype': '1',
 'searchScope': '0',
 'x': '0',
 'y': '0',
 },
 ) as resp:
 resp.raise_for_status()
 pages = BeautifulSoup(markup=resp.text, features='html.parser', parse_only=self.pagination)
 n_pages_string = list(pages.select_one('td').children)[4]
 n_pages = int(re.search(r'\d+', n_pages_string)[0])
 if n_pages > 1:
 docs = []
 for page in count(1):
 with self.session.get(
 resp.url,
 params={
 'wbtreeid': 1001,
 'newskeycode2': b64encode(query.encode()),
 'searchScope': '0',
 'currentnum': page,
 },
 ) as resp:
 resp.raise_for_status()
 doc = BeautifulSoup(markup=resp.text, features='html.parser', parse_only=self.subdoc)
 print(f"Scraping page {page}/{n_pages}.")
 docs.append(doc)
 if page >= n_pages:
 yield from docs
 break
 else:
 doc = BeautifulSoup(markup=resp.text, features='html.parser', parse_only=self.subdoc)
 yield doc
 def yield_results(self, query) -> Iterable[Result]:
 doc_gen = self.search(query)
 for doc in doc_gen:
 for item in doc.find('ul', recursive=False).find_all('li', recursive=False):
 yield Result.from_list_item(item)
def main():
 with TsinghuaSite() as site:
 query = '尹至'
 results = tuple(site.yield_results(query))
 # assert any(query in r.caption for r in results)
 for result in results:
 print(result)
if __name__ == '__main__':
 main()

The code seems to be working correctly; but I would still like to seek suggestions on how to improve on it.

Output:

Scraping page 1/2.
Scraping page 2/2.
Result(caption='出土文献研究与保护中心2020年报', when=datetime.date(2021, 4, 9), path='info/1041/2615.htm')
Result(caption='《战国秦汉文字与文献论稿》出版', when=datetime.date(2020, 7, 17), path='info/1012/1289.htm')
Result(caption='【光明日报】清华简十年:古书重现与古史新探', when=datetime.date(2018, 12, 25), path='info/1072/1551.htm')
Result(caption='《清華簡與古史探賾》出版', when=datetime.date(2018, 8, 30), path='info/1012/1436.htm')
Result(caption='【出土文獻第九輯】鄔可晶:《尹至》"惟(肉哉)虐德暴(身童)亡典"句試解', when=datetime.date(2018, 5, 24), path='info/1073/1952.htm')
Result(caption='【出土文獻第五輯】袁金平:從《尹至》篇"播"字的討論談文義對文字考釋的重要性', when=datetime.date(2018, 4, 26), path='info/1081/2378.htm')
Result(caption='【出土文獻第五輯】袁金平:從《尹至》篇"播"字的討論談文義對文字考釋的重要性', when=datetime.date(2018, 4, 26), path='info/1081/2378.htm')
Result(caption='【出土文獻第二輯】羅 琨:讀《尹至》"自夏徂亳"', when=datetime.date(2018, 4, 12), path='info/1081/2283.htm')
Result(caption='【出土文獻第二輯】羅 琨:讀《尹至》"自夏徂亳"', when=datetime.date(2018, 4, 12), path='info/1081/2283.htm')
Result(caption='《出土文獻》(第九輯)出版', when=datetime.date(2016, 10, 26), path='info/1012/1411.htm')
Result(caption='《出土文獻研究》第十三輯出版', when=datetime.date(2015, 4, 8), path='info/1012/1396.htm')
Result(caption='清華大學藏戰國竹簡第五冊相關研究論文', when=datetime.date(2015, 4, 8), path='info/1081/2215.htm')
Result(caption='清華大學藏戰國竹簡第五冊相關研究論文', when=datetime.date(2015, 4, 8), path='info/1081/2215.htm')
Result(caption='《出土文獻》(第五輯)出版', when=datetime.date(2014, 10, 13), path='info/1012/1393.htm')
Result(caption='清华简入选《国家珍贵古籍名录》', when=datetime.date(2013, 12, 11), path='info/1072/1496.htm')
Result(caption='出土文献研究与保护中心2020年报', when=datetime.date(2021, 4, 9), path='info/1041/2615.htm')
Result(caption='《战国秦汉文字与文献论稿》出版', when=datetime.date(2020, 7, 17), path='info/1012/1289.htm')
Result(caption='【光明日报】清华简十年:古书重现与古史新探', when=datetime.date(2018, 12, 25), path='info/1072/1551.htm')
Result(caption='《清華簡與古史探賾》出版', when=datetime.date(2018, 8, 30), path='info/1012/1436.htm')
Result(caption='【出土文獻第九輯】鄔可晶:《尹至》"惟(肉哉)虐德暴(身童)亡典"句試解', when=datetime.date(2018, 5, 24), path='info/1073/1952.htm')
Result(caption='【出土文獻第五輯】袁金平:從《尹至》篇"播"字的討論談文義對文字考釋的重要性', when=datetime.date(2018, 4, 26), path='info/1081/2378.htm')
Result(caption='【出土文獻第五輯】袁金平:從《尹至》篇"播"字的討論談文義對文字考釋的重要性', when=datetime.date(2018, 4, 26), path='info/1081/2378.htm')
Result(caption='【出土文獻第二輯】羅 琨:讀《尹至》"自夏徂亳"', when=datetime.date(2018, 4, 12), path='info/1081/2283.htm')
Result(caption='【出土文獻第二輯】羅 琨:讀《尹至》"自夏徂亳"', when=datetime.date(2018, 4, 12), path='info/1081/2283.htm')
Result(caption='《出土文獻》(第九輯)出版', when=datetime.date(2016, 10, 26), path='info/1012/1411.htm')
Result(caption='《出土文獻研究》第十三輯出版', when=datetime.date(2015, 4, 8), path='info/1012/1396.htm')
Result(caption='清華大學藏戰國竹簡第五冊相關研究論文', when=datetime.date(2015, 4, 8), path='info/1081/2215.htm')
Result(caption='清華大學藏戰國竹簡第五冊相關研究論文', when=datetime.date(2015, 4, 8), path='info/1081/2215.htm')
Result(caption='《出土文獻》(第五輯)出版', when=datetime.date(2014, 10, 13), path='info/1012/1393.htm')
Result(caption='清华简入选《国家珍贵古籍名录》', when=datetime.date(2013, 12, 11), path='info/1072/1496.htm')

Question 2

Scraping code is often ugly, and this is no exception. There's not much you can do, but it's important to work extra hard to make it readable.

Error handling is missing (just throws an error), but that's okay if that's what you want. It makes things more readable.

search is too long and nested too deeply. You should limit the number of layers of indentation much more for readability. Split out the middle into search_page to make the pagination logic easier to understand:

def search_page(self, url, page_num)
 with self.session.get(
 url,
 params={
 'wbtreeid': 1001,
 'newskeycode2': b64encode(query.encode()),
 'searchScope': '0',
 'currentnum': page,
 },
 ) as resp:
 resp.raise_for_status()
 return BeautifulSoup(markup=resp.text, features='html.parser', parse_only=self.subdoc)
if n_pages > 1:
 for page in count(1):
 doc = search_page(resp.url, page)
 print(f"Scraping page {page}/{n_pages}.")
 docs.append(doc)
 if page >= n_pages:
 yield from docs
 break
 else:
 doc = BeautifulSoup(markup=resp.text, features='html.parser', parse_only=self.subdoc)
 yield doc

You have an error here--you mean to have an if-else clause, not a for-else clause, because the else clause can never trigger in the original. You would likely have caught the error with less nesting.

There is no reason to add the indentation of

with session.get(...) as resp:
 resp.raise_for_status()
 ...

This is only needed for streaming output. Just use

resp = session.get(...)
resp.raise_for_status()
...

Don't use count. Use range which is more standard. The entire second loop can be replaced with

if n_pages > 1:
 for page in range(1, n_pages+1):
 print(f"Scraping page {page}/{n_pages}.")
 yield search_page(resp.url, page)
else:
 yield BeautifulSoup(markup=resp.text, features='html.parser', parse_only=self.subdoc)

The last change to make here is to simplify out one more layer (and make one less request), by assuming assuming the initial page is the first page of results already:

print(f"Scraping page 1/{n_pages}.")
yield BeautifulSoup(markup=resp.text, features='html.parser', parse_only=self.subdoc)
for page in range(2, n_pages+1):
 print(f"Scraping page {page}/{n_pages}.")
 yield search_page(resp.url, page)

You can check in your output that you were printing the first page twice (info/1012/1289.htm appears twice for example) so this is better for the user too.

To clean search further, separate out page fetching and scraping more cleanly--yield up resp.text rather than BeautifulSoup documents, and add a helper function which turns an iterable of HTML strings into an iterable of parsed documents. This removes some repetition.

Finally, there is some repetition in the parameters between grabbing the main page and the search page. Currently you are using the main page URL to reduce this... this is OK but you could also switch the main page to call search_page(1), and remove the first set of logic altogether. Whether this works depends on the website, which I'm not familiar with.

Zachary Vance Zachary Vance 1,5739 silver badges20 bronze badges · Answer 1 · 2021-09-01 09:10:10Z

Scraping code is often ugly, and this is no exception. There's not much you can do, but it's important to work extra hard to make it readable.

Error handling is missing (just throws an error), but that's okay if that's what you want. It makes things more readable.

search is too long and nested too deeply. You should limit the number of layers of indentation much more for readability. Split out the middle into search_page to make the pagination logic easier to understand:

def search_page(self, url, page_num)
 with self.session.get(
 url,
 params={
 'wbtreeid': 1001,
 'newskeycode2': b64encode(query.encode()),
 'searchScope': '0',
 'currentnum': page,
 },
 ) as resp:
 resp.raise_for_status()
 return BeautifulSoup(markup=resp.text, features='html.parser', parse_only=self.subdoc)
if n_pages > 1:
 for page in count(1):
 doc = search_page(resp.url, page)
 print(f"Scraping page {page}/{n_pages}.")
 docs.append(doc)
 if page >= n_pages:
 yield from docs
 break
 else:
 doc = BeautifulSoup(markup=resp.text, features='html.parser', parse_only=self.subdoc)
 yield doc

You have an error here--you mean to have an if-else clause, not a for-else clause, because the else clause can never trigger in the original. You would likely have caught the error with less nesting.

There is no reason to add the indentation of

with session.get(...) as resp:
 resp.raise_for_status()
 ...

This is only needed for streaming output. Just use

resp = session.get(...)
resp.raise_for_status()
...

Don't use count. Use range which is more standard. The entire second loop can be replaced with

if n_pages > 1:
 for page in range(1, n_pages+1):
 print(f"Scraping page {page}/{n_pages}.")
 yield search_page(resp.url, page)
else:
 yield BeautifulSoup(markup=resp.text, features='html.parser', parse_only=self.subdoc)

The last change to make here is to simplify out one more layer (and make one less request), by assuming assuming the initial page is the first page of results already:

print(f"Scraping page 1/{n_pages}.")
yield BeautifulSoup(markup=resp.text, features='html.parser', parse_only=self.subdoc)
for page in range(2, n_pages+1):
 print(f"Scraping page {page}/{n_pages}.")
 yield search_page(resp.url, page)

You can check in your output that you were printing the first page twice (info/1012/1289.htm appears twice for example) so this is better for the user too.

To clean search further, separate out page fetching and scraping more cleanly--yield up resp.text rather than BeautifulSoup documents, and add a helper function which turns an iterable of HTML strings into an iterable of parsed documents. This removes some repetition.

Finally, there is some repetition in the parameters between grabbing the main page and the search page. Currently you are using the main page URL to reduce this... this is OK but you could also switch the main page to call search_page(1), and remove the first set of logic altogether. Whether this works depends on the website, which I'm not familiar with.

Stack Exchange Network

Adding page iteration capability to Requests scraper

Output:

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Linked

Hot Network Questions

Adding page iteration capability to Requests scraper

Output:

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Linked

Related

Hot Network Questions