I am trying to build on @Reinderien's answer to my previous question over here to add page iteration functionality to the code:
from base64 import b64encode
from datetime import date
from typing import Iterable, ClassVar, List
from attr import dataclass
from bs4 import BeautifulSoup, SoupStrainer, Tag
from requests import Session
import re
from itertools import count
from urllib.parse import urljoin
BASE_URL = 'https://www.ctwx.tsinghua.edu.cn'
@dataclass
class Result:
caption: str
when: date
path: str
@classmethod
def from_list_item(cls, item: Tag) -> 'Result':
return cls(
caption=item.a.text,
path=item.a['href'],
when=date.fromisoformat(item.find('span', recursive=False).text),
)
class TsinghuaSite:
subdoc: ClassVar[SoupStrainer] = SoupStrainer(name='ul', class_='search_list')
pagination: ClassVar[SoupStrainer] = SoupStrainer(name='table', class_='listFrame')
def __init__(self):
self.session = Session()
def __enter__(self) -> 'TsinghuaSite':
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.session.close()
def search(self, query: str) -> Iterable[List]:
with self.session.post(
urljoin(BASE_URL, 'search.jsp'),
params={'wbtreeid': 1001},
data={
'lucenenewssearchkey': b64encode(query.encode()),
'_lucenesearchtype': '1',
'searchScope': '0',
'x': '0',
'y': '0',
},
) as resp:
resp.raise_for_status()
pages = BeautifulSoup(markup=resp.text, features='html.parser', parse_only=self.pagination)
n_pages_string = list(pages.select_one('td').children)[4]
n_pages = int(re.search(r'\d+', n_pages_string)[0])
if n_pages > 1:
docs = []
for page in count(1):
with self.session.get(
resp.url,
params={
'wbtreeid': 1001,
'newskeycode2': b64encode(query.encode()),
'searchScope': '0',
'currentnum': page,
},
) as resp:
resp.raise_for_status()
doc = BeautifulSoup(markup=resp.text, features='html.parser', parse_only=self.subdoc)
print(f"Scraping page {page}/{n_pages}.")
docs.append(doc)
if page >= n_pages:
yield from docs
break
else:
doc = BeautifulSoup(markup=resp.text, features='html.parser', parse_only=self.subdoc)
yield doc
def yield_results(self, query) -> Iterable[Result]:
doc_gen = self.search(query)
for doc in doc_gen:
for item in doc.find('ul', recursive=False).find_all('li', recursive=False):
yield Result.from_list_item(item)
def main():
with TsinghuaSite() as site:
query = '尹至'
results = tuple(site.yield_results(query))
# assert any(query in r.caption for r in results)
for result in results:
print(result)
if __name__ == '__main__':
main()
The code seems to be working correctly; but I would still like to seek suggestions on how to improve on it.
Output:
Scraping page 1/2.
Scraping page 2/2.
Result(caption='出土文献研究与保护中心2020年报', when=datetime.date(2021, 4, 9), path='info/1041/2615.htm')
Result(caption='《战国秦汉文字与文献论稿》出版', when=datetime.date(2020, 7, 17), path='info/1012/1289.htm')
Result(caption='【光明日报】清华简十年:古书重现与古史新探', when=datetime.date(2018, 12, 25), path='info/1072/1551.htm')
Result(caption='《清華簡與古史探賾》出版', when=datetime.date(2018, 8, 30), path='info/1012/1436.htm')
Result(caption='【出土文獻第九輯】鄔可晶:《尹至》"惟(肉哉)虐德暴(身童)亡典"句試解', when=datetime.date(2018, 5, 24), path='info/1073/1952.htm')
Result(caption='【出土文獻第五輯】袁金平:從《尹至》篇"播"字的討論談文義對文字考釋的重要性', when=datetime.date(2018, 4, 26), path='info/1081/2378.htm')
Result(caption='【出土文獻第五輯】袁金平:從《尹至》篇"播"字的討論談文義對文字考釋的重要性', when=datetime.date(2018, 4, 26), path='info/1081/2378.htm')
Result(caption='【出土文獻第二輯】羅 琨:讀《尹至》"自夏徂亳"', when=datetime.date(2018, 4, 12), path='info/1081/2283.htm')
Result(caption='【出土文獻第二輯】羅 琨:讀《尹至》"自夏徂亳"', when=datetime.date(2018, 4, 12), path='info/1081/2283.htm')
Result(caption='《出土文獻》(第九輯)出版', when=datetime.date(2016, 10, 26), path='info/1012/1411.htm')
Result(caption='《出土文獻研究》第十三輯出版', when=datetime.date(2015, 4, 8), path='info/1012/1396.htm')
Result(caption='清華大學藏戰國竹簡第五冊相關研究論文', when=datetime.date(2015, 4, 8), path='info/1081/2215.htm')
Result(caption='清華大學藏戰國竹簡第五冊相關研究論文', when=datetime.date(2015, 4, 8), path='info/1081/2215.htm')
Result(caption='《出土文獻》(第五輯)出版', when=datetime.date(2014, 10, 13), path='info/1012/1393.htm')
Result(caption='清华简入选《国家珍贵古籍名录》', when=datetime.date(2013, 12, 11), path='info/1072/1496.htm')
Result(caption='出土文献研究与保护中心2020年报', when=datetime.date(2021, 4, 9), path='info/1041/2615.htm')
Result(caption='《战国秦汉文字与文献论稿》出版', when=datetime.date(2020, 7, 17), path='info/1012/1289.htm')
Result(caption='【光明日报】清华简十年:古书重现与古史新探', when=datetime.date(2018, 12, 25), path='info/1072/1551.htm')
Result(caption='《清華簡與古史探賾》出版', when=datetime.date(2018, 8, 30), path='info/1012/1436.htm')
Result(caption='【出土文獻第九輯】鄔可晶:《尹至》"惟(肉哉)虐德暴(身童)亡典"句試解', when=datetime.date(2018, 5, 24), path='info/1073/1952.htm')
Result(caption='【出土文獻第五輯】袁金平:從《尹至》篇"播"字的討論談文義對文字考釋的重要性', when=datetime.date(2018, 4, 26), path='info/1081/2378.htm')
Result(caption='【出土文獻第五輯】袁金平:從《尹至》篇"播"字的討論談文義對文字考釋的重要性', when=datetime.date(2018, 4, 26), path='info/1081/2378.htm')
Result(caption='【出土文獻第二輯】羅 琨:讀《尹至》"自夏徂亳"', when=datetime.date(2018, 4, 12), path='info/1081/2283.htm')
Result(caption='【出土文獻第二輯】羅 琨:讀《尹至》"自夏徂亳"', when=datetime.date(2018, 4, 12), path='info/1081/2283.htm')
Result(caption='《出土文獻》(第九輯)出版', when=datetime.date(2016, 10, 26), path='info/1012/1411.htm')
Result(caption='《出土文獻研究》第十三輯出版', when=datetime.date(2015, 4, 8), path='info/1012/1396.htm')
Result(caption='清華大學藏戰國竹簡第五冊相關研究論文', when=datetime.date(2015, 4, 8), path='info/1081/2215.htm')
Result(caption='清華大學藏戰國竹簡第五冊相關研究論文', when=datetime.date(2015, 4, 8), path='info/1081/2215.htm')
Result(caption='《出土文獻》(第五輯)出版', when=datetime.date(2014, 10, 13), path='info/1012/1393.htm')
Result(caption='清华简入选《国家珍贵古籍名录》', when=datetime.date(2013, 12, 11), path='info/1072/1496.htm')
1 Answer 1
Scraping code is often ugly, and this is no exception. There's not much you can do, but it's important to work extra hard to make it readable.
Error handling is missing (just throws an error), but that's okay if that's what you want. It makes things more readable.
search
is too long and nested too deeply. You should limit the number of layers of indentation much more for readability. Split out the middle into search_page
to make the pagination logic easier to understand:
def search_page(self, url, page_num)
with self.session.get(
url,
params={
'wbtreeid': 1001,
'newskeycode2': b64encode(query.encode()),
'searchScope': '0',
'currentnum': page,
},
) as resp:
resp.raise_for_status()
return BeautifulSoup(markup=resp.text, features='html.parser', parse_only=self.subdoc)
if n_pages > 1:
for page in count(1):
doc = search_page(resp.url, page)
print(f"Scraping page {page}/{n_pages}.")
docs.append(doc)
if page >= n_pages:
yield from docs
break
else:
doc = BeautifulSoup(markup=resp.text, features='html.parser', parse_only=self.subdoc)
yield doc
You have an error here--you mean to have an if-else
clause, not a for-else clause, because the else clause can never trigger in the original. You would likely have caught the error with less nesting.
There is no reason to add the indentation of
with session.get(...) as resp:
resp.raise_for_status()
...
This is only needed for streaming output. Just use
resp = session.get(...)
resp.raise_for_status()
...
Don't use count
. Use range
which is more standard. The entire second loop can be replaced with
if n_pages > 1:
for page in range(1, n_pages+1):
print(f"Scraping page {page}/{n_pages}.")
yield search_page(resp.url, page)
else:
yield BeautifulSoup(markup=resp.text, features='html.parser', parse_only=self.subdoc)
The last change to make here is to simplify out one more layer (and make one less request), by assuming assuming the initial page is the first page of results already:
print(f"Scraping page 1/{n_pages}.")
yield BeautifulSoup(markup=resp.text, features='html.parser', parse_only=self.subdoc)
for page in range(2, n_pages+1):
print(f"Scraping page {page}/{n_pages}.")
yield search_page(resp.url, page)
You can check in your output that you were printing the first page twice (info/1012/1289.htm
appears twice for example) so this is better for the user too.
To clean search
further, separate out page fetching and scraping more cleanly--yield up resp.text rather than BeautifulSoup documents, and add a helper function which turns an iterable of HTML strings into an iterable of parsed documents. This removes some repetition.
Finally, there is some repetition in the parameters between grabbing the main page and the search page. Currently you are using the main page URL to reduce this... this is OK but you could also switch the main page to call search_page(1)
, and remove the first set of logic altogether. Whether this works depends on the website, which I'm not familiar with.