3
\$\begingroup\$

This is a follow-up of my question over here.

I adopted @Reinderien's suggested script to a second website below:

fudan.py

from contextlib import contextmanager
from dataclasses import dataclass
from datetime import datetime, date
from pathlib import Path
from typing import Iterable, Optional, ContextManager
# pip install proxy.py
import proxy
from proxy.http.exception import HttpRequestRejected
from proxy.http.parser import HttpParser
from proxy.http.proxy import HttpProxyBasePlugin
from selenium.common.exceptions import (
 NoSuchElementException,
 StaleElementReferenceException,
 TimeoutException,
 WebDriverException,
)
from selenium.webdriver import Firefox, FirefoxProfile
from selenium.webdriver.common.by import By
from selenium.webdriver.common.proxy import ProxyType
from selenium.webdriver.remote.webdriver import WebDriver
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
@dataclass
class PrimaryResult:
 caption: str
 date: date
 link: str
 @classmethod
 def from_row(cls, row: WebElement) -> 'PrimaryResult': 
 sno, caption, viewed_number, published_date = row.find_elements_by_xpath('td')
 caption_links = caption.find_elements_by_tag_name('a')[0]
 published_date = date.isoformat(datetime.strptime(published_date.text, '%Y/%m/%d'))
 return cls(
 caption = caption_links.text,
 date = published_date,
 link = caption_links.get_attribute('href')
 )
 def __str__(self):
 return (
 f'題名 {self.caption}'
 f'\n發表時間 {self.date}'
 f'\n文章連結 {self.link}'
 )
class MainPage:
 def __init__(self, driver: WebDriver):
 self.driver = driver
 def submit_search(self, keyword: str) -> None:
 wait = WebDriverWait(self.driver, 100)
 search = wait.until(
 EC.presence_of_element_located((By.CLASS_NAME, 'text2'))
 )
 search.send_keys(keyword)
 search.submit()
 def get_element_and_stop_page(self, *locator) -> WebElement:
 ignored_exceptions = (NoSuchElementException, StaleElementReferenceException)
 wait = WebDriverWait(self.driver, 30, ignored_exceptions=ignored_exceptions)
 elm = wait.until(EC.presence_of_element_located(locator))
 self.driver.execute_script("window.stop();")
 return elm
 def next_page(self) -> None:
 try: 
 link = self.get_element_and_stop_page(By.LINK_TEXT, "下页")
 except:
 print("No button with 「下页」 found.")
 return
 try:
 link.click()
 print("Navigating to Next Page")
 except (TimeoutException, WebDriverException):
 print("Last page reached")
class SearchResults:
 def __init__(self, driver: WebDriver):
 self.driver = driver
 def get_structured_elements(self) -> Iterable[PrimaryResult]:
 rows = self.driver.find_elements_by_xpath(
 '//table[1]/tbody/tr[position() > 1]'
 )
 for row in rows:
 yield PrimaryResult.from_row(row)
# class ContentFilterPlugin(HttpProxyBasePlugin):
# HOST_WHITELIST = {
# b'ocsp.digicert.com',
# b'ocsp.sca1b.amazontrust.com',
# b'big5.oversea.cnki.net',
# b'gwz.fudan.edu.cn'
# }
# def handle_client_request(self, request: HttpParser) -> Optional[HttpParser]:
# host = request.host or request.header(b'Host')
# if host not in self.HOST_WHITELIST:
# raise HttpRequestRejected(403)
# if any(
# suffix in request.path
# for suffix in (
# b'png', b'ico', b'jpg', b'gif', b'css',
# )
# ):
# raise HttpRequestRejected(403)
# return request
# def before_upstream_connection(self, request):
# return super().before_upstream_connection(request)
# def handle_upstream_chunk(self, chunk):
# return super().handle_upstream_chunk(chunk)
# def on_upstream_connection_close(self):
# pass
# @contextmanager
# def run_driver() -> ContextManager[WebDriver]:
# prox_type = ProxyType.MANUAL['ff_value']
# prox_host = '127.0.0.1'
# prox_port = 8889
# profile = FirefoxProfile()
# profile.set_preference('network.proxy.type', prox_type)
# profile.set_preference('network.proxy.http', prox_host)
# profile.set_preference('network.proxy.ssl', prox_host)
# profile.set_preference('network.proxy.http_port', prox_port)
# profile.set_preference('network.proxy.ssl_port', prox_port)
# profile.update_preferences()
# plugin = f'{Path(__file__).stem}.{ContentFilterPlugin.__name__}'
# with proxy.start((
# '--hostname', prox_host,
# '--port', str(prox_port),
# '--plugins', plugin,
# )), Firefox(profile) as driver:
# yield driver
def fudan_search(keyword) -> None:
 with Firefox() as driver:
 driver.get('http://www.gwz.fudan.edu.cn/Web/Search?s=' + keyword + '&btnSearch=')
 # driver.get('http://www.gwz.fudan.edu.cn')
 page = MainPage(driver)
 # page.submit_search(keyword)
 primary_result_page = SearchResults(driver)
 primary_results = primary_result_page.get_structured_elements()
 for result in primary_results:
 print(result)
 print()
 page.next_page()
if __name__ == '__main__':
 fudan_search('人性論')

Output:

題名 梅廣:《大學》古本新訂
發表時間 2017年06月12日
文章連結 http://www.gwz.fudan.edu.cn/Web/Show/3063
題名 《楚地簡帛思想研究》(第四輯)出版
發表時間 2011年04月28日
文章連結 http://www.gwz.fudan.edu.cn/Web/Show/1481
題名 譚樸森先生捐贈圖書總目
發表時間 2008年06月02日
文章連結 http://www.gwz.fudan.edu.cn/Web/Show/448
題名 裘錫圭:由郭店簡〈性自命出〉的「室性者故也」說到《孟子》的「天下之言性也」章
發表時間 2008年01月27日
文章連結 http://www.gwz.fudan.edu.cn/Web/Show/326
No button with 「下页」 found.

I couldn't get proxy to work over here because the page (ironically) took ages to load when using it.

In my old script below, I have a search function that loops through a list of search terms and compile them, page by page, into a captions_link dictionary. The url links in that dictionary is then fed into driver.get requests to scrape the author, title, and download link of the article, if available. Because not all captions lead to academic articles (and there is not way to tell programmatically beforehand), I can only visit all links and see whether there is anything to download with the loop_through_url function.


fudan_old.py

import re
import time
from selenium import webdriver
from selenium.webdriver import Firefox
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.firefox.options import Options
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException
from urllib.parse import unquote
from urllib import request
from datetime import date, datetime
import json
# CONSTANTS
XPATH = {
 "captions": '//tbody/tr/td[2]/a',
 "date_published": '//tbody/tr[position() > 1]/td[4]',
 "max_page_num": "//table[2]/tbody/tr/td[1]",
 "downloads": "/html/body/div[2]/div[2]/div/div[2]/span/p/a"
}
# Initialize driver
def driver_init(keyword):
 global driver
 url = 'http://www.gwz.fudan.edu.cn/Web/Search?s=' + keyword
 options = Options()
 options.page_load_strategy = 'eager'
 driver = webdriver.Firefox(options=options)
 try:
 driver.get(url)
 except:
 driver.refresh()
 return driver
def stop_loading_page_when_element_is_present(xpath):
 global driver
 wait = WebDriverWait(driver, 100)
 wait.until(EC.presence_of_element_located((By.XPATH, xpath)))
 driver.execute_script("window.stop();")
def turn_page():
 global driver
 ignored_exceptions = (NoSuchElementException, StaleElementReferenceException)
 wait = WebDriverWait(driver, 30, ignored_exceptions=ignored_exceptions)
 try:
 wait.until(EC.presence_of_element_located((By.LINK_TEXT, "下页")))
 driver.execute_script("window.stop();")
 except:
 print('No button with 「下页」 found.')
 return
 try:
 wait.until(
 EC.element_to_be_clickable((By.LINK_TEXT, "下页")))
 driver.find_element_by_link_text("下页").click()
 print("Navigating to Next Page")
 except (TimeoutException, WebDriverException):
 print("Last page reached")
def max_page_num():
 global driver
 elem = driver.find_element_by_xpath(XPATH['max_page_num'])
 text = elem.text
 max_pg = re.search("共.+?条记录, 页.+?/(.+)", text).group(1).strip()
 return int(max_pg)
def captions_dict():
 global driver
 ignored_exceptions = (NoSuchElementException, StaleElementReferenceException)
 wait = WebDriverWait(driver, 30, ignored_exceptions=ignored_exceptions)
 captions = []
 links = []
 dates = []
 for i in range(max_page_num()):
 content = wait.until(EC.presence_of_all_elements_located((By.XPATH, XPATH['captions'])))
 time.sleep(3)
 stop_loading_page_when_element_is_present(XPATH['captions'])
 for item in content:
 captions.append(item.text)
 links.append(item.get_attribute('href'))
 date_published = driver.find_elements_by_xpath(XPATH['date_published'])
 for item in date_published:
 dates.append(item.text)
 turn_page()
 # convert to dictionary to remove duplicated captions.
 caption_link = dict(zip(captions, links))
 driver.close()
 return caption_link, dates
def get_article():
 global driver
 try:
 caption = driver.find_element_by_class_name('title')
 author, title = caption.text.split(":")
 stop_loading_page_when_element_is_present("//*[ contains (text(), '点击下载附件' ) ]")
 except:
 return
 # dl = driver.find_element_by_xpath(" //*[ contains (text(), '点击下载附件' ) ]/parent::*")
 # dl = dl.find_element_by_tag_name('a')
 dl = driver.find_element_by_xpath("//*[contains (text(), '.doc') or contains (text(), '.pdf')]/parent::*/parent::*//a")
 download_link = unquote(dl.get_attribute('href'))
 if download_link:
 print("Article found!")
 if author == "網摘":
 author = driver.find_element_by_xpath('/html/body/div[2]/div[2]/div/div[2]/span/div[1]/p[2]/b').text
 title = driver.find_element_by_xpath('/html/body/div[2]/div[2]/div/div[2]/span/div[1]/h2/span').text
 rslt = {"author": author, "title": title, "url": download_link}
 return rslt
def loop_through_url(dict_with_url_as_values):
 keys_lst = list(dict_with_url_as_values.keys())
 url_lst = list(dict_with_url_as_values.values())
 downloadables = {}
 for i, item in enumerate(url_lst):
 global driver
 driver = webdriver.Firefox()
 try:
 driver.get(item)
 except:
 driver.refresh()
 stop_loading_page_when_element_is_present\
 ("/html/body/div[2]/div[2]/div/div[2]/span")
 print("Visiting ", keys_lst[i])
 result = get_article()
 if result:
 if len(result) > 1:
 downloadables.update({keys_lst[i]: result})
 driver.close()
 return downloadables
def search(keyword, output_format="json"):
 """Loop through list of search terms and 
 compile search results together."""
 
 global driver
 search_results = []
 not_found = []
 if isinstance(keyword, list):
 print("Searching through a list of", len(keyword), "keywords...\n")
 # items=list[map(lambda x: title_search(x), keyword)]
 for i, item in enumerate(keyword):
 single_search_result=search(item)
 if single_search_result:
 search_results.extend(single_search_result)
 print(i + 1, item)
 else:
 not_found.append(item)
 print("\n", len(not_found)," titles cannot be found:\n")
 print(*not_found, sep='\n')
 return search_results, not_found
 else:
 driver_init(keyword)
 stop_loading_page_when_element_is_present(XPATH['captions'])
 if output_format == "json":
 single_search_result, dates = captions_dict()
# elif output_format == "bib": # ignore for now.
# single_search_result = add_to_bib()
# elif output_format == "zot":
# single_search_result = add_to_zotero()
 else:
 print("Invalid output format.")
# driver.close()
 return single_search_result, dates
def main(keyword):
 caption_link = search(keyword)
 rslt = loop_through_url(caption_link[0])
 dates = caption_link[1]
 for i, k in enumerate(list(rslt.keys())):
 rslt[k]['date']=date.isoformat(
 datetime.strptime(dates[i],'%Y/%m/%d'))
 with open('fudan_search_result.json', 'w') as file:
 file.write(str(date.today()))
 file.write("\n")
 json.dump(rslt, file, ensure_ascii=False, indent=4)
 print('Done!')
 return caption_link, rslt
if __name__ == '__main__':
 main('人性論')

Output:

{
 "梅廣:《大學》古本新訂": {
 "author": "梅廣",
 "title": "《大學》古本新訂",
 "url": "http://www.gwz.fudan.edu.cn/lunwen/1796梅廣:《大學》古本新訂.doc",
 "date": "2017年06月12日"
 },
 "裘錫圭:由郭店簡〈性自命出〉的「室性者故也」說到《孟子》的「天下之言性也」章": {
 "author": "裘錫圭",
 "title": "由郭店簡〈性自命出〉的「室性者故也」說到《孟子》的「天下之言性也」章",
 "url": "http://www.gwz.fudan.edu.cn/Web/Show/articles/up/0059由郭店簡〈性自命出〉的「室性者故也」說到《孟子》的「天下之言性也」章.doc",
 "date": "2011年04月28日"
 }

Questions:

  1. How do we improve the old script's by incorporating its search-loop functionality into the new script above?
  2. A demonstration of the Requests approach, if more suited to the task, will also be welcome!
  3. One issue with the old script is that date needs to be put back into the dictionary in main.
asked Jun 10, 2021 at 11:11
\$\endgroup\$
2
  • \$\begingroup\$ @Reinderien, I have removed the wuhan.py update to make the question more concise. You can ignore add_to_bib and add_to_zotero for now. The functions were inherited from my old cnki script, and I have yet to update them to work for the present situation. \$\endgroup\$ Commented Jun 16, 2021 at 7:24
  • \$\begingroup\$ @Reinderien, apologies, code has been fixed. \$\endgroup\$ Commented Jun 18, 2021 at 17:14

1 Answer 1

4
+50
\$\begingroup\$

I'm going to give a partial answer that ignores most of what you've done, because the approach is Selenium-based when that is really not needed here. Compared to some of your other questions, this website is very simple, DOM-wise - and follows a very simple and standard pagination strategy. The biggest take-away here is - when you're scraping, stay as close to the metal as you reasonably can; if you can jettison the entire browser and use direct HTTP requests then you're bound to have an easier and faster time.

I am not going to attempt to write or re-write your visit loop. I recommend that you attempt this yourself, using Requests.

Suggested

from dataclasses import dataclass
from itertools import count
from typing import Dict, Iterable, Tuple, List
from bs4 import BeautifulSoup
from requests import Session
from datetime import date, datetime
@dataclass
class Link:
 title: str
 url: str
 clicks: int
 replies: int
 added: date
 @classmethod
 def from_row(cls, props: Dict[str, str], url: str) -> 'Link':
 clicks, replies = props['点击/回复'].split('/')
 # Skip number=int(props['编号']) - this only has meaning within one page
 return cls(
 title=props['资源标题'],
 url=url,
 clicks=int(clicks),
 replies=int(replies),
 added=datetime.strptime(props['添加时间'], '%Y/%m/%d').date(),
 )
 def __str__(self):
 return f'{self.added.isoformat()} {self.url} {self.title}'
def get_page(session: Session, query: str, page: int) -> Tuple[List[Link], int]:
 with session.get(
 'http://www.gwz.fudan.edu.cn/Web/Search',
 params={
 's': query,
 'page': page,
 },
 ) as resp:
 resp.raise_for_status()
 doc = BeautifulSoup(resp.text, 'html.parser')
 table = doc.select_one('#tab table')
 heads = [h.text for h in table.select('tr.cap td')]
 links = []
 for row in table.find_all('tr', class_=''):
 cells = [td.text for td in row.find_all('td')]
 links.append(Link.from_row(
 props=dict(zip(heads, cells)),
 url=row.find('a')['href'],
 ))
 page_td = doc.select_one('#tab table:nth-child(2) td')
 n_pages = int(page_td.text.rsplit('/', 1)[1])
 return links, n_pages
def get_all_links(session: Session, query: str) -> Iterable[Link]:
 for page in count(1):
 links, n_pages = get_page(session, query, page)
 print(f'{page}/{n_pages}')
 yield from links
 if page >= n_pages:
 break
def main() -> None:
 with Session() as session:
 for link in get_all_links(session, '究'):
 print(link)
if __name__ == '__main__':
 main()

Output (truncated)

1/98
2021年06月10日 /Web/Show/4798 劉釗:關於《孟子》一處詞語訓釋和理解的辨正
2021年06月09日 /Web/Show/4797 中心劉釗教授、汪少華教授、陳劍教授分別當選 中國訓詁學研究會副會長、秘書長、理事
2021年06月05日 /Web/Show/4796 復旦大學出土文獻與古文字研究中心2021屆研究生完成論文答辯
2021年06月04日 /Web/Show/4795 贾连翔先生著《出土数字卦文献辑释》出版
2021年05月31日 /Web/Show/4794 王寧:清華簡拾《四告》之四的缺簡問題
2021年05月29日 /Web/Show/4793 汪少華、顏春峰先生點校《茶香室叢鈔》出版
2021年05月28日 /Web/Show/4792 梁春勝先生著《六朝石刻叢考》出版
2021年05月27日 /Web/Show/4791 李永康:京山蘇家壟與《定公五年》"稷"地考
2021年05月26日 /Web/Show/4790 陶安先生著《嶽麓秦簡〈爲獄等狀四種〉釋文注釋》(修訂本)出版
2021年05月24日 /Web/Show/4789 方稚松先生著《殷墟甲骨文五種外記事刻辭研究》出版
2021年05月23日 /Web/Show/4788 我中心2019級博士生王茁獲國務院政府特殊津貼
2021年05月20日 /Web/Show/4787 呂全義先生著《兩周基層地域性居民組織研究》出版
2021年05月20日 /Web/Show/4786 李洪財:談談漢簡草字的考釋方法
2021年05月17日 /Web/Show/4785 名和敏光:虎溪山漢簡"X日而憂置城Y歲"考釋
2021年05月14日 /Web/Show/4784 聞人軍:齊國六種量制之演變 --兼論《隋書·律曆志》"古斛之制"
2021年05月09日 /Web/Show/4783 吳麗婉:大維多利亞美術館藏一片卜甲再考釋
2021年04月30日 /Web/Show/4782 李永康:論西周時期的"伯仲稱謂" ——兼論春秋時期的"子伯仲稱謂"
2021年04月28日 /Web/Show/4781 任攀:尹灣漢簡《神烏賦》校釋
2021年04月26日 /Web/Show/4780 "第十二屆中古漢語國際學術研討會"徵集論文通知
2021年04月24日 /Web/Show/4779 聞人軍:周代射侯形制新考
2021年04月23日 /Web/Show/4778 裘錫圭:【囦丮】卣銘文補釋
2021年04月22日 /Web/Show/4776 李愛輝:國圖藏甲骨殘片補考
2021年04月18日 /Web/Show/4775 《半部學術史,一位李先生:李學勤先生學術成就與學術思想國際研討會論文集》出版
2021年04月18日 /Web/Show/4774 《李學勤文集》發佈會在清華大學召開
2021年04月13日 /Web/Show/4773 彭裕商先生著《漢語古文字學概論》出版
2021年04月12日 /Web/Show/4772 湯志彪先生著《晉系璽印彙編》出版
2021年04月10日 /Web/Show/4771 白一平、沙加爾先生著《上古漢語新構擬》出版
2021年04月09日 /Web/Show/4770 陳英傑先生著《金文與青銅器研究論集》出版
2021年04月08日 /Web/Show/4769 馬孟龍先生著《西漢侯國地理》(修訂本)出版
2021年04月02日 /Web/Show/4768 我中心裘錫圭教授著《老子今研》出版
2/98
2021年03月19日 /Web/Show/4767 迪迦:說楚地傳說中"穴熊"與"鬻熊"
2021年03月18日 /Web/Show/4766 抱小:海昏竹書《詩》校字一則
2021年03月05日 /Web/Show/4765 抱小:海昏竹書《詩》異文小札續
2021年03月02日 /Web/Show/4764 陳劍、龐琨:選擇學術就是選擇一種生活方式——專訪陳劍教授
2021年02月25日 /Web/Show/4762 劉海宇:據清華簡《四告》談《師同鼎》銘文首句的釋讀
2021年02月15日 /Web/Show/4761 王寧:海昏侯墓竹簡《易占》淺識
2021年02月08日 /Web/Show/4760 王冰:"虢季爲匽姬媵甗"乃姬姓間通婚證據辨正
2021年02月02日 /Web/Show/4759 劉釗:出土文獻與《山海經》新證
2021年01月30日 /Web/Show/4758 王寧:清華簡拾《四告》之二讀札
2021年01月26日 /Web/Show/4757 抱小:釋海昏木楬之"㧼"字
2021年01月25日 /Web/Show/4756 抱小:古文獻中所見從"勺"從"夕"之字訛誤之例
2021年01月25日 /Web/Show/4755 莊晗:《考工記》"皋陶"新釋
2021年01月21日 /Web/Show/4754 抱小:海昏竹書《保傅》"知=非色"臆解
2021年01月21日 /Web/Show/4753 抱小:《尸子》補校一則
2021年01月20日 /Web/Show/4752 抱小:海昏竹書《詩》異文小札
2021年01月18日 /Web/Show/4751 王寧:清華簡拾《四告》之三讀札二則
2021年01月17日 /Web/Show/4750 出土文獻與古文字研究青年學者訪談070:蕭毅
2021年01月13日 /Web/Show/4748 朱鳳瀚、蘇强先生主編《中國國家博物館館藏文物研究叢書·青銅器卷(商)》出版
2021年01月12日 /Web/Show/4747 出土文獻與古文字研究青年學者訪談069:禤健聰
2021年01月09日 /Web/Show/4745 出土文獻與古文字研究青年學者訪談068:马孟龙
2021年01月06日 /Web/Show/4744 出土文獻與古文字研究青年學者訪談067:方勇
2021年01月05日 /Web/Show/4743 董珊:樂從堂藏銅馬式考
2020年12月31日 /Web/Show/4740 《出土文獻與古文字研究》第九輯出版
2020年12月30日 /Web/Show/4739 出土文獻與古文字研究青年學者訪談066:門藝
2020年12月29日 /Web/Show/4738 李豪:上博簡"羹"字補釋
2020年12月26日 /Web/Show/4737 清華大學王子楊教授來我中心作講座("出土文獻與古文字研究雲講座"第九場)
2020年12月26日 /Web/Show/4736 出土文獻與古文字研究青年學者訪談065:王化平
2020年12月26日 /Web/Show/4735 李永康:春秋曾侯夫婦墓的認定與曾公求"至于桓莊"考
2020年12月25日 /Web/Show/4734 我中心刘钊教授主编《马王堆汉墓简帛文字全编》获评2020年度中华书局双十佳图书
2020年12月24日 /Web/Show/4733 出土文獻與古文字研究青年學者訪談064:吴毅强
answered Jun 22, 2021 at 5:00
\$\endgroup\$

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.