Return to Question

Notice removed Authoritative reference needed by Sati

occurred Jun 23, 2021 at 22:41

Bounty Ended with Reinderien's answer chosen by Sati

occurred Jun 23, 2021 at 22:41

Tweeted twitter.com/StackCodeReview/status/1406944996654190599

occurred Jun 21, 2021 at 12:00

Notice added Authoritative reference needed by Sati

occurred Jun 21, 2021 at 10:19

Bounty Started worth 50 reputation by Sati

occurred Jun 21, 2021 at 10:19

added 52 characters in body

Source Link

edited Jun 18, 2021 at 17:14

Sati

edited Jun 18, 2021 at 17:14

Sati

{
 "梅廣:《大學》古本新訂": {
 "author": "梅廣",
 "title": "《大學》古本新訂",
 "url": "http://www.gwz.fudan.edu.cn/lunwen/1796梅廣:《大學》古本新訂.doc",
 "date": 2017"2017-06-1212"
 },
 "裘錫圭:由郭店簡〈性自命出〉的「室性者故也」說到《孟子》的「天下之言性也」章": {
 "author": "裘錫圭",
 "title": "由郭店簡〈性自命出〉的「室性者故也」說到《孟子》的「天下之言性也」章",
 "url": "http://www.gwz.fudan.edu.cn/Web/Show/articles/up/0059由郭店簡〈性自命出〉的「室性者故也」說到《孟子》的「天下之言性也」章.doc",
 "date": 2008"2011-0104-2728"
 }
}

{
 "梅廣:《大學》古本新訂": {
 "author": "梅廣",
 "title": "《大學》古本新訂",
 "url": "http://www.gwz.fudan.edu.cn/lunwen/1796梅廣:《大學》古本新訂.doc"
 "date": 2017-06-12
 },
 "裘錫圭:由郭店簡〈性自命出〉的「室性者故也」說到《孟子》的「天下之言性也」章": {
 "author": "裘錫圭",
 "title": "由郭店簡〈性自命出〉的「室性者故也」說到《孟子》的「天下之言性也」章",
 "url": "http://www.gwz.fudan.edu.cn/Web/Show/articles/up/0059由郭店簡〈性自命出〉的「室性者故也」說到《孟子》的「天下之言性也」章.doc"
 "date": 2008-01-27
 }
}

{
 "梅廣:《大學》古本新訂": {
 "author": "梅廣",
 "title": "《大學》古本新訂",
 "url": "http://www.gwz.fudan.edu.cn/lunwen/1796梅廣:《大學》古本新訂.doc",
 "date": "2017-06-12"
 },
 "裘錫圭:由郭店簡〈性自命出〉的「室性者故也」說到《孟子》的「天下之言性也」章": {
 "author": "裘錫圭",
 "title": "由郭店簡〈性自命出〉的「室性者故也」說到《孟子》的「天下之言性也」章",
 "url": "http://www.gwz.fudan.edu.cn/Web/Show/articles/up/0059由郭店簡〈性自命出〉的「室性者故也」說到《孟子》的「天下之言性也」章.doc",
 "date": "2011-04-28"
 }

added 52 characters in body

Source Link

edited Jun 18, 2021 at 17:09

Sati

edited Jun 18, 2021 at 17:09

Sati

import re
import time
from selenium import webdriver
from selenium.webdriver import Firefox
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.firefox.options import Options
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException
from urllib.parse import unquote
from urllib import request
from datetime import date, datetime
import json
# CONSTANTS
XPATH = {
 "captions": '//tbody/tr/td[2]/a',
 "date_published": '//tbody/tr[position() > 1]/td[4]',
 "max_page_num": "//table[2]/tbody/tr/td[1]",
 "downloads": "/html/body/div[2]/div[2]/div/div[2]/span/p/a"
}
# Initialize driver
def driver_init(keyword):
 global driver
 url = 'http://www.gwz.fudan.edu.cn/Web/Search?s=' + keyword
 options = Options()
 options.page_load_strategy = 'eager'
 driver = webdriver.Firefox(options=options)
 try:
 driver.get(url)
 except:
 driver.refresh()
 return driver
def stop_loading_page_when_element_is_present(xpath):
 global driver
 wait = WebDriverWait(driver, 100)
 wait.until(EC.presence_of_element_located((By.XPATH, xpath)))
 driver.execute_script("window.stop();")
def turn_page():
 global driver
 ignored_exceptions = (NoSuchElementException, StaleElementReferenceException)
 wait = WebDriverWait(driver, 30, ignored_exceptions=ignored_exceptions)
 try:
 wait.until(EC.presence_of_element_located((By.LINK_TEXT, "下页")))
 driver.execute_script("window.stop();")
 except:
 print('No button with 「下页」 found.')
 return
 try:
 wait.until(
 EC.element_to_be_clickable((By.LINK_TEXT, "下页")))
 driver.find_element_by_link_text("下页").click()
 print("Navigating to Next Page")
 except (TimeoutException, WebDriverException):
 print("Last page reached")
def max_page_num():
 global driver
 elem = driver.find_element_by_xpath(XPATH['max_page_num'])
 text = elem.text
 max_pg = re.search("共.+?条记录, 页.+?/(.+)", text).group(1).strip()
 return int(max_pg)
def captions_dict():
 global driver
 ignored_exceptions = (NoSuchElementException, StaleElementReferenceException)
 wait = WebDriverWait(driver, 30, ignored_exceptions=ignored_exceptions)
 captions = []
 links = []
 dates = []
 for i in range(max_page_num()):
 content = wait.until(EC.presence_of_all_elements_located((By.XPATH, XPATH['captions'])))
 time.sleep(3)
 stop_loading_page_when_element_is_present(XPATH['captions'])
 for item in content:
 captions.append(item.text)
 links.append(item.get_attribute('href'))
 date_published = driver.find_elements_by_xpath(XPATH['date_published'])
 for item in date_published:
 dates.append(item.text)
 turn_page()
 # convert to dictionary to remove duplicated captions.
 caption_link = dict(zip(captions, links))
 driver.close()
 return caption_link, dates
def get_article():
 global driver
 try:
 caption = driver.find_element_by_class_name('title')
 author, title = caption.text.split(":")
 stop_loading_page_when_element_is_present("//*[ contains (text(), '点击下载附件' ) ]")
 except:
 return
 # dl = driver.find_element_by_xpath(" //*[ contains (text(), '点击下载附件' ) ]/parent::*")
 # dl = dl.find_element_by_tag_name('a')
 dl = driver.find_element_by_xpath("//*[contains (text(), '.doc') or contains (text(), '.pdf')]/parent::*/parent::*//a")
 download_link = unquote(dl.get_attribute('href'))
 if download_link:
 print("Article found!")
 if author == "網摘":
 author = driver.find_element_by_xpath('/html/body/div[2]/div[2]/div/div[2]/span/div[1]/p[2]/b').text
 title = driver.find_element_by_xpath('/html/body/div[2]/div[2]/div/div[2]/span/div[1]/h2/span').text
 rslt = {"author": author, "title": title, "url": download_link}
 return rslt
def loop_through_url(dict_with_url_as_values):
 keys_lst = list(dict_with_url_as_values.keys())
 url_lst = list(dict_with_url_as_values.values())
 downloadables = {}
 for i, item in enumerate(url_lst):
 global driver
 driver = webdriver.Firefox()
 try:
 driver.get(item)
 except:
 driver.refresh()
 stop_loading_page_when_element_is_present\
 ("/html/body/div[2]/div[2]/div/div[2]/span")
 print("Visiting ", keys_lst[i])
 result = get_article()
 if result:
 if len(result) > 1:
 downloadables.update({keys_lst[i]: result})
 driver.close()
 return downloadables
def search(keyword, output_format="json"):
 """Loop through list of search terms and 
 compile search results together."""
 
 global driver
 search_results = []
 not_found = []
 if isinstance(keyword, list):
 print("Searching through a list of", len(keyword), "keywords...\n")
 # items=list[map(lambda x: title_search(x), keyword)]
 for i, item in enumerate(keyword):
 single_search_result=search(item)
 if single_search_result:
 search_results.extend(single_search_result)
 print(i + 1, item)
 else:
 not_found.append(item)
 print("\n", len(not_found)," titles cannot be found:\n")
 print(*not_found, sep='\n')
 return search_results, not_found
 else:
 driver_init(keyword)
 stop_loading_page_when_element_is_present(XPATH['captions'])
 if output_format == "json":
 single_search_result, dates = captions_dict()
# elif output_format == "bib": # ignore for now.
# single_search_result = add_to_bib()
# elif output_format == "zot":
# single_search_result = add_to_zotero()
 else:
 print("Invalid output format.")
# driver.close()
 return single_search_result, dates
def main(keyword):
 caption_link = search(keyword)
 rslt = loop_through_url(caption_link[0])
 dates = caption_link[1]
 for i, k in enumerate(list(rslt.keys())):
 rslt[k]['date']=date.isoformat(
 datetime.strptime(dates[i],'%Y/%m/%d'))
 with open('fudan_search_result.json', 'w') as file:
 file.write(str(date.today()))
 file.write("\n")
 json.dump(rslt, file, ensure_ascii=False, indent=4)
 print('Done!')
 return caption_link, rslt
if __name__ == '__main__':
 main('人性論')

import re
import time
from selenium import webdriver
from selenium.webdriver import Firefox
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.firefox.options import Options
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException
from urllib.parse import unquote
from urllib import request
from datetime import date
import json
# CONSTANTS
XPATH = {
 "captions": '//tbody/tr/td[2]/a',
 "date_published": '//tbody/tr[position() > 1]/td[4]',
 "max_page_num": "//table[2]/tbody/tr/td[1]",
 "downloads": "/html/body/div[2]/div[2]/div/div[2]/span/p/a"
}
# Initialize driver
def driver_init(keyword):
 global driver
 url = 'http://www.gwz.fudan.edu.cn/Web/Search?s=' + keyword
 options = Options()
 options.page_load_strategy = 'eager'
 driver = webdriver.Firefox(options=options)
 try:
 driver.get(url)
 except:
 driver.refresh()
 return driver
def stop_loading_page_when_element_is_present(xpath):
 global driver
 wait = WebDriverWait(driver, 100)
 wait.until(EC.presence_of_element_located((By.XPATH, xpath)))
 driver.execute_script("window.stop();")
def turn_page():
 global driver
 ignored_exceptions = (NoSuchElementException, StaleElementReferenceException)
 wait = WebDriverWait(driver, 30, ignored_exceptions=ignored_exceptions)
 try:
 wait.until(EC.presence_of_element_located((By.LINK_TEXT, "下页")))
 driver.execute_script("window.stop();")
 except:
 print('No button with 「下页」 found.')
 return
 try:
 wait.until(
 EC.element_to_be_clickable((By.LINK_TEXT, "下页")))
 driver.find_element_by_link_text("下页").click()
 print("Navigating to Next Page")
 except (TimeoutException, WebDriverException):
 print("Last page reached")
def max_page_num():
 global driver
 elem = driver.find_element_by_xpath(XPATH['max_page_num'])
 text = elem.text
 max_pg = re.search("共.+?条记录, 页.+?/(.+)", text).group(1).strip()
 return int(max_pg)
def captions_dict():
 global driver
 ignored_exceptions = (NoSuchElementException, StaleElementReferenceException)
 wait = WebDriverWait(driver, 30, ignored_exceptions=ignored_exceptions)
 captions = []
 links = []
 dates = []
 for i in range(max_page_num()):
 content = wait.until(EC.presence_of_all_elements_located((By.XPATH, XPATH['captions'])))
 time.sleep(3)
 stop_loading_page_when_element_is_present(XPATH['captions'])
 for item in content:
 captions.append(item.text)
 links.append(item.get_attribute('href'))
 date_published = driver.find_elements_by_xpath(XPATH['date_published'])
 for item in date_published:
 dates.append(item.text)
 turn_page()
 # convert to dictionary to remove duplicated captions.
 caption_link = dict(zip(captions, links))
 driver.close()
 return caption_link, dates
def get_article():
 global driver
 try:
 caption = driver.find_element_by_class_name('title')
 author, title = caption.text.split(":")
 stop_loading_page_when_element_is_present("//*[ contains (text(), '点击下载附件' ) ]")
 except:
 return
 # dl = driver.find_element_by_xpath(" //*[ contains (text(), '点击下载附件' ) ]/parent::*")
 # dl = dl.find_element_by_tag_name('a')
 dl = driver.find_element_by_xpath("//*[contains (text(), '.doc') or contains (text(), '.pdf')]/parent::*/parent::*//a")
 download_link = unquote(dl.get_attribute('href'))
 if download_link:
 print("Article found!")
 if author == "網摘":
 author = driver.find_element_by_xpath('/html/body/div[2]/div[2]/div/div[2]/span/div[1]/p[2]/b').text
 title = driver.find_element_by_xpath('/html/body/div[2]/div[2]/div/div[2]/span/div[1]/h2/span').text
 rslt = {"author": author, "title": title, "url": download_link}
 return rslt
def loop_through_url(dict_with_url_as_values):
 keys_lst = list(dict_with_url_as_values.keys())
 url_lst = list(dict_with_url_as_values.values())
 downloadables = {}
 for i, item in enumerate(url_lst):
 global driver
 driver = webdriver.Firefox()
 try:
 driver.get(item)
 except:
 driver.refresh()
 stop_loading_page_when_element_is_present\
 ("/html/body/div[2]/div[2]/div/div[2]/span")
 print("Visiting ", keys_lst[i])
 result = get_article()
 if result:
 if len(result) > 1:
 downloadables.update({keys_lst[i]: result})
 driver.close()
 return downloadables
def search(keyword, output_format="json"):
 """Loop through list of search terms and 
 compile search results together."""
 
 global driver
 search_results = []
 not_found = []
 if isinstance(keyword, list):
 print("Searching through a list of", len(keyword), "keywords...\n")
 # items=list[map(lambda x: title_search(x), keyword)]
 for i, item in enumerate(keyword):
 single_search_result=search(item)
 if single_search_result:
 search_results.extend(single_search_result)
 print(i + 1, item)
 else:
 not_found.append(item)
 print("\n", len(not_found)," titles cannot be found:\n")
 print(*not_found, sep='\n')
 return search_results, not_found
 else:
 driver_init(keyword)
 stop_loading_page_when_element_is_present(XPATH['captions'])
 if output_format == "json":
 single_search_result, dates = captions_dict()
# elif output_format == "bib": # ignore for now.
# single_search_result = add_to_bib()
# elif output_format == "zot":
# single_search_result = add_to_zotero()
 else:
 print("Invalid output format.")
# driver.close()
 return single_search_result, dates
def main(keyword):
 caption_link = search(keyword)
 rslt = loop_through_url(caption_link[0])
 dates = caption_link[1]
 for i, k in enumerate(list(rslt.keys())):
 rslt[k]['date']=date.isoformat(dates[i])
 with open('fudan_search_result.json', 'w') as file:
 file.write(str(date.today()))
 file.write("\n")
 json.dump(rslt, file, ensure_ascii=False, indent=4)
 print('Done!')
 return caption_link, rslt
if __name__ == '__main__':
 main('人性論')

import re
import time
from selenium import webdriver
from selenium.webdriver import Firefox
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.firefox.options import Options
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException
from urllib.parse import unquote
from urllib import request
from datetime import date, datetime
import json
# CONSTANTS
XPATH = {
 "captions": '//tbody/tr/td[2]/a',
 "date_published": '//tbody/tr[position() > 1]/td[4]',
 "max_page_num": "//table[2]/tbody/tr/td[1]",
 "downloads": "/html/body/div[2]/div[2]/div/div[2]/span/p/a"
}
# Initialize driver
def driver_init(keyword):
 global driver
 url = 'http://www.gwz.fudan.edu.cn/Web/Search?s=' + keyword
 options = Options()
 options.page_load_strategy = 'eager'
 driver = webdriver.Firefox(options=options)
 try:
 driver.get(url)
 except:
 driver.refresh()
 return driver
def stop_loading_page_when_element_is_present(xpath):
 global driver
 wait = WebDriverWait(driver, 100)
 wait.until(EC.presence_of_element_located((By.XPATH, xpath)))
 driver.execute_script("window.stop();")
def turn_page():
 global driver
 ignored_exceptions = (NoSuchElementException, StaleElementReferenceException)
 wait = WebDriverWait(driver, 30, ignored_exceptions=ignored_exceptions)
 try:
 wait.until(EC.presence_of_element_located((By.LINK_TEXT, "下页")))
 driver.execute_script("window.stop();")
 except:
 print('No button with 「下页」 found.')
 return
 try:
 wait.until(
 EC.element_to_be_clickable((By.LINK_TEXT, "下页")))
 driver.find_element_by_link_text("下页").click()
 print("Navigating to Next Page")
 except (TimeoutException, WebDriverException):
 print("Last page reached")
def max_page_num():
 global driver
 elem = driver.find_element_by_xpath(XPATH['max_page_num'])
 text = elem.text
 max_pg = re.search("共.+?条记录, 页.+?/(.+)", text).group(1).strip()
 return int(max_pg)
def captions_dict():
 global driver
 ignored_exceptions = (NoSuchElementException, StaleElementReferenceException)
 wait = WebDriverWait(driver, 30, ignored_exceptions=ignored_exceptions)
 captions = []
 links = []
 dates = []
 for i in range(max_page_num()):
 content = wait.until(EC.presence_of_all_elements_located((By.XPATH, XPATH['captions'])))
 time.sleep(3)
 stop_loading_page_when_element_is_present(XPATH['captions'])
 for item in content:
 captions.append(item.text)
 links.append(item.get_attribute('href'))
 date_published = driver.find_elements_by_xpath(XPATH['date_published'])
 for item in date_published:
 dates.append(item.text)
 turn_page()
 # convert to dictionary to remove duplicated captions.
 caption_link = dict(zip(captions, links))
 driver.close()
 return caption_link, dates
def get_article():
 global driver
 try:
 caption = driver.find_element_by_class_name('title')
 author, title = caption.text.split(":")
 stop_loading_page_when_element_is_present("//*[ contains (text(), '点击下载附件' ) ]")
 except:
 return
 # dl = driver.find_element_by_xpath(" //*[ contains (text(), '点击下载附件' ) ]/parent::*")
 # dl = dl.find_element_by_tag_name('a')
 dl = driver.find_element_by_xpath("//*[contains (text(), '.doc') or contains (text(), '.pdf')]/parent::*/parent::*//a")
 download_link = unquote(dl.get_attribute('href'))
 if download_link:
 print("Article found!")
 if author == "網摘":
 author = driver.find_element_by_xpath('/html/body/div[2]/div[2]/div/div[2]/span/div[1]/p[2]/b').text
 title = driver.find_element_by_xpath('/html/body/div[2]/div[2]/div/div[2]/span/div[1]/h2/span').text
 rslt = {"author": author, "title": title, "url": download_link}
 return rslt
def loop_through_url(dict_with_url_as_values):
 keys_lst = list(dict_with_url_as_values.keys())
 url_lst = list(dict_with_url_as_values.values())
 downloadables = {}
 for i, item in enumerate(url_lst):
 global driver
 driver = webdriver.Firefox()
 try:
 driver.get(item)
 except:
 driver.refresh()
 stop_loading_page_when_element_is_present\
 ("/html/body/div[2]/div[2]/div/div[2]/span")
 print("Visiting ", keys_lst[i])
 result = get_article()
 if result:
 if len(result) > 1:
 downloadables.update({keys_lst[i]: result})
 driver.close()
 return downloadables
def search(keyword, output_format="json"):
 """Loop through list of search terms and 
 compile search results together."""
 
 global driver
 search_results = []
 not_found = []
 if isinstance(keyword, list):
 print("Searching through a list of", len(keyword), "keywords...\n")
 # items=list[map(lambda x: title_search(x), keyword)]
 for i, item in enumerate(keyword):
 single_search_result=search(item)
 if single_search_result:
 search_results.extend(single_search_result)
 print(i + 1, item)
 else:
 not_found.append(item)
 print("\n", len(not_found)," titles cannot be found:\n")
 print(*not_found, sep='\n')
 return search_results, not_found
 else:
 driver_init(keyword)
 stop_loading_page_when_element_is_present(XPATH['captions'])
 if output_format == "json":
 single_search_result, dates = captions_dict()
# elif output_format == "bib": # ignore for now.
# single_search_result = add_to_bib()
# elif output_format == "zot":
# single_search_result = add_to_zotero()
 else:
 print("Invalid output format.")
# driver.close()
 return single_search_result, dates
def main(keyword):
 caption_link = search(keyword)
 rslt = loop_through_url(caption_link[0])
 dates = caption_link[1]
 for i, k in enumerate(list(rslt.keys())):
 rslt[k]['date']=date.isoformat(
 datetime.strptime(dates[i],'%Y/%m/%d'))
 with open('fudan_search_result.json', 'w') as file:
 file.write(str(date.today()))
 file.write("\n")
 json.dump(rslt, file, ensure_ascii=False, indent=4)
 print('Done!')
 return caption_link, rslt
if __name__ == '__main__':
 main('人性論')

added 6 characters in body

Source Link

edited Jun 18, 2021 at 16:52

Sati

edited Jun 18, 2021 at 16:52

Sati

import re
import time
from selenium import webdriver
from selenium.webdriver import Firefox
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.firefox.options import Options
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException
from urllib.parse import unquote
from urllib import request
from datetime import date
import json
# CONSTANTS
XPATH = {
 "captions": '//tbody/tr/td[2]/a',
 "date_published": '//tbody/tr[position() > 1]/td[4]',
 "max_page_num": "//table[2]/tbody/tr/td[1]",
 "downloads": "/html/body/div[2]/div[2]/div/div[2]/span/p/a"
}
# Initialize driver
def driver_init(keyword):
 global driver
 url = 'http://www.gwz.fudan.edu.cn/Web/Search?s=' + keyword
 options = Options()
 options.page_load_strategy = 'eager'
 driver = webdriver.Firefox(options=options)
 try:
 driver.get(url)
 except:
 driver.refresh()
 return driver
def stop_loading_page_when_element_is_present(xpath):
 global driver
 wait = WebDriverWait(driver, 100)
 wait.until(EC.presence_of_element_located((By.XPATH, xpath)))
 driver.execute_script("window.stop();")
def turn_page():
 global driver
 ignored_exceptions = (NoSuchElementException, StaleElementReferenceException)
 wait = WebDriverWait(driver, 30, ignored_exceptions=ignored_exceptions)
 try:
 wait.until(EC.presence_of_element_located((By.LINK_TEXT, "下页")))
 driver.execute_script("window.stop();")
 except:
 print('No button with 「下页」 found.')
 return
 try:
 wait.until(
 EC.element_to_be_clickable((By.LINK_TEXT, "下页")))
 driver.find_element_by_link_text("下页").click()
 print("Navigating to Next Page")
 except (TimeoutException, WebDriverException):
 print("Last page reached")
def max_page_num():
 global driver
 elem = driver.find_element_by_xpath(XPATH['max_page_num'])
 text = elem.text
 max_pg = re.search("共.+?条记录, 页.+?/(.+)", text).group(1).strip()
 return int(max_pg)
def captions_dict():
 global driver
 ignored_exceptions = (NoSuchElementException, StaleElementReferenceException)
 wait = WebDriverWait(driver, 30, ignored_exceptions=ignored_exceptions)
 captions = []
 links = []
 dates = []
 for i in range(max_page_num()):
 content = wait.until(EC.presence_of_all_elements_located((By.XPATH, XPATH['captions'])))
 time.sleep(3)
 stop_loading_page_when_element_is_present(XPATH['captions'])
 for item in content:
 captions.append(item.text)
 links.append(item.get_attribute('href'))
 date_published = driver.find_elements_by_xpath(XPATH['date_published'])
 for item in date_published:
 dates.append(item.text)
 turn_page()
 # convert to dictionary to remove duplicated captions.
 caption_link = dict(zip(captions, links))
 driver.close()
 return caption_link, dates
def get_article():
 global driver
 try:
 caption = driver.find_element_by_class_name('title')
 author, title = caption.text.split(":")
 stop_loading_page_when_element_is_present("//*[ contains (text(), '点击下载附件' ) ]")
 except:
 return
 # dl = driver.find_element_by_xpath(" //*[ contains (text(), '点击下载附件' ) ]/parent::*")
 # dl = dl.find_element_by_tag_name('a')
 dl = driver.find_element_by_xpath("//*[contains (text(), '.doc') or contains (text(), '.pdf')]/parent::*/parent::*//a")
 download_link = unquote(dl.get_attribute('href'))
 if download_link:
 print("Article found!")
 if author == "網摘":
 author = driver.find_element_by_xpath('/html/body/div[2]/div[2]/div/div[2]/span/div[1]/p[2]/b').text
 title = driver.find_element_by_xpath('/html/body/div[2]/div[2]/div/div[2]/span/div[1]/h2/span').text
 rslt = {"author": author, "title": title, "url": download_link}
 return rslt
def loop_through_url(dict_with_url_as_values):
 keys_lst = list(dict_with_url_as_values.keys())
 url_lst = list(dict_with_url_as_values.values())
 downloadables = {}
 for i, item in enumerate(url_lst):
 global driver
 driver = webdriver.Firefox()
 try:
 driver.get(item)
 except:
 driver.refresh()
 stop_loading_page_when_element_is_present\
 ("/html/body/div[2]/div[2]/div/div[2]/span")
 print("Visiting ", keys_lst[i])
 result = get_article()
 if result:
 if len(result) > 1:
 downloadables.update({keys_lst[i]: result})
 driver.close()
 return downloadables
def search(keyword, output_format="json"):
 """Loop through list of search terms and 
 compile search results together."""
 
 global driver
 search_results = []
 not_found = []
 if isinstance(keyword, list):
 print("Searching through a list of", len(keyword), "keywords...\n")
 # items=list[map(lambda x: title_search(x), keyword)]
 for i, item in enumerate(keyword):
 single_search_result=search(item)
 if single_search_result:
 search_results.extend(single_search_result)
 print(i + 1, item)
 else:
 not_found.append(item)
 print("\n", len(not_found)," titles cannot be found:\n")
 print(*not_found, sep='\n')
 return search_results, not_found
 else:
 driver_init(keyword)
 stop_loading_page_when_element_is_present(XPATH['captions'])
 if output_format == "json":
 single_search_result, dates = captions_dict()
# elif output_format == "bib": # ignore for now.
# single_search_result = add_to_bib()
# elif output_format == "zot":
# single_search_result = add_to_zotero()
 else:
 print("Invalid output format.")
# driver.close()
 return single_search_result, dates
def main(keyword):
 caption_link = search(keyword)
 rslt = loop_through_url(caption_link[0])
 dates = caption_link[1]
 for i, k in enumerate(list(drslt.keys())):
 d[k]['date']=daterslt[k]['date']=date.isoformat(dates[i])
 with open('fudan_search_result.json', 'w') as file:
 file.write(str(date.today()))
 file.write("\n")
 json.dump(rslt, file, ensure_ascii=False, indent=4)
 print('Done!')
 return caption_link, rslt
if __name__ == '__main__':
 main('人性論')

import re
import time
from selenium import webdriver
from selenium.webdriver import Firefox
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.firefox.options import Options
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException
from urllib.parse import unquote
from urllib import request
from datetime import date
import json
# CONSTANTS
XPATH = {
 "captions": '//tbody/tr/td[2]/a',
 "date_published": '//tbody/tr[position() > 1]/td[4]',
 "max_page_num": "//table[2]/tbody/tr/td[1]",
 "downloads": "/html/body/div[2]/div[2]/div/div[2]/span/p/a"
}
# Initialize driver
def driver_init(keyword):
 global driver
 url = 'http://www.gwz.fudan.edu.cn/Web/Search?s=' + keyword
 options = Options()
 options.page_load_strategy = 'eager'
 driver = webdriver.Firefox(options=options)
 try:
 driver.get(url)
 except:
 driver.refresh()
 return driver
def stop_loading_page_when_element_is_present(xpath):
 global driver
 wait = WebDriverWait(driver, 100)
 wait.until(EC.presence_of_element_located((By.XPATH, xpath)))
 driver.execute_script("window.stop();")
def turn_page():
 global driver
 ignored_exceptions = (NoSuchElementException, StaleElementReferenceException)
 wait = WebDriverWait(driver, 30, ignored_exceptions=ignored_exceptions)
 try:
 wait.until(EC.presence_of_element_located((By.LINK_TEXT, "下页")))
 driver.execute_script("window.stop();")
 except:
 print('No button with 「下页」 found.')
 return
 try:
 wait.until(
 EC.element_to_be_clickable((By.LINK_TEXT, "下页")))
 driver.find_element_by_link_text("下页").click()
 print("Navigating to Next Page")
 except (TimeoutException, WebDriverException):
 print("Last page reached")
def max_page_num():
 global driver
 elem = driver.find_element_by_xpath(XPATH['max_page_num'])
 text = elem.text
 max_pg = re.search("共.+?条记录, 页.+?/(.+)", text).group(1).strip()
 return int(max_pg)
def captions_dict():
 global driver
 ignored_exceptions = (NoSuchElementException, StaleElementReferenceException)
 wait = WebDriverWait(driver, 30, ignored_exceptions=ignored_exceptions)
 captions = []
 links = []
 dates = []
 for i in range(max_page_num()):
 content = wait.until(EC.presence_of_all_elements_located((By.XPATH, XPATH['captions'])))
 time.sleep(3)
 stop_loading_page_when_element_is_present(XPATH['captions'])
 for item in content:
 captions.append(item.text)
 links.append(item.get_attribute('href'))
 date_published = driver.find_elements_by_xpath(XPATH['date_published'])
 for item in date_published:
 dates.append(item.text)
 turn_page()
 # convert to dictionary to remove duplicated captions.
 caption_link = dict(zip(captions, links))
 driver.close()
 return caption_link, dates
def get_article():
 global driver
 try:
 caption = driver.find_element_by_class_name('title')
 author, title = caption.text.split(":")
 stop_loading_page_when_element_is_present("//*[ contains (text(), '点击下载附件' ) ]")
 except:
 return
 # dl = driver.find_element_by_xpath(" //*[ contains (text(), '点击下载附件' ) ]/parent::*")
 # dl = dl.find_element_by_tag_name('a')
 dl = driver.find_element_by_xpath("//*[contains (text(), '.doc') or contains (text(), '.pdf')]/parent::*/parent::*//a")
 download_link = unquote(dl.get_attribute('href'))
 if download_link:
 print("Article found!")
 if author == "網摘":
 author = driver.find_element_by_xpath('/html/body/div[2]/div[2]/div/div[2]/span/div[1]/p[2]/b').text
 title = driver.find_element_by_xpath('/html/body/div[2]/div[2]/div/div[2]/span/div[1]/h2/span').text
 rslt = {"author": author, "title": title, "url": download_link}
 return rslt
def loop_through_url(dict_with_url_as_values):
 keys_lst = list(dict_with_url_as_values.keys())
 url_lst = list(dict_with_url_as_values.values())
 downloadables = {}
 for i, item in enumerate(url_lst):
 global driver
 driver = webdriver.Firefox()
 try:
 driver.get(item)
 except:
 driver.refresh()
 stop_loading_page_when_element_is_present\
 ("/html/body/div[2]/div[2]/div/div[2]/span")
 print("Visiting ", keys_lst[i])
 result = get_article()
 if result:
 if len(result) > 1:
 downloadables.update({keys_lst[i]: result})
 driver.close()
 return downloadables
def search(keyword, output_format="json"):
 """Loop through list of search terms and 
 compile search results together."""
 
 global driver
 search_results = []
 not_found = []
 if isinstance(keyword, list):
 print("Searching through a list of", len(keyword), "keywords...\n")
 # items=list[map(lambda x: title_search(x), keyword)]
 for i, item in enumerate(keyword):
 single_search_result=search(item)
 if single_search_result:
 search_results.extend(single_search_result)
 print(i + 1, item)
 else:
 not_found.append(item)
 print("\n", len(not_found)," titles cannot be found:\n")
 print(*not_found, sep='\n')
 return search_results, not_found
 else:
 driver_init(keyword)
 stop_loading_page_when_element_is_present(XPATH['captions'])
 if output_format == "json":
 single_search_result, dates = captions_dict()
# elif output_format == "bib": # ignore for now.
# single_search_result = add_to_bib()
# elif output_format == "zot":
# single_search_result = add_to_zotero()
 else:
 print("Invalid output format.")
# driver.close()
 return single_search_result, dates
def main(keyword):
 caption_link = search(keyword)
 rslt = loop_through_url(caption_link[0])
 dates = caption_link[1]
 for i, k in enumerate(list(d.keys())):
 d[k]['date']=date.isoformat(dates[i])
 with open('fudan_search_result.json', 'w') as file:
 file.write(str(date.today()))
 file.write("\n")
 json.dump(rslt, file, ensure_ascii=False, indent=4)
 print('Done!')
 return caption_link, rslt
if __name__ == '__main__':
 main('人性論')

import re
import time
from selenium import webdriver
from selenium.webdriver import Firefox
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.firefox.options import Options
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException
from urllib.parse import unquote
from urllib import request
from datetime import date
import json
# CONSTANTS
XPATH = {
 "captions": '//tbody/tr/td[2]/a',
 "date_published": '//tbody/tr[position() > 1]/td[4]',
 "max_page_num": "//table[2]/tbody/tr/td[1]",
 "downloads": "/html/body/div[2]/div[2]/div/div[2]/span/p/a"
}
# Initialize driver
def driver_init(keyword):
 global driver
 url = 'http://www.gwz.fudan.edu.cn/Web/Search?s=' + keyword
 options = Options()
 options.page_load_strategy = 'eager'
 driver = webdriver.Firefox(options=options)
 try:
 driver.get(url)
 except:
 driver.refresh()
 return driver
def stop_loading_page_when_element_is_present(xpath):
 global driver
 wait = WebDriverWait(driver, 100)
 wait.until(EC.presence_of_element_located((By.XPATH, xpath)))
 driver.execute_script("window.stop();")
def turn_page():
 global driver
 ignored_exceptions = (NoSuchElementException, StaleElementReferenceException)
 wait = WebDriverWait(driver, 30, ignored_exceptions=ignored_exceptions)
 try:
 wait.until(EC.presence_of_element_located((By.LINK_TEXT, "下页")))
 driver.execute_script("window.stop();")
 except:
 print('No button with 「下页」 found.')
 return
 try:
 wait.until(
 EC.element_to_be_clickable((By.LINK_TEXT, "下页")))
 driver.find_element_by_link_text("下页").click()
 print("Navigating to Next Page")
 except (TimeoutException, WebDriverException):
 print("Last page reached")
def max_page_num():
 global driver
 elem = driver.find_element_by_xpath(XPATH['max_page_num'])
 text = elem.text
 max_pg = re.search("共.+?条记录, 页.+?/(.+)", text).group(1).strip()
 return int(max_pg)
def captions_dict():
 global driver
 ignored_exceptions = (NoSuchElementException, StaleElementReferenceException)
 wait = WebDriverWait(driver, 30, ignored_exceptions=ignored_exceptions)
 captions = []
 links = []
 dates = []
 for i in range(max_page_num()):
 content = wait.until(EC.presence_of_all_elements_located((By.XPATH, XPATH['captions'])))
 time.sleep(3)
 stop_loading_page_when_element_is_present(XPATH['captions'])
 for item in content:
 captions.append(item.text)
 links.append(item.get_attribute('href'))
 date_published = driver.find_elements_by_xpath(XPATH['date_published'])
 for item in date_published:
 dates.append(item.text)
 turn_page()
 # convert to dictionary to remove duplicated captions.
 caption_link = dict(zip(captions, links))
 driver.close()
 return caption_link, dates
def get_article():
 global driver
 try:
 caption = driver.find_element_by_class_name('title')
 author, title = caption.text.split(":")
 stop_loading_page_when_element_is_present("//*[ contains (text(), '点击下载附件' ) ]")
 except:
 return
 # dl = driver.find_element_by_xpath(" //*[ contains (text(), '点击下载附件' ) ]/parent::*")
 # dl = dl.find_element_by_tag_name('a')
 dl = driver.find_element_by_xpath("//*[contains (text(), '.doc') or contains (text(), '.pdf')]/parent::*/parent::*//a")
 download_link = unquote(dl.get_attribute('href'))
 if download_link:
 print("Article found!")
 if author == "網摘":
 author = driver.find_element_by_xpath('/html/body/div[2]/div[2]/div/div[2]/span/div[1]/p[2]/b').text
 title = driver.find_element_by_xpath('/html/body/div[2]/div[2]/div/div[2]/span/div[1]/h2/span').text
 rslt = {"author": author, "title": title, "url": download_link}
 return rslt
def loop_through_url(dict_with_url_as_values):
 keys_lst = list(dict_with_url_as_values.keys())
 url_lst = list(dict_with_url_as_values.values())
 downloadables = {}
 for i, item in enumerate(url_lst):
 global driver
 driver = webdriver.Firefox()
 try:
 driver.get(item)
 except:
 driver.refresh()
 stop_loading_page_when_element_is_present\
 ("/html/body/div[2]/div[2]/div/div[2]/span")
 print("Visiting ", keys_lst[i])
 result = get_article()
 if result:
 if len(result) > 1:
 downloadables.update({keys_lst[i]: result})
 driver.close()
 return downloadables
def search(keyword, output_format="json"):
 """Loop through list of search terms and 
 compile search results together."""
 
 global driver
 search_results = []
 not_found = []
 if isinstance(keyword, list):
 print("Searching through a list of", len(keyword), "keywords...\n")
 # items=list[map(lambda x: title_search(x), keyword)]
 for i, item in enumerate(keyword):
 single_search_result=search(item)
 if single_search_result:
 search_results.extend(single_search_result)
 print(i + 1, item)
 else:
 not_found.append(item)
 print("\n", len(not_found)," titles cannot be found:\n")
 print(*not_found, sep='\n')
 return search_results, not_found
 else:
 driver_init(keyword)
 stop_loading_page_when_element_is_present(XPATH['captions'])
 if output_format == "json":
 single_search_result, dates = captions_dict()
# elif output_format == "bib": # ignore for now.
# single_search_result = add_to_bib()
# elif output_format == "zot":
# single_search_result = add_to_zotero()
 else:
 print("Invalid output format.")
# driver.close()
 return single_search_result, dates
def main(keyword):
 caption_link = search(keyword)
 rslt = loop_through_url(caption_link[0])
 dates = caption_link[1]
 for i, k in enumerate(list(rslt.keys())):
 rslt[k]['date']=date.isoformat(dates[i])
 with open('fudan_search_result.json', 'w') as file:
 file.write(str(date.today()))
 file.write("\n")
 json.dump(rslt, file, ensure_ascii=False, indent=4)
 print('Done!')
 return caption_link, rslt
if __name__ == '__main__':
 main('人性論')