{
"梅廣:《大學》古本新訂": {
"author": "梅廣",
"title": "《大學》古本新訂",
"url": "http://www.gwz.fudan.edu.cn/lunwen/1796梅廣:《大學》古本新訂.doc",
"date": 2017"2017-06-1212"
},
"裘錫圭:由郭店簡〈性自命出〉的「室性者故也」說到《孟子》的「天下之言性也」章": {
"author": "裘錫圭",
"title": "由郭店簡〈性自命出〉的「室性者故也」說到《孟子》的「天下之言性也」章",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/articles/up/0059由郭店簡〈性自命出〉的「室性者故也」說到《孟子》的「天下之言性也」章.doc",
"date": 2008"2011-0104-2728"
}
}
{
"梅廣:《大學》古本新訂": {
"author": "梅廣",
"title": "《大學》古本新訂",
"url": "http://www.gwz.fudan.edu.cn/lunwen/1796梅廣:《大學》古本新訂.doc"
"date": 2017-06-12
},
"裘錫圭:由郭店簡〈性自命出〉的「室性者故也」說到《孟子》的「天下之言性也」章": {
"author": "裘錫圭",
"title": "由郭店簡〈性自命出〉的「室性者故也」說到《孟子》的「天下之言性也」章",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/articles/up/0059由郭店簡〈性自命出〉的「室性者故也」說到《孟子》的「天下之言性也」章.doc"
"date": 2008-01-27
}
}
{
"梅廣:《大學》古本新訂": {
"author": "梅廣",
"title": "《大學》古本新訂",
"url": "http://www.gwz.fudan.edu.cn/lunwen/1796梅廣:《大學》古本新訂.doc",
"date": "2017-06-12"
},
"裘錫圭:由郭店簡〈性自命出〉的「室性者故也」說到《孟子》的「天下之言性也」章": {
"author": "裘錫圭",
"title": "由郭店簡〈性自命出〉的「室性者故也」說到《孟子》的「天下之言性也」章",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/articles/up/0059由郭店簡〈性自命出〉的「室性者故也」說到《孟子》的「天下之言性也」章.doc",
"date": "2011-04-28"
}
import re
import time
from selenium import webdriver
from selenium.webdriver import Firefox
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.firefox.options import Options
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException
from urllib.parse import unquote
from urllib import request
from datetime import date, datetime
import json
# CONSTANTS
XPATH = {
"captions": '//tbody/tr/td[2]/a',
"date_published": '//tbody/tr[position() > 1]/td[4]',
"max_page_num": "//table[2]/tbody/tr/td[1]",
"downloads": "/html/body/div[2]/div[2]/div/div[2]/span/p/a"
}
# Initialize driver
def driver_init(keyword):
global driver
url = 'http://www.gwz.fudan.edu.cn/Web/Search?s=' + keyword
options = Options()
options.page_load_strategy = 'eager'
driver = webdriver.Firefox(options=options)
try:
driver.get(url)
except:
driver.refresh()
return driver
def stop_loading_page_when_element_is_present(xpath):
global driver
wait = WebDriverWait(driver, 100)
wait.until(EC.presence_of_element_located((By.XPATH, xpath)))
driver.execute_script("window.stop();")
def turn_page():
global driver
ignored_exceptions = (NoSuchElementException, StaleElementReferenceException)
wait = WebDriverWait(driver, 30, ignored_exceptions=ignored_exceptions)
try:
wait.until(EC.presence_of_element_located((By.LINK_TEXT, "下页")))
driver.execute_script("window.stop();")
except:
print('No button with 「下页」 found.')
return
try:
wait.until(
EC.element_to_be_clickable((By.LINK_TEXT, "下页")))
driver.find_element_by_link_text("下页").click()
print("Navigating to Next Page")
except (TimeoutException, WebDriverException):
print("Last page reached")
def max_page_num():
global driver
elem = driver.find_element_by_xpath(XPATH['max_page_num'])
text = elem.text
max_pg = re.search("共.+?条记录, 页.+?/(.+)", text).group(1).strip()
return int(max_pg)
def captions_dict():
global driver
ignored_exceptions = (NoSuchElementException, StaleElementReferenceException)
wait = WebDriverWait(driver, 30, ignored_exceptions=ignored_exceptions)
captions = []
links = []
dates = []
for i in range(max_page_num()):
content = wait.until(EC.presence_of_all_elements_located((By.XPATH, XPATH['captions'])))
time.sleep(3)
stop_loading_page_when_element_is_present(XPATH['captions'])
for item in content:
captions.append(item.text)
links.append(item.get_attribute('href'))
date_published = driver.find_elements_by_xpath(XPATH['date_published'])
for item in date_published:
dates.append(item.text)
turn_page()
# convert to dictionary to remove duplicated captions.
caption_link = dict(zip(captions, links))
driver.close()
return caption_link, dates
def get_article():
global driver
try:
caption = driver.find_element_by_class_name('title')
author, title = caption.text.split(":")
stop_loading_page_when_element_is_present("//*[ contains (text(), '点击下载附件' ) ]")
except:
return
# dl = driver.find_element_by_xpath(" //*[ contains (text(), '点击下载附件' ) ]/parent::*")
# dl = dl.find_element_by_tag_name('a')
dl = driver.find_element_by_xpath("//*[contains (text(), '.doc') or contains (text(), '.pdf')]/parent::*/parent::*//a")
download_link = unquote(dl.get_attribute('href'))
if download_link:
print("Article found!")
if author == "網摘":
author = driver.find_element_by_xpath('/html/body/div[2]/div[2]/div/div[2]/span/div[1]/p[2]/b').text
title = driver.find_element_by_xpath('/html/body/div[2]/div[2]/div/div[2]/span/div[1]/h2/span').text
rslt = {"author": author, "title": title, "url": download_link}
return rslt
def loop_through_url(dict_with_url_as_values):
keys_lst = list(dict_with_url_as_values.keys())
url_lst = list(dict_with_url_as_values.values())
downloadables = {}
for i, item in enumerate(url_lst):
global driver
driver = webdriver.Firefox()
try:
driver.get(item)
except:
driver.refresh()
stop_loading_page_when_element_is_present\
("/html/body/div[2]/div[2]/div/div[2]/span")
print("Visiting ", keys_lst[i])
result = get_article()
if result:
if len(result) > 1:
downloadables.update({keys_lst[i]: result})
driver.close()
return downloadables
def search(keyword, output_format="json"):
"""Loop through list of search terms and
compile search results together."""
global driver
search_results = []
not_found = []
if isinstance(keyword, list):
print("Searching through a list of", len(keyword), "keywords...\n")
# items=list[map(lambda x: title_search(x), keyword)]
for i, item in enumerate(keyword):
single_search_result=search(item)
if single_search_result:
search_results.extend(single_search_result)
print(i + 1, item)
else:
not_found.append(item)
print("\n", len(not_found)," titles cannot be found:\n")
print(*not_found, sep='\n')
return search_results, not_found
else:
driver_init(keyword)
stop_loading_page_when_element_is_present(XPATH['captions'])
if output_format == "json":
single_search_result, dates = captions_dict()
# elif output_format == "bib": # ignore for now.
# single_search_result = add_to_bib()
# elif output_format == "zot":
# single_search_result = add_to_zotero()
else:
print("Invalid output format.")
# driver.close()
return single_search_result, dates
def main(keyword):
caption_link = search(keyword)
rslt = loop_through_url(caption_link[0])
dates = caption_link[1]
for i, k in enumerate(list(rslt.keys())):
rslt[k]['date']=date.isoformat(
datetime.strptime(dates[i],'%Y/%m/%d'))
with open('fudan_search_result.json', 'w') as file:
file.write(str(date.today()))
file.write("\n")
json.dump(rslt, file, ensure_ascii=False, indent=4)
print('Done!')
return caption_link, rslt
if __name__ == '__main__':
main('人性論')
import re
import time
from selenium import webdriver
from selenium.webdriver import Firefox
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.firefox.options import Options
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException
from urllib.parse import unquote
from urllib import request
from datetime import date
import json
# CONSTANTS
XPATH = {
"captions": '//tbody/tr/td[2]/a',
"date_published": '//tbody/tr[position() > 1]/td[4]',
"max_page_num": "//table[2]/tbody/tr/td[1]",
"downloads": "/html/body/div[2]/div[2]/div/div[2]/span/p/a"
}
# Initialize driver
def driver_init(keyword):
global driver
url = 'http://www.gwz.fudan.edu.cn/Web/Search?s=' + keyword
options = Options()
options.page_load_strategy = 'eager'
driver = webdriver.Firefox(options=options)
try:
driver.get(url)
except:
driver.refresh()
return driver
def stop_loading_page_when_element_is_present(xpath):
global driver
wait = WebDriverWait(driver, 100)
wait.until(EC.presence_of_element_located((By.XPATH, xpath)))
driver.execute_script("window.stop();")
def turn_page():
global driver
ignored_exceptions = (NoSuchElementException, StaleElementReferenceException)
wait = WebDriverWait(driver, 30, ignored_exceptions=ignored_exceptions)
try:
wait.until(EC.presence_of_element_located((By.LINK_TEXT, "下页")))
driver.execute_script("window.stop();")
except:
print('No button with 「下页」 found.')
return
try:
wait.until(
EC.element_to_be_clickable((By.LINK_TEXT, "下页")))
driver.find_element_by_link_text("下页").click()
print("Navigating to Next Page")
except (TimeoutException, WebDriverException):
print("Last page reached")
def max_page_num():
global driver
elem = driver.find_element_by_xpath(XPATH['max_page_num'])
text = elem.text
max_pg = re.search("共.+?条记录, 页.+?/(.+)", text).group(1).strip()
return int(max_pg)
def captions_dict():
global driver
ignored_exceptions = (NoSuchElementException, StaleElementReferenceException)
wait = WebDriverWait(driver, 30, ignored_exceptions=ignored_exceptions)
captions = []
links = []
dates = []
for i in range(max_page_num()):
content = wait.until(EC.presence_of_all_elements_located((By.XPATH, XPATH['captions'])))
time.sleep(3)
stop_loading_page_when_element_is_present(XPATH['captions'])
for item in content:
captions.append(item.text)
links.append(item.get_attribute('href'))
date_published = driver.find_elements_by_xpath(XPATH['date_published'])
for item in date_published:
dates.append(item.text)
turn_page()
# convert to dictionary to remove duplicated captions.
caption_link = dict(zip(captions, links))
driver.close()
return caption_link, dates
def get_article():
global driver
try:
caption = driver.find_element_by_class_name('title')
author, title = caption.text.split(":")
stop_loading_page_when_element_is_present("//*[ contains (text(), '点击下载附件' ) ]")
except:
return
# dl = driver.find_element_by_xpath(" //*[ contains (text(), '点击下载附件' ) ]/parent::*")
# dl = dl.find_element_by_tag_name('a')
dl = driver.find_element_by_xpath("//*[contains (text(), '.doc') or contains (text(), '.pdf')]/parent::*/parent::*//a")
download_link = unquote(dl.get_attribute('href'))
if download_link:
print("Article found!")
if author == "網摘":
author = driver.find_element_by_xpath('/html/body/div[2]/div[2]/div/div[2]/span/div[1]/p[2]/b').text
title = driver.find_element_by_xpath('/html/body/div[2]/div[2]/div/div[2]/span/div[1]/h2/span').text
rslt = {"author": author, "title": title, "url": download_link}
return rslt
def loop_through_url(dict_with_url_as_values):
keys_lst = list(dict_with_url_as_values.keys())
url_lst = list(dict_with_url_as_values.values())
downloadables = {}
for i, item in enumerate(url_lst):
global driver
driver = webdriver.Firefox()
try:
driver.get(item)
except:
driver.refresh()
stop_loading_page_when_element_is_present\
("/html/body/div[2]/div[2]/div/div[2]/span")
print("Visiting ", keys_lst[i])
result = get_article()
if result:
if len(result) > 1:
downloadables.update({keys_lst[i]: result})
driver.close()
return downloadables
def search(keyword, output_format="json"):
"""Loop through list of search terms and
compile search results together."""
global driver
search_results = []
not_found = []
if isinstance(keyword, list):
print("Searching through a list of", len(keyword), "keywords...\n")
# items=list[map(lambda x: title_search(x), keyword)]
for i, item in enumerate(keyword):
single_search_result=search(item)
if single_search_result:
search_results.extend(single_search_result)
print(i + 1, item)
else:
not_found.append(item)
print("\n", len(not_found)," titles cannot be found:\n")
print(*not_found, sep='\n')
return search_results, not_found
else:
driver_init(keyword)
stop_loading_page_when_element_is_present(XPATH['captions'])
if output_format == "json":
single_search_result, dates = captions_dict()
# elif output_format == "bib": # ignore for now.
# single_search_result = add_to_bib()
# elif output_format == "zot":
# single_search_result = add_to_zotero()
else:
print("Invalid output format.")
# driver.close()
return single_search_result, dates
def main(keyword):
caption_link = search(keyword)
rslt = loop_through_url(caption_link[0])
dates = caption_link[1]
for i, k in enumerate(list(rslt.keys())):
rslt[k]['date']=date.isoformat(dates[i])
with open('fudan_search_result.json', 'w') as file:
file.write(str(date.today()))
file.write("\n")
json.dump(rslt, file, ensure_ascii=False, indent=4)
print('Done!')
return caption_link, rslt
if __name__ == '__main__':
main('人性論')
import re
import time
from selenium import webdriver
from selenium.webdriver import Firefox
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.firefox.options import Options
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException
from urllib.parse import unquote
from urllib import request
from datetime import date, datetime
import json
# CONSTANTS
XPATH = {
"captions": '//tbody/tr/td[2]/a',
"date_published": '//tbody/tr[position() > 1]/td[4]',
"max_page_num": "//table[2]/tbody/tr/td[1]",
"downloads": "/html/body/div[2]/div[2]/div/div[2]/span/p/a"
}
# Initialize driver
def driver_init(keyword):
global driver
url = 'http://www.gwz.fudan.edu.cn/Web/Search?s=' + keyword
options = Options()
options.page_load_strategy = 'eager'
driver = webdriver.Firefox(options=options)
try:
driver.get(url)
except:
driver.refresh()
return driver
def stop_loading_page_when_element_is_present(xpath):
global driver
wait = WebDriverWait(driver, 100)
wait.until(EC.presence_of_element_located((By.XPATH, xpath)))
driver.execute_script("window.stop();")
def turn_page():
global driver
ignored_exceptions = (NoSuchElementException, StaleElementReferenceException)
wait = WebDriverWait(driver, 30, ignored_exceptions=ignored_exceptions)
try:
wait.until(EC.presence_of_element_located((By.LINK_TEXT, "下页")))
driver.execute_script("window.stop();")
except:
print('No button with 「下页」 found.')
return
try:
wait.until(
EC.element_to_be_clickable((By.LINK_TEXT, "下页")))
driver.find_element_by_link_text("下页").click()
print("Navigating to Next Page")
except (TimeoutException, WebDriverException):
print("Last page reached")
def max_page_num():
global driver
elem = driver.find_element_by_xpath(XPATH['max_page_num'])
text = elem.text
max_pg = re.search("共.+?条记录, 页.+?/(.+)", text).group(1).strip()
return int(max_pg)
def captions_dict():
global driver
ignored_exceptions = (NoSuchElementException, StaleElementReferenceException)
wait = WebDriverWait(driver, 30, ignored_exceptions=ignored_exceptions)
captions = []
links = []
dates = []
for i in range(max_page_num()):
content = wait.until(EC.presence_of_all_elements_located((By.XPATH, XPATH['captions'])))
time.sleep(3)
stop_loading_page_when_element_is_present(XPATH['captions'])
for item in content:
captions.append(item.text)
links.append(item.get_attribute('href'))
date_published = driver.find_elements_by_xpath(XPATH['date_published'])
for item in date_published:
dates.append(item.text)
turn_page()
# convert to dictionary to remove duplicated captions.
caption_link = dict(zip(captions, links))
driver.close()
return caption_link, dates
def get_article():
global driver
try:
caption = driver.find_element_by_class_name('title')
author, title = caption.text.split(":")
stop_loading_page_when_element_is_present("//*[ contains (text(), '点击下载附件' ) ]")
except:
return
# dl = driver.find_element_by_xpath(" //*[ contains (text(), '点击下载附件' ) ]/parent::*")
# dl = dl.find_element_by_tag_name('a')
dl = driver.find_element_by_xpath("//*[contains (text(), '.doc') or contains (text(), '.pdf')]/parent::*/parent::*//a")
download_link = unquote(dl.get_attribute('href'))
if download_link:
print("Article found!")
if author == "網摘":
author = driver.find_element_by_xpath('/html/body/div[2]/div[2]/div/div[2]/span/div[1]/p[2]/b').text
title = driver.find_element_by_xpath('/html/body/div[2]/div[2]/div/div[2]/span/div[1]/h2/span').text
rslt = {"author": author, "title": title, "url": download_link}
return rslt
def loop_through_url(dict_with_url_as_values):
keys_lst = list(dict_with_url_as_values.keys())
url_lst = list(dict_with_url_as_values.values())
downloadables = {}
for i, item in enumerate(url_lst):
global driver
driver = webdriver.Firefox()
try:
driver.get(item)
except:
driver.refresh()
stop_loading_page_when_element_is_present\
("/html/body/div[2]/div[2]/div/div[2]/span")
print("Visiting ", keys_lst[i])
result = get_article()
if result:
if len(result) > 1:
downloadables.update({keys_lst[i]: result})
driver.close()
return downloadables
def search(keyword, output_format="json"):
"""Loop through list of search terms and
compile search results together."""
global driver
search_results = []
not_found = []
if isinstance(keyword, list):
print("Searching through a list of", len(keyword), "keywords...\n")
# items=list[map(lambda x: title_search(x), keyword)]
for i, item in enumerate(keyword):
single_search_result=search(item)
if single_search_result:
search_results.extend(single_search_result)
print(i + 1, item)
else:
not_found.append(item)
print("\n", len(not_found)," titles cannot be found:\n")
print(*not_found, sep='\n')
return search_results, not_found
else:
driver_init(keyword)
stop_loading_page_when_element_is_present(XPATH['captions'])
if output_format == "json":
single_search_result, dates = captions_dict()
# elif output_format == "bib": # ignore for now.
# single_search_result = add_to_bib()
# elif output_format == "zot":
# single_search_result = add_to_zotero()
else:
print("Invalid output format.")
# driver.close()
return single_search_result, dates
def main(keyword):
caption_link = search(keyword)
rslt = loop_through_url(caption_link[0])
dates = caption_link[1]
for i, k in enumerate(list(rslt.keys())):
rslt[k]['date']=date.isoformat(
datetime.strptime(dates[i],'%Y/%m/%d'))
with open('fudan_search_result.json', 'w') as file:
file.write(str(date.today()))
file.write("\n")
json.dump(rslt, file, ensure_ascii=False, indent=4)
print('Done!')
return caption_link, rslt
if __name__ == '__main__':
main('人性論')
import re
import time
from selenium import webdriver
from selenium.webdriver import Firefox
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.firefox.options import Options
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException
from urllib.parse import unquote
from urllib import request
from datetime import date
import json
# CONSTANTS
XPATH = {
"captions": '//tbody/tr/td[2]/a',
"date_published": '//tbody/tr[position() > 1]/td[4]',
"max_page_num": "//table[2]/tbody/tr/td[1]",
"downloads": "/html/body/div[2]/div[2]/div/div[2]/span/p/a"
}
# Initialize driver
def driver_init(keyword):
global driver
url = 'http://www.gwz.fudan.edu.cn/Web/Search?s=' + keyword
options = Options()
options.page_load_strategy = 'eager'
driver = webdriver.Firefox(options=options)
try:
driver.get(url)
except:
driver.refresh()
return driver
def stop_loading_page_when_element_is_present(xpath):
global driver
wait = WebDriverWait(driver, 100)
wait.until(EC.presence_of_element_located((By.XPATH, xpath)))
driver.execute_script("window.stop();")
def turn_page():
global driver
ignored_exceptions = (NoSuchElementException, StaleElementReferenceException)
wait = WebDriverWait(driver, 30, ignored_exceptions=ignored_exceptions)
try:
wait.until(EC.presence_of_element_located((By.LINK_TEXT, "下页")))
driver.execute_script("window.stop();")
except:
print('No button with 「下页」 found.')
return
try:
wait.until(
EC.element_to_be_clickable((By.LINK_TEXT, "下页")))
driver.find_element_by_link_text("下页").click()
print("Navigating to Next Page")
except (TimeoutException, WebDriverException):
print("Last page reached")
def max_page_num():
global driver
elem = driver.find_element_by_xpath(XPATH['max_page_num'])
text = elem.text
max_pg = re.search("共.+?条记录, 页.+?/(.+)", text).group(1).strip()
return int(max_pg)
def captions_dict():
global driver
ignored_exceptions = (NoSuchElementException, StaleElementReferenceException)
wait = WebDriverWait(driver, 30, ignored_exceptions=ignored_exceptions)
captions = []
links = []
dates = []
for i in range(max_page_num()):
content = wait.until(EC.presence_of_all_elements_located((By.XPATH, XPATH['captions'])))
time.sleep(3)
stop_loading_page_when_element_is_present(XPATH['captions'])
for item in content:
captions.append(item.text)
links.append(item.get_attribute('href'))
date_published = driver.find_elements_by_xpath(XPATH['date_published'])
for item in date_published:
dates.append(item.text)
turn_page()
# convert to dictionary to remove duplicated captions.
caption_link = dict(zip(captions, links))
driver.close()
return caption_link, dates
def get_article():
global driver
try:
caption = driver.find_element_by_class_name('title')
author, title = caption.text.split(":")
stop_loading_page_when_element_is_present("//*[ contains (text(), '点击下载附件' ) ]")
except:
return
# dl = driver.find_element_by_xpath(" //*[ contains (text(), '点击下载附件' ) ]/parent::*")
# dl = dl.find_element_by_tag_name('a')
dl = driver.find_element_by_xpath("//*[contains (text(), '.doc') or contains (text(), '.pdf')]/parent::*/parent::*//a")
download_link = unquote(dl.get_attribute('href'))
if download_link:
print("Article found!")
if author == "網摘":
author = driver.find_element_by_xpath('/html/body/div[2]/div[2]/div/div[2]/span/div[1]/p[2]/b').text
title = driver.find_element_by_xpath('/html/body/div[2]/div[2]/div/div[2]/span/div[1]/h2/span').text
rslt = {"author": author, "title": title, "url": download_link}
return rslt
def loop_through_url(dict_with_url_as_values):
keys_lst = list(dict_with_url_as_values.keys())
url_lst = list(dict_with_url_as_values.values())
downloadables = {}
for i, item in enumerate(url_lst):
global driver
driver = webdriver.Firefox()
try:
driver.get(item)
except:
driver.refresh()
stop_loading_page_when_element_is_present\
("/html/body/div[2]/div[2]/div/div[2]/span")
print("Visiting ", keys_lst[i])
result = get_article()
if result:
if len(result) > 1:
downloadables.update({keys_lst[i]: result})
driver.close()
return downloadables
def search(keyword, output_format="json"):
"""Loop through list of search terms and
compile search results together."""
global driver
search_results = []
not_found = []
if isinstance(keyword, list):
print("Searching through a list of", len(keyword), "keywords...\n")
# items=list[map(lambda x: title_search(x), keyword)]
for i, item in enumerate(keyword):
single_search_result=search(item)
if single_search_result:
search_results.extend(single_search_result)
print(i + 1, item)
else:
not_found.append(item)
print("\n", len(not_found)," titles cannot be found:\n")
print(*not_found, sep='\n')
return search_results, not_found
else:
driver_init(keyword)
stop_loading_page_when_element_is_present(XPATH['captions'])
if output_format == "json":
single_search_result, dates = captions_dict()
# elif output_format == "bib": # ignore for now.
# single_search_result = add_to_bib()
# elif output_format == "zot":
# single_search_result = add_to_zotero()
else:
print("Invalid output format.")
# driver.close()
return single_search_result, dates
def main(keyword):
caption_link = search(keyword)
rslt = loop_through_url(caption_link[0])
dates = caption_link[1]
for i, k in enumerate(list(drslt.keys())):
d[k]['date']=daterslt[k]['date']=date.isoformat(dates[i])
with open('fudan_search_result.json', 'w') as file:
file.write(str(date.today()))
file.write("\n")
json.dump(rslt, file, ensure_ascii=False, indent=4)
print('Done!')
return caption_link, rslt
if __name__ == '__main__':
main('人性論')
import re
import time
from selenium import webdriver
from selenium.webdriver import Firefox
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.firefox.options import Options
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException
from urllib.parse import unquote
from urllib import request
from datetime import date
import json
# CONSTANTS
XPATH = {
"captions": '//tbody/tr/td[2]/a',
"date_published": '//tbody/tr[position() > 1]/td[4]',
"max_page_num": "//table[2]/tbody/tr/td[1]",
"downloads": "/html/body/div[2]/div[2]/div/div[2]/span/p/a"
}
# Initialize driver
def driver_init(keyword):
global driver
url = 'http://www.gwz.fudan.edu.cn/Web/Search?s=' + keyword
options = Options()
options.page_load_strategy = 'eager'
driver = webdriver.Firefox(options=options)
try:
driver.get(url)
except:
driver.refresh()
return driver
def stop_loading_page_when_element_is_present(xpath):
global driver
wait = WebDriverWait(driver, 100)
wait.until(EC.presence_of_element_located((By.XPATH, xpath)))
driver.execute_script("window.stop();")
def turn_page():
global driver
ignored_exceptions = (NoSuchElementException, StaleElementReferenceException)
wait = WebDriverWait(driver, 30, ignored_exceptions=ignored_exceptions)
try:
wait.until(EC.presence_of_element_located((By.LINK_TEXT, "下页")))
driver.execute_script("window.stop();")
except:
print('No button with 「下页」 found.')
return
try:
wait.until(
EC.element_to_be_clickable((By.LINK_TEXT, "下页")))
driver.find_element_by_link_text("下页").click()
print("Navigating to Next Page")
except (TimeoutException, WebDriverException):
print("Last page reached")
def max_page_num():
global driver
elem = driver.find_element_by_xpath(XPATH['max_page_num'])
text = elem.text
max_pg = re.search("共.+?条记录, 页.+?/(.+)", text).group(1).strip()
return int(max_pg)
def captions_dict():
global driver
ignored_exceptions = (NoSuchElementException, StaleElementReferenceException)
wait = WebDriverWait(driver, 30, ignored_exceptions=ignored_exceptions)
captions = []
links = []
dates = []
for i in range(max_page_num()):
content = wait.until(EC.presence_of_all_elements_located((By.XPATH, XPATH['captions'])))
time.sleep(3)
stop_loading_page_when_element_is_present(XPATH['captions'])
for item in content:
captions.append(item.text)
links.append(item.get_attribute('href'))
date_published = driver.find_elements_by_xpath(XPATH['date_published'])
for item in date_published:
dates.append(item.text)
turn_page()
# convert to dictionary to remove duplicated captions.
caption_link = dict(zip(captions, links))
driver.close()
return caption_link, dates
def get_article():
global driver
try:
caption = driver.find_element_by_class_name('title')
author, title = caption.text.split(":")
stop_loading_page_when_element_is_present("//*[ contains (text(), '点击下载附件' ) ]")
except:
return
# dl = driver.find_element_by_xpath(" //*[ contains (text(), '点击下载附件' ) ]/parent::*")
# dl = dl.find_element_by_tag_name('a')
dl = driver.find_element_by_xpath("//*[contains (text(), '.doc') or contains (text(), '.pdf')]/parent::*/parent::*//a")
download_link = unquote(dl.get_attribute('href'))
if download_link:
print("Article found!")
if author == "網摘":
author = driver.find_element_by_xpath('/html/body/div[2]/div[2]/div/div[2]/span/div[1]/p[2]/b').text
title = driver.find_element_by_xpath('/html/body/div[2]/div[2]/div/div[2]/span/div[1]/h2/span').text
rslt = {"author": author, "title": title, "url": download_link}
return rslt
def loop_through_url(dict_with_url_as_values):
keys_lst = list(dict_with_url_as_values.keys())
url_lst = list(dict_with_url_as_values.values())
downloadables = {}
for i, item in enumerate(url_lst):
global driver
driver = webdriver.Firefox()
try:
driver.get(item)
except:
driver.refresh()
stop_loading_page_when_element_is_present\
("/html/body/div[2]/div[2]/div/div[2]/span")
print("Visiting ", keys_lst[i])
result = get_article()
if result:
if len(result) > 1:
downloadables.update({keys_lst[i]: result})
driver.close()
return downloadables
def search(keyword, output_format="json"):
"""Loop through list of search terms and
compile search results together."""
global driver
search_results = []
not_found = []
if isinstance(keyword, list):
print("Searching through a list of", len(keyword), "keywords...\n")
# items=list[map(lambda x: title_search(x), keyword)]
for i, item in enumerate(keyword):
single_search_result=search(item)
if single_search_result:
search_results.extend(single_search_result)
print(i + 1, item)
else:
not_found.append(item)
print("\n", len(not_found)," titles cannot be found:\n")
print(*not_found, sep='\n')
return search_results, not_found
else:
driver_init(keyword)
stop_loading_page_when_element_is_present(XPATH['captions'])
if output_format == "json":
single_search_result, dates = captions_dict()
# elif output_format == "bib": # ignore for now.
# single_search_result = add_to_bib()
# elif output_format == "zot":
# single_search_result = add_to_zotero()
else:
print("Invalid output format.")
# driver.close()
return single_search_result, dates
def main(keyword):
caption_link = search(keyword)
rslt = loop_through_url(caption_link[0])
dates = caption_link[1]
for i, k in enumerate(list(d.keys())):
d[k]['date']=date.isoformat(dates[i])
with open('fudan_search_result.json', 'w') as file:
file.write(str(date.today()))
file.write("\n")
json.dump(rslt, file, ensure_ascii=False, indent=4)
print('Done!')
return caption_link, rslt
if __name__ == '__main__':
main('人性論')
import re
import time
from selenium import webdriver
from selenium.webdriver import Firefox
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.firefox.options import Options
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException
from urllib.parse import unquote
from urllib import request
from datetime import date
import json
# CONSTANTS
XPATH = {
"captions": '//tbody/tr/td[2]/a',
"date_published": '//tbody/tr[position() > 1]/td[4]',
"max_page_num": "//table[2]/tbody/tr/td[1]",
"downloads": "/html/body/div[2]/div[2]/div/div[2]/span/p/a"
}
# Initialize driver
def driver_init(keyword):
global driver
url = 'http://www.gwz.fudan.edu.cn/Web/Search?s=' + keyword
options = Options()
options.page_load_strategy = 'eager'
driver = webdriver.Firefox(options=options)
try:
driver.get(url)
except:
driver.refresh()
return driver
def stop_loading_page_when_element_is_present(xpath):
global driver
wait = WebDriverWait(driver, 100)
wait.until(EC.presence_of_element_located((By.XPATH, xpath)))
driver.execute_script("window.stop();")
def turn_page():
global driver
ignored_exceptions = (NoSuchElementException, StaleElementReferenceException)
wait = WebDriverWait(driver, 30, ignored_exceptions=ignored_exceptions)
try:
wait.until(EC.presence_of_element_located((By.LINK_TEXT, "下页")))
driver.execute_script("window.stop();")
except:
print('No button with 「下页」 found.')
return
try:
wait.until(
EC.element_to_be_clickable((By.LINK_TEXT, "下页")))
driver.find_element_by_link_text("下页").click()
print("Navigating to Next Page")
except (TimeoutException, WebDriverException):
print("Last page reached")
def max_page_num():
global driver
elem = driver.find_element_by_xpath(XPATH['max_page_num'])
text = elem.text
max_pg = re.search("共.+?条记录, 页.+?/(.+)", text).group(1).strip()
return int(max_pg)
def captions_dict():
global driver
ignored_exceptions = (NoSuchElementException, StaleElementReferenceException)
wait = WebDriverWait(driver, 30, ignored_exceptions=ignored_exceptions)
captions = []
links = []
dates = []
for i in range(max_page_num()):
content = wait.until(EC.presence_of_all_elements_located((By.XPATH, XPATH['captions'])))
time.sleep(3)
stop_loading_page_when_element_is_present(XPATH['captions'])
for item in content:
captions.append(item.text)
links.append(item.get_attribute('href'))
date_published = driver.find_elements_by_xpath(XPATH['date_published'])
for item in date_published:
dates.append(item.text)
turn_page()
# convert to dictionary to remove duplicated captions.
caption_link = dict(zip(captions, links))
driver.close()
return caption_link, dates
def get_article():
global driver
try:
caption = driver.find_element_by_class_name('title')
author, title = caption.text.split(":")
stop_loading_page_when_element_is_present("//*[ contains (text(), '点击下载附件' ) ]")
except:
return
# dl = driver.find_element_by_xpath(" //*[ contains (text(), '点击下载附件' ) ]/parent::*")
# dl = dl.find_element_by_tag_name('a')
dl = driver.find_element_by_xpath("//*[contains (text(), '.doc') or contains (text(), '.pdf')]/parent::*/parent::*//a")
download_link = unquote(dl.get_attribute('href'))
if download_link:
print("Article found!")
if author == "網摘":
author = driver.find_element_by_xpath('/html/body/div[2]/div[2]/div/div[2]/span/div[1]/p[2]/b').text
title = driver.find_element_by_xpath('/html/body/div[2]/div[2]/div/div[2]/span/div[1]/h2/span').text
rslt = {"author": author, "title": title, "url": download_link}
return rslt
def loop_through_url(dict_with_url_as_values):
keys_lst = list(dict_with_url_as_values.keys())
url_lst = list(dict_with_url_as_values.values())
downloadables = {}
for i, item in enumerate(url_lst):
global driver
driver = webdriver.Firefox()
try:
driver.get(item)
except:
driver.refresh()
stop_loading_page_when_element_is_present\
("/html/body/div[2]/div[2]/div/div[2]/span")
print("Visiting ", keys_lst[i])
result = get_article()
if result:
if len(result) > 1:
downloadables.update({keys_lst[i]: result})
driver.close()
return downloadables
def search(keyword, output_format="json"):
"""Loop through list of search terms and
compile search results together."""
global driver
search_results = []
not_found = []
if isinstance(keyword, list):
print("Searching through a list of", len(keyword), "keywords...\n")
# items=list[map(lambda x: title_search(x), keyword)]
for i, item in enumerate(keyword):
single_search_result=search(item)
if single_search_result:
search_results.extend(single_search_result)
print(i + 1, item)
else:
not_found.append(item)
print("\n", len(not_found)," titles cannot be found:\n")
print(*not_found, sep='\n')
return search_results, not_found
else:
driver_init(keyword)
stop_loading_page_when_element_is_present(XPATH['captions'])
if output_format == "json":
single_search_result, dates = captions_dict()
# elif output_format == "bib": # ignore for now.
# single_search_result = add_to_bib()
# elif output_format == "zot":
# single_search_result = add_to_zotero()
else:
print("Invalid output format.")
# driver.close()
return single_search_result, dates
def main(keyword):
caption_link = search(keyword)
rslt = loop_through_url(caption_link[0])
dates = caption_link[1]
for i, k in enumerate(list(rslt.keys())):
rslt[k]['date']=date.isoformat(dates[i])
with open('fudan_search_result.json', 'w') as file:
file.write(str(date.today()))
file.write("\n")
json.dump(rslt, file, ensure_ascii=False, indent=4)
print('Done!')
return caption_link, rslt
if __name__ == '__main__':
main('人性論')
lang-py