I want to get whole list of pdf links in the below url page: 'http://www1.kiwoom.com/nkw.templateFrameSet.do?m=m0601010000'
The problem is that the webpage uses javascript internally to show the links, and I could not get the pdf links.
Actually, I tried to parse with various ways found through googling. But I failed. Can you suggest the proper way to solve the problem ?
The below is the code I tried but failed:
def crawle_kiwoom_mletter():
if not os.path.exists(dir_output_mletter):
os.makedirs(dir_output_mletter)
#urlformat = 'https://www.kiwoom.com/nkw.template.do?m=m0601010101&s_menu=ML&s_sqno=4784'
urlformat = 'http://www1.kiwoom.com/nkw.templateFrameSet.do?m=m0601010000'
index = -1
while True:
index = index + 1
url = urlformat.format(index)
print('processing {}...'.format(url))
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page, 'lxml')
#print_anchors(soup)
print(soup.prettify())
'''
if browse_mbriefing_linkpages(soup) == False:
break
'''
break
'''
https://impythonist.wordpress.com/2015/01/06/ultimate-guide-for-scraping-javascript-rendered-web-pages/
'''
import sys
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
from lxml import html
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
def crawl_kiwoom_mletter2():
url = 'http://www1.kiwoom.com/nkw.templateFrameSet.do?m=m0601010000'
url='http://www1.kiwoom.com/nkw.templateFrameSet.do?m=m0601010000&source=&xdr='
#This does the magic.Loads everything
r = Render(url)
#result is a QString.
result = r.frame.toHtml()
print(result)
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
'''
http://stackoverflow.com/questions/28289699/python-web-scraping-for-javascript-generated-content
'''
def crawl_kiwoom_mletter3():
browser = webdriver.Firefox()
url = 'http://www1.kiwoom.com/nkw.templateFrameSet.do?m=m0601010000'
browser.get(url)
res = browser.page_source
print(res)
driver.close()
-
Surely, Selenium is the way to go..har07– har072016年02月21日 06:08:50 +00:00Commented Feb 21, 2016 at 6:08
-
So what exactly are you trying to get? The links for the "Morning Letters" don't actually have links, they are ajax calls to update the page with the pdf. Are you trying to download the pdfs? Or reference the html page for the links?Morgan G– Morgan G2016年02月21日 06:41:02 +00:00Commented Feb 21, 2016 at 6:41
-
@MorganG I'm trying to download all the pdfs I haven't downloaded yet. thanks to Kenavoz's code, I could download several pdfs. But I want to know how to analyze this kind of webpage in order to make a periodical downloader tool.user1913171– user19131712016年02月21日 17:42:05 +00:00Commented Feb 21, 2016 at 17:42
1 Answer 1
Try this code using python2 and BeautifulSoup4 :
from bs4 import BeautifulSoup
import re
import urllib, urllib2
def browse(page):
url = 'http://bbn.kiwoom.com/bbn.marketConditionMLList.do'
values = {
'pageno': page,
'basePath': '4',
's_startdate': '20120822',
's_enddate': '20200222',
}
data = urllib.urlencode(values)
req = urllib2.Request(url, data)
page=urllib2.urlopen(req)
soup = BeautifulSoup(page.read())
aTagAll = soup.find_all('a', {'class': 'file'})
for aTag in aTagAll:
downloadFile(getParams( aTag ))
page+=1
def getParams(aTag):
params = {}
m = re.search(r"openFile\('([^']*)','([^']*)','([^']*)", aTag['onclick'])
params['realname'] = m.group(1)
params['filename'] = m.group(2)
params['snMakedate'] = m.group(3)
return params
def downloadFile(params):
print 'Downloading : %s' % params['filename']
url = 'http://bbn.kiwoom.com/bbn.fileDownload.do'
values = {
's_realname': params['realname'],
's_filename': params['filename'],
's_snMakedate': params['snMakedate'],
'pageno': '8',
'basePath': '4'
}
data = urllib.urlencode(values)
req = urllib2.Request(url, data)
try: response = urllib2.urlopen(req)
except urllib2.HTTPError as e:
print e.code
print e.read()
file = open(params['filename'], 'w')
file.write(response.read())
file.close()
for pagenum in range(1, 58):
browse(page=pagenum)
It get all links from the pdf list page and parse them with the getParams function.
The params and an additional basePath param are sent to the download url using urllib2 python module.
I suggest you to add a delay between each request to prevent overloading the server.
UPDATE :
It now browses pages from 1 to 58 (actual number of pages) and parse all links.
3 Comments
pageno to scrap next pages.Explore related questions
See similar questions with these tags.