From d7eeba208ef556b8e49c405e7027228a7fc8fe4d Mon Sep 17 00:00:00 2001 From: zhangzlg Date: 2017年9月15日 21:35:39 +0800 Subject: [PATCH 1/8] =?UTF-8?q?=E7=BD=91=E7=BB=9C=E7=88=AC=E8=99=AB?= =?UTF-8?q?=EF=BC=8C=E7=8E=B0=E5=9C=A8=E5=B7=B2=E7=BB=8F=E7=88=AC=E5=88=B0?= =?UTF-8?q?=E7=BD=91=E9=A1=B5=E5=8F=8A=E7=9B=B8=E5=85=B3MP4=20URL=EF=BC=8C?= =?UTF-8?q?=E4=B8=8B=E4=B8=80=E6=AD=A5=E9=9C=80=E8=A6=81=E5=B0=86=E6=96=87?= =?UTF-8?q?=E4=BB=B6=E4=B8=8B=E8=BD=BD=E4=B8=8B=E6=9D=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- WebScrapingWithPython/yaoshe1/1.log | 43 ++++++++++++++++++++++++ WebScrapingWithPython/yaoshe1/index.py | 45 ++++++++++++++++++++++++++ 2 files changed, 88 insertions(+) create mode 100644 WebScrapingWithPython/yaoshe1/1.log create mode 100644 WebScrapingWithPython/yaoshe1/index.py diff --git a/WebScrapingWithPython/yaoshe1/1.log b/WebScrapingWithPython/yaoshe1/1.log new file mode 100644 index 0000000..62731fa --- /dev/null +++ b/WebScrapingWithPython/yaoshe1/1.log @@ -0,0 +1,43 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/WebScrapingWithPython/yaoshe1/index.py b/WebScrapingWithPython/yaoshe1/index.py new file mode 100644 index 0000000..e937465 --- /dev/null +++ b/WebScrapingWithPython/yaoshe1/index.py @@ -0,0 +1,45 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +# @Filename: index +# @Date : 2017年09月14日 23:15 +# @Author : zzl +""" + python_version 2.7.11 +""" +from urllib2 import urlopen +# from urllib2 import open +from bs4 import BeautifulSoup +import urllib2 +import re + +def opeVideoUrl(url): + html = urlopen(url).read() + ss = html.replace(" ","") + urls = re.findall(r"(http://www.yaoshe1.com/get_file/.*?mp4).*?",ss,re.I) + for i in urls: + print i + # urllib2.urlretrieve(i, "%s.mp4" % (i, )) + # else: + + # print 'this is over' + # bsObj = BeautifulSoup(html, "html5lib") + # print bsObj + + +html = urlopen("http://www.yaoshe1.com/") +# print(html.read()) +bsObj = BeautifulSoup(html, "html5lib") +itemsDivObj = bsObj.findAll("div",{"class":re.compile("^(item)((?!:).)*$")}) +for obj in itemsDivObj: + videosObjs = obj.findAll("a",{"href":re.compile("^(http://www.yaoshe1.com/videos/)((?!:).)*$")}) + # print("==================") + strHref = videosObjs[0].attrs["href"] + # print strHref + count = 0 + while count < 1: + count = count+1 + opeVideoUrl(strHref) + + + + From 6e562cec93866506b3547f34886baa6b5080475e Mon Sep 17 00:00:00 2001 From: zhangzlg Date: 2017年9月15日 21:36:40 +0800 Subject: [PATCH 2/8] test --- WebScrapingWithPython/python-scraping | 1 + WebScrapingWithPython/remark | 28 +++++++++++++++++++++++++++ 2 files changed, 29 insertions(+) create mode 160000 WebScrapingWithPython/python-scraping diff --git a/WebScrapingWithPython/python-scraping b/WebScrapingWithPython/python-scraping new file mode 160000 index 0000000..cec78b7 --- /dev/null +++ b/WebScrapingWithPython/python-scraping @@ -0,0 +1 @@ +Subproject commit cec78b7cadf16f45249c443248f918d6efaee6d3 diff --git a/WebScrapingWithPython/remark b/WebScrapingWithPython/remark index a2051d1..6df378d 100644 --- a/WebScrapingWithPython/remark +++ b/WebScrapingWithPython/remark @@ -62,3 +62,31 @@ Tesseract 文档: https://github.com/tesseract-ocr/tesseract/wiki EditThis cookie http://www.editthiscookie.com + + +分布式计算 distributed computing +Tor代理服务器 + +PySocks python 代理服务器通信模埠 https://pypi.python.org/pypi/PySocks + +搜索引擎优化: Search Engine Optimization,SEO + robots.txt + +机器人排除标准: Robots Exclusion Standard robots.txt + + +# +# robots.txt for PHPWIND BOARD +# Version 5.x +# + +User-agent: * +Disallow: /admin/ +Disallow: /require/ +Disallow: /hack/ +Disallow: /attachment/ +Disallow: /images/ +Disallow: /data/ +Disallow: /ipdata/ +Disallow: /template/ + From ad70ac10f528c9cb8ca69649a2fac4b431fc7f0c Mon Sep 17 00:00:00 2001 From: zzl Date: 2017年9月24日 11:11:23 +0800 Subject: [PATCH 3/8] ok --- WebScrapingWithPython/yaoshe1/index.py | 35 +++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/WebScrapingWithPython/yaoshe1/index.py b/WebScrapingWithPython/yaoshe1/index.py index e937465..c7a835c 100644 --- a/WebScrapingWithPython/yaoshe1/index.py +++ b/WebScrapingWithPython/yaoshe1/index.py @@ -9,18 +9,46 @@ from urllib2 import urlopen # from urllib2 import open from bs4 import BeautifulSoup -import urllib2 +# import urllib import re +def getFile(url): + file_name = url.split('/')[-1] + u = urlopen(url) + f = open(file_name, 'wb') + + block_sz = 8192 + while True: + buffer = u.read(block_sz) + if not buffer: + break + + f.write(buffer) + f.close() + print "Sucessful to download" + " " + file_name + +def getHtml(url): + page = urllib2.urlopen(url) + html = page.read() + page.close() + return html + +# compile the regular expressions and find +# all stuff we need +def getUrl(html): + reg = r'(?:href|HREF)="?((?:http://)?.+?\.pdf)' + url_re = re.compile(reg) + url_lst = re.findall(url_re,html) + return(url_lst) + def opeVideoUrl(url): html = urlopen(url).read() ss = html.replace(" ","") urls = re.findall(r"(http://www.yaoshe1.com/get_file/.*?mp4).*?",ss,re.I) for i in urls: print i - # urllib2.urlretrieve(i, "%s.mp4" % (i, )) + getFile(i); # else: - # print 'this is over' # bsObj = BeautifulSoup(html, "html5lib") # print bsObj @@ -38,6 +66,7 @@ def opeVideoUrl(url): count = 0 while count < 1: count = count+1 + print("url = " + strHref) opeVideoUrl(strHref) From f243722bd9888f1f010c4059937c28ddf587b7ca Mon Sep 17 00:00:00 2001 From: zzl Date: 2017年9月24日 18:25:53 +0800 Subject: [PATCH 4/8] =?UTF-8?q?=E6=8A=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 3 + WebScrapingWithPython/yaoshe1/index.py | 14 ++- WebScrapingWithPython/yaoshe1/index2.py | 74 +++++++++++++ web_crawle/book/demo1.py | 94 +++++++++++++++++ web_crawle/book/demo2.py | 134 ++++++++++++++++++++++++ 5 files changed, 317 insertions(+), 2 deletions(-) create mode 100644 WebScrapingWithPython/yaoshe1/index2.py create mode 100644 web_crawle/book/demo1.py create mode 100644 web_crawle/book/demo2.py diff --git a/README.md b/README.md index 8df0858..9540b6e 100644 --- a/README.md +++ b/README.md @@ -16,3 +16,6 @@ **技术选型** - python version 2.7.11 - python version 3.4.0 + + +http://ns.giit.us/htm_data/7/1706/2470604.html \ No newline at end of file diff --git a/WebScrapingWithPython/yaoshe1/index.py b/WebScrapingWithPython/yaoshe1/index.py index c7a835c..642b52e 100644 --- a/WebScrapingWithPython/yaoshe1/index.py +++ b/WebScrapingWithPython/yaoshe1/index.py @@ -9,13 +9,23 @@ from urllib2 import urlopen # from urllib2 import open from bs4 import BeautifulSoup +import requests # import urllib import re +downLoadFile = 'H:\\happy\1円\\' ##要下载到的目录 + def getFile(url): + if(requests.get(url).status_code == 404): + print('这是个错误网址') + return [] + print ('正在打开 ',url) file_name = url.split('/')[-1] + file_s = downLoadFile + file_name u = urlopen(url) - f = open(file_name, 'wb') + # u = requests.urlopen(url) + + f = open(file_s, 'wb') block_sz = 8192 while True: @@ -28,7 +38,7 @@ def getFile(url): print "Sucessful to download" + " " + file_name def getHtml(url): - page = urllib2.urlopen(url) + page = urlopen(url) html = page.read() page.close() return html diff --git a/WebScrapingWithPython/yaoshe1/index2.py b/WebScrapingWithPython/yaoshe1/index2.py new file mode 100644 index 0000000..2d4c4b3 --- /dev/null +++ b/WebScrapingWithPython/yaoshe1/index2.py @@ -0,0 +1,74 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +# @Filename: index +# @Date : 2017年09月14日 23:15 +# @Author : zzl +""" + python_version 2.7.11 +""" +from urllib2 import urlopen +# from urllib2 import open +from bs4 import BeautifulSoup +# import urllib +import re + +def getFile(url): + file_name = url.split('/')[-1] + u = urlopen(url) + f = open(file_name, 'wb') + + block_sz = 8192 + while True: + buffer = u.read(block_sz) + if not buffer: + break + + f.write(buffer) + f.close() + print "Sucessful to download" + " " + file_name + +def getHtml(url): + page = urlopen(url) + html = page.read() + page.close() + return html + +# compile the regular expressions and find +# all stuff we need +def getUrl(html): + reg = r'(?:href|HREF)="?((?:http://)?.+?\.pdf)' + url_re = re.compile(reg) + url_lst = re.findall(url_re,html) + return(url_lst) + +def opeVideoUrl(url): + html = urlopen(url).read() + ss = html.replace(" ","") + urls = re.findall(r"(http://www.yaoshe1.com/get_file/.*?mp4).*?",ss,re.I) + for i in urls: + print i + getFile(i); + # else: + # print 'this is over' + # bsObj = BeautifulSoup(html, "html5lib") + # print bsObj + + +html = urlopen("http://www.yaoshe1.com/") +# print(html.read()) +bsObj = BeautifulSoup(html, "html5lib") +itemsDivObj = bsObj.findAll("div",{"class":re.compile("^(item)((?!:).)*$")}) +for obj in itemsDivObj: + videosObjs = obj.findAll("a",{"href":re.compile("^(http://www.yaoshe1.com/videos/)((?!:).)*$")}) + # print("==================") + strHref = videosObjs[0].attrs["href"] + # print strHref + count = 0 + while count < 1: + count = count+1 + print("url = " + strHref) + opeVideoUrl(strHref) + + + + diff --git a/web_crawle/book/demo1.py b/web_crawle/book/demo1.py new file mode 100644 index 0000000..74239fa --- /dev/null +++ b/web_crawle/book/demo1.py @@ -0,0 +1,94 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +# @Filename: demo1 +# @Date : 2017-09-24 11:32 +# @Author : zzl +# from http://blog.csdn.net/actanble/article/details/52347458 + +""" + python_version 3.4 +""" +import re +import urllib.request as request +from bs4 import BeautifulSoup +import requests + +'''全局变量声明, 下载其它小说请注意修改 [下载到的本地目录, 书号, 起始index号]''' +downLoadFile = 'G:\\github.com\\python\\web_crawle\\book\\' ##要下载到的目录 +shuhao = '2_2970' ## 书号就是http://www.biquge.com/2_2970/2456497.html; com后面的那个。 +start, end = 2456497,100000 + +def setSrr(url): + if(requests.get(url).status_code == 404): + print('这是个错误网址') + return [] + print ('正在打开 ',url) + + l = [] + '''''请求响应和不响应的处理''' + response = request.urlopen(url) + + html = response.read() + soup = BeautifulSoup(html,"html5lib") + item = soup.findAll('h1') + title = re.match(r'(.*)

(.*)

(.*)', str(item) ,re.M|re.I).group(2) + l.append(title.split(' ')[0]) + l.append(title) + strings = soup.findAll('div', id="content")[0]; + for string in strings: + st = string.__str__() + if (len(st.split('
'))> 1): + pass + else: + l.append(st) + return l +#strings.split() + +#穿入字符串 写入文件;标题为l[0] +def setDoc(l): + if(len(l) < 2): + return + file_s = downLoadFile + l[0] + '.txt' + file = open(file_s, 'w+', encoding='utf-8') + for i in l: + file.write('\t') + for ii in i.split(' '): + file.write(ii) + file.write('\n') + +#开始自加数值;读取新文档;如果没有;那么跳过 +''''' 最开始设置为1066142,100 ''' +def setNum(num,n): + l = [(num + i) for i in range(n)] + sl = [str(l[i]) for i in range(len(l))] + return sl + +'''''自动产生新的url''' + +''''' 自己观察到: 第一章的地址http://www.biquge.com/2_2970/2456497.html +最后一张的地址 http://www.biquge.com/2_2970/3230837.html''' +def setNewUrl(sl): + urls = [] + for x in sl: + xsr = 'http://www.biquge.com/'+ shuhao +'/'+ x +'.html' #对应的单章html + urls.append(xsr) + return urls + + + +def setTxts(urls): + for url in urls: + setDoc(setSrr(url)) + +print( +''''' +-------------- +开始下载超品相师 +-------------- +——actanble 手打—— +如果要下载其他的txt文件: 请修改—— +URL 和 对应的起始html的index号。 +''' +) +setTxts(setNewUrl(setNum(start, end))) + diff --git a/web_crawle/book/demo2.py b/web_crawle/book/demo2.py new file mode 100644 index 0000000..6ee3783 --- /dev/null +++ b/web_crawle/book/demo2.py @@ -0,0 +1,134 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +# @Filename: demo2 +# @Date : 2017-09-24 11:56 +# @Author : zzl +#from http://blog.csdn.net/cellurs/article/details/69367635s +""" + python_version 3.4 +""" +import time,os,traceback,random +import requests,re +from bs4 import BeautifulSoup + +#define +Agent =['Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1' + ,'Opera/9.27 (Windows NT 5.2; U; zh-cn)' + ,'Mozilla/5.0 (iPhone; U; CPU like Mac OS X) AppleWebKit/420.1 (KHTML, like Gecko) Version/3.0' + ] +def ProcName(name):#清洗目录名 + pat = r'[<|>|/|\|||:|"|*|?]+|(提示:已启用缓存技术,最新章节可能会延时显示,登录书架即可实时查看。)' + pat = re.compile(pat) + return pat.sub('',name) + +def log(url): + path = "\home\Rullec\log.txt" + with open(path,'w') as f: + f.write(str(url)) + f.close() + +def GetHtmlText(url):#获得HTML页面内容 此处可以增加proxies代理服务器,只不过目前还没有 + try: + r = requests.get(url,headers={'User-Agent':Agent[random.randint(0,Agent.__len__()-1)]},timeout = 20) + r.raise_for_status() + r.encoding = 'utf-8' + return r.text + except requests.exceptions.ReadTimeout|requests.exceptions.ConnectTimeout: + traceback.print_exc() #如果出现超时错误 + #log(url) + waittime = random.randint(10, 20) + print("出现超时错误!等待"+str(waittime)+"秒!\n") + time.sleep(waittime) + return None + +def FindIndex(name):#获得目标小说目录页 + url = "http://zhannei.baidu.com/cse/search?&s=287293036948159515&q="+str(name)+"&click="+str(random.randint(1,3))+'&nsid=' + text = GetHtmlText(url) + if text==None: + print("文本为空,无法解析") + return None + soup = BeautifulSoup(text.encode('utf-8'),'html.parser') + list = soup.find_all(name = 'a',attrs = {"cpos" :"title","title":name})#一个list + url = [] + for i in list: + url.append(i["href"]) + return url + +def ProcTxt(text): + text = text[9:] + pat = r' ' + pat = re.compile(pat) + return pat.sub("\n",text) + +def Write(storpath,tag):#进行小说的存储 + if None==tag: + print('小说写入失败,原因是小说最后一层超链接无法获取') + return 1 + a = tag.a#标签的属性使用tag['title']来获得,标签下的搜索使用tag.children来实现 + storpath += "/"+ProcName(a.string)+".txt" + while os.path.exists(storpath): + if time.time()-os.path.getctime(storpath)<100 : + newpath = storpath.split('.') + storpath = newpath[0] + "#.txt" + else: + return 1 + url = 'http://www.biquge.com/'+a['href'] + text = GetHtmlText(url) + if or len(text)==0: + print("最后一层文本获取失败!") + return 1 + soup = BeautifulSoup(text.encode('utf-8'),'html5lib') + novel = soup.find_all("div",attrs={"id":"content"}) + text = novel[0].text + #开始写入 + with open(storpath,'w',encoding='utf-8') as f: + text = ProcTxt(str(text)) + f.write(text) + f.close() + return 0 + +def Spider(url,path):#爬取小说目录页 + text = "" + #while len(text) or + text = GetHtmlText(url) + num = 0 + soup = BeautifulSoup(text,'html5lib') + list = soup.find_all(["dd","dt"]) + nowpath = "" + flag = 0 + for i in list: + if i.name == "dt": + nowpath = path+"/"+ProcName(i.string) + if os.path.exists(nowpath): + pass + else: + os.mkdir(nowpath) + else: + flag = Write(nowpath,i) + num+=1 + if num%10==0 and (not flag): + time.sleep(random.randint(3,10))#爬虫每爬几个就休眠 + print("\r当前进度: {:.2f}%".format(num * 100 / len(list)), end="") + return "" + +def main():#主函数 + name = ProcName(input("请输入要爬取的小说的名字:")) + url = FindIndex(name)#爬取搜索结果,在其中查找目录页,并且返回 + if 0==len(url): + print("查无此小说") + exit() + path = "E:\小说\\" + #path = "\home\Rullec\小说\\" + if os.path.exists(path): + pass + else: + os.mkdir(path) + path = path +name + if os.path.exists(path): + pass + else: + os.mkdir(path) + Spider(url[0],path) +main() + + From fef16ff80927cc6aa00503f1e9db281a43fa3bcf Mon Sep 17 00:00:00 2001 From: zzl Date: 2017年10月12日 20:04:38 +0800 Subject: [PATCH 5/8] 222 --- WebScrapingWithPython/yaoshe1/1.log | 43 -------------------------- WebScrapingWithPython/yaoshe1/index.py | 41 ++++++++++++++---------- web_crawle/book/file.py | 8 +++++ 3 files changed, 33 insertions(+), 59 deletions(-) delete mode 100644 WebScrapingWithPython/yaoshe1/1.log create mode 100644 web_crawle/book/file.py diff --git a/WebScrapingWithPython/yaoshe1/1.log b/WebScrapingWithPython/yaoshe1/1.log deleted file mode 100644 index 62731fa..0000000 --- a/WebScrapingWithPython/yaoshe1/1.log +++ /dev/null @@ -1,43 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/WebScrapingWithPython/yaoshe1/index.py b/WebScrapingWithPython/yaoshe1/index.py index 642b52e..17d1fa4 100644 --- a/WebScrapingWithPython/yaoshe1/index.py +++ b/WebScrapingWithPython/yaoshe1/index.py @@ -12,16 +12,20 @@ import requests # import urllib import re +import os -downLoadFile = 'H:\\happy\1円\\' ##要下载到的目录 +downLoadFile = 'H:\\happy\4円\\' ##要下载到的目录 def getFile(url): if(requests.get(url).status_code == 404): print('这是个错误网址') return [] - print ('正在打开 ',url) + #print ('正在打开 ',url) file_name = url.split('/')[-1] file_s = downLoadFile + file_name + if os.path.exists(file_s): + print("file exists = " + file_name) + return u = urlopen(url) # u = requests.urlopen(url) @@ -35,7 +39,7 @@ def getFile(url): f.write(buffer) f.close() - print "Sucessful to download" + " " + file_name + print("Sucessful to download = " + file_name) def getHtml(url): page = urlopen(url) @@ -54,30 +58,35 @@ def getUrl(html): def opeVideoUrl(url): html = urlopen(url).read() ss = html.replace(" ","") - urls = re.findall(r"(http://www.yaoshe1.com/get_file/.*?mp4).*?",ss,re.I) + urls = re.findall(r"(http://www.yaoshe2.com/get_file/.*?mp4).*?",ss,re.I) for i in urls: - print i - getFile(i); + print(i) + try: + getFile(i); + except Exception,e: + print e.message # else: # print 'this is over' # bsObj = BeautifulSoup(html, "html5lib") # print bsObj -html = urlopen("http://www.yaoshe1.com/") +html = urlopen("http://www.yaoshe2.com/") # print(html.read()) bsObj = BeautifulSoup(html, "html5lib") itemsDivObj = bsObj.findAll("div",{"class":re.compile("^(item)((?!:).)*$")}) +print "itemsDivObj div item = ",len(itemsDivObj) for obj in itemsDivObj: - videosObjs = obj.findAll("a",{"href":re.compile("^(http://www.yaoshe1.com/videos/)((?!:).)*$")}) - # print("==================") - strHref = videosObjs[0].attrs["href"] - # print strHref - count = 0 - while count < 1: - count = count+1 - print("url = " + strHref) - opeVideoUrl(strHref) + videosObjs = obj.findAll("a",{"href":re.compile("^(http://www.yaoshe2.com/videos/)((?!:).)*$")}) + print "videosObjs a videos = ",len(videosObjs) + if len(videosObjs) != 0: + strHref = videosObjs[0].attrs["href"] + # print strHref + count = 0 + while count < 1: + count = count+1 + print("url = " + strHref) + opeVideoUrl(strHref) diff --git a/web_crawle/book/file.py b/web_crawle/book/file.py new file mode 100644 index 0000000..ccdf2a5 --- /dev/null +++ b/web_crawle/book/file.py @@ -0,0 +1,8 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +# @Filename: file +# @Date : 2017-09-25 09:35 +# @Author : zzl +""" + python_version 2.7.11 +""" \ No newline at end of file From 27fda965be1e0fbee1ef456eef25ed5f3ceeae6b Mon Sep 17 00:00:00 2001 From: zzl Date: 2017年10月16日 17:16:48 +0800 Subject: [PATCH 6/8] next page --- WebScrapingWithPython/yaoshe1/index.py | 49 +++++++++++++++++--------- 1 file changed, 32 insertions(+), 17 deletions(-) diff --git a/WebScrapingWithPython/yaoshe1/index.py b/WebScrapingWithPython/yaoshe1/index.py index 17d1fa4..b74622f 100644 --- a/WebScrapingWithPython/yaoshe1/index.py +++ b/WebScrapingWithPython/yaoshe1/index.py @@ -14,7 +14,7 @@ import re import os -downLoadFile = 'H:\\happy\4円\\' ##要下载到的目录 +downLoadFile = 'H:\\happy\5円\\' ##要下载到的目录 def getFile(url): if(requests.get(url).status_code == 404): @@ -70,23 +70,38 @@ def opeVideoUrl(url): # bsObj = BeautifulSoup(html, "html5lib") # print bsObj +def eachLatestUpdates(): + print "eachLatestUpdates" + while currPage < 10: + startOpenPage("http://www.yaoshe2.com/latest-updates/" + currPage + "/") + currPage = currPage +1 + + + + +def startOpenPage(url): + html = urlopen(url) + # print(html.read()) + bsObj = BeautifulSoup(html, "html5lib") + itemsDivObj = bsObj.findAll("div",{"class":re.compile("^(item)((?!:).)*$")}) + print "itemsDivObj div item = ",len(itemsDivObj) + for obj in itemsDivObj: + videosObjs = obj.findAll("a",{"href":re.compile("^(http://www.yaoshe2.com/videos/)((?!:).)*$")}) + print "videosObjs a videos = ",len(videosObjs) + if len(videosObjs) != 0: + strHref = videosObjs[0].attrs["href"] + # print strHref + count = 0 + while count < 1: + count = count+1 + print("url = " + strHref) + opeVideoUrl(strHref) + + +currPage = 2 +startOpenPage("http://www.yaoshe2.com/") +eachLatestUpdates() -html = urlopen("http://www.yaoshe2.com/") -# print(html.read()) -bsObj = BeautifulSoup(html, "html5lib") -itemsDivObj = bsObj.findAll("div",{"class":re.compile("^(item)((?!:).)*$")}) -print "itemsDivObj div item = ",len(itemsDivObj) -for obj in itemsDivObj: - videosObjs = obj.findAll("a",{"href":re.compile("^(http://www.yaoshe2.com/videos/)((?!:).)*$")}) - print "videosObjs a videos = ",len(videosObjs) - if len(videosObjs) != 0: - strHref = videosObjs[0].attrs["href"] - # print strHref - count = 0 - while count < 1: - count = count+1 - print("url = " + strHref) - opeVideoUrl(strHref) From 0f4147183f5e73b98a668b94efe510895555a5bd Mon Sep 17 00:00:00 2001 From: zzl Date: 2017年10月16日 19:37:39 +0800 Subject: [PATCH 7/8] =?UTF-8?q?=E8=BF=B7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- WebScrapingWithPython/yaoshe1/index.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/WebScrapingWithPython/yaoshe1/index.py b/WebScrapingWithPython/yaoshe1/index.py index b74622f..1bfc99c 100644 --- a/WebScrapingWithPython/yaoshe1/index.py +++ b/WebScrapingWithPython/yaoshe1/index.py @@ -16,6 +16,8 @@ downLoadFile = 'H:\\happy\5円\\' ##要下载到的目录 + + def getFile(url): if(requests.get(url).status_code == 404): print('这是个错误网址') @@ -71,10 +73,11 @@ def opeVideoUrl(url): # print bsObj def eachLatestUpdates(): + currPage = 2 print "eachLatestUpdates" while currPage < 10: - startOpenPage("http://www.yaoshe2.com/latest-updates/" + currPage + "/") - currPage = currPage +1 + startOpenPage("http://www.yaoshe2.com/latest-updates/" + str(currPage) + "/") + currPage = currPage + 1 @@ -98,7 +101,6 @@ def startOpenPage(url): opeVideoUrl(strHref) -currPage = 2 startOpenPage("http://www.yaoshe2.com/") eachLatestUpdates() From 268fc58f64e30361ae1fdb612f566addd06294f9 Mon Sep 17 00:00:00 2001 From: zzl Date: 2018年3月30日 12:00:29 +0800 Subject: [PATCH 8/8] test --- WebScrapingWithPython/yaoshe1/index.py | 27 +++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/WebScrapingWithPython/yaoshe1/index.py b/WebScrapingWithPython/yaoshe1/index.py index 1bfc99c..53368a4 100644 --- a/WebScrapingWithPython/yaoshe1/index.py +++ b/WebScrapingWithPython/yaoshe1/index.py @@ -5,6 +5,9 @@ # @Author : zzl """ python_version 2.7.11 + + import package: bs4,requests,html5lib + """ from urllib2 import urlopen # from urllib2 import open @@ -14,8 +17,8 @@ import re import os -downLoadFile = 'H:\\happy\5円\\' ##要下载到的目录 - +downLoadFile = 'D:\\happy\\yaoshe6' ##要下载到的目录 +isdebug = True def getFile(url): @@ -63,10 +66,10 @@ def opeVideoUrl(url): urls = re.findall(r"(http://www.yaoshe2.com/get_file/.*?mp4).*?",ss,re.I) for i in urls: print(i) - try: - getFile(i); - except Exception,e: - print e.message + # try: + getFile(i); + # except Exception,e: + # print e.message # else: # print 'this is over' # bsObj = BeautifulSoup(html, "html5lib") @@ -75,21 +78,23 @@ def opeVideoUrl(url): def eachLatestUpdates(): currPage = 2 print "eachLatestUpdates" - while currPage < 10: - startOpenPage("http://www.yaoshe2.com/latest-updates/" + str(currPage) + "/") + while currPage < 100: + startOpenPage("http://www.yaoshe6.com/latest-updates/" + str(currPage) + "/") currPage = currPage + 1 def startOpenPage(url): + print "base url ============== " + url html = urlopen(url) - # print(html.read()) + if isdebug == True: + print(html.read()) bsObj = BeautifulSoup(html, "html5lib") itemsDivObj = bsObj.findAll("div",{"class":re.compile("^(item)((?!:).)*$")}) print "itemsDivObj div item = ",len(itemsDivObj) for obj in itemsDivObj: - videosObjs = obj.findAll("a",{"href":re.compile("^(http://www.yaoshe2.com/videos/)((?!:).)*$")}) + videosObjs = obj.findAll("a",{"href":re.compile("^(http://www.yaoshe6.com/videos/)((?!:).)*$")}) print "videosObjs a videos = ",len(videosObjs) if len(videosObjs) != 0: strHref = videosObjs[0].attrs["href"] @@ -101,7 +106,7 @@ def startOpenPage(url): opeVideoUrl(strHref) -startOpenPage("http://www.yaoshe2.com/") +startOpenPage("http://www.yaoshe6.com/") eachLatestUpdates()

AltStyle によって変換されたページ (->オリジナル) /