From d7eeba208ef556b8e49c405e7027228a7fc8fe4d Mon Sep 17 00:00:00 2001
From: zhangzlg <zhangzhanliang@micro-view.com>
Date: 2017年9月15日 21:35:39 +0800
Subject: [PATCH 1/8] =?UTF-8?q?=E7=BD=91=E7=BB=9C=E7=88=AC=E8=99=AB?=
 =?UTF-8?q?=EF=BC=8C=E7=8E=B0=E5=9C=A8=E5=B7=B2=E7=BB=8F=E7=88=AC=E5=88=B0?=
 =?UTF-8?q?=E7=BD=91=E9=A1=B5=E5=8F=8A=E7=9B=B8=E5=85=B3MP4=20URL=EF=BC=8C?=
 =?UTF-8?q?=E4=B8=8B=E4=B8=80=E6=AD=A5=E9=9C=80=E8=A6=81=E5=B0=86=E6=96=87?=
 =?UTF-8?q?=E4=BB=B6=E4=B8=8B=E8=BD=BD=E4=B8=8B=E6=9D=A5?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
 WebScrapingWithPython/yaoshe1/1.log &#124; 43 ++++++++++++++++++++++++
 WebScrapingWithPython/yaoshe1/index.py &#124; 45 ++++++++++++++++++++++++++
 2 files changed, 88 insertions(+)
 create mode 100644 WebScrapingWithPython/yaoshe1/1.log
 create mode 100644 WebScrapingWithPython/yaoshe1/index.py
diff --git a/WebScrapingWithPython/yaoshe1/1.log b/WebScrapingWithPython/yaoshe1/1.log
new file mode 100644
index 0000000..62731fa
--- /dev/null
+++ b/WebScrapingWithPython/yaoshe1/1.log
@@ -0,0 +1,43 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/WebScrapingWithPython/yaoshe1/index.py b/WebScrapingWithPython/yaoshe1/index.py
new file mode 100644
index 0000000..e937465
--- /dev/null
+++ b/WebScrapingWithPython/yaoshe1/index.py
@@ -0,0 +1,45 @@
+#!/usr/bin/python
+# -*- coding: UTF-8 -*-
+# @Filename: index
+# @Date : 2017年09月14日 23:15
+# @Author : zzl
+"""
+ python_version 2.7.11
+"""
+from urllib2 import urlopen
+# from urllib2 import open
+from bs4 import BeautifulSoup
+import urllib2
+import re
+
+def opeVideoUrl(url):
+ html = urlopen(url).read()
+ ss = html.replace(" ","")
+ urls = re.findall(r"(http://www.yaoshe1.com/get_file/.*?mp4).*?",ss,re.I)
+ for i in urls:
+ print i
+ # urllib2.urlretrieve(i, "%s.mp4" % (i, ))
+ # else:
+ 
+ # print 'this is over'
+ # bsObj = BeautifulSoup(html, "html5lib")
+ # print bsObj
+
+
+html = urlopen("http://www.yaoshe1.com/")
+# print(html.read())
+bsObj = BeautifulSoup(html, "html5lib")
+itemsDivObj = bsObj.findAll("div",{"class":re.compile("^(item)((?!:).)*$")})
+for obj in itemsDivObj:
+ videosObjs = obj.findAll("a",{"href":re.compile("^(http://www.yaoshe1.com/videos/)((?!:).)*$")})
+ # print("==================")
+ strHref = videosObjs[0].attrs["href"]
+ # print strHref
+ count = 0
+ while count < 1: + count = count+1 + opeVideoUrl(strHref) + + + + From 6e562cec93866506b3547f34886baa6b5080475e Mon Sep 17 00:00:00 2001 From: zhangzlg <zhangzhanliang@micro-view.com>
Date: 2017年9月15日 21:36:40 +0800
Subject: [PATCH 2/8] test
---
 WebScrapingWithPython/python-scraping &#124; 1 +
 WebScrapingWithPython/remark &#124; 28 +++++++++++++++++++++++++++
 2 files changed, 29 insertions(+)
 create mode 160000 WebScrapingWithPython/python-scraping
diff --git a/WebScrapingWithPython/python-scraping b/WebScrapingWithPython/python-scraping
new file mode 160000
index 0000000..cec78b7
--- /dev/null
+++ b/WebScrapingWithPython/python-scraping
@@ -0,0 +1 @@
+Subproject commit cec78b7cadf16f45249c443248f918d6efaee6d3
diff --git a/WebScrapingWithPython/remark b/WebScrapingWithPython/remark
index a2051d1..6df378d 100644
--- a/WebScrapingWithPython/remark
+++ b/WebScrapingWithPython/remark
@@ -62,3 +62,31 @@ Tesseract 文档: https://github.com/tesseract-ocr/tesseract/wiki
 
 
 EditThis cookie http://www.editthiscookie.com
+
+
+分布式计算 distributed computing
+Tor代理服务器
+ 
+PySocks python 代理服务器通信模埠 https://pypi.python.org/pypi/PySocks
+
+搜索引擎优化: Search Engine Optimization,SEO
+ robots.txt
+
+机器人排除标准: Robots Exclusion Standard robots.txt
+
+
+#
+# robots.txt for PHPWIND BOARD
+# Version 5.x
+#
+
+User-agent: *
+Disallow: /admin/
+Disallow: /require/
+Disallow: /hack/
+Disallow: /attachment/
+Disallow: /images/
+Disallow: /data/
+Disallow: /ipdata/
+Disallow: /template/
+
From ad70ac10f528c9cb8ca69649a2fac4b431fc7f0c Mon Sep 17 00:00:00 2001
From: zzl <zzl@1noc.cn>
Date: 2017年9月24日 11:11:23 +0800
Subject: [PATCH 3/8] ok
---
 WebScrapingWithPython/yaoshe1/index.py &#124; 35 +++++++++++++++++++++++---
 1 file changed, 32 insertions(+), 3 deletions(-)
diff --git a/WebScrapingWithPython/yaoshe1/index.py b/WebScrapingWithPython/yaoshe1/index.py
index e937465..c7a835c 100644
--- a/WebScrapingWithPython/yaoshe1/index.py
+++ b/WebScrapingWithPython/yaoshe1/index.py
@@ -9,18 +9,46 @@
 from urllib2 import urlopen
 # from urllib2 import open
 from bs4 import BeautifulSoup
-import urllib2
+# import urllib
 import re
 
+def getFile(url):
+ file_name = url.split('/')[-1]
+ u = urlopen(url)
+ f = open(file_name, 'wb')
+
+ block_sz = 8192
+ while True:
+ buffer = u.read(block_sz)
+ if not buffer:
+ break
+
+ f.write(buffer)
+ f.close()
+ print "Sucessful to download" + " " + file_name
+
+def getHtml(url):
+ page = urllib2.urlopen(url)
+ html = page.read()
+ page.close()
+ return html
+
+# compile the regular expressions and find
+# all stuff we need
+def getUrl(html):
+ reg = r'(?:href&#124;HREF)="?((?:http://)?.+?\.pdf)'
+ url_re = re.compile(reg)
+ url_lst = re.findall(url_re,html)
+ return(url_lst)
+
 def opeVideoUrl(url):
 html = urlopen(url).read()
 ss = html.replace(" ","")
 urls = re.findall(r"(http://www.yaoshe1.com/get_file/.*?mp4).*?",ss,re.I)
 for i in urls:
 print i
- # urllib2.urlretrieve(i, "%s.mp4" % (i, ))
+ getFile(i);
 # else:
- 
 # print 'this is over'
 # bsObj = BeautifulSoup(html, "html5lib")
 # print bsObj
@@ -38,6 +66,7 @@ def opeVideoUrl(url):
 count = 0
 while count < 1: count = count+1 + print("url = " + strHref) opeVideoUrl(strHref) From f243722bd9888f1f010c4059937c28ddf587b7ca Mon Sep 17 00:00:00 2001 From: zzl <zzl@1noc.cn>
Date: 2017年9月24日 18:25:53 +0800
Subject: [PATCH 4/8] =?UTF-8?q?=E6=8A=98?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
 README.md &#124; 3 +
 WebScrapingWithPython/yaoshe1/index.py &#124; 14 ++-
 WebScrapingWithPython/yaoshe1/index2.py &#124; 74 +++++++++++++
 web_crawle/book/demo1.py &#124; 94 +++++++++++++++++
 web_crawle/book/demo2.py &#124; 134 ++++++++++++++++++++++++
 5 files changed, 317 insertions(+), 2 deletions(-)
 create mode 100644 WebScrapingWithPython/yaoshe1/index2.py
 create mode 100644 web_crawle/book/demo1.py
 create mode 100644 web_crawle/book/demo2.py
diff --git a/README.md b/README.md
index 8df0858..9540b6e 100644
--- a/README.md
+++ b/README.md
@@ -16,3 +16,6 @@
 **技术选型**
 - python version 2.7.11
 - python version 3.4.0
+
+
+http://ns.giit.us/htm_data/7/1706/2470604.html
\ No newline at end of file
diff --git a/WebScrapingWithPython/yaoshe1/index.py b/WebScrapingWithPython/yaoshe1/index.py
index c7a835c..642b52e 100644
--- a/WebScrapingWithPython/yaoshe1/index.py
+++ b/WebScrapingWithPython/yaoshe1/index.py
@@ -9,13 +9,23 @@
 from urllib2 import urlopen
 # from urllib2 import open
 from bs4 import BeautifulSoup
+import requests
 # import urllib
 import re
 
+downLoadFile = 'H:\\happy\1円\\' ##要下载到的目录
+
 def getFile(url):
+ if(requests.get(url).status_code == 404):
+ print('这是个错误网址')
+ return []
+ print ('正在打开 ',url)
 file_name = url.split('/')[-1]
+ file_s = downLoadFile + file_name
 u = urlopen(url)
- f = open(file_name, 'wb')
+ # u = requests.urlopen(url)
+
+ f = open(file_s, 'wb')
 
 block_sz = 8192
 while True:
@@ -28,7 +38,7 @@ def getFile(url):
 print "Sucessful to download" + " " + file_name
 
 def getHtml(url):
- page = urllib2.urlopen(url)
+ page = urlopen(url)
 html = page.read()
 page.close()
 return html
diff --git a/WebScrapingWithPython/yaoshe1/index2.py b/WebScrapingWithPython/yaoshe1/index2.py
new file mode 100644
index 0000000..2d4c4b3
--- /dev/null
+++ b/WebScrapingWithPython/yaoshe1/index2.py
@@ -0,0 +1,74 @@
+#!/usr/bin/python
+# -*- coding: UTF-8 -*-
+# @Filename: index
+# @Date : 2017年09月14日 23:15
+# @Author : zzl
+"""
+ python_version 2.7.11
+"""
+from urllib2 import urlopen
+# from urllib2 import open
+from bs4 import BeautifulSoup
+# import urllib
+import re
+
+def getFile(url):
+ file_name = url.split('/')[-1]
+ u = urlopen(url)
+ f = open(file_name, 'wb')
+
+ block_sz = 8192
+ while True:
+ buffer = u.read(block_sz)
+ if not buffer:
+ break
+
+ f.write(buffer)
+ f.close()
+ print "Sucessful to download" + " " + file_name
+
+def getHtml(url):
+ page = urlopen(url)
+ html = page.read()
+ page.close()
+ return html
+
+# compile the regular expressions and find
+# all stuff we need
+def getUrl(html):
+ reg = r'(?:href&#124;HREF)="?((?:http://)?.+?\.pdf)'
+ url_re = re.compile(reg)
+ url_lst = re.findall(url_re,html)
+ return(url_lst)
+
+def opeVideoUrl(url):
+ html = urlopen(url).read()
+ ss = html.replace(" ","")
+ urls = re.findall(r"(http://www.yaoshe1.com/get_file/.*?mp4).*?",ss,re.I)
+ for i in urls:
+ print i
+ getFile(i);
+ # else:
+ # print 'this is over'
+ # bsObj = BeautifulSoup(html, "html5lib")
+ # print bsObj
+
+
+html = urlopen("http://www.yaoshe1.com/")
+# print(html.read())
+bsObj = BeautifulSoup(html, "html5lib")
+itemsDivObj = bsObj.findAll("div",{"class":re.compile("^(item)((?!:).)*$")})
+for obj in itemsDivObj:
+ videosObjs = obj.findAll("a",{"href":re.compile("^(http://www.yaoshe1.com/videos/)((?!:).)*$")})
+ # print("==================")
+ strHref = videosObjs[0].attrs["href"]
+ # print strHref
+ count = 0
+ while count < 1: + count = count+1 + print("url = " + strHref) + opeVideoUrl(strHref) + + + + diff --git a/web_crawle/book/demo1.py b/web_crawle/book/demo1.py new file mode 100644 index 0000000..74239fa --- /dev/null +++ b/web_crawle/book/demo1.py @@ -0,0 +1,94 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +# @Filename: demo1 +# @Date : 2017-09-24 11:32 +# @Author : zzl +# from http://blog.csdn.net/actanble/article/details/52347458 + +""" + python_version 3.4 +""" +import re +import urllib.request as request +from bs4 import BeautifulSoup +import requests + +'''全局变量声明, 下载其它小说请注意修改 [下载到的本地目录, 书号, 起始index号]''' +downLoadFile = 'G:\\github.com\\python\\web_crawle\\book\\' ##要下载到的目录 +shuhao = '2_2970' ## 书号就是http://www.biquge.com/2_2970/2456497.html; com后面的那个。 +start, end = 2456497,100000 + +def setSrr(url): + if(requests.get(url).status_code == 404): + print('这是个错误网址') + return [] + print ('正在打开 ',url) + + l = [] + '''''请求响应和不响应的处理''' + response = request.urlopen(url) + + html = response.read() + soup = BeautifulSoup(html,"html5lib") + item = soup.findAll('h1') + title = re.match(r'(.*)<h1> (.*)</h1>(.*)', str(item) ,re.M&#124;re.I).group(2) 
+ l.append(title.split(' ')[0]) 
+ l.append(title) 
+ strings = soup.findAll('div', id="content")[0]; 
+ for string in strings: 
+ st = string.__str__() 
+ if (len(st.split('<br/>'))> 1): 
+ pass 
+ else: 
+ l.append(st) 
+ return l 
+#strings.split() 
+ 
+#穿入字符串 写入文件;标题为l[0] 
+def setDoc(l): 
+ if(len(l) < 2): + return + file_s = downLoadFile + l[0] + '.txt' + file = open(file_s, 'w+', encoding='utf-8') + for i in l: + file.write('\t') + for ii in i.split(' '): + file.write(ii) + file.write('\n') + +#开始自加数值;读取新文档;如果没有;那么跳过 +''''' 最开始设置为1066142,100 ''' +def setNum(num,n): + l = [(num + i) for i in range(n)] + sl = [str(l[i]) for i in range(len(l))] + return sl + +'''''自动产生新的url''' + +''''' 自己观察到: 第一章的地址http://www.biquge.com/2_2970/2456497.html +最后一张的地址 http://www.biquge.com/2_2970/3230837.html''' +def setNewUrl(sl): + urls = [] + for x in sl: + xsr = 'http://www.biquge.com/'+ shuhao +'/'+ x +'.html' #对应的单章html + urls.append(xsr) + return urls + + + +def setTxts(urls): + for url in urls: + setDoc(setSrr(url)) + +print( +''''' +-------------- +开始下载超品相师 +-------------- +——actanble 手打—— +如果要下载其他的txt文件: 请修改—— +URL 和 对应的起始html的index号。 +''' +) +setTxts(setNewUrl(setNum(start, end))) + diff --git a/web_crawle/book/demo2.py b/web_crawle/book/demo2.py new file mode 100644 index 0000000..6ee3783 --- /dev/null +++ b/web_crawle/book/demo2.py @@ -0,0 +1,134 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +# @Filename: demo2 +# @Date : 2017-09-24 11:56 +# @Author : zzl +#from http://blog.csdn.net/cellurs/article/details/69367635s +""" + python_version 3.4 +""" +import time,os,traceback,random +import requests,re +from bs4 import BeautifulSoup + +#define +Agent =['Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1' + ,'Opera/9.27 (Windows NT 5.2; U; zh-cn)' + ,'Mozilla/5.0 (iPhone; U; CPU like Mac OS X) AppleWebKit/420.1 (KHTML, like Gecko) Version/3.0' + ] +def ProcName(name):#清洗目录名 + pat = r'[<&#124;>&#124;/&#124;\&#124;&#124;&#124;:&#124;"&#124;*&#124;?]+&#124;(提示:已启用缓存技术,最新章节可能会延时显示,登录书架即可实时查看。)'
+ pat = re.compile(pat)
+ return pat.sub('',name)
+
+def log(url):
+ path = "\home\Rullec\log.txt"
+ with open(path,'w') as f:
+ f.write(str(url))
+ f.close()
+
+def GetHtmlText(url):#获得HTML页面内容 此处可以增加proxies代理服务器,只不过目前还没有
+ try:
+ r = requests.get(url,headers={'User-Agent':Agent[random.randint(0,Agent.__len__()-1)]},timeout = 20)
+ r.raise_for_status()
+ r.encoding = 'utf-8'
+ return r.text
+ except requests.exceptions.ReadTimeout&#124;requests.exceptions.ConnectTimeout:
+ traceback.print_exc() #如果出现超时错误
+ #log(url)
+ waittime = random.randint(10, 20)
+ print("出现超时错误!等待"+str(waittime)+"秒!\n")
+ time.sleep(waittime)
+ return None
+
+def FindIndex(name):#获得目标小说目录页
+ url = "http://zhannei.baidu.com/cse/search?&s=287293036948159515&q="+str(name)+"&click="+str(random.randint(1,3))+'&nsid='
+ text = GetHtmlText(url)
+ if text==None:
+ print("文本为空,无法解析")
+ return None
+ soup = BeautifulSoup(text.encode('utf-8'),'html.parser')
+ list = soup.find_all(name = 'a',attrs = {"cpos" :"title","title":name})#一个list
+ url = []
+ for i in list:
+ url.append(i["href"])
+ return url
+
+def ProcTxt(text):
+ text = text[9:]
+ pat = r'  '
+ pat = re.compile(pat)
+ return pat.sub("\n",text)
+
+def Write(storpath,tag):#进行小说的存储
+ if None==tag:
+ print('小说写入失败,原因是小说最后一层超链接无法获取')
+ return 1
+ a = tag.a#标签的属性使用tag['title']来获得,标签下的搜索使用tag.children来实现
+ storpath += "/"+ProcName(a.string)+".txt"
+ while os.path.exists(storpath):
+ if time.time()-os.path.getctime(storpath)<100 : + newpath = storpath.split('.') + storpath = newpath[0] + "#.txt" + else: + return 1 + url = 'http://www.biquge.com/'+a['href'] + text = GetHtmlText(url) + if or len(text)==0: + print("最后一层文本获取失败!") + return 1 + soup = BeautifulSoup(text.encode('utf-8'),'html5lib') + novel = soup.find_all("div",attrs={"id":"content"}) + text = novel[0].text + #开始写入 + with open(storpath,'w',encoding='utf-8') as f: + text = ProcTxt(str(text)) + f.write(text) + f.close() + return 0 + +def Spider(url,path):#爬取小说目录页 + text = "" + #while len(text) or + text = GetHtmlText(url) + num = 0 + soup = BeautifulSoup(text,'html5lib') + list = soup.find_all(["dd","dt"]) + nowpath = "" + flag = 0 + for i in list: + if i.name == "dt": + nowpath = path+"/"+ProcName(i.string) + if os.path.exists(nowpath): + pass + else: + os.mkdir(nowpath) + else: + flag = Write(nowpath,i) + num+=1 + if num%10==0 and (not flag): + time.sleep(random.randint(3,10))#爬虫每爬几个就休眠 + print("\r当前进度: {:.2f}%".format(num * 100 / len(list)), end="") + return "" + +def main():#主函数 + name = ProcName(input("请输入要爬取的小说的名字:")) + url = FindIndex(name)#爬取搜索结果,在其中查找目录页,并且返回 + if 0==len(url): + print("查无此小说") + exit() + path = "E:\小说\\" + #path = "\home\Rullec\小说\\" + if os.path.exists(path): + pass + else: + os.mkdir(path) + path = path +name + if os.path.exists(path): + pass + else: + os.mkdir(path) + Spider(url[0],path) +main() + + From fef16ff80927cc6aa00503f1e9db281a43fa3bcf Mon Sep 17 00:00:00 2001 From: zzl <zzl@1noc.cn>
Date: 2017年10月12日 20:04:38 +0800
Subject: [PATCH 5/8] 222
---
 WebScrapingWithPython/yaoshe1/1.log &#124; 43 --------------------------
 WebScrapingWithPython/yaoshe1/index.py &#124; 41 ++++++++++++++----------
 web_crawle/book/file.py &#124; 8 +++++
 3 files changed, 33 insertions(+), 59 deletions(-)
 delete mode 100644 WebScrapingWithPython/yaoshe1/1.log
 create mode 100644 web_crawle/book/file.py
diff --git a/WebScrapingWithPython/yaoshe1/1.log b/WebScrapingWithPython/yaoshe1/1.log
deleted file mode 100644
index 62731fa..0000000
--- a/WebScrapingWithPython/yaoshe1/1.log
+++ /dev/null
@@ -1,43 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/WebScrapingWithPython/yaoshe1/index.py b/WebScrapingWithPython/yaoshe1/index.py
index 642b52e..17d1fa4 100644
--- a/WebScrapingWithPython/yaoshe1/index.py
+++ b/WebScrapingWithPython/yaoshe1/index.py
@@ -12,16 +12,20 @@
 import requests
 # import urllib
 import re
+import os
 
-downLoadFile = 'H:\\happy\1円\\' ##要下载到的目录
+downLoadFile = 'H:\\happy\4円\\' ##要下载到的目录
 
 def getFile(url):
 if(requests.get(url).status_code == 404):
 print('这是个错误网址')
 return []
- print ('正在打开 ',url)
+ #print ('正在打开 ',url)
 file_name = url.split('/')[-1]
 file_s = downLoadFile + file_name
+ if os.path.exists(file_s):
+ print("file exists = " + file_name)
+ return
 u = urlopen(url)
 # u = requests.urlopen(url)
 
@@ -35,7 +39,7 @@ def getFile(url):
 
 f.write(buffer)
 f.close()
- print "Sucessful to download" + " " + file_name
+ print("Sucessful to download = " + file_name)
 
 def getHtml(url):
 page = urlopen(url)
@@ -54,30 +58,35 @@ def getUrl(html):
 def opeVideoUrl(url):
 html = urlopen(url).read()
 ss = html.replace(" ","")
- urls = re.findall(r"(http://www.yaoshe1.com/get_file/.*?mp4).*?",ss,re.I)
+ urls = re.findall(r"(http://www.yaoshe2.com/get_file/.*?mp4).*?",ss,re.I)
 for i in urls:
- print i
- getFile(i);
+ print(i)
+ try:
+ 	getFile(i);
+ except Exception,e:
+ print e.message
 # else:
 # print 'this is over'
 # bsObj = BeautifulSoup(html, "html5lib")
 # print bsObj
 
 
-html = urlopen("http://www.yaoshe1.com/")
+html = urlopen("http://www.yaoshe2.com/")
 # print(html.read())
 bsObj = BeautifulSoup(html, "html5lib")
 itemsDivObj = bsObj.findAll("div",{"class":re.compile("^(item)((?!:).)*$")})
+print "itemsDivObj div item = ",len(itemsDivObj)
 for obj in itemsDivObj:
- videosObjs = obj.findAll("a",{"href":re.compile("^(http://www.yaoshe1.com/videos/)((?!:).)*$")})
- # print("==================")
- strHref = videosObjs[0].attrs["href"]
- # print strHref
- count = 0
- while count < 1: - count = count+1 - print("url = " + strHref) - opeVideoUrl(strHref) + videosObjs = obj.findAll("a",{"href":re.compile("^(http://www.yaoshe2.com/videos/)((?!:).)*$")}) + print "videosObjs a videos = ",len(videosObjs) + if len(videosObjs) != 0: + strHref = videosObjs[0].attrs["href"] + # print strHref + count = 0 + while count < 1: + count = count+1 + print("url = " + strHref) + opeVideoUrl(strHref) diff --git a/web_crawle/book/file.py b/web_crawle/book/file.py new file mode 100644 index 0000000..ccdf2a5 --- /dev/null +++ b/web_crawle/book/file.py @@ -0,0 +1,8 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- +# @Filename: file +# @Date : 2017-09-25 09:35 +# @Author : zzl +""" + python_version 2.7.11 +""" \ No newline at end of file From 27fda965be1e0fbee1ef456eef25ed5f3ceeae6b Mon Sep 17 00:00:00 2001 From: zzl <zzl@1noc.cn>
Date: 2017年10月16日 17:16:48 +0800
Subject: [PATCH 6/8] next page
---
 WebScrapingWithPython/yaoshe1/index.py &#124; 49 +++++++++++++++++---------
 1 file changed, 32 insertions(+), 17 deletions(-)
diff --git a/WebScrapingWithPython/yaoshe1/index.py b/WebScrapingWithPython/yaoshe1/index.py
index 17d1fa4..b74622f 100644
--- a/WebScrapingWithPython/yaoshe1/index.py
+++ b/WebScrapingWithPython/yaoshe1/index.py
@@ -14,7 +14,7 @@
 import re
 import os
 
-downLoadFile = 'H:\\happy\4円\\' ##要下载到的目录
+downLoadFile = 'H:\\happy\5円\\' ##要下载到的目录
 
 def getFile(url):
 if(requests.get(url).status_code == 404):
@@ -70,23 +70,38 @@ def opeVideoUrl(url):
 # bsObj = BeautifulSoup(html, "html5lib")
 # print bsObj
 
+def eachLatestUpdates():
+ print "eachLatestUpdates"
+ while currPage < 10: + startOpenPage("http://www.yaoshe2.com/latest-updates/" + currPage + "/") + currPage = currPage +1 + + + + +def startOpenPage(url): + html = urlopen(url) + # print(html.read()) + bsObj = BeautifulSoup(html, "html5lib") + itemsDivObj = bsObj.findAll("div",{"class":re.compile("^(item)((?!:).)*$")}) + print "itemsDivObj div item = ",len(itemsDivObj) + for obj in itemsDivObj: + videosObjs = obj.findAll("a",{"href":re.compile("^(http://www.yaoshe2.com/videos/)((?!:).)*$")}) + print "videosObjs a videos = ",len(videosObjs) + if len(videosObjs) != 0: + strHref = videosObjs[0].attrs["href"] + # print strHref + count = 0 + while count < 1: + count = count+1 + print("url = " + strHref) + opeVideoUrl(strHref) + + +currPage = 2 +startOpenPage("http://www.yaoshe2.com/") +eachLatestUpdates() -html = urlopen("http://www.yaoshe2.com/") -# print(html.read()) -bsObj = BeautifulSoup(html, "html5lib") -itemsDivObj = bsObj.findAll("div",{"class":re.compile("^(item)((?!:).)*$")}) -print "itemsDivObj div item = ",len(itemsDivObj) -for obj in itemsDivObj: - videosObjs = obj.findAll("a",{"href":re.compile("^(http://www.yaoshe2.com/videos/)((?!:).)*$")}) - print "videosObjs a videos = ",len(videosObjs) - if len(videosObjs) != 0: - strHref = videosObjs[0].attrs["href"] - # print strHref - count = 0 - while count < 1: - count = count+1 - print("url = " + strHref) - opeVideoUrl(strHref) From 0f4147183f5e73b98a668b94efe510895555a5bd Mon Sep 17 00:00:00 2001 From: zzl <zzl@1noc.cn>
Date: 2017年10月16日 19:37:39 +0800
Subject: [PATCH 7/8] =?UTF-8?q?=E8=BF=B7?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
 WebScrapingWithPython/yaoshe1/index.py &#124; 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/WebScrapingWithPython/yaoshe1/index.py b/WebScrapingWithPython/yaoshe1/index.py
index b74622f..1bfc99c 100644
--- a/WebScrapingWithPython/yaoshe1/index.py
+++ b/WebScrapingWithPython/yaoshe1/index.py
@@ -16,6 +16,8 @@
 
 downLoadFile = 'H:\\happy\5円\\' ##要下载到的目录
 
+
+
 def getFile(url):
 if(requests.get(url).status_code == 404):
 print('这是个错误网址')
@@ -71,10 +73,11 @@ def opeVideoUrl(url):
 # print bsObj
 
 def eachLatestUpdates():
+ currPage = 2
 print "eachLatestUpdates"
 while currPage < 10: - startOpenPage("http://www.yaoshe2.com/latest-updates/" + currPage + "/") - currPage = currPage +1 + startOpenPage("http://www.yaoshe2.com/latest-updates/" + str(currPage) + "/") + currPage = currPage + 1 @@ -98,7 +101,6 @@ def startOpenPage(url): opeVideoUrl(strHref) -currPage = 2 startOpenPage("http://www.yaoshe2.com/") eachLatestUpdates() From 268fc58f64e30361ae1fdb612f566addd06294f9 Mon Sep 17 00:00:00 2001 From: zzl <zzl@1noc.cn>
Date: 2018年3月30日 12:00:29 +0800
Subject: [PATCH 8/8] test
---
 WebScrapingWithPython/yaoshe1/index.py &#124; 27 +++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)
diff --git a/WebScrapingWithPython/yaoshe1/index.py b/WebScrapingWithPython/yaoshe1/index.py
index 1bfc99c..53368a4 100644
--- a/WebScrapingWithPython/yaoshe1/index.py
+++ b/WebScrapingWithPython/yaoshe1/index.py
@@ -5,6 +5,9 @@
 # @Author : zzl
 """
 python_version 2.7.11
+
+ import package: bs4,requests,html5lib
+
 """
 from urllib2 import urlopen
 # from urllib2 import open
@@ -14,8 +17,8 @@
 import re
 import os
 
-downLoadFile = 'H:\\happy\5円\\' ##要下载到的目录
-
+downLoadFile = 'D:\\happy\\yaoshe6' ##要下载到的目录
+isdebug = True
 
 
 def getFile(url):
@@ -63,10 +66,10 @@ def opeVideoUrl(url):
 urls = re.findall(r"(http://www.yaoshe2.com/get_file/.*?mp4).*?",ss,re.I)
 for i in urls:
 print(i)
- try:
- 	getFile(i);
- except Exception,e:
- print e.message
+ # try:
+ getFile(i);
+ # except Exception,e:
+ # print e.message
 # else:
 # print 'this is over'
 # bsObj = BeautifulSoup(html, "html5lib")
@@ -75,21 +78,23 @@ def opeVideoUrl(url):
 def eachLatestUpdates():
 currPage = 2
 print "eachLatestUpdates"
- while currPage < 10:
- startOpenPage("http://www.yaoshe2.com/latest-updates/" + str(currPage) + "/")
+ while currPage < 100:
+ startOpenPage("http://www.yaoshe6.com/latest-updates/" + str(currPage) + "/")
 currPage = currPage + 1
 
 
 
 
 def startOpenPage(url):
+ print "base url ============== " + url
 html = urlopen(url)
- # print(html.read())
+ if isdebug == True:
+ print(html.read())
 bsObj = BeautifulSoup(html, "html5lib")
 itemsDivObj = bsObj.findAll("div",{"class":re.compile("^(item)((?!:).)*$")})
 print "itemsDivObj div item = ",len(itemsDivObj)
 for obj in itemsDivObj:
- videosObjs = obj.findAll("a",{"href":re.compile("^(http://www.yaoshe2.com/videos/)((?!:).)*$")})
+ videosObjs = obj.findAll("a",{"href":re.compile("^(http://www.yaoshe6.com/videos/)((?!:).)*$")})
 print "videosObjs a videos = ",len(videosObjs)
 if len(videosObjs) != 0:
 strHref = videosObjs[0].attrs["href"]
@@ -101,7 +106,7 @@ def startOpenPage(url):
 opeVideoUrl(strHref)
 
 
-startOpenPage("http://www.yaoshe2.com/")
+startOpenPage("http://www.yaoshe6.com/")
 eachLatestUpdates()
 
 
</div><div class="naked_ctrl">
<form action="/index.cgi/contrast" method="get" name="gate">
<p><a href="http://altstyle.alfasado.net">AltStyle</a> によって変換されたページ <a href="https://github.com/chier/python/compare/master...develop.patch">(-&gt;オリジナル)</a>
/ <label>アドレス: <input type="text" name="naked_post_url" value="https://github.com/chier/python/compare/master...develop.patch" size="22" /></label> <label>モード: <select name="naked_post_mode">
<option value="default">デフォルト</option>
<option value="speech">音声ブラウザ</option>
<option value="ruby">ルビ付き</option>
<option value="contrast" selected="selected">配色反転</option>
<option value="larger-text">文字拡大</option>
<option value="mobile">モバイル</option>
</select>
<input type="submit" value="表示" />
</p>
</form>
</div>