Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit eb1e1e6

Browse files
committed
提交代码
1 parent c104012 commit eb1e1e6

18 files changed

+382
-1
lines changed

‎fans/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ Python技术 公众号文章代码库
1616

1717
[PyAutoGUI,轻松搞定图片上传!](https://github.com/JustDoPython/python-examples/tree/master/fans/imgupload):PyAutoGUI,轻松搞定图片上传!
1818

19-
19+
[为了买车,我爬了懂车帝!](https://github.com/JustDoPython/python-examples/tree/master/fans/scrapydcd):为了买车,我爬了懂车帝!
2020

2121

2222

‎fans/scrapydcd/dcd/dcd/__init__.py

Whitespace-only changes.
109 Bytes
Binary file not shown.
1.51 KB
Binary file not shown.
361 Bytes
Binary file not shown.
295 Bytes
Binary file not shown.
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# -*- coding: utf-8 -*-
2+
import time
3+
from selenium import webdriver
4+
from scrapy.http.response.html import HtmlResponse
5+
6+
class DcdDownloaderMiddleware(object):
7+
8+
def __init__(self):
9+
# 加载测试浏览器
10+
options = webdriver.ChromeOptions()
11+
options.add_argument('--no-sandbox')
12+
options.add_argument('--disable-gpu')
13+
options.add_argument('--ignore-certificate-errors')
14+
options.add_argument('--ignore-ssl-errors')
15+
16+
self.driver = webdriver.Chrome(executable_path=r"C:\drf2\drf2\chromedriver.exe",options=options)
17+
self.driver.maximize_window()
18+
19+
#重写process_request方法
20+
def process_request(self, request, spider):
21+
print('request.url',request.url)
22+
self.driver.get(request.url)
23+
js = 'return document.body.scrollHeight;'
24+
height = 0
25+
if request.url != 'https://www.dongchedi.com/auto/library/x-x-x-x-x-x-x-x-x-x-x-x-x-x-x-x-x-x':
26+
while True:
27+
new_height = self.driver.execute_script(js)
28+
if new_height > height:
29+
self.driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
30+
height = new_height
31+
time.sleep(1)
32+
else:
33+
print("滚动条已经处于页面最下方!")
34+
break
35+
source = self.driver.page_source
36+
# 创建一个response对象,把页面信息都封装在reponse对象中
37+
response = HtmlResponse(url=self.driver.current_url,body=source,request = request,encoding="utf-8")
38+
return response

‎fans/scrapydcd/dcd/dcd/items.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# Define here the models for your scraped items
2+
#
3+
# See documentation in:
4+
# https://docs.scrapy.org/en/latest/topics/items.html
5+
6+
import scrapy
7+
8+
9+
class DcdItem(scrapy.Item):
10+
#品牌
11+
brand = scrapy.Field()
12+
#车型
13+
name = scrapy.Field()
14+
#评分
15+
score = scrapy.Field()
16+
#特点
17+
title = scrapy.Field()

‎fans/scrapydcd/dcd/dcd/middlewares.py

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
# Define here the models for your spider middleware
2+
#
3+
# See documentation in:
4+
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
5+
6+
import time
7+
from selenium import webdriver
8+
from scrapy import signals
9+
from scrapy.http.response.html import HtmlResponse
10+
11+
# useful for handling different item types with a single interface
12+
from itemadapter import is_item, ItemAdapter
13+
14+
15+
class DcdSpiderMiddleware:
16+
# Not all methods need to be defined. If a method is not defined,
17+
# scrapy acts as if the spider middleware does not modify the
18+
# passed objects.
19+
20+
@classmethod
21+
def from_crawler(cls, crawler):
22+
# This method is used by Scrapy to create your spiders.
23+
s = cls()
24+
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
25+
return s
26+
27+
def process_spider_input(self, response, spider):
28+
# Called for each response that goes through the spider
29+
# middleware and into the spider.
30+
31+
# Should return None or raise an exception.
32+
return None
33+
34+
def process_spider_output(self, response, result, spider):
35+
# Called with the results returned from the Spider, after
36+
# it has processed the response.
37+
38+
# Must return an iterable of Request, or item objects.
39+
for i in result:
40+
yield i
41+
42+
def process_spider_exception(self, response, exception, spider):
43+
# Called when a spider or process_spider_input() method
44+
# (from other spider middleware) raises an exception.
45+
46+
# Should return either None or an iterable of Request or item objects.
47+
pass
48+
49+
def process_start_requests(self, start_requests, spider):
50+
# Called with the start requests of the spider, and works
51+
# similarly to the process_spider_output() method, except
52+
# that it doesn’t have a response associated.
53+
54+
# Must return only requests (not items).
55+
for r in start_requests:
56+
yield r
57+
58+
def spider_opened(self, spider):
59+
spider.logger.info('Spider opened: %s' % spider.name)
60+
61+
62+
class DcdDownloaderMiddleware:
63+
# Not all methods need to be defined. If a method is not defined,
64+
# scrapy acts as if the downloader middleware does not modify the
65+
# passed objects.
66+
67+
def __init__(self):
68+
# 加载测试浏览器
69+
options = webdriver.ChromeOptions()
70+
options.add_argument('--no-sandbox')
71+
options.add_argument('--disable-gpu')
72+
options.add_argument('--ignore-certificate-errors')
73+
options.add_argument('--ignore-ssl-errors')
74+
75+
self.driver = webdriver.Chrome(executable_path=r"C:\drf2\drf2\chromedriver.exe",options=options)
76+
self.driver.maximize_window()
77+
78+
@classmethod
79+
def from_crawler(cls, crawler):
80+
# This method is used by Scrapy to create your spiders.
81+
s = cls()
82+
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
83+
return s
84+
85+
def process_request(self, request, spider):
86+
print('request.url',request.url)
87+
self.driver.get(request.url)
88+
js = 'return document.body.scrollHeight;'
89+
height = 0
90+
if request.url != 'https://www.dongchedi.com/auto/library/x-x-x-x-x-x-x-x-x-x-x-x-x-x-x-x-x-x':
91+
while True:
92+
new_height = self.driver.execute_script(js)
93+
if new_height > height:
94+
self.driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
95+
height = new_height
96+
time.sleep(1)
97+
else:
98+
print("滚动条已经处于页面最下方!")
99+
break
100+
source = self.driver.page_source
101+
# 创建一个response对象,把页面信息都封装在reponse对象中
102+
response = HtmlResponse(url=self.driver.current_url,body=source,request = request,encoding="utf-8")
103+
return response
104+
105+
def process_response(self, request, response, spider):
106+
# Called with the response returned from the downloader.
107+
108+
# Must either;
109+
# - return a Response object
110+
# - return a Request object
111+
# - or raise IgnoreRequest
112+
return response
113+
114+
def process_exception(self, request, exception, spider):
115+
# Called when a download handler or a process_request()
116+
# (from other downloader middleware) raises an exception.
117+
118+
# Must either:
119+
# - return None: continue processing this exception
120+
# - return a Response object: stops process_exception() chain
121+
# - return a Request object: stops process_exception() chain
122+
pass
123+
124+
def spider_opened(self, spider):
125+
spider.logger.info('Spider opened: %s' % spider.name)

‎fans/scrapydcd/dcd/dcd/pipelines.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Define your item pipelines here
2+
#
3+
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
4+
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
5+
6+
7+
# useful for handling different item types with a single interface
8+
from itemadapter import ItemAdapter
9+
10+
11+
class DcdPipeline:
12+
def process_item(self, item, spider):
13+
return item

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /