Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 4c50711

Browse files
authored
添加了 geonodedaili.py 爬取代理 (#186)
* add geonodedaili.py * add headers through crawl function
1 parent 78325d0 commit 4c50711

File tree

1 file changed

+71
-0
lines changed

1 file changed

+71
-0
lines changed
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
import time
2+
from retrying import RetryError
3+
from loguru import logger
4+
from proxypool.schemas.proxy import Proxy
5+
from proxypool.crawlers.base import BaseCrawler
6+
import json
7+
8+
BASE_URL = 'https://proxylist.geonode.com/api/proxy-list?limit=500&page={page}&sort_by=lastChecked&sort_type=desc'
9+
MAX_PAGE = 18
10+
11+
12+
class GeonodeCrawler(BaseCrawler):
13+
"""
14+
Geonode crawler, https://proxylist.geonode.com/
15+
"""
16+
urls = [BASE_URL.format(page=page) for page in range(1, MAX_PAGE + 1)]
17+
18+
def parse(self, html):
19+
"""
20+
parse html file to get proxies
21+
:return:
22+
"""
23+
try:
24+
result = json.loads(html)
25+
proxy_list = result['data']
26+
for proxy_item in proxy_list:
27+
host = proxy_item['ip']
28+
port = proxy_item['port']
29+
yield Proxy(host=host, port=port)
30+
except json.JSONDecodeError:
31+
print("json.JSONDecodeError")
32+
return
33+
34+
def crawl(self):
35+
"""
36+
override crawl main method
37+
add headers
38+
"""
39+
headers = {
40+
'authority': 'proxylist.geonode.com',
41+
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="99", "Google Chrome";v="99"',
42+
'accept': 'application/json, text/plain, */*',
43+
'sec-ch-ua-mobile': '?0',
44+
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.83 Safari/537.36',
45+
'sec-ch-ua-platform': '"macOS"',
46+
'origin': 'https://geonode.com',
47+
'sec-fetch-site': 'same-site',
48+
'sec-fetch-mode': 'cors',
49+
'sec-fetch-dest': 'empty',
50+
'referer': 'https://geonode.com/',
51+
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,ja;q=0.7',
52+
'if-none-match': 'W/"c25d-BXjLTmP+/yYXtIz4OEcmdOWSv88"',
53+
}
54+
try:
55+
for url in self.urls:
56+
logger.info(f'fetching {url}')
57+
html = self.fetch(url, headers=headers)
58+
if not html:
59+
continue
60+
time.sleep(.5)
61+
yield from self.process(html, url)
62+
except RetryError:
63+
logger.error(
64+
f'crawler {self} crawled proxy unsuccessfully, '
65+
'please check if target url is valid or network issue')
66+
67+
68+
if __name__ == '__main__':
69+
crawler = GeonodeCrawler()
70+
for proxy in crawler.crawl():
71+
print(proxy)

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /