Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 78b3244

Browse files
authored
add sub proxy pool mechanics (#213)
1 parent 0033586 commit 78b3244

File tree

7 files changed

+111
-30
lines changed

7 files changed

+111
-30
lines changed

‎proxypool/processors/getter.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
from proxypool.storages.redis import RedisClient
33
from proxypool.setting import PROXY_NUMBER_MAX
44
from proxypool.crawlers import __all__ as crawlers_cls
5-
5+
fromproxypool.testersimport__all__astesters_cls
66

77
class Getter(object):
88
"""
@@ -16,6 +16,8 @@ def __init__(self):
1616
self.redis = RedisClient()
1717
self.crawlers_cls = crawlers_cls
1818
self.crawlers = [crawler_cls() for crawler_cls in self.crawlers_cls]
19+
self.testers_cls = testers_cls
20+
self.testers = [tester_cls() for tester_cls in self.testers_cls]
1921

2022
def is_full(self):
2123
"""
@@ -36,6 +38,7 @@ def run(self):
3638
logger.info(f'crawler {crawler} to get proxy')
3739
for proxy in crawler.crawl():
3840
self.redis.add(proxy)
41+
[self.redis.add(proxy, redis_key=tester.key) for tester in self.testers]
3942

4043

4144
if __name__ == '__main__':

‎proxypool/processors/server.py

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from flask import Flask, g, request
2+
from proxypool.exceptions import PoolEmptyException
23
from proxypool.storages.redis import RedisClient
3-
from proxypool.setting import API_HOST, API_PORT, API_THREADED, API_KEY, IS_DEV
4+
from proxypool.setting import API_HOST, API_PORT, API_THREADED, API_KEY, IS_DEV, PROXY_RAND_KEY_DEGRADED
45
import functools
56

67
__all__ = ['app']
@@ -53,10 +54,19 @@ def index():
5354
@auth_required
5455
def get_proxy():
5556
"""
56-
get a random proxy
57+
get a random proxy, can query the specific sub-pool according the (redis) key
58+
if PROXY_RAND_KEY_DEGRADED is set to True, will get a universal random proxy if no proxy found in the sub-pool
5759
:return: get a random proxy
5860
"""
61+
key = request.args.get('key')
5962
conn = get_conn()
63+
# return conn.random(key).string() if key else conn.random().string()
64+
if key:
65+
try:
66+
return conn.random(key).string()
67+
except PoolEmptyException:
68+
if not PROXY_RAND_KEY_DEGRADED:
69+
raise
6070
return conn.random().string()
6171

6272

@@ -67,8 +77,10 @@ def get_proxy_all():
6777
get a random proxy
6878
:return: get a random proxy
6979
"""
80+
key = request.args.get('key')
81+
7082
conn = get_conn()
71-
proxies = conn.all()
83+
proxies = conn.all(key) ifkeyelseconn.all()
7284
proxies_string = ''
7385
if proxies:
7486
for proxy in proxies:
@@ -85,7 +97,8 @@ def get_count():
8597
:return: count, int
8698
"""
8799
conn = get_conn()
88-
return str(conn.count())
100+
key = request.args.get('key')
101+
return str(conn.count(key)) if key else conn.count()
89102

90103

91104
if __name__ == '__main__':

‎proxypool/processors/tester.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
TEST_DONT_SET_MAX_SCORE
88
from aiohttp import ClientProxyConnectionError, ServerDisconnectedError, ClientOSError, ClientHttpProxyError
99
from asyncio import TimeoutError
10+
from proxypool.testers import __all__ as testers_cls
1011

1112
EXCEPTIONS = (
1213
ClientProxyConnectionError,
@@ -30,6 +31,8 @@ def __init__(self):
3031
"""
3132
self.redis = RedisClient()
3233
self.loop = asyncio.get_event_loop()
34+
self.testers_cls = testers_cls
35+
self.testers = [tester_cls() for tester_cls in self.testers_cls]
3336

3437
async def test(self, proxy: Proxy):
3538
"""
@@ -63,8 +66,33 @@ async def test(self, proxy: Proxy):
6366
else:
6467
self.redis.decrease(proxy)
6568
logger.debug(f'proxy {proxy.string()} is invalid, decrease score')
69+
# if independent tester class found, create new set of storage and do the extra test
70+
for tester in self.testers:
71+
key = tester.key
72+
if self.redis.exists(proxy, key):
73+
test_url = tester.test_url
74+
headers = tester.headers()
75+
cookies = tester.cookies()
76+
async with session.get(test_url, proxy=f'http://{proxy.string()}',
77+
timeout=TEST_TIMEOUT,
78+
headers=headers,
79+
cookies=cookies,
80+
allow_redirects=False) as response:
81+
resp_text = await response.text()
82+
is_valid = await tester.parse(resp_text, test_url, proxy.string())
83+
if is_valid:
84+
if tester.test_dont_set_max_score:
85+
logger.info(f'key[{key}] proxy {proxy.string()} is valid, remain current score')
86+
else:
87+
self.redis.max(proxy, key, tester.proxy_score_max)
88+
logger.info(f'key[{key}] proxy {proxy.string()} is valid, set max score')
89+
else:
90+
self.redis.decrease(proxy, tester.key, tester.proxy_score_min)
91+
logger.info(f'key[{key}] proxy {proxy.string()} is invalid, decrease score')
92+
6693
except EXCEPTIONS:
6794
self.redis.decrease(proxy)
95+
[self.redis.decrease(proxy, tester.key, tester.proxy_score_min) for tester in self.testers]
6896
logger.debug(f'proxy {proxy.string()} is invalid, decrease score')
6997

7098
@logger.catch

‎proxypool/setting.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,8 @@
5656
PROXY_SCORE_MAX = env.int('PROXY_SCORE_MAX', 100)
5757
PROXY_SCORE_MIN = env.int('PROXY_SCORE_MIN', 0)
5858
PROXY_SCORE_INIT = env.int('PROXY_SCORE_INIT', 10)
59+
# whether to get a universal random proxy if no proxy exists in the sub-pool identified by a specific key
60+
PROXY_RAND_KEY_DEGRADED = env.bool('TEST_ANONYMOUS', True)
5961

6062
# definition of proxy number
6163
PROXY_NUMBER_MAX = 50000

‎proxypool/storages/redis.py

Lines changed: 25 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ def __init__(self, host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db
3434
self.db = redis.StrictRedis(
3535
host=host, port=port, password=password, db=db, decode_responses=True, **kwargs)
3636

37-
def add(self, proxy: Proxy, score=PROXY_SCORE_INIT) -> int:
37+
def add(self, proxy: Proxy, score=PROXY_SCORE_INIT, redis_key=REDIS_KEY) -> int:
3838
"""
3939
add proxy and set it to init score
4040
:param proxy: proxy, ip:port, like 8.8.8.8:88
@@ -44,12 +44,12 @@ def add(self, proxy: Proxy, score=PROXY_SCORE_INIT) -> int:
4444
if not is_valid_proxy(f'{proxy.host}:{proxy.port}'):
4545
logger.info(f'invalid proxy {proxy}, throw it')
4646
return
47-
if not self.exists(proxy):
47+
if not self.exists(proxy, redis_key):
4848
if IS_REDIS_VERSION_2:
49-
return self.db.zadd(REDIS_KEY, score, proxy.string())
50-
return self.db.zadd(REDIS_KEY, {proxy.string(): score})
49+
return self.db.zadd(redis_key, score, proxy.string())
50+
return self.db.zadd(redis_key, {proxy.string(): score})
5151

52-
def random(self) -> Proxy:
52+
def random(self, redis_key=REDIS_KEY, proxy_score_min=PROXY_SCORE_MIN, proxy_score_max=PROXY_SCORE_MAX) -> Proxy:
5353
"""
5454
get random proxy
5555
firstly try to get proxy with max score
@@ -59,74 +59,74 @@ def random(self) -> Proxy:
5959
"""
6060
# try to get proxy with max score
6161
proxies = self.db.zrangebyscore(
62-
REDIS_KEY, PROXY_SCORE_MAX, PROXY_SCORE_MAX)
62+
redis_key, proxy_score_max, proxy_score_max)
6363
if len(proxies):
6464
return convert_proxy_or_proxies(choice(proxies))
6565
# else get proxy by rank
6666
proxies = self.db.zrevrange(
67-
REDIS_KEY, PROXY_SCORE_MIN, PROXY_SCORE_MAX)
67+
redis_key, proxy_score_min, proxy_score_max)
6868
if len(proxies):
6969
return convert_proxy_or_proxies(choice(proxies))
7070
# else raise error
7171
raise PoolEmptyException
7272

73-
def decrease(self, proxy: Proxy) -> int:
73+
def decrease(self, proxy: Proxy, redis_key=REDIS_KEY, proxy_score_min=PROXY_SCORE_MIN) -> int:
7474
"""
7575
decrease score of proxy, if small than PROXY_SCORE_MIN, delete it
7676
:param proxy: proxy
7777
:return: new score
7878
"""
7979
if IS_REDIS_VERSION_2:
80-
self.db.zincrby(REDIS_KEY, proxy.string(), -1)
80+
self.db.zincrby(redis_key, proxy.string(), -1)
8181
else:
82-
self.db.zincrby(REDIS_KEY, -1, proxy.string())
83-
score = self.db.zscore(REDIS_KEY, proxy.string())
82+
self.db.zincrby(redis_key, -1, proxy.string())
83+
score = self.db.zscore(redis_key, proxy.string())
8484
logger.info(f'{proxy.string()} score decrease 1, current {score}')
85-
if score <= PROXY_SCORE_MIN:
85+
if score <= proxy_score_min:
8686
logger.info(f'{proxy.string()} current score {score}, remove')
87-
self.db.zrem(REDIS_KEY, proxy.string())
87+
self.db.zrem(redis_key, proxy.string())
8888

89-
def exists(self, proxy: Proxy) -> bool:
89+
def exists(self, proxy: Proxy, redis_key=REDIS_KEY) -> bool:
9090
"""
9191
if proxy exists
9292
:param proxy: proxy
9393
:return: if exists, bool
9494
"""
95-
return not self.db.zscore(REDIS_KEY, proxy.string()) is None
95+
return not self.db.zscore(redis_key, proxy.string()) is None
9696

97-
def max(self, proxy: Proxy) -> int:
97+
def max(self, proxy: Proxy, redis_key=REDIS_KEY, proxy_score_max=PROXY_SCORE_MAX) -> int:
9898
"""
9999
set proxy to max score
100100
:param proxy: proxy
101101
:return: new score
102102
"""
103-
logger.info(f'{proxy.string()} is valid, set to {PROXY_SCORE_MAX}')
103+
logger.info(f'{proxy.string()} is valid, set to {proxy_score_max}')
104104
if IS_REDIS_VERSION_2:
105-
return self.db.zadd(REDIS_KEY, PROXY_SCORE_MAX, proxy.string())
106-
return self.db.zadd(REDIS_KEY, {proxy.string(): PROXY_SCORE_MAX})
105+
return self.db.zadd(redis_key, proxy_score_max, proxy.string())
106+
return self.db.zadd(redis_key, {proxy.string(): proxy_score_max})
107107

108-
def count(self) -> int:
108+
def count(self, redis_key=REDIS_KEY) -> int:
109109
"""
110110
get count of proxies
111111
:return: count, int
112112
"""
113-
return self.db.zcard(REDIS_KEY)
113+
return self.db.zcard(redis_key)
114114

115-
def all(self) -> List[Proxy]:
115+
def all(self, redis_key=REDIS_KEY, proxy_score_min=PROXY_SCORE_MIN, proxy_score_max=PROXY_SCORE_MAX) -> List[Proxy]:
116116
"""
117117
get all proxies
118118
:return: list of proxies
119119
"""
120-
return convert_proxy_or_proxies(self.db.zrangebyscore(REDIS_KEY, PROXY_SCORE_MIN, PROXY_SCORE_MAX))
120+
return convert_proxy_or_proxies(self.db.zrangebyscore(redis_key, proxy_score_min, proxy_score_max))
121121

122-
def batch(self, cursor, count) -> List[Proxy]:
122+
def batch(self, cursor, count, redis_key=REDIS_KEY) -> List[Proxy]:
123123
"""
124124
get batch of proxies
125125
:param cursor: scan cursor
126126
:param count: scan count
127127
:return: list of proxies
128128
"""
129-
cursor, proxies = self.db.zscan(REDIS_KEY, cursor, count=count)
129+
cursor, proxies = self.db.zscan(redis_key, cursor, count=count)
130130
return cursor, convert_proxy_or_proxies([i[0] for i in proxies])
131131

132132

‎proxypool/testers/__init__.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
import pkgutil
2+
from .base import BaseTester
3+
import inspect
4+
5+
6+
# load classes subclass of BaseCrawler
7+
classes = []
8+
for loader, name, is_pkg in pkgutil.walk_packages(__path__):
9+
module = loader.find_module(name).load_module(name)
10+
for name, value in inspect.getmembers(module):
11+
globals()[name] = value
12+
if inspect.isclass(value) and issubclass(value, BaseTester) and value is not BaseTester \
13+
and not getattr(value, 'ignore', False):
14+
classes.append(value)
15+
__all__ = __ALL__ = classes
16+

‎proxypool/testers/base.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
from proxypool.setting import TEST_DONT_SET_MAX_SCORE, PROXY_SCORE_INIT, PROXY_SCORE_MAX, PROXY_SCORE_MIN
2+
3+
4+
class BaseTester(object):
5+
test_url = ""
6+
key = ""
7+
test_dont_set_max_score = TEST_DONT_SET_MAX_SCORE
8+
proxy_score_init = PROXY_SCORE_INIT
9+
proxy_score_max = PROXY_SCORE_MAX
10+
proxy_score_min = PROXY_SCORE_MIN
11+
12+
def headers(self):
13+
return None
14+
15+
def cookies(self):
16+
return None
17+
18+
async def parse(self, html, url, proxy, expr='{"code":0'):
19+
return True if expr in html else False

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /