Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 69cb60b

Browse files
committed
Version 1.0.0
Beta version.
1 parent 25ff5e7 commit 69cb60b

File tree

7 files changed

+215
-2
lines changed

7 files changed

+215
-2
lines changed

‎.gitignore‎

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
.idea/
2+
env/

‎README.md‎

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,6 @@
1-
# python-urlsresolver
2-
Python urls resolver library
1+
# urlsresolver
2+
Python urls resolver library with meta refresh support.
3+
4+
You can expand real address of any shortened url with `urlsresolver.resolve_url(url)` function.
5+
6+
Checks for meta refresh html tags & refresh http header.

‎setup.py‎

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
#!/usr/bin/env python -u
2+
from setuptools import setup, find_packages
3+
4+
setup(
5+
name='urlsresolver',
6+
version=".".join(map(str, __import__("urlsresolver").__version__)),
7+
description='Python urls resolver library',
8+
author='Alexandr I. Shurigin',
9+
author_email='ya@helldude.ru',
10+
maintainer='Alexandr I. Shurigin',
11+
maintainer_email='ya@helldude.ru',
12+
url='https://github.com/phpdude/python-urlsresolver',
13+
packages=find_packages(),
14+
test_suite='tests',
15+
classifiers=[
16+
"Development Status :: 4 - Beta",
17+
"Intended Audience :: Developers",
18+
"License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)",
19+
"Natural Language :: English",
20+
"Operating System :: OS Independent",
21+
"Programming Language :: Python",
22+
"Topic :: Internet :: WWW/HTTP",
23+
"Topic :: Software Development :: Libraries :: Python Modules",
24+
"Topic :: System :: Networking",
25+
"Topic :: Utilities"
26+
],
27+
install_requires=[
28+
'requests'
29+
]
30+
)

‎tests/__init__.py‎

Whitespace-only changes.

‎tests/tags_extractor.py‎

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
from unittest import TestCase
2+
3+
from urlsresolver import get_tags
4+
5+
6+
class TestTagsExtraction(TestCase):
7+
def test_meta_tags_extraction(self):
8+
tags = get_tags("""
9+
<meta attribute1="11211"
10+
attribute2=10
11+
attribute3
12+
13+
attribute4='asdasda
14+
asdasd'>
15+
""", 'meta')
16+
17+
tag = next(tags)
18+
19+
self.assertEqual(tag['attribute1'], '11211')
20+
self.assertEqual(tag['attribute2'], '10')
21+
self.assertEqual(tag['attribute3'], 'attribute3')
22+
self.assertEqual(tag['attribute4'], """asdasda
23+
asdasd""")
24+
25+
def test_meta_tags(self):
26+
tags = list(get_tags("""
27+
<meta charset="utf-8"/>
28+
<meta http-equiv="refresh" content="0;test=&rarr;; URL=https://mobile.twitter.com/i/nojs_router?path=%2Ftwitter%2Fstatus%2F644156390211125249%2Fvideo%2F1"></meta>
29+
""", 'meta'))
30+
31+
for t in tags:
32+
if 'charset' in t:
33+
self.assertEqual(t['charset'], 'utf-8')
34+
if 'http-equiv' in t:
35+
self.assertEqual(t['http-equiv'], 'refresh')
36+
self.assertEqual(t['content'], u'0;test=\u2192; URL=https://mobile.twitter.com/i/nojs_router?'
37+
u'path=%2Ftwitter%2Fstatus%2F644156390211125249%2Fvideo%2F1')

‎tests/twitter_urls.py‎

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
from random import shuffle, choice
2+
import unittest
3+
import re
4+
5+
import requests
6+
7+
from urlsresolver import URL_REGEX, resolve_url
8+
9+
10+
class TestTwitterTrendsUrls(unittest.TestCase):
11+
def setUp(self):
12+
trends = requests.get('https://twitter.com/twitter')
13+
14+
urls = re.findall(URL_REGEX, trends.content)
15+
urls = list(x for x in urls if re.match('https?://t.co/[a-z0-9]+$', x, re.IGNORECASE))
16+
shuffle(urls)
17+
18+
self.urls = urls
19+
20+
def test_twitter_urls(self):
21+
for u in self.urls[:3]:
22+
self.assertNotEqual(u, resolve_url(u))
23+
24+
def test_url_history(self):
25+
test_url = choice(self.urls)
26+
27+
url, history = resolve_url(test_url, history=True)
28+
29+
self.assertNotEqual(url, test_url)
30+
self.assertIsInstance(history, list)

‎urlsresolver/__init__.py‎

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
# coding=utf-8
2+
from HTMLParser import HTMLParser
3+
from collections import OrderedDict
4+
from contextlib import closing
5+
import re
6+
from urlparse import urljoin
7+
8+
import requests
9+
10+
__version__ = (1, 0, 0)
11+
__author__ = 'Alexandr Shurigin (https://github.com/phpdude/)'
12+
13+
# HTML tags syntax http://www.w3.org/TR/html-markup/syntax.html
14+
TAG_ATTRIBUTES_REGEX = \
15+
"(?:\s+%(attr)s\s*=\s*\"%(dqval)s\")|" \
16+
"(?:\s+%(attr)s\s*=\s*'%(sqval)s')|" \
17+
"(?:\s+%(attr)s\s*=\s*%(uqval)s)|" \
18+
"(?:\s+%(attr)s)" % {
19+
'attr': "([^\s\\x00\"'>/=]+)",
20+
'uqval': "([^\s\"'=><`]*)",
21+
'sqval': "([^'\\x00]*)",
22+
'dqval': "([^\"\\x00]*)"
23+
}
24+
25+
26+
def get_tags(html, tag_name):
27+
parser = HTMLParser()
28+
for m in re.findall('<%s(\s+[^>]*)/*>' % tag_name, html, re.IGNORECASE):
29+
attrs = {}
30+
31+
for x in re.findall('(?:(%s))' % TAG_ATTRIBUTES_REGEX, m, re.UNICODE):
32+
if x[1]:
33+
attrs[x[1]] = parser.unescape(x[2])
34+
elif x[3]:
35+
attrs[x[3]] = parser.unescape(x[4])
36+
elif x[5]:
37+
attrs[x[5]] = parser.unescape(x[6])
38+
elif x[7]:
39+
attrs[x[7]] = parser.unescape(x[7])
40+
41+
yield attrs
42+
43+
44+
def resolve_url(
45+
start_url,
46+
user_agent=False,
47+
chunk_size=1500,
48+
max_redirects=30,
49+
history=False,
50+
remove_noscript=True,
51+
**kwargs):
52+
"""
53+
Helper function for expanding shortened urls.
54+
55+
:param start_url: Shortened url to expand
56+
:param user_agent: Custom User-Agent header.
57+
:param chunk_size: Size of header to fetch from response body for searching meta refresh tags.
58+
:param max_redirects: Maximum meta refresh redirects
59+
:param history: If True, function will return tuple with (url, history) where history is list of redirects
60+
:param remove_noscript: Remove <noscript></noscript> blocks from head HTML (skip redirects for old browsers versions)
61+
:param kwargs: Custom kwargs for requests.get(**kwargs) function.
62+
:return: str|tuple
63+
"""
64+
s = requests.session()
65+
66+
urls_history = OrderedDict()
67+
if user_agent:
68+
s.headers['User-Agent'] = user_agent
69+
70+
def follow_meta_redirects(url, max_redirects, **kwargs):
71+
urls_history[url] = True
72+
73+
if max_redirects < 0:
74+
raise ValueError("Cannot resolve real url with max_redirects=%s" % max_redirects)
75+
76+
max_redirects -= 1
77+
78+
with closing(s.get(url, allow_redirects=True, stream=True, **kwargs)) as resp:
79+
if resp.history:
80+
for r in resp.history:
81+
urls_history[r.url] = True
82+
83+
head, real_url = next(resp.iter_content(chunk_size, decode_unicode=False)), resp.url
84+
85+
# Removing html blocks in <noscript></noscript>
86+
if remove_noscript:
87+
head = re.sub('<noscript[^>]*>.*</noscript[^>]*>', '', head, flags=re.DOTALL)
88+
89+
redirect = None
90+
if 'refresh' in resp.headers:
91+
redirect = resp.headers['refresh']
92+
elif not redirect:
93+
for tag in get_tags(head, 'meta'):
94+
if tag.get('http-equiv', '') == 'refresh':
95+
redirect = tag.get('content', None)
96+
97+
if redirect:
98+
m = re.search('url\s*=\s*([^\s;]+)', redirect, re.I)
99+
if m:
100+
real_url = follow_meta_redirects(urljoin(resp.url, m.group(1)), max_redirects)
101+
102+
urls_history[real_url] = True
103+
104+
return real_url
105+
106+
real_url = follow_meta_redirects(start_url, max_redirects, **kwargs)
107+
if history:
108+
return real_url, urls_history.keys()
109+
else:
110+
return real_url

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /