Commit 69cb60b

committed

Version 1.0.0

Beta version.

1 parent 25ff5e7 commit 69cb60bCopy full SHA for 69cb60b

File tree

7 files changed

+215

-2

lines changed

.gitignore
README.md
setup.py
tests
urlsresolver
- __init__.py

7 files changed

+215

-2

lines changed

`‎.gitignore‎`

Lines changed: 2 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+.idea/`
	`2`	`+env/`

`‎README.md‎`

Lines changed: 6 additions & 2 deletions

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,6 @@`
`1`		`-# python-urlsresolver`
`2`		`-Python urls resolver library`
	`1`	`+# urlsresolver`
	`2`	`+Python urls resolver library with meta refresh support.`
	`3`	`+`
	`4`	+You can expand real address of any shortened url with `urlsresolver.resolve_url(url)` function.
	`5`	`+`
	`6`	`+Checks for meta refresh html tags & refresh http header.`

`‎setup.py‎`

Lines changed: 30 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,30 @@`
	`1`	`+#!/usr/bin/env python -u`
	`2`	`+from setuptools import setup, find_packages`
	`3`	`+`
	`4`	`+setup(`
	`5`	`+ name='urlsresolver',`
	`6`	`+ version=".".join(map(str, __import__("urlsresolver").__version__)),`
	`7`	`+ description='Python urls resolver library',`
	`8`	`+ author='Alexandr I. Shurigin',`
	`9`	`+ author_email='ya@helldude.ru',`
	`10`	`+ maintainer='Alexandr I. Shurigin',`
	`11`	`+ maintainer_email='ya@helldude.ru',`
	`12`	`+ url='https://github.com/phpdude/python-urlsresolver',`
	`13`	`+ packages=find_packages(),`
	`14`	`+ test_suite='tests',`
	`15`	`+ classifiers=[`
	`16`	`+ "Development Status :: 4 - Beta",`
	`17`	`+ "Intended Audience :: Developers",`
	`18`	`+ "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)",`
	`19`	`+ "Natural Language :: English",`
	`20`	`+ "Operating System :: OS Independent",`
	`21`	`+ "Programming Language :: Python",`
	`22`	`+ "Topic :: Internet :: WWW/HTTP",`
	`23`	`+ "Topic :: Software Development :: Libraries :: Python Modules",`
	`24`	`+ "Topic :: System :: Networking",`
	`25`	`+ "Topic :: Utilities"`
	`26`	`+ ],`
	`27`	`+ install_requires=[`
	`28`	`+ 'requests'`
	`29`	`+ ]`
	`30`	`+)`

`‎tests/init.py‎`

Whitespace-only changes.

`‎tests/tags_extractor.py‎`

Lines changed: 37 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,37 @@`
	`1`	`+from unittest import TestCase`
	`2`	`+`
	`3`	`+from urlsresolver import get_tags`
	`4`	`+`
	`5`	`+`
	`6`	`+class TestTagsExtraction(TestCase):`
	`7`	`+ def test_meta_tags_extraction(self):`
	`8`	`+ tags = get_tags("""`
	`9`	`+ <meta attribute1="11211"`
	`10`	`+ attribute2=10`
	`11`	`+ attribute3`
	`12`	`+`
	`13`	`+ attribute4='asdasda`
	`14`	`+ asdasd'>`
	`15`	`+ """, 'meta')`
	`16`	`+`
	`17`	`+ tag = next(tags)`
	`18`	`+`
	`19`	`+ self.assertEqual(tag['attribute1'], '11211')`
	`20`	`+ self.assertEqual(tag['attribute2'], '10')`
	`21`	`+ self.assertEqual(tag['attribute3'], 'attribute3')`
	`22`	`+ self.assertEqual(tag['attribute4'], """asdasda`
	`23`	`+ asdasd""")`
	`24`	`+`
	`25`	`+ def test_meta_tags(self):`
	`26`	`+ tags = list(get_tags("""`
	`27`	`+ <meta charset="utf-8"/>`
	`28`	`+ <meta http-equiv="refresh" content="0;test=→; URL=https://mobile.twitter.com/i/nojs_router?path=%2Ftwitter%2Fstatus%2F644156390211125249%2Fvideo%2F1"></meta>`
	`29`	`+ """, 'meta'))`
	`30`	`+`
	`31`	`+ for t in tags:`
	`32`	`+ if 'charset' in t:`
	`33`	`+ self.assertEqual(t['charset'], 'utf-8')`
	`34`	`+ if 'http-equiv' in t:`
	`35`	`+ self.assertEqual(t['http-equiv'], 'refresh')`
	`36`	`+ self.assertEqual(t['content'], u'0;test=\u2192; URL=https://mobile.twitter.com/i/nojs_router?'`
	`37`	`+ u'path=%2Ftwitter%2Fstatus%2F644156390211125249%2Fvideo%2F1')`

`‎tests/twitter_urls.py‎`

Lines changed: 30 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,30 @@`
	`1`	`+from random import shuffle, choice`
	`2`	`+import unittest`
	`3`	`+import re`
	`4`	`+`
	`5`	`+import requests`
	`6`	`+`
	`7`	`+from urlsresolver import URL_REGEX, resolve_url`
	`8`	`+`
	`9`	`+`
	`10`	`+class TestTwitterTrendsUrls(unittest.TestCase):`
	`11`	`+ def setUp(self):`
	`12`	`+ trends = requests.get('https://twitter.com/twitter')`
	`13`	`+`
	`14`	`+ urls = re.findall(URL_REGEX, trends.content)`
	`15`	`+ urls = list(x for x in urls if re.match('https?://t.co/[a-z0-9]+$', x, re.IGNORECASE))`
	`16`	`+ shuffle(urls)`
	`17`	`+`
	`18`	`+ self.urls = urls`
	`19`	`+`
	`20`	`+ def test_twitter_urls(self):`
	`21`	`+ for u in self.urls[:3]:`
	`22`	`+ self.assertNotEqual(u, resolve_url(u))`
	`23`	`+`
	`24`	`+ def test_url_history(self):`
	`25`	`+ test_url = choice(self.urls)`
	`26`	`+`
	`27`	`+ url, history = resolve_url(test_url, history=True)`
	`28`	`+`
	`29`	`+ self.assertNotEqual(url, test_url)`
	`30`	`+ self.assertIsInstance(history, list)`

`‎urlsresolver/init.py‎`

Lines changed: 110 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,110 @@`
	`1`	`+# coding=utf-8`
	`2`	`+from HTMLParser import HTMLParser`
	`3`	`+from collections import OrderedDict`
	`4`	`+from contextlib import closing`
	`5`	`+import re`
	`6`	`+from urlparse import urljoin`
	`7`	`+`
	`8`	`+import requests`
	`9`	`+`
	`10`	`+__version__ = (1, 0, 0)`
	`11`	`+__author__ = 'Alexandr Shurigin (https://github.com/phpdude/)'`
	`12`	`+`
	`13`	`+# HTML tags syntax http://www.w3.org/TR/html-markup/syntax.html`
	`14`	`+TAG_ATTRIBUTES_REGEX = \`
	`15`	`+ "(?:\s+%(attr)s\s=\s\"%(dqval)s\")\|" \`
	`16`	`+ "(?:\s+%(attr)s\s=\s'%(sqval)s')\|" \`
	`17`	`+ "(?:\s+%(attr)s\s=\s%(uqval)s)\|" \`
	`18`	`+ "(?:\s+%(attr)s)" % {`
	`19`	`+ 'attr': "([^\s\\x00\"'>/=]+)",`
	`20`	+ 'uqval': "([^\s\"'=><`]*)",
	`21`	`+ 'sqval': "([^'\\x00]*)",`
	`22`	`+ 'dqval': "([^\"\\x00]*)"`
	`23`	`+ }`
	`24`	`+`
	`25`	`+`
	`26`	`+def get_tags(html, tag_name):`
	`27`	`+ parser = HTMLParser()`
	`28`	`+ for m in re.findall('<%s(\s+[^>])/>' % tag_name, html, re.IGNORECASE):`
	`29`	`+ attrs = {}`
	`30`	`+`
	`31`	`+ for x in re.findall('(?:(%s))' % TAG_ATTRIBUTES_REGEX, m, re.UNICODE):`
	`32`	`+ if x[1]:`
	`33`	`+ attrs[x[1]] = parser.unescape(x[2])`
	`34`	`+ elif x[3]:`
	`35`	`+ attrs[x[3]] = parser.unescape(x[4])`
	`36`	`+ elif x[5]:`
	`37`	`+ attrs[x[5]] = parser.unescape(x[6])`
	`38`	`+ elif x[7]:`
	`39`	`+ attrs[x[7]] = parser.unescape(x[7])`
	`40`	`+`
	`41`	`+ yield attrs`
	`42`	`+`
	`43`	`+`
	`44`	`+def resolve_url(`
	`45`	`+ start_url,`
	`46`	`+ user_agent=False,`
	`47`	`+ chunk_size=1500,`
	`48`	`+ max_redirects=30,`
	`49`	`+ history=False,`
	`50`	`+ remove_noscript=True,`
	`51`	`+ **kwargs):`
	`52`	`+ """`
	`53`	`+ Helper function for expanding shortened urls.`
	`54`	`+`
	`55`	`+ :param start_url: Shortened url to expand`
	`56`	`+ :param user_agent: Custom User-Agent header.`
	`57`	`+ :param chunk_size: Size of header to fetch from response body for searching meta refresh tags.`
	`58`	`+ :param max_redirects: Maximum meta refresh redirects`
	`59`	`+ :param history: If True, function will return tuple with (url, history) where history is list of redirects`
	`60`	`+ :param remove_noscript: Remove <noscript></noscript> blocks from head HTML (skip redirects for old browsers versions)`
	`61`	`+ :param kwargs: Custom kwargs for requests.get(**kwargs) function.`
	`62`	`+ :return: str\|tuple`
	`63`	`+ """`
	`64`	`+ s = requests.session()`
	`65`	`+`
	`66`	`+ urls_history = OrderedDict()`
	`67`	`+ if user_agent:`
	`68`	`+ s.headers['User-Agent'] = user_agent`
	`69`	`+`
	`70`	`+ def follow_meta_redirects(url, max_redirects, **kwargs):`
	`71`	`+ urls_history[url] = True`
	`72`	`+`
	`73`	`+ if max_redirects < 0:`
	`74`	`+ raise ValueError("Cannot resolve real url with max_redirects=%s" % max_redirects)`
	`75`	`+`
	`76`	`+ max_redirects -= 1`
	`77`	`+`
	`78`	`+ with closing(s.get(url, allow_redirects=True, stream=True, **kwargs)) as resp:`
	`79`	`+ if resp.history:`
	`80`	`+ for r in resp.history:`
	`81`	`+ urls_history[r.url] = True`
	`82`	`+`
	`83`	`+ head, real_url = next(resp.iter_content(chunk_size, decode_unicode=False)), resp.url`
	`84`	`+`
	`85`	`+ # Removing html blocks in <noscript></noscript>`
	`86`	`+ if remove_noscript:`
	`87`	`+ head = re.sub('<noscript[^>]>.</noscript[^>]*>', '', head, flags=re.DOTALL)`
	`88`	`+`
	`89`	`+ redirect = None`
	`90`	`+ if 'refresh' in resp.headers:`
	`91`	`+ redirect = resp.headers['refresh']`
	`92`	`+ elif not redirect:`
	`93`	`+ for tag in get_tags(head, 'meta'):`
	`94`	`+ if tag.get('http-equiv', '') == 'refresh':`
	`95`	`+ redirect = tag.get('content', None)`
	`96`	`+`
	`97`	`+ if redirect:`
	`98`	`+ m = re.search('url\s=\s([^\s;]+)', redirect, re.I)`
	`99`	`+ if m:`
	`100`	`+ real_url = follow_meta_redirects(urljoin(resp.url, m.group(1)), max_redirects)`
	`101`	`+`
	`102`	`+ urls_history[real_url] = True`
	`103`	`+`
	`104`	`+ return real_url`
	`105`	`+`
	`106`	`+ real_url = follow_meta_redirects(start_url, max_redirects, **kwargs)`
	`107`	`+ if history:`
	`108`	`+ return real_url, urls_history.keys()`
	`109`	`+ else:`
	`110`	`+ return real_url`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit 69cb60b

File tree

7 files changed

7 files changed

`‎.gitignore‎`

`‎README.md‎`

`‎setup.py‎`

`‎tests/init.py‎`

`‎tests/tags_extractor.py‎`

`‎tests/twitter_urls.py‎`

`‎urlsresolver/init.py‎`

0 commit comments

File tree

7 files changed

7 files changed

‎.gitignore‎

‎README.md‎

‎setup.py‎

‎tests/__init__.py‎

‎tests/tags_extractor.py‎

‎tests/twitter_urls.py‎

‎urlsresolver/__init__.py‎

0 commit comments

`‎.gitignore‎`

`‎README.md‎`

`‎setup.py‎`

`‎tests/init.py‎`

`‎tests/tags_extractor.py‎`

`‎tests/twitter_urls.py‎`

`‎urlsresolver/init.py‎`