Commit 4817e1c

committed

Get Links From Webpage Added

1 parent 2ea2339 commit 4817e1cCopy full SHA for 4817e1c

File tree

2 files changed

+81

-0

lines changed

Links From Webpage
- get_links.py
- requirements.txt

2 files changed

+81

-0

lines changed

`‎Links From Webpage/get_links.py`

Lines changed: 79 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,79 @@`
	`1`	`+import argparse`
	`2`	`+from bs4 import BeautifulSoup`
	`3`	`+from pprint import pprint`
	`4`	`+import re`
	`5`	`+import requests`
	`6`	`+`
	`7`	`+`
	`8`	`+def get_urls(url):`
	`9`	`+`
	`10`	`+ # try, except block`
	`11`	`+ try:`
	`12`	`+`
	`13`	`+ # Get the page, with the timeout set to 5`
	`14`	`+ resp = requests.get(url, timeout=5)`
	`15`	`+`
	`16`	`+ # Check the response code, 200 for valid`
	`17`	`+ if resp.status_code == 200:`
	`18`	`+ # Create the soup`
	`19`	`+ soup = BeautifulSoup(resp.content, "html.parser")`
	`20`	`+`
	`21`	`+ # Find all the links from the page`
	`22`	`+ links = soup.find_all('a')`
	`23`	`+`
	`24`	`+ # Extract the href of the links`
	`25`	`+ data = [y.attrs['href'] for y in links if y.name == 'a']`
	`26`	`+`
	`27`	`+ # Create a string with all the links found`
	`28`	`+ str_data = " ".join(data)`
	`29`	`+`
	`30`	`+ # Regex is used to filter out the links`
	`31`	`+ link_regex = re.compile("((https?):((//)\|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)", re.DOTALL)`
	`32`	`+`
	`33`	`+ # Gets all the valid links`
	`34`	`+ regex_links = re.findall(link_regex, str_data)`
	`35`	`+`
	`36`	`+ # Extract the links from the matched regex`
	`37`	`+ f_links = [link[0] for link in regex_links]`
	`38`	`+`
	`39`	`+ # Pretty print`
	`40`	`+ pprint(f_links)`
	`41`	`+`
	`42`	`+ # Not a valid response code is given by the site`
	`43`	`+ else:`
	`44`	`+ print(f"Status Code: {resp.status_code}, For URL: {url}")`
	`45`	`+`
	`46`	`+ # Tries to connect to the site`
	`47`	`+ except requests.exceptions.ConnectionError:`
	`48`	`+ print(f"No Response From URL: {url}")`
	`49`	`+`
	`50`	`+ # Generic exception`
	`51`	`+ except Exception as err:`
	`52`	`+ print(f"Exception Raised: {err}")`
	`53`	`+`
	`54`	`+`
	`55`	`+def main():`
	`56`	`+`
	`57`	`+ parser = argparse.ArgumentParser(description='Extract Links From Webpage')`
	`58`	`+`
	`59`	`+ # Argument is to input the url to extract links from`
	`60`	`+ parser.add_argument('-u', dest='url', type=str, help='URL to extract links from', required=True)`
	`61`	`+`
	`62`	`+ args = parser.parse_args()`
	`63`	`+`
	`64`	`+ if args.url:`
	`65`	`+`
	`66`	`+ url = args.url`
	`67`	`+`
	`68`	`+ # Check if the url starts with wither http or https`
	`69`	`+ if not url.startswith("http") and not url.startswith("https"):`
	`70`	`+`
	`71`	`+ # Add the https in front of the url`
	`72`	`+ url = "https://" + url`
	`73`	`+`
	`74`	`+ # Call to the function`
	`75`	`+ get_urls(url)`
	`76`	`+`
	`77`	`+`
	`78`	`+if __name__ == '__main__':`
	`79`	`+ main()`

`‎Links From Webpage/requirements.txt`

Lines changed: 2 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+requests==2.25.1`
	`2`	`+beautifulsoup4==4.9.3`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit 4817e1c

File tree

2 files changed

2 files changed

`‎Links From Webpage/get_links.py`

`‎Links From Webpage/requirements.txt`

0 commit comments