Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 4817e1c

Browse files
Get Links From Webpage Added
1 parent 2ea2339 commit 4817e1c

File tree

2 files changed

+81
-0
lines changed

2 files changed

+81
-0
lines changed

‎Links From Webpage/get_links.py

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
import argparse
2+
from bs4 import BeautifulSoup
3+
from pprint import pprint
4+
import re
5+
import requests
6+
7+
8+
def get_urls(url):
9+
10+
# try, except block
11+
try:
12+
13+
# Get the page, with the timeout set to 5
14+
resp = requests.get(url, timeout=5)
15+
16+
# Check the response code, 200 for valid
17+
if resp.status_code == 200:
18+
# Create the soup
19+
soup = BeautifulSoup(resp.content, "html.parser")
20+
21+
# Find all the links from the page
22+
links = soup.find_all('a')
23+
24+
# Extract the href of the links
25+
data = [y.attrs['href'] for y in links if y.name == 'a']
26+
27+
# Create a string with all the links found
28+
str_data = " ".join(data)
29+
30+
# Regex is used to filter out the links
31+
link_regex = re.compile("((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)", re.DOTALL)
32+
33+
# Gets all the valid links
34+
regex_links = re.findall(link_regex, str_data)
35+
36+
# Extract the links from the matched regex
37+
f_links = [link[0] for link in regex_links]
38+
39+
# Pretty print
40+
pprint(f_links)
41+
42+
# Not a valid response code is given by the site
43+
else:
44+
print(f"Status Code: {resp.status_code}, For URL: {url}")
45+
46+
# Tries to connect to the site
47+
except requests.exceptions.ConnectionError:
48+
print(f"No Response From URL: {url}")
49+
50+
# Generic exception
51+
except Exception as err:
52+
print(f"Exception Raised: {err}")
53+
54+
55+
def main():
56+
57+
parser = argparse.ArgumentParser(description='Extract Links From Webpage')
58+
59+
# Argument is to input the url to extract links from
60+
parser.add_argument('-u', dest='url', type=str, help='URL to extract links from', required=True)
61+
62+
args = parser.parse_args()
63+
64+
if args.url:
65+
66+
url = args.url
67+
68+
# Check if the url starts with wither http or https
69+
if not url.startswith("http") and not url.startswith("https"):
70+
71+
# Add the https in front of the url
72+
url = "https://" + url
73+
74+
# Call to the function
75+
get_urls(url)
76+
77+
78+
if __name__ == '__main__':
79+
main()

‎Links From Webpage/requirements.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
requests==2.25.1
2+
beautifulsoup4==4.9.3

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /