Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit d71b939

Browse files
[Update] Code
1 parent 80ca605 commit d71b939

File tree

1 file changed

+53
-54
lines changed

1 file changed

+53
-54
lines changed

β€Ž0x23-WebCloner/webcloner.py

Lines changed: 53 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1,70 +1,69 @@
1-
""""
2-
Program name : Website cloner
3-
author : https://github.com/codeperfectplus
4-
How to use : Check README.md
5-
"""
6-
71
import os
82
import sys
93
import requests
104
from bs4 import BeautifulSoup
11-
5+
fromurllib.parseimporturljoin, urlparse
126

137
class CloneWebsite:
14-
def __init__(self, website_name):
15-
self.website_name = website_name
8+
def __init__(self, website_url):
9+
self.website_url = website_url
10+
self.domain_name = urlparse(website_url).netloc
11+
self.visited_urls = set()
1612

17-
def crawl_website(self):
18-
""" This function will crawl website and return content"""
19-
content = requests.get(website_name)
20-
if content.status_code == 200:
21-
return content
13+
def get_full_url(self, path):
14+
return urljoin(self.website_url, path)
2215

23-
def create_folder(self):
24-
""" This function will create folder for website """
25-
folder_name = (website_name.split("/"))[2]
16+
def valid_url(self, url):
17+
return urlparse(url).netloc == self.domain_name
18+
19+
def save_content(self, url, path):
2620
try:
27-
os.makedirs(folder_name)
21+
response = requests.get(url, stream=True)
22+
if response.status_code == 200:
23+
os.makedirs(os.path.dirname(path), exist_ok=True)
24+
with open(path, 'wb') as file:
25+
for chunk in response.iter_content(chunk_size=8192):
26+
file.write(chunk)
2827
except Exception as e:
29-
print(e)
30-
return folder_name
28+
print(f"Error saving {url}: {e}")
29+
30+
def crawl_website(self, url=None):
31+
if url is None:
32+
url = self.website_url
3133

32-
def save_website(self):
33-
""" This function will save website to respective folder """
34-
folder_name = self.create_folder()
35-
content = self.crawl_website()
36-
with open(
37-
f"{folder_name}/index.html", "w", encoding="ascii", errors="ignore"
38-
) as file:
39-
file.write(content.text)
34+
if url in self.visited_urls:
35+
return
36+
self.visited_urls.add(url)
37+
38+
try:
39+
response = requests.get(url)
40+
if response.status_code != 200:
41+
return
42+
except Exception as e:
43+
print(f"Error accessing {url}: {e}")
44+
return
4045

41-
def save_image(self):
42-
folder_name = self.create_folder()
43-
os.chdir(folder_name)
44-
data = requests.get(website_name).text
45-
soup = BeautifulSoup(data, "html.parser")
46-
for img in soup.find_all("img"):
47-
src = img["src"]
48-
print(src)
49-
image_name = src.split("/")[-1]
50-
path = src.split("/")[:-1]
51-
path = "/".join(path)
52-
try:
53-
os.makedirs(path)
54-
except Exception:
55-
print("File Exists")
46+
soup = BeautifulSoup(response.text, 'html.parser')
5647

57-
if "/" == src[:1]:
58-
print(src)
59-
src = website_name + src
60-
img_data = requests.get(src).content
61-
with open(f"{path}/{image_name}", "wb") as file:
62-
file.write(img_data)
63-
print("complete")
48+
# Save the current page
49+
path = urlparse(url).path
50+
if not path.endswith('.html'):
51+
path = os.path.join(path, 'index.html')
52+
self.save_content(url, os.path.join(self.domain_name, path.lstrip('/')))
6453

54+
# Extract and save all linked resources
55+
for tag, attribute in [('img', 'src'), ('script', 'src'), ('link', 'href'), ('a', 'href')]:
56+
for resource in soup.find_all(tag):
57+
if attribute in resource.attrs:
58+
resource_url = self.get_full_url(resource[attribute])
59+
if self.valid_url(resource_url):
60+
file_path = os.path.join(self.domain_name, urlparse(resource_url).path.lstrip('/'))
61+
if resource_url.endswith('.html'):
62+
self.crawl_website(resource_url)
63+
else:
64+
self.save_content(resource_url, file_path)
6565

6666
if __name__ == "__main__":
67-
website_name = sys.argv[1]
68-
clone = CloneWebsite(website_name)
69-
clone.save_website()
70-
clone.save_image()
67+
website_url = sys.argv[1]
68+
clone = CloneWebsite(website_url)
69+
clone.crawl_website()

0 commit comments

Comments
(0)

AltStyle γ«γ‚ˆγ£γ¦ε€‰ζ›γ•γ‚ŒγŸγƒšγƒΌγ‚Έ (->γ‚ͺγƒͺγ‚ΈγƒŠγƒ«) /