1
- """"
2
- Program name : Website cloner
3
- author : https://github.com/codeperfectplus
4
- How to use : Check README.md
5
- """
6
-
7
1
import os
8
2
import sys
9
3
import requests
10
4
from bs4 import BeautifulSoup
11
-
5
+ from urllib . parse import urljoin , urlparse
12
6
13
7
class CloneWebsite :
14
- def __init__ (self , website_name ):
15
- self .website_name = website_name
8
+ def __init__ (self , website_url ):
9
+ self .website_url = website_url
10
+ self .domain_name = urlparse (website_url ).netloc
11
+ self .visited_urls = set ()
16
12
17
- def crawl_website (self ):
18
- """ This function will crawl website and return content"""
19
- content = requests .get (website_name )
20
- if content .status_code == 200 :
21
- return content
13
+ def get_full_url (self , path ):
14
+ return urljoin (self .website_url , path )
22
15
23
- def create_folder (self ):
24
- """ This function will create folder for website """
25
- folder_name = (website_name .split ("/" ))[2 ]
16
+ def valid_url (self , url ):
17
+ return urlparse (url ).netloc == self .domain_name
18
+
19
+ def save_content (self , url , path ):
26
20
try :
27
- os .makedirs (folder_name )
21
+ response = requests .get (url , stream = True )
22
+ if response .status_code == 200 :
23
+ os .makedirs (os .path .dirname (path ), exist_ok = True )
24
+ with open (path , 'wb' ) as file :
25
+ for chunk in response .iter_content (chunk_size = 8192 ):
26
+ file .write (chunk )
28
27
except Exception as e :
29
- print (e )
30
- return folder_name
28
+ print (f"Error saving { url } : { e } " )
29
+
30
+ def crawl_website (self , url = None ):
31
+ if url is None :
32
+ url = self .website_url
31
33
32
- def save_website (self ):
33
- """ This function will save website to respective folder """
34
- folder_name = self .create_folder ()
35
- content = self .crawl_website ()
36
- with open (
37
- f"{ folder_name } /index.html" , "w" , encoding = "ascii" , errors = "ignore"
38
- ) as file :
39
- file .write (content .text )
34
+ if url in self .visited_urls :
35
+ return
36
+ self .visited_urls .add (url )
37
+
38
+ try :
39
+ response = requests .get (url )
40
+ if response .status_code != 200 :
41
+ return
42
+ except Exception as e :
43
+ print (f"Error accessing { url } : { e } " )
44
+ return
40
45
41
- def save_image (self ):
42
- folder_name = self .create_folder ()
43
- os .chdir (folder_name )
44
- data = requests .get (website_name ).text
45
- soup = BeautifulSoup (data , "html.parser" )
46
- for img in soup .find_all ("img" ):
47
- src = img ["src" ]
48
- print (src )
49
- image_name = src .split ("/" )[- 1 ]
50
- path = src .split ("/" )[:- 1 ]
51
- path = "/" .join (path )
52
- try :
53
- os .makedirs (path )
54
- except Exception :
55
- print ("File Exists" )
46
+ soup = BeautifulSoup (response .text , 'html.parser' )
56
47
57
- if "/" == src [:1 ]:
58
- print (src )
59
- src = website_name + src
60
- img_data = requests .get (src ).content
61
- with open (f"{ path } /{ image_name } " , "wb" ) as file :
62
- file .write (img_data )
63
- print ("complete" )
48
+ # Save the current page
49
+ path = urlparse (url ).path
50
+ if not path .endswith ('.html' ):
51
+ path = os .path .join (path , 'index.html' )
52
+ self .save_content (url , os .path .join (self .domain_name , path .lstrip ('/' )))
64
53
54
+ # Extract and save all linked resources
55
+ for tag , attribute in [('img' , 'src' ), ('script' , 'src' ), ('link' , 'href' ), ('a' , 'href' )]:
56
+ for resource in soup .find_all (tag ):
57
+ if attribute in resource .attrs :
58
+ resource_url = self .get_full_url (resource [attribute ])
59
+ if self .valid_url (resource_url ):
60
+ file_path = os .path .join (self .domain_name , urlparse (resource_url ).path .lstrip ('/' ))
61
+ if resource_url .endswith ('.html' ):
62
+ self .crawl_website (resource_url )
63
+ else :
64
+ self .save_content (resource_url , file_path )
65
65
66
66
if __name__ == "__main__" :
67
- website_name = sys .argv [1 ]
68
- clone = CloneWebsite (website_name )
69
- clone .save_website ()
70
- clone .save_image ()
67
+ website_url = sys .argv [1 ]
68
+ clone = CloneWebsite (website_url )
69
+ clone .crawl_website ()
0 commit comments