My Download habit is notorious and my download folder end up piled up with random downloads, I created a python script to maintain my download folder in windows upon running the script it moves files with appropriate extension into its designated folder
.mp4 files will be moved to video folder
more extension and folder can be added by editing json file
import json
import os
import random
import shutil
download_folder_path = "C:\\Users\\<USER>\\Downloads"
with open("extensions.json",'r') as file:
data = json.load(file)
program_folder_name = f"{download_folder_path}\\{data[0]['windows_program_extensions']['folder_name']}"
compressed_folder_name = f"{download_folder_path}\\{data[0]['compressed_extensions']['folder_name']}"
image_folder_name = f"{download_folder_path}\\{data[0]['image_extensions']['folder_name']}"
video_folder_name = f"{download_folder_path}\\{data[0]['video_extensions']['folder_name']}"
documents_folder_name = f"{download_folder_path}\\{data[0]['document_extensions']['folder_name']}"
code_folder_name = f"{download_folder_path}\\{data[0]['code_extensions']['folder_name']}"
unknown_files = f"{download_folder_path}\\Unknown_Files"
folder_names = [program_folder_name,compressed_folder_name,image_folder_name,video_folder_name,documents_folder_name,code_folder_name,unknown_files]
file_names = []
for file in os.listdir(download_folder_path):
if os.path.isfile(os.path.join(download_folder_path,file)):
file_names.append(file)
for folder in folder_names:
if not os.path.exists(folder):
os.mkdir(folder)
print(f"{folder} has been created successfully")
else:
print("Folder is already there")
def handle_duplicate(file,destination_folder):
print("File already exists!!!")
print("Renaming...")
file_name,extension = os.path.splitext(file)
suffix = int(random.random()*10)
new_name = f"{file_name}-{suffix}{extension}"
os.rename(f"{download_folder_path}\\{file}",f"{download_folder_path}\\{new_name}")
shutil.move(f"{download_folder_path}\\{new_name}",destination_folder)
def file_mover(file,file_path,destination_folder):
print(destination_folder)
print(compressed_folder_name)
if not os.path.exists(os.path.join(destination_folder,file)):
shutil.move(file_path,destination_folder)
else:
handle_duplicate(file,destination_folder)
def main():
for file in file_names:
file_extension = os.path.splitext(file)[1]
file_path = f"{download_folder_path}\\{file}"
print(f"Moving... {file} in ")
if file_extension in data[0]['windows_program_extensions']['extensions']:
file_mover(file,file_path,program_folder_name)
elif file_extension in data[0]['compressed_extensions']['extensions']:
file_mover(file,file_path,compressed_folder_name)
elif file_extension in data[0]['image_extensions']['extensions']:
file_mover(file,file_path,image_folder_name)
elif file_extension in data[0]['video_extensions']['extensions']:
file_mover(file,file_path,video_folder_name)
elif file_extension in data[0]['document_extensions']['extensions']:
file_mover(file,file_path,documents_folder_name)
elif file_extension in data[0]['code_extensions']['extensions']:
file_mover(file,file_path,code_folder_name)
else:
file_mover(file,file_path,unknown_files)
Things im looking to do next
- Handling big file transfer,
- Usage of Python multithreading
- Scheduling script so it runs on given timeframe
- Improving coding style usage of mapping and different data structure and reduce the use of for loops
2 Answers 2
Thank you for having
sorted
those import
s.
It improves legibility,
and makes future git merge conflicts less likely.
Path
download_folder_path = "C:\\Users\\<USER>\\Downloads"
Clearly this works. But prefer to use a Path.
Separately, prefer a raw-string such as r"C:\Users\<USER>\Downloads"
,
if you feel the need to represent \
backwhacks in your source text.
Or prefer /
forward-slash "C:/Users/<USER>/Downloads"
if the
libraries you use, e.g. pathlib
, treat both kinds of slash the same way.
With a less redundant identifier, this gives us:
from pathlib import Path
download_folder = Path("C:/Users/<USER>/Downloads")
Now you can conveniently construct new names in this way:
unknown_files = download_folder / "Unknown_Files"
You may find that assigning Path("~/Downloads").expanduser()
suffices.
dict
There's a tedious amount of repeated FOO_folder_name
variables.
Prefer to use folder_name[foo]
dictionary mappings.
Define the following list and loop over it (or derive it from data
):
extension_categories = [
'windows_program',
'compressed',
'image',
'video',
'document',
'code_extensions',
]
Even if you adopt none of these suggestions, instead of
data = json.load(file)
prefer
data = json.load(file)[0]
so you can drop that copy-n-pasted zero de-ref from half a dozen expressions.
In general, leaning on the CMD-v Paste key is a code smell. The first four times you did it in this file, maybe it's not so bad. But by the time you notice the keycap has a barely legible "V" due to repeated use, it's time to think about factoring out some common parameter and maybe extracting a helper function.
exists OK
if not os.path.exists(folder):
os.mkdir(folder)
With both the
function
you used, and the
method
I recommend you use,
we can pass in ... , exist_ok=True)
to avoid this common existence test.
comprehension
file_names = []
for file in os.listdir(download_folder_path):
if os.path.isfile(os.path.join(download_folder_path, file)):
file_names.append(file)
This is fine as-is. We could more conveniently construct it with a glob list comprehension:
download_folder = Path( ...
file_names = [path
for path in download_folder.glob("*")
if path.is_file()]
if __name__
guard
def main():
Thank you for having packaged up most of your business logic in main
.
I confess I don't exactly see how you run it.
I think you maybe missed pasting the last couple of lines into the OP:
if __name__ == "__main__":
main()
Why do we use such a guard?
In case another module, such as your
unit tests,
wants to import
this module,
which should occur without any annoying side effects happening.
The import should just define your functions and maybe some globals.
So constructing file_names
, and doing mkdir on folder_names
,
should probably also be within main
,
or within a helper that it calls.
noun? or verb?
def file_mover(file, file_path, destination_folder):
That's a slightly odd identifier.
Prefer verbs for function names, e.g. def move_file
,
and nouns for classes, e.g. class FileMover:
generate unique name
You didn't explicitly write it down, but the responsibility of these lines is to invent a "new" name which does not appear in the FS:
def handle_duplicate(file, destination_folder):
...
suffix = int(random.random() * 10)
new_name = f"{file_name}-{suffix}{extension}"
The caller asked whether os.path.exists()
,
but this code never does, and it needs to.
You should keep inventing random suffixes until
you find one that does not appear in the filesystem.
If you exhaust all ten digits with no joy, consider using an
ISO timestamp as part of the filename.
ToDo
- Handling big file transfer,
- Usage of Python multithreading
- Scheduling script so it runs on given timeframe
- Improving coding style usage of mapping and different data structure and reduce the use of for loops
- shutil already copes nicely with big files.
- I can't imagine how threads (with the GIL!) would help, and multiprocessing seems like overkill for this.
- Use
cron
or one of its replacements. - This critique has suggested several datastructure improvements. Also, using "$ black -S *.py" wouldn't hurt.
Adding the occasional unit test for your helper functions wouldn't hurt.
The biggest refactoring opportunity is to make the code "import safe"
without side effects.
Creating a class instance object which stores common params
like self.download_folder
would also be a worthy aim.
This codebase achieves its design goals.
I would be willing to delegate or accept maintenance tasks on it.
To handle big file transfers, you can utilize the shutil
module in Python. The shutil
module provides a high-level interface for copying files and directories. You can use the shutil.copy2()
function to copy files while preserving the original metadata.
To use Python multithreading, you can use the threading
module. The threading
module allows you to create and manage multiple threads. By running certain tasks in separate threads, you can achieve parallel execution and potentially improve performance. However, keep in mind that not all tasks are suitable for parallel execution, and you should carefully consider the synchronization and potential race conditions when using multiple threads.
To schedule the script to run on a given timeframe, you can use the schedule
library. The schedule
library provides a simple and intuitive way to schedule tasks in Python. You can define functions for specific tasks and schedule them to run at specific times or intervals using the provided decorators.
To improve the coding style and reduce the use of for loops, you can consider using mapping functions like map()
, list comprehensions, and generator expressions. These constructs can help make your code more concise and expressive. Additionally, consider using appropriate data structures like dictionaries, sets, and namedtuples to organize and manipulate your data more efficiently.
Here's an updated version of your script incorporating these changes:
import json
import os
import shutil
import threading
import schedule
import time
download_folder_path = "C:\\Users\\<USER>\\Downloads"
with open("extensions.json", 'r') as file:
data = json.load(file)
folder_mapping = {
'windows_program_extensions': data[0]['windows_program_extensions']['folder_name'],
'compressed_extensions': data[0]['compressed_extensions']['folder_name'],
'image_extensions': data[0]['image_extensions']['folder_name'],
'video_extensions': data[0]['video_extensions']['folder_name'],
'document_extensions': data[0]['document_extensions']['folder_name'],
'code_extensions': data[0]['code_extensions']['folder_name']
}
unknown_files_folder = f"{download_folder_path}\\Unknown_Files"
def handle_big_file_transfer(file):
destination_folder = determine_destination_folder(file)
shutil.copy2(os.path.join(download_folder_path, file), os.path.join(download_folder_path, destination_folder))
def determine_destination_folder(file):
file_extension = os.path.splitext(file)[1][1:].lower()
for key, value in folder_mapping.items():
if file_extension in data[0][key]['extensions']:
return f"{download_folder_path}\\{value}"
return unknown_files_folder
def create_folders():
for folder in folder_mapping.values():
folder_path = f"{download_folder_path}\\{folder}"
if not os.path.exists(folder_path):
os.mkdir(folder_path)
print(f"{folder_path} has been created successfully")
else:
print("Folder is already there")
if not os.path.exists(unknown_files_folder):
os.mkdir(unknown_files_folder)
print(f"{unknown_files_folder} has been created successfully")
else:
print("Folder is already there")
def main():
create_folders()
file_names = [file for file in os.listdir(download_folder_path) if os.path.isfile(os.path.join(download_folder_path, file))]
# Handling big file transfer using multiple threads
for file in file_names:
threading.Thread(target=handle_big_file_transfer, args=(file,)).start()
# Schedule script to run on a given timeframe
schedule.every().day.at("12:00").do(main)
while True:
schedule.run_pending()
time.sleep(1)
if __name__ == "__main__":
main()
cron
exist for, so you don't have to reimplement that logic yourself. \$\endgroup\$