Python 3 Multi-connection downloader

Question 1

I have written a fully functional simple command line multi-connection downloader, written in Python 3, using just threading, requests and pathlib for its core functionality, plus 19 other libraries for some extra features.

I have added resume support and it automatically resumes interrupted downloads when downloading the same URL to the same file path where a partially downloaded file is, and I fixed all the bugs contained in the code.

The core code is extremely simple, it uses requests.get with stream enabled to download the file, it uses the range HTTP header to download the file in 32 segments, using one Thread to download each.

However the script is extremely complex, with many additional features and poorly formatted code, making it a headache trying to extend it.

I have added many features to the code, for example, it will validate the Windows file path, the URL format, the ping latency of the URL, and accessibility of target resource, and whether or not the server supports the range header, and it shows download speed, progress bar, time elapsed and ETA...

All the features are necessary, but making the code convoluted and hard to extend.

I plan to add GUI functionalities to the script, and I mainly intend to use it as a library inside other scripts, and add argsparse to it, and I also intend to make it behave differently depending on whether it is launched in a command shell or not (it should have a GUI if it is launched via explorer.exe while it shouldn't if it is launched via pwsh.exe), I am able to do these but I don't know exactly where to start...

Here is the code:

import json
import keyboard
import os
import psutil
import random
import re
import requests
import sys
import time
import validators
from collections import deque
from datetime import datetime
from math import inf
from pathlib import Path
from ping3 import ping
from reprint import output
from requests.sessions import Session
from requests.adapters import HTTPAdapter
from threading import Thread
from urllib3.poolmanager import PoolManager
from win32gui import GetForegroundWindow
from win32process import GetWindowThreadProcessId
def is_active():
 active = GetWindowThreadProcessId(GetForegroundWindow())[1]
 parents = psutil.Process().parents()
 for p in parents:
 if p.pid == active:
 return True
 return False
def timestring(sec):
 sec = int(sec)
 m, s = divmod(sec, 60)
 h, m = divmod(m, 60)
 return f'{h:02d}:{m:02d}:{s:02d}'
class Port_Getter:
 @staticmethod
 def busyports():
 return set(i.laddr.port for i in psutil.net_connections())
 def __init__(self):
 self.assigned = set()
 def randomport(self):
 port = random.randint(1, 65535)
 while port in Port_Getter.busyports() or port in self.assigned:
 port = random.randint(1, 65535)
 self.assigned.add(port)
 return port
class Adapter(HTTPAdapter):
 def __init__(self, port, *args, **kwargs):
 self._source_port = port
 super(Adapter, self).__init__(*args, **kwargs)
 def init_poolmanager(self, connections, maxsize, block=False):
 self.poolmanager = PoolManager(
 num_pools=connections, maxsize=maxsize,
 block=block, source_address=('', self._source_port))
class USession(Session):
 portassigner = Port_Getter()
 def __init__(self, *args, **kwargs):
 super(USession, self).__init__(*args, **kwargs)
 self.headers.update(
 {'connection': 'close', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Firefox/91.0'})
 self.setport()
 def setport(self):
 port = USession.portassigner.randomport()
 self.mount('http://', Adapter(port))
 self.mount('https://', Adapter(port))
class Multidown:
 def __init__(self, dic, id):
 self.count = 0
 self.completed = False
 self.id = id
 self.dic = dic
 self.position = self.getval('position')
 
 def getval(self, key):
 return self.dic[self.id][key]
 
 def setval(self, key, val):
 self.dic[self.id][key] = val
 def worker(self):
 interrupted = True
 filepath = self.getval('filepath')
 path = Path(filepath)
 end = self.getval('end')
 if not path.exists():
 start = self.getval('start')
 else:
 self.count = path.stat().st_size
 start = self.getval('start') + self.count
 url = self.getval('url')
 self.position = start
 f = path.open(mode='ab+')
 if self.count != self.getval('length'):
 interrupted = False
 s = USession()
 r = s.get(
 url, headers={'range': 'bytes={0}-{1}'.format(start, end)}, stream=True)
 while True:
 if self.dic['paused']:
 r.connection.close()
 r.close()
 s.close()
 interrupted = True
 break
 if (chunk := next(r.iter_content(131072), None)):
 f.write(chunk)
 self.count += len(chunk)
 self.position += len(chunk)
 self.setval('count', self.count)
 self.setval('position', self.position)
 else:
 break
 f.close()
 if not interrupted:
 r.close()
 s.close()
 if self.count == self.getval('length'):
 self.completed = 1
 self.setval('completed', 1)
class Singledown:
 def __init__(self):
 self.count = 0
 def worker(self, url, path):
 with requests.get(url, stream=True) as r:
 with path.open('wb') as file:
 for chunk in r.iter_content(1048576):
 if chunk:
 self.count += len(chunk)
 file.write(chunk)
class Verifier:
 @staticmethod
 def validate_filepath(path):
 path = path.replace('\\', '/')
 if (not re.match('^[a-zA-Z]:/(((?![<>:"/|?*]).)+((?<![ .])/)?)*$', path) or
 not Path(path[:3]).exists()):
 print('Invalid windows file path has been inputted, process will now stop.')
 return False
 return True
 
 @staticmethod
 def validate_url(url):
 if not validators.url(url):
 print('Invalid url been inputted, process will now stop.')
 return False
 if url.lower().startswith('ftp://'):
 print(
 "`requests` module doesn't suport File Transfer Protocol, process will now stop")
 return False
 return True
 
 @staticmethod
 def confirm_overwrite(path, overwrite):
 filepath = str(path)
 if not path.exists():
 return True
 if path.is_file():
 if overwrite:
 return True
 while True:
 answer = input(
 f'`{filepath}` already exists, do you want to overwrite it? \n(Yes, No):').lower()
 if answer in ['y', 'yes', 'n', 'no']:
 if answer.startswith('y'):
 os.remove(filepath)
 return True
 print('Invalid input detected, retaking input.')
 print(f'Overwritting {filepath} has been aborted, process will now stop.')
 return False
 
 @staticmethod
 def test_connection(url):
 server = url.split('/')[2]
 ok = ping(server, timeout=2)
 if ok == False:
 print(
 'The server of the inputted url is non-existent, process will now stop.')
 return False
 if ok:
 return True
 if not ok:
 print('Connection has timed out, will reattempt to ping server 5 times.')
 for i in range(5):
 print(
 f'Reattempting to ping server, retrying {i + 1} out of 5')
 ok = ping(server, timeout=2)
 if ok:
 print(
 f'Connection successful on retry {i + 1}, process will now continue.')
 return True
 print(f'Retry {i + 1} out of 5 timed out' + (i != 4)
 * ', reattempting in 1 second.' + (i == 4) * '.')
 time.sleep(1)
 print('Failed to connect server, connection timed out, process will now stop')
 return False
 
 @staticmethod
 def validate_accessible(url):
 head = requests.head(url)
 if head.status_code == 200:
 return True
 for i in range(5):
 print(f'Server responce is invalid, retrying {i + 1} out of 5')
 head = requests.head(url)
 if head.status_code == 200:
 print(
 f'Connection successful on retry {i + 1}, process will now continue.')
 return True
 print(f'Retry {i + 1} out of 5 failed to access data' +
 (i != 4) * ', reattempting in 1 second.' + (i == 4) * '.')
 time.sleep(1)
 print("Can't establish a connection with access to data, can't download target file, process will now stop.")
 return False
class Downloader:
 def __init__(self):
 self.recent = deque([0] * 12, maxlen=12)
 self.recentspeeds = deque([0] * 200, maxlen=200)
 self.dic = dict()
 self.workers = []
 def download(self, url, filepath, num_connections=32, overwrite=False):
 bcontinue = Path(filepath + '.progress.json').exists()
 singlethread = False
 threads = []
 path = Path(filepath)
 if not Verifier.validate_filepath(filepath):
 raise ValueError()
 
 if not Verifier.validate_url(url):
 raise ValueError()
 
 if not bcontinue:
 if not Verifier.confirm_overwrite(path, overwrite):
 raise InterruptedError()
 
 if not Verifier.test_connection(url):
 raise TimeoutError()
 
 if not Verifier.validate_accessible(url):
 raise PermissionError()
 
 head = requests.head(url)
 folder = '/'.join(filepath.split('/')[:-1])
 Path(folder).mkdir(parents=True, exist_ok=True)
 headers = head.headers
 total = headers.get('content-length')
 if not total:
 print(
 f'Cannot find the total length of the content of {url}, the file will be downloaded using a single thread.')
 started = datetime.now()
 print('Task started on %s.' %
 started.strftime('%Y-%m-%d %H:%M:%S'))
 sd = Singledown()
 th = Thread(target=sd.worker, args=(url, path))
 self.workers.append(sd)
 th.start()
 total = inf
 singlethread = True
 else:
 total = int(total)
 if not headers.get('accept-ranges'):
 print(
 'Server does not support the `range` parameter, the file will be downloaded using a single thread.')
 started = datetime.now()
 print('Task started on %s.' %
 started.strftime('%Y-%m-%d %H:%M:%S'))
 sd = self.Singledown()
 th = Thread(target=sd.singledown, args=(url, path))
 self.workers.append(sd)
 th.start()
 singlethread = True
 else:
 if bcontinue:
 progress = json.loads(Path(filepath + '.progress.json').read_text(), 
 object_hook=lambda d: {int(k) if k.isdigit() else k: v for k, v in d.items()})
 segment = total / num_connections
 started = datetime.now()
 lastpressed = started
 print('Task started on %s.' %
 started.strftime('%Y-%m-%d %H:%M:%S'))
 self.dic['total'] = total
 self.dic['connections'] = num_connections
 self.dic['paused'] = False
 for i in range(num_connections):
 if not bcontinue:
 start = int(segment * i)
 end = int(segment * (i + 1)) - (i != num_connections - 1)
 position = start
 length = end - start + (i != num_connections - 1)
 else:
 start = progress[i]['start']
 end = progress[i]['end']
 position = progress[i]['position']
 length = progress[i]['length']
 self.dic[i] = {
 'start': start,
 'position': position,
 'end': end,
 'filepath': filepath + '.' + str(i).zfill(2) + '.part',
 'count': 0,
 'length': length,
 'url': url,
 'completed': False
 }
 
 for i in range(num_connections):
 md = Multidown(self.dic, i)
 th = Thread(target=md.worker)
 threads.append(th)
 th.start()
 self.workers.append(md)
 
 Path(filepath + '.progress.json').write_text(json.dumps(self.dic, indent=4))
 downloaded = 0
 totalMiB = total / 1048576
 speeds = []
 interval = 0.04
 with output(initial_len=5, interval=0) as dynamic_print:
 while True:
 Path(filepath + '.progress.json').write_text(json.dumps(self.dic, indent=4))
 status = sum([i.completed for i in self.workers])
 downloaded = sum(i.count for i in self.workers)
 self.recent.append(downloaded)
 done = int(100 * downloaded / total)
 doneMiB = downloaded / 1048576
 gt0 = len([i for i in self.recent if i])
 if not gt0:
 speed = 0
 else:
 recent = list(self.recent)[12 - gt0:]
 if len(recent) == 1:
 speed = recent[0] / 1048576 / interval
 else:
 diff = [b - a for a, b in zip(recent, recent[1:])]
 speed = sum(diff) / len(diff) / 1048576 / interval
 speeds.append(speed)
 self.recentspeeds.append(speed)
 nzspeeds = [i for i in speeds if i]
 if nzspeeds:
 minspeed = min(nzspeeds)
 else:
 minspeed = 0
 maxspeed = max(speeds)
 now = datetime.now()
 elapsed = (now - started).total_seconds()
 meanspeed = downloaded / elapsed / 1048576
 remaining = totalMiB - doneMiB
 dynamic_print[0] = '[{0}{1}] {2}'.format(
 '\u2588' * done, '\u00b7' * (100-done), str(done)) + '% completed' + (not singlethread) * ', paused: {0}'.format(self.dic['paused'])
 dynamic_print[1] = 'Download mode: ' + singlethread * \
 'Single-thread' + (not singlethread) * 'Multi-thread (press Space to pause/resume, press Escape to stop)'
 dynamic_print[2] = '{0:.2f} MiB downloaded, {1:.2f} MiB total, {2:.2f} MiB remaining, download speed: {3:.2f} MiB/s'.format(
 doneMiB, totalMiB, remaining, speed)
 if speed and total != inf:
 eta = timestring(remaining / speed)
 else:
 eta = '99:59:59'
 dynamic_print[3] = 'Minimum speed: {0:.2f} MiB/s, average speed: {1:.2f} MiB/s, maximum speed: {2:.2f} MiB/s'.format(
 minspeed, meanspeed, maxspeed)
 dynamic_print[4] = 'Task started on {0}, {1} elapsed, ETA: {2}'.format(
 started.strftime('%Y-%m-%d %H:%M:%S'), timestring(elapsed), eta)
 if keyboard.is_pressed('space') and is_active():
 if not singlethread:
 pressed = datetime.now()
 if (pressed - lastpressed).total_seconds() > 0.5:
 lastpressed = pressed
 if self.dic['paused']:
 for md in self.workers:
 if not md.completed:
 th = Thread(target=md.worker)
 th.start()
 threads.append(th)
 self.dic['paused'] = not self.dic['paused']
 if self.dic['paused']:
 time.sleep(0.1)
 while threads:
 th = threads.pop(0)
 th.join()
 if keyboard.is_pressed('esc'):
 if not singlethread:
 ended = datetime.now()
 self.dic['paused'] = True
 break
 if status == len(self.workers):
 if not singlethread:
 BLOCKSIZE = 4096
 BLOCKS = 1024
 CHUNKSIZE = BLOCKSIZE * BLOCKS
 with path.open('wb') as dest:
 for i in range(32):
 file = filepath + '.' + str(i).zfill(2) + '.part'
 with Path(file).open('rb') as f:
 while (chunk := f.read(CHUNKSIZE)):
 dest.write(chunk)
 Path(file).unlink()
 ended = datetime.now()
 break
 time.sleep(interval)
 time_spent = (ended - started).total_seconds()
 meanspeed = total / time_spent / 1048576
 status = sum([i.completed for i in self.workers])
 if status == len(self.workers):
 print('Task completed on {0}, total time elapsed: {1}, average speed: {2:.2f} MiB/s'.format(
 ended.strftime('%Y-%m-%d %H:%M:%S'), timestring(time_spent), meanspeed))
 Path(filepath + '.progress.json').unlink()
 else:
 print('Task interrupted on {0}, total time elapsed: {1}, average speed: {2:.2f} MiB/s'.format(
 ended.strftime('%Y-%m-%d %H:%M:%S'), timestring(time_spent), meanspeed))
if __name__ == '__main__':
 d = Downloader()
 d.download(*sys.argv[1:])

Example:

downloader.py 'http://ipv4.download.thinkbroadband.com/1GB.zip' 'D:/Downloads/1GB.zip'

The downloaded file's SHA-256 hash should be: 5674e59283d95efe8c88770515a9bbc80cbb77cb67602389fd91def26d26aed2.

How should I format my code?

In particular, what to do with the text prompts? What to do with the if conditions? And how should the "main loop" be correctly realized? Should I split my code into multiple files? If I should, then what should they be like?

Question 2

can you share the full formatted code i wanned a downloader like this for a hobby project, if you can share the github repo link would appreciate it!

Question 3

Too bad I can't run it because of the Windows parts, but otherwise from the first look of it, it's pretty nice.

How should I format my code?

I'd consider using black for auto-formatting, respectively running a PEP8 linter every so often.

In particular, what to do with the text prompts?

You mean with a GUI? You'd have to do (modal) GUI prompts. The whole architecture will be pretty different and it's generally not going to be the case that each method can drive user input like you're doing here. Or at least it would be detrimental to the overall design since you'd always be locked into this mode of operation (prompting the user).

What to do with the if conditions?

I don't know exactly what you're referring to here.

And how should the "main loop" be correctly realized?

The main loop is, depending on which framework you'll use (not raw win32 windows, right? I'd really suggest avoiding going that low-level), simply going to be a framework method that you'll call and which only returns when the application shuts down. Everything else will be events and callbacks.

For anything that needs to be processed concurrently you'll have to add a thread - that's probably what you'll have to do for the actual downloads, that, or fully event-driven, which'd mean, that the event loop will have to listen to data being available on the sockets and react appropriately.

That is btw. also how you could already restructure the main loop here, instead of busy looping, check for input from standard input / wherever the keyboard comes from, then use the same thing for events from each worker thread, something like an event queue.

Should I split my code into multiple files? If I should, then what should they be like?

There's arguments for it, mostly just related to being able to put related things together and the ability to navigate. If it works for you, it's not a necessity to have multiple files.

That being said, the Downloader.download method is huge. I need roughly four screens to go through it, that's impossible to keep all in my head! So, I'd suggest splitting that up into more manageable chunks.

is_active consider any like return any(p.pid == active for p in parents) instead of a manual loop.
Port_Getter should be PortGetter.
USession could do with a better name, maybe UserAgentSession or just UserSession.
Verifier.validate_filepath's regular expression is interesting, I wouldn't know whether it's too strict or too loose.
Verifier.test_connection three checks for ok, that's at least one too many. And as you did correctly later, ok == False is better written as not ok.
I'd let the validate methods immediately throw exceptions. That'd convey more information than a True/False can, anyway. Also consider adding some useful information for the reader into those exceptions too, like what's currently in the print statements! This would also go a long way to making the change to a GUI version - print you can't catch that easily, but exceptions you can.
test_connection and validate_accessible does ... two HTTP requests? On top of the one to actually download things? That feels excessive to me.
self.dic - anti-pattern, if you need that for dumping/reading to/from JSON make to_json/from_json methods and implement serialisation there. That way you're not restricted to any particular shape of that data and things like IDEs can make a more educated guess. Even typing the values might be possible.=

If you decide to stick with it, consider implementing dictionary access on the Multidown class, so that self.count would directly access the dictionary, or self["count"] would be possible too.
I see a lot of close calls - consider implementing with support and generally be aware of exception safety.
131072 is a magic constant, make it one; since it's 128 * 1024 aka 128kb, mention that, or at least write it CHUNK_SIZE = 128 * 1024 (maybe even something like KILO_BYTE = 1024; CHUNK_SIZE = 128 * KILO_BYTE, but that's a bit excessive too, most people know what 1024 means). Oh I see you did that later - then reuse that, I doubt there need to be multiple differing chunk sizes.

Question 4

Just a few remarks from me. Not a comprehensive review. I appreciate your effort, I have not tested your code and I am on Linux anyway.

But the thing that surprised me the most was the lack of comments. Also the lack of line spacing in the functions. I know that the code is long already, but that cannot be an excuse.

I think you should split the project by putting all your classes in a separate file, then you import it. Then you can concentrate on the core functionality of your project.

There are some minor improvements you can make, for example use context managers with files like you did on line 142:

with path.open('wb') as file:

Do the same line for 107 and elsewhere:

f = path.open(mode='ab+')

An alternative is to enclose the snippet in a try/finally block - the catch clause is optional (but it's good to have a generic exception handler in place). Then put the cleanup code in the finally section.

Using pathlib for handling file paths/names was a sound decision, even though your application is not cross-platform.

As mentioned already, the naming conventions can be improved a bit. For example at line 24 there is a function called is_active but it's not immediately obvious if we are talking about a download, a process, a requests session or something else. The function names should therefore be more intuitive and instantly convey the intended purpose.

In class USession the user agent is hard-coded. Providing a default value makes sense but it may be useful to override the value for specific needs. Also provide additional headers if needed.

I would replace the prints with the logging module and send output to both console and a log file. Then you can more easily review the execution of your script, especially if it's going to run unattended or if you miss output due to limited screen buffer size. A decent tutorial: Logging in Python

Some of your tests may fail, for example in function test_connection: a web server may be running fine but not reply to a ping because of a firewall. I would ditch that test. You could run a DNS query instead, to verify that the domain name can be resolved. Does not mean reachable though. But this is a rough test that doesn't cost much, and the result should be cached by your DNS resolver for the next request. It will have to do a DNS query anyway, unless your download URL contains an IP address instead of a domain name.

In that ping function you can simplify this:

if ok == False:
 print(
 'The server of the inputted url is non-existent, process will now stop.')
 return False
if ok:
 return True

If ok is a boolean value, then it can be directly returned as such regardless of whether it is true or false:

return ok

or:

if not ok:
 print('The server of the inputted url is non-existent, process will now stop.')
return ok

The thing is, do you really need to do all those tests that are slowing you down ? You could just cut to the chase and attempt the download right away after a minimum of validation.

And in that function, instead of doing this:

server = url.split('/')[2]

you could use the URL parser module instead to extract the URL components.

In function validate_accessible you first try to determine if the resource if available by performing a HEAD request. I don't have any stats to back up this statement, but I think a good number of web applications do not implement the method in their pages. For example I often see Django/Flask applications that only respond to GET and POST requests, because these are the only valid methods defined for their routes.

The problem with HEAD nowadays, is that a lot of content is generated dynamically, so the CPU cost of handling a HEAD request server-side may be pretty much the same as a regular GET request. HEAD is useful for static content, when the web server can determine the content length for a file right away, and also provides the information in the returned headers. What you are doing makes sense but is probably not so useful in this age. I would discard that test probably.

A web server could return a content length of zero, then you'll have a division by zero exception at line 344:

 done = int(100 * downloaded / total)

It would be interesting to measure the execution time of individual bits of code. Another link for you: Python Timer Functions: Three Ways to Monitor Your Code. You could use their codetiming class to measure download time too.

Question 5

`Port_Getter`

Since randomport both checks whether a port it listed in busyports and whether this getter has already been assigned that port, I'm led to assume it's possible for one to be true but not the other. Which would seem to imply that a port that has been assigned to one getter and is not currently busy might end up assigned to another getter. Is that intentional?

By convention, short-lived connections tend to get assigned port numbers above 49152, so as to not conflict with applications and protocols that expect to be able to operate on specific port numbers that are known in advance. It might be good to respect that

The repeated-random-selection approach to picking a random available port works, but feels a bit ugly. Granted, the only alternative I can think of would be to maintain a set of all available ports and sample from that, which might not be ideal

Conventionally, this'd be called PortGetter instead. The methods busyports and randomport would be called busy_ports and random_port

65535 is a magic number, and should probably be given a named constant. 49152 would need one as well

`Adapter`

init_poolmanager doesn't seem to ever get called. Does it?

I believe super(Adapter, self) can be simplified to just super()

`USession`

The name is a bit unclear - what does the U stand for? Also, setport would conventionally be set_port

What's with the Firefox user agent? I mean, user agents are fairly meaningless so you aren't wrong to send it, but do you need to?

`Multidown`

Using a shared dictionary for Multidowns to communicate back with their creator feels a bit strange to me. Having that data stored on the Multidown itself could offer more control, might make it clearer what data is actually available, and would prevent one thread from accidentally changing another thread's data (which would probably have weird consequences). self.dic[self.id]['start'] does look a lot messier than self.start

worker could maybe use some breaking up - at least a blank line or two, if not actually breaking it into multiple pieces. It's a bit on the long side

Side note, this exists only to be a context for the worker function to run in - so in terms of name, might it be more useful to refer to this object as a Worker or DownloadWorker or something? Or maybe something like ParallelWorker to distinguish it from Singledown?

`Singledown`

This feels like just a special case of Multidown - is it even necessary to have this?

If it is, I'm not a huge fan of how this, unlike Multidown, does not seem to allow itself to be interrupted ahead of time. Keep in mind that this'll also probably be slower than a pile of Multidowns, so a user would probably be more likely to need to interrupt it ahead of time

Like Multidown, this is also a type of worker, and could maybe be renamed accordingly.

1048576 is a magic number

`Verifier`

This is just a bag of static methods that don't even interact with eachothers. Why is this a class instead of just standalone functions? When would you want a Verifier object? How would two Verifier objects be different? If the answers are "you wouldn't" and "they wouldn't", and it looks like they are, the class isn't really doing anything by being a class, is it?

I'm not sure the validation functions should be responsible for printing the error messages - you may want to use them in other ways. It'd be more flexible to let (or, ideally, force) the caller to decide what to do, ideally by raising an exception

What's with the Path(path[:3].exists()) check? I assume it's to check for device files, but it seems to have both false positives (CONNECTION.txt starts with CON but is a legal file name) and false negatives (I believe LPT1 is a device file but I don't think LPT is). I think Path(path).is_file() may be closer to what you're looking for?

validate_url will accept any protocol not specifically rejected. I think the opposite would make more sense, since there number of supported protocols is finite.

confirm_overwrite feels like it has two jobs - prompting a user for input until they provide a yes/no answer is a nicely defined task that could probably be separated out into its own function.

`Downloader`

download is very long, and could almost certainly be divided into more manageable pieces - it's directly responsible for all of the following things:

Creating the folder to download into
Check whether the server reports a content-length and accepts ranges, and decide which download method to use
Creating the worker threads
Figuring out whether to resume a previous download
Calculating download speed
Combining pieces of a multi-part download into a single file
Providing feedback to the user
Managing keyboard input from the user

That's a lot of different responsibilities. They're pretty interweaved right now, but I think untangling and separating them (where possible) could go a long way to making this code easier to read and modify

I'm pretty sure Singledown is not part of the Downloader class, so I doubt creating a "self.Singledown" will work

"99:59:59" is not a very good ETA for cases when you might not even know how much stuff you're downloading. If you don't know the ETA, just say you don't know

You only combine data from the first 32 parts, even if you have more than 32 workers

You also assume the chunks are all in order and all equal size, which seems like it might not be the case if you start a download, cancel it, and then later resume it with a different number of workers - your program as a whole will never do that, but download doesn't know that and probably shouldn't assume

You keep constructing filepath + '.progress.json' by hand in like seven different places. If you do it once and pass it around, any future changes to how that data is stored become much easier to make

ferada ferada 11.4k25 silver badges65 bronze badges · Accepted Answer · 2021-10-01 13:35:44Z

Too bad I can't run it because of the Windows parts, but otherwise from the first look of it, it's pretty nice.

How should I format my code?

I'd consider using black for auto-formatting, respectively running a PEP8 linter every so often.

In particular, what to do with the text prompts?

You mean with a GUI? You'd have to do (modal) GUI prompts. The whole architecture will be pretty different and it's generally not going to be the case that each method can drive user input like you're doing here. Or at least it would be detrimental to the overall design since you'd always be locked into this mode of operation (prompting the user).

What to do with the if conditions?

I don't know exactly what you're referring to here.

And how should the "main loop" be correctly realized?

The main loop is, depending on which framework you'll use (not raw win32 windows, right? I'd really suggest avoiding going that low-level), simply going to be a framework method that you'll call and which only returns when the application shuts down. Everything else will be events and callbacks.

For anything that needs to be processed concurrently you'll have to add a thread - that's probably what you'll have to do for the actual downloads, that, or fully event-driven, which'd mean, that the event loop will have to listen to data being available on the sockets and react appropriately.

That is btw. also how you could already restructure the main loop here, instead of busy looping, check for input from standard input / wherever the keyboard comes from, then use the same thing for events from each worker thread, something like an event queue.

Should I split my code into multiple files? If I should, then what should they be like?

There's arguments for it, mostly just related to being able to put related things together and the ability to navigate. If it works for you, it's not a necessity to have multiple files.

That being said, the Downloader.download method is huge. I need roughly four screens to go through it, that's impossible to keep all in my head! So, I'd suggest splitting that up into more manageable chunks.

is_active consider any like return any(p.pid == active for p in parents) instead of a manual loop.
Port_Getter should be PortGetter.
USession could do with a better name, maybe UserAgentSession or just UserSession.
Verifier.validate_filepath's regular expression is interesting, I wouldn't know whether it's too strict or too loose.
Verifier.test_connection three checks for ok, that's at least one too many. And as you did correctly later, ok == False is better written as not ok.
I'd let the validate methods immediately throw exceptions. That'd convey more information than a True/False can, anyway. Also consider adding some useful information for the reader into those exceptions too, like what's currently in the print statements! This would also go a long way to making the change to a GUI version - print you can't catch that easily, but exceptions you can.
test_connection and validate_accessible does ... two HTTP requests? On top of the one to actually download things? That feels excessive to me.
self.dic - anti-pattern, if you need that for dumping/reading to/from JSON make to_json/from_json methods and implement serialisation there. That way you're not restricted to any particular shape of that data and things like IDEs can make a more educated guess. Even typing the values might be possible.=

If you decide to stick with it, consider implementing dictionary access on the Multidown class, so that self.count would directly access the dictionary, or self["count"] would be possible too.
I see a lot of close calls - consider implementing with support and generally be aware of exception safety.
131072 is a magic constant, make it one; since it's 128 * 1024 aka 128kb, mention that, or at least write it CHUNK_SIZE = 128 * 1024 (maybe even something like KILO_BYTE = 1024; CHUNK_SIZE = 128 * KILO_BYTE, but that's a bit excessive too, most people know what 1024 means). Oh I see you did that later - then reuse that, I doubt there need to be multiple differing chunk sizes.

Stack Exchange Network

Python 3 Multi-connection downloader

3 Answers 3

`Port_Getter`

`Adapter`

`USession`

`Multidown`

`Singledown`

`Verifier`

`Downloader`

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

Python 3 Multi-connection downloader

3 Answers 3

Port_Getter

Adapter

USession

Multidown

Singledown

Verifier

Downloader

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions

`Port_Getter`

`Adapter`

`USession`

`Multidown`

`Singledown`

`Verifier`

`Downloader`