\$\begingroup\$
\$\endgroup\$
I recently posted this script:
VIM colors downloader in Python
But since I'm not allowed to update the code there, I wanted to get an idea on this version, that uses multiprocessing:
import os
import json
import requests
from bs4 import BeautifulSoup
from multiprocessing import Process
class Spider:
def __init__(self, total_pages=40):
self.base_url = "http://vimcolors.com"
self.total_pages = total_pages
self.download_dir = 'colors'
# If we don't have the download directory
if not os.path.isdir(self.download_dir):
print(self.download_dir, 'does not exist, trying to create it...')
# create it...
os.mkdir(self.download_dir)
def download(self, name, url):
try:
full_path = os.path.join(self.download_dir, name)
# If we have already downloaded this file, just skip
if os.path.isfile(full_path):
raise Exception('File: {} already exists; skipping.'.format(name))
# Get the response
response = requests.get(url)
if response.status_code == 404:
raise Exception('File not found: {}'.format(url))
# Try downloading the file
with open(full_path, 'wb') as file_path:
file_path.write(response.content)
except Exception as e:
print(e)
else:
print('Downloaded', name)
def crawl(self):
def repo_formatter(scheme):
base_url = scheme['github_repo']['address'].replace('github.com', 'raw.githubusercontent.com')
return '{}/master/colors/'.format(base_url)
# Loop over all the pages
for page in range(self.total_pages):
page_source = requests.get(self.base_url, params={'page': page + 1})
plain_text = page_source.text
soup = BeautifulSoup(plain_text, 'lxml')
# Get the data
json_data = json.loads(soup.find('div', {'id': 'data'}).attrs['data-colorschemes'])
# Download the files
for data in json_data['colorschemes']:
file_name = data['name'] + '.vim'
process = Process(target=self.download, args=(file_name, repo_formatter(data),))
process.daemon = True
process.start()
print('\n\nAll done.')
colors_spider = Spider()
colors_spider.crawl()
Everything is working. I just wanted to know, is multiprocessing a good choice in this case?
lang-py