I have server side log files written to various directories throughout the course of a week. I have the need to optionally compress, and archive these files to AWS.
The script I have come up with takes a config file defining the include glob to look for matches, and offers optional removal of the source files once the process has completed.
python aws_bkup.py path/to/config.cfg
A sample config file can be found in the project directory: https://github.com/twanas/aws-bkup
I would be most appreciative of feedback on the code style and areas it can be improved.
from __future__ import print_function
import re
import errno
import os
from glob import glob
from os.path import join, basename, splitext
from os import environ, remove
from shutil import copy, rmtree
from uuid import uuid4
import gzip
from configparser import ConfigParser
from datetime import date, timedelta
import subprocess
def gz(src, dest):
""" Compresses a file to *.gz
Parameters
----------
src: filepath of file to be compressesd
dest: destination filepath
"""
filename = splitext(basename(src))[0]
destpath = join(dest, '{}.gz'.format(filename))
blocksize = 1 << 16 #64kB
with open(src) as f_in:
f_out = gzip.open(destpath, 'wb')
while True:
block = f_in.read(blocksize)
if block == '':
break
f_out.write(block)
f_out.close()
def aws_sync(src, dest):
""" Synchronise a local directory to aws
Parameters
----------
src: local path
dest: aws bucket
"""
cmd = 'aws s3 sync {} {}'.format(src, dest)
push = subprocess.call(cmd, shell=True)
def today():
""" Returns a string format of today's date """
return date.today().strftime('%Y%m%d')
def fwe():
""" Returns a string format of the next friday's date """
d = date.today()
while d.weekday() != 4:
d += timedelta(1)
return d
def regex_match(string, pattern):
""" Returns if there is a match between parameter and regex pattern """
pattern = re.compile(pattern)
return pattern.match(string)
def mkdir_p(path):
try:
os.makedirs(path)
except OSError as exc: # Python >2.5
if exc.errno == errno.EEXIST and os.path.isdir(path):
pass
else:
raise
def aws_bkup(section, include, exclude, s3root, categorize_weekly=True, compress=True, remove_source=True):
""" Transfers a backup of any local files matching the user's criteria to AWS.
Parameters
----------
include: regex pattern to use for the file inclusion(s)
exclude: regex pattern to use for the file exclusion(s)
s3root: AWS root in which to send the backup
categorize_weekly: switch between daily and weekly folder groupings
compress: switch to compress outbound files to AWS
"""
folder = '{}'.format(fwe() if categorize_weekly else today())
tmp_root = join('/tmp', str(uuid4()))
tmp_dir = join(tmp_root, folder)
mkdir_p(tmp_dir)
for file in glob(include):
if regex_match(file, exclude):
continue
print('Processing: {}'.format(file))
if compress:
gz(file, tmp_dir)
else:
copy(file, tmp_dir)
if remove_source:
remove(file)
aws_dest = join(s3root, section)
print('Syncronizing {} to s3'.format(tmp_dir))
aws_sync(tmp_root, aws_dest)
if os.path.exists(tmp_root):
rmtree(tmp_root)
print('Done')
if __name__ == "__main__":
import sys
args = sys.argv
if len(args) < 2:
print("Usage: python -m aws-bkup /path/to/config.cfg")
sys.exit()
config = ConfigParser()
config.read(args[1])
environ['AWS_ACCESS_KEY_ID'] = config.get('aws', 'access_id')
environ['AWS_SECRET_ACCESS_KEY'] = config.get('aws', 'secret_key')
environ['AWS_DEFAULT_REGION'] = config.get('aws', 'region')
for section in config.sections():
if section != 'aws':
print('Starting {}'.format(section))
aws_bkup(
section,
config.get(section, 'include'),
config.get(section, 'exclude'),
config.get('aws', 's3root'),
config.getboolean(section, 'categorize_weekly'),
config.getboolean(section, 'compress'),
config.getboolean(section, 'remove_source')
)
2 Answers 2
After a quick read-through, I’ve spotted two items:
with
not used for f_out
The code:
with open(src) as f_in:
f_out = gzip.open(destpath, 'wb')
#...
f_out.close()
should be replaced with:
with open(arc) as f_in, gzip.open(destpath, 'wb') as f_out:
#...
Reg-ex pattern repeatedly compiled
The function regex_match()
takes a string and compiles it to a pattern, and then matches a string to that pattern. The same pattern string is repeatedly passed to regex_match
. This string should be compiled to a pattern by the caller, and the resulting pattern reused for each match. This means the calls to regex_match
could be replaced by exclude_pattern.match(file)
Argument quoting
If src
or dest
contain spaces, this command may become confused.
cmd = 'aws s3 sync {} {}'.format(src, dest)
push = subprocess.call(cmd, shell=True)
Since you are using the shell=True
argument, it may also be a vector for arbitrary command injection!
Instead of formatting the command into a string, with proper quoting, and requiring the .call()
command to parse it, you can simply pass in an array of arguments to the call. No need to worry about spaces or proper escaping/quoting -- and arbitrary command injection becomes much harder:
cmd = ['aws', 's3', 'sync', src, dest]
push = subprocess.call(cmd, shell=True)
Additional notes:
push
is neither returned or used.
Also, while subprocess.call(...)
is still acceptable, as of Python 3.5 subprocess.run(...)
is the preferred interface.
-
\$\begingroup\$ Thank you for the feedback @AJNeufeld. I have made the recommended adjustments. I appreciate your help. \$\endgroup\$awaugh– awaugh2018年08月22日 09:02:09 +00:00Commented Aug 22, 2018 at 9:02
Convert while
to for
Any time you see a function being called in a while
loop with some sort of predicate for break
, you can probably turn it into a for
loop:
def gz(src, dest):
""" Compresses a file to *.gz
Parameters
----------
src: filepath of file to be compressesd
dest: destination filepath
"""
filename = splitext(basename(src))[0]
destpath = join(dest, '{}.gz'.format(filename))
blocksize = 1 << 16 #64kB
# this can be altered to a for loop
with open(src) as f_in:
f_out = gzip.open(destpath, 'wb')
while True:
block = f_in.read(blocksize)
if block == '':
break
f_out.write(block)
f_out.close()
This function can be turned into:
from functools import partial
def gz(src, dest):
""" Compresses a file to *.gz
Parameters
----------
src: filepath of file to be compressesd
dest: destination filepath
"""
filename = splitext(basename(src))[0]
destpath = join(dest, '{}.gz'.format(filename))
blocksize = 1 << 16 #64kB
with open(src) as f_in, gzip.open(destpath, 'wb') as f_out:
reader = partial(f_in.read, blocksize)
for block in iter(reader, ''):
f_out.write(block)
iter
can take two arguments, a callable that takes no arguments and a sentinel value that signals that the loop should end. In order for f_in.read
to take no args, you can bind blocksize
to the size
argument using functools.partial
, returning a new function that takes no arguments.
Continuing on with this function, you should be opening src
in bytes mode so that you are writing bytes back:
def gz(src, dest):
""" Compresses a file to *.gz
Parameters
----------
src: filepath of file to be compressesd
dest: destination filepath
"""
filename = splitext(basename(src))[0]
# let's use an f-string here
destpath = join(dest, f'{filename}.gz')
blocksize = 1 << 16 #64kB
# open src in rb mode
with open(src, 'rb') as f_in, gzip.open(destpath, 'wb') as f_out:
reader = partial(f_in.read, blocksize)
# change your sentinel value to an empty byte string
for block in iter(reader, b''):
f_out.write(block)
mkdir_p
This could be covered by using the os.makedirs
function with exist_ok=True
. Any other OSError
would then be raised accordingly:
def aws_bkup(section, include, exclude, s3root, categorize_weekly=True, compress=True, remove_source=True):
""" Transfers a backup of any local files matching the user's criteria to AWS.
Parameters
----------
include: regex pattern to use for the file inclusion(s)
exclude: regex pattern to use for the file exclusion(s)
s3root: AWS root in which to send the backup
categorize_weekly: switch between daily and weekly folder groupings
compress: switch to compress outbound files to AWS
"""
folder = '{}'.format(fwe() if categorize_weekly else today())
tmp_root = join('/tmp', str(uuid4()))
tmp_dir = join(tmp_root, folder)
os.makedirs(tmp_dir, exist_ok=True)
Date formatting
You use two different date formats, one for fwe
and one for today
:
# Returns %Y%m%d
def today():
""" Returns a string format of today's date """
return date.today().strftime('%Y%m%d')
# Returns %Y-%m-%d
def fwe():
""" Returns a string format of the next friday's date """
d = date.today()
while d.weekday() != 4:
d += timedelta(1)
return d
Keep it consistent. I would prefer the less cluttered format of date
, so just collapse it into one function:
def get_date(weekly=False):
d = date.today()
if not weekly:
return d
while d.weekday() != 4:
d += timedelta(days=1)
return d
def aws_bkup(section, include, exclude, s3root, categorize_weekly=True, compress=True, remove_source=True):
# pass in your categorize_weekly variable as a param on the
# get_date function
folder = str(get_date(weekly=categorize_weekly))
Explore related questions
See similar questions with these tags.