I've not used this site before, I would like to get opinions on the code I've completed for my first 'program' in Python. I would like to understand how 'good' it is and where it can be improved, anything that says "you're clearly a beginner and haven't learned xxx" would be helpful.
Background to project: My payslips are in a weird format with info dotted around all over a spreadsheet and have slight variations in positioning month on month. For example the value of a field might not be in the adjacent cell it might be below it for one field and to the right for another (although this relative positioning is consistent month on month) I combined manually all the months into one Excel and wanted to structure the data so I could eventually run calculations on it to ensure it was correct and my taxes were correct (still need to do that part).
# -*- coding: utf-8 -*-
import openpyxl
def get_data(ws):
data = []
for y in range(1,ws.max_row + 1):
rows = []
for x in range(1,ws.max_column + 1):
rows.append(ws.cell(column = x, row = y).value)
data.append(rows)
return data
def search_data(data, criteria):
location = []
for row in range(len(data)):
for column in range(len(data[row])):
if data[row][column] == criteria:
location.append([row,column])
if len(location) == 0:
print('Not present on sheet')
return None
return location
def offset(location,rows,columns):
location[0] = location[0] + rows
location[1] = location[1] + columns
return location
def get_table(data, startloc, columns=None):
count = 1
rows = 1
table = []
temprow = startloc[0]
tempcol = startloc[1]
while count != 0:
if data[temprow][tempcol] != None:
temprow = temprow + 1
if temprow == len(data):
break
else:
count = 0
rows = temprow - startloc[0]
if columns == None:
temprow = startloc[0]
tempcol = startloc[1]
count = 1
while count != 0:
if data[temprow][tempcol] != None:
tempcol = tempcol + 1
if tempcol == len(data[temprow]):
break
else:
count = 0
columns = tempcol - startloc[1]
for y in range(rows):
table_rows = []
for x in range(columns):
table_rows.append(data[startloc[0]+y][startloc[1]+x])
table.append(table_rows)
return table
def get_value(data, location):
return data[location[0]][location[1]]
def get_tax_info(ws):
data = get_data(ws)
tax_info = []
#gets individual values
tax_period = get_value(data,offset(search_data(data,'Calendar Month')[0],0,1))
tax_period_ending = get_value(data,offset(search_data(data,'Calendar Month')[0],0,2))
tax_code = get_value(data,offset(search_data(data, 'Tax Code')[0],1,0))
tax_basis = get_value(data,offset(search_data(data, 'Tax Basis')[0],1,0))
NI_category = get_value(data,offset(search_data(data,'NI Category')[0],1,0))
total_payments = get_value(data,offset(search_data(data,'Total Payments')[0],1,0))
net_pay = get_value(data,offset(search_data(data,'Net Pay')[0],1,0))
#gets tables
payments = get_table(data,offset(search_data(data,'Payments')[0],4,0))
deductions = get_table(data,offset(search_data(data,'Deductions')[0],4,0))
balances = get_table(data,offset(search_data(data,'Balances')[0],4,0))
#combine
tax_info.extend([tax_period, tax_period_ending, tax_code, tax_basis, NI_category, total_payments, net_pay])
tax_info.append(payments)
tax_info.append(deductions)
tax_info.append(balances)
return tax_info
class payslip:
def __init__(self, tax_period, tax_period_ending, tax_code, tax_basis, NI_category, total_payments, net_pay, payments, deductions, balances):
'Creates a payslip object with all the fields of the payslip'
self.tax_period_ending = tax_period_ending
self.tax_code = tax_code
self.tax_basis = tax_basis
self.NI_category = NI_category
self.total_payments = total_payments
self.net_pay = net_pay
self.payments = payments
self.deductions = deductions
self.balances = balances
wb = openpyxl.load_workbook('.../Payslips.xlsx')
sheet_names = wb.sheetnames
payslips = []
for i in range(0,len(sheet_names)):
each = sheet_names[i]
ws = wb[each]
tax_info = get_tax_info(ws)
payslips.append(payslip(tax_info[0], tax_info[1], tax_info[2], tax_info[3], tax_info[4], tax_info[5], tax_info[6], tax_info[7],tax_info[8],tax_info[9]))
wb2 = openpyxl.Workbook()
ws2 = wb2.active
ws2.title = 'Payslips Summary'
ws2['A1'],ws2['B1'],ws2['C1'],ws2['D1'],ws2['E1'], ws2['F1'] = 'tax_period_ending', 'total_payments', 'net_pay', 'tax_code', 'NI_category', 'tax_basis'
for i in range(0,len(payslips)):
ws2.cell(row=i+2, column=1).value = payslips[i].tax_period_ending
ws2.cell(row=i+2, column=2).value = payslips[i].total_payments
ws2.cell(row=i+2, column=3).value = payslips[i].net_pay
ws2.cell(row=i+2, column=4).value = payslips[i].tax_code
ws2.cell(row=i+2, column=5).value = payslips[i].NI_category
ws2.cell(row=i+2, column=6).value = payslips[i].tax_basis
wb2.save('Payslips_Summary.xlsx')
-
\$\begingroup\$ Is this Python 3? \$\endgroup\$Sunny Patel– Sunny Patel2018年07月19日 18:12:50 +00:00Commented Jul 19, 2018 at 18:12
-
\$\begingroup\$ Yes it should be \$\endgroup\$Toby Bishop– Toby Bishop2018年07月19日 19:03:05 +00:00Commented Jul 19, 2018 at 19:03
-
1\$\begingroup\$ It should be? You don't know? \$\endgroup\$t3chb0t– t3chb0t2018年07月19日 19:07:53 +00:00Commented Jul 19, 2018 at 19:07
-
1\$\begingroup\$ @t3chb0t - I've set it up as Python 3, most resources I have used to learn this far have been Python 3 but it's not impossible that I have strayed into Python 2.7 as other than a few things like the print statement I'm not sure I would recognise the differences immediately and sometimes I still find resources in Python 2.7 which I'm still reading about to understand things. \$\endgroup\$Toby Bishop– Toby Bishop2018年07月19日 19:17:11 +00:00Commented Jul 19, 2018 at 19:17
1 Answer 1
for x in range(1,ws.max_column + 1): rows.append(ws.cell(column = x, row = y).value)
One of the biggest savers in overhead is grabbing multiple cells at once, instead of making multiple calls to get adjacent cell data.
location = [] for row in range(len(data)): for column in range(len(data[row])): if data[row][column] == criteria: location.append([row,column])
Couple things here. enumerate()
function is super helpful in getting both the index and the item itself.
location = []
for r, row in enumerate(data):
for c, cell in enumerate(row):
if cell == criteria:
location.append([r,c])
Taking this a step further; since you are just creating a list, you can use a list comprehentsion to accomplish this in one go (with a nested for
):
location = [[r,c] for r, row in enumerate(data) for c, cell in enumerate(row) if cell == criteria]
Almost looks like a mouthful, but it's easy to follow with the basic building blocks.
if len(location) == 0:
Python understands that lists can be empty, so this can be simplified to just:
if not location:
def offset(location,rows,columns): location[0] = location[0] + rows location[1] = location[1] + columns return location
Makes sense to turn these into incrementors:
def offset(location,rows,columns):
location[0] += rows
location[1] += columns
return location
count = 1 # Flag can be utilized differently, not needed rows = 1 # Gets reassigned later anyway, not needed table = [] # Can be used in a list comprehension assignment temprow = startloc[0] tempcol = startloc[1]
You can get away with some tuple assignment here:
temprow, tempcol = startloc
while count != 0: if data[temprow][tempcol] != None: temprow = temprow + 1 if temprow == len(data): break else: count = 0
Seems like your actual loop condition is based on the first if
:
while data[temprow][tempcol] != None:
temprow += 1
if temprow == len(data):
break
if columns == None: temprow = startloc[0] tempcol = startloc[1] count = 1 while count != 0: if data[temprow][tempcol] != None: tempcol = tempcol + 1 if tempcol == len(data[temprow]): break else: count = 0
Combining some of the previous items:
if columns == None:
temprow, tempcol = startloc
while data[temprow][tempcol] != None:
tempcol += 1
if tempcol == len(data[temprow]):
break
for y in range(rows): table_rows = [] for x in range(columns): table_rows.append(data[startloc[0]+y][startloc[1]+x]) table.append(table_rows)
Another way to setup list comprehension by combining your offset
and get_value
functions:
table = [[get_value(data, offset(startloc, y, x)) for x in range(columns)] for y in range(rows)]
tax_period = get_value(data,offset(search_data(data,'Calendar Month')[0],0,1)) tax_period_ending = get_value(data,offset(search_data(data,'Calendar Month')[0],0,2)) tax_code = get_value(data,offset(search_data(data, 'Tax Code')[0],1,0)) tax_basis = get_value(data,offset(search_data(data, 'Tax Basis')[0],1,0)) NI_category = get_value(data,offset(search_data(data,'NI Category')[0],1,0)) total_payments = get_value(data,offset(search_data(data,'Total Payments')[0],1,0)) net_pay = get_value(data,offset(search_data(data,'Net Pay')[0],1,0)) #gets tables payments = get_table(data,offset(search_data(data,'Payments')[0],4,0)) deductions = get_table(data,offset(search_data(data,'Deductions')[0],4,0)) balances = get_table(data,offset(search_data(data,'Balances')[0],4,0))
These repeated calls to get_table
, get_value
, and search_data
, perhaps you can refactor by adding a function that helps get the values you want with only the unique line differences.
tax_info.extend([tax_period, tax_period_ending, tax_code, tax_basis, NI_category, total_payments, net_pay]) tax_info.append(payments) tax_info.append(deductions) tax_info.append(balances) return tax_info
You may as well just return it all at once:
return [tax_period, tax_period_ending, tax_code, tax_basis, NI_category, total_payments, net_pay, payments, deductions, balances]
class payslip: def __init__(self, tax_period, tax_period_ending, tax_code, tax_basis, NI_category, total_payments, net_pay, payments, deductions, balances): 'Creates a payslip object with all the fields of the payslip' self.tax_period_ending = tax_period_ending self.tax_code = tax_code self.tax_basis = tax_basis self.NI_category = NI_category self.total_payments = total_payments self.net_pay = net_pay self.payments = payments self.deductions = deductions self.balances = balances
Ideally, with all this data you're retreiving, seems like you'd be better suited to extend this class, that ingests data
, and sets the fields into Class properties like you're already doing. Also, class names should be capitalized.
class Payslip:
def __init__(self, worksheet):
'Creates a payslip object with all the fields of the payslip'
self.data = self.get_data(worksheet)
self.get_tax_info()
def get_data(ws):
# code for get_data here.
def search(self, criteria):
# search_data code here, substituting data with self.data
# Should throw exception if not found.
# Also good idea to incorporate offset function by adding params to this method
def value(self, location):
return self.data[location[0]][location[1]]
def table(self, startloc, columns=None):
# Get_table code here, substituting data with self.data
@staticmethod
def offset(location,rows,columns):
# offset code here. This static method doesn't need data, but nice to be encapsulated
def get_tax_info(self)
# Add logic to figure out tax infos
self.tax_period, self.tax_period_ending = tax_period, tax_period_ending
self.tax_code, self.tax_basis, self.NI_category = tax_code, tax_basis, NI_category
self.total_payments, self.net_pay = total_payments, net_pay
self.payments, self.deductions, self.balances = payments, deductions, balances
for i in range(0,len(sheet_names)): each = sheet_names[i] ws = wb[each]
Can just loop over sheetnames here:
for sheet in wb.sheetnames:
ws = wb[sheet]
payslips.append(payslip(tax_info[0], tax_info[1], tax_info[2], tax_info[3], tax_info[4], tax_info[5], tax_info[6], tax_info[7],tax_info[8],tax_info[9]))
If you didn't expand your class definitions:
payslips.append(payslip(*tax_info))
But if you expanded your class definitions to do the processing, you can make this look so much cleaner by:
payslips.append(Payslip(ws))
for i in range(0,len(payslips)): ws2.cell(row=i+2, column=1).value = payslips[i].tax_period_ending ws2.cell(row=i+2, column=2).value = payslips[i].total_payments ws2.cell(row=i+2, column=3).value = payslips[i].net_pay ws2.cell(row=i+2, column=4).value = payslips[i].tax_code ws2.cell(row=i+2, column=5).value = payslips[i].NI_category ws2.cell(row=i+2, column=6).value = payslips[i].tax_basis
As said earlier with accessing many cells, you should be able to grab the cells and assign them all in one swoop.
for rows in ws2['A2':'F'+len(payslips)+2]:
This will iterate through each row, setting you up for something nice. Next, you'll want to extend your Payslips
class again with another method to return each of the field values:
def summary_data(self):
yield self.tax_period_ending
yield self.total_payments
yield self.net_pay
yield self.tax_code
yield self.NI_category
yield self.tax_basis
Using the yield
keyword to create a generator that will return each of these items one at a time when you iterate through it.
Since we'll want to step through each payslip
at the same time as each row, we can combine both by using the zip
function and accessing both together.
for rows, ps in zip(ws2['A2':'F'+len(payslips)+2], payslips):
for cell, value in zip(rows, payslips.summary_data()):
cell.value = value
Using the slice notation to grab the cells, we transformed your repeated calls, into a single call to the sheet to get A2:F#
where # is the number of payslips + 2 (offset). We iterate over the rows to get cell data as well as the summary data values at the same time so that we can save it to the sheet.
Then finally you can wrap up your summary as you have with:
wb2.save('Payslips_Summary.xlsx')
No issues there. I hope this helps, as this is one of my first reviews on here.