Flask web normalizer CSV app

Question 1

I've created a Flask application that processes Excel files uploaded by users. The application reads specific data based on client-specific keywords and generates a CSV file which users can download. I'm looking for feedback on improving the structure, efficiency, and security of this application.

The whole app idea was to normalize the XLSX/XLS files and extract the most important data. The program is not perfect, I want to make it more dynamic, when it comes to sheet-name or other mechanisms inside.

This is only backend code.

The goal was to create the standardized version of XLSX file, so the user will be able to import order into the Prodio.

The important thing that I couldn't achieve is that, when our header columns has different name, for example Reference should also be Referencja etc.... I was getting a lot of errors, but the rest isn't that bad... is it?

from flask import Flask, request, send_file, render_template, redirect, url_for, session, make_response, flash
import re
from werkzeug.utils import secure_filename
import os
import pandas as pd
from io import StringIO
import logging
from dataclasses import dataclass, field
from typing import Optional, Dict
from datetime import datetime
from gevent.pywsgi import WSGIServer
app = Flask(__name__)
app.config['UPLOAD_FOLDER'] = 'uploads/'
app.config['ALLOWED_EXTENSIONS'] = {'xlsx', 'xls'}
app.secret_key = 'super secret key'
os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def load_product_mapping():
 product_mapping_df = pd.read_csv('products_data.csv')
 product_mapping_df['Dodatkowe oznaczenia'] = product_mapping_df['Dodatkowe oznaczenia'].fillna('').astype(str).str.strip()
 product_mapping_df['Nazwa'] = product_mapping_df['Nazwa'].str.strip()
 product_mapping = {k: v for k, v in product_mapping_df.set_index('Nazwa')['Dodatkowe oznaczenia'].items() if v}
 return product_mapping
@dataclass
class ExcelProcessorConfig:
 client_keyword: str = 'Valeo Electric'
 date_keywords: Dict[str, str] = field(default_factory=lambda: {
 'pick_up': 'Pick-up',
 'receiving': 'Receiving'
 })
 columns: list = field(default_factory=lambda: [
 'Planner', 'Reference', 'Description', 'Location', 'Unit', 'Details'
 ])
 column_mapping: Dict[str, str] = field(default_factory=lambda: {
 'Reference': ['Reference', 'REFERENCJA'],
 'Description': ['Description', 'OPIS'],
 })
 
 exclusion_keyword: str = 'Transportation mode'
 final_columns: list = field(default_factory=lambda: [
 'Klient', 'Oczekiwany termin realizacji', 'Termin potwierdzony', 'Reference', 'Produkt', 'Sztuk',
 'Uwagi dla wszystkich', 'Uwagi niewidoczne dla produkcji', 'Atrybut 1 (opcjonalnie)', 
 'Atrybut 2 (opcjonalnie)', 'Atrybut 3 (opcjonalnie)'
 ])
class ExcelProcessor:
 def __init__(self, config: ExcelProcessorConfig):
 self.config = config
 def extract_client_name(self, data: pd.DataFrame) -> Optional[str]:
 try:
 row = data[data.iloc[:, 3].str.contains(self.config.client_keyword, na=False)].iloc[0, 3]
 return row.split('\n')[0].split(',')[0]
 except IndexError as e:
 logging.error(f"Failed to extract client name: {e}")
 return None
 
 def find_relevant_sheet(self, excel_data: pd.ExcelFile) -> Optional[pd.DataFrame]:
 priority_sheets = ['Antalis', 'Pick-up order ATK', 'ATK', 'Sheet1']
 for sheet_name in priority_sheets:
 if sheet_name in excel_data.sheet_names:
 return pd.read_excel(excel_data, sheet_name=sheet_name)
 for sheet_name in excel_data.sheet_names:
 df = pd.read_excel(excel_data, sheet_name=sheet_name)
 if self.config.client_keyword in df.to_string():
 return df
 return None
 def extract_date(self, data: pd.DataFrame, keyword: str) -> str:
 try:
 logging.debug(f"Searching in DataFrame column: {data.iloc[:, 0].tolist()}")
 pattern = re.compile(r'\b' + re.escape(keyword) + r'\b', re.IGNORECASE)
 
 for index, row in data.iterrows():
 for col_index, cell_value in enumerate(row):
 if pattern.search(str(cell_value)):
 desired_data = None
 for col in range(col_index + 1, len(row)):
 if pd.notna(data.iloc[index, col]):
 desired_data = str(data.iloc[index, col])
 break
 if desired_data is not None:
 return desired_data
 else:
 return "Unknown date"
 
 logging.warning(f"No matches found for keyword '{keyword}'. Data might not contain the expected keywords.")
 return "Unknown date"
 
 except Exception as e:
 logging.error(f"An error occurred while extracting the date: {e}")
 return "Unknown date"
 def prepare_order_data(self, data: pd.DataFrame) -> pd.DataFrame:
 try:
 start_row = data[data.iloc[:, 0].str.contains('DB', na=False)].index[0]
 order_data = data.iloc[start_row:, :6]
 order_data.columns = self.config.columns
 return order_data.dropna(subset=['Reference']).query(f"Reference != '{self.config.exclusion_keyword}'")
 except IndexError as e:
 logging.error(f"Error processing order data: {e}")
 return pd.DataFrame()
 
 def process_file(self, file_path: str) -> pd.DataFrame:
 try:
 excel_data = pd.ExcelFile(file_path)
 antalis_data = self.find_relevant_sheet(excel_data)
 client_name = 'Valeo'
 pick_up_date = '19.08.2024'
 receiving_date = self.extract_date(antalis_data, self.config.date_keywords['receiving'])
 order_data = self.prepare_order_data(antalis_data)
 if not order_data.empty:
 order_data['Klient'] = client_name
 order_data['Oczekiwany termin realizacji'] = pick_up_date
 order_data['Termin potwierdzony'] = ""
 order_data['Produkt'] = order_data['Description'] + ' ' + order_data['Location'].fillna('')
 # product_mapping = load_product_mapping()
 # product_mapping = {k.strip(): v.strip() for k, v in product_mapping.items()}
 # order_data['Produkt'] = order_data['Produkt'].str.strip()
 # print("Data Before Mapping:", order_data['Produkt'].tolist())
 # order_data['Produkt'] = order_data['Produkt'].map(product_mapping).fillna(order_data['Produkt'])
 # print("Data After Mapping:", order_data['Produkt'].tolist())
 order_data['Sztuk'] = order_data.apply(lambda row: row['Details'] if pd.notna(row['Details']) and row['Details'] != 'NA' else '', axis=1)
 order_data = order_data[order_data['Sztuk'] != '']
 for column in self.config.final_columns:
 if column not in order_data.columns:
 order_data[column] = ''
 final_data = order_data[self.config.final_columns]
 final_data.columns = ['Klient', 'Oczekiwany termin realizacji', 'Termin potwierdzony', 'Zewn. nr zamówienia', 'Produkt', 'Sztuk', 'Uwagi dla wszystkich', 'Uwagi niewidoczne dla produkcji', 'Atrybut 1 (opcjonalnie)', 'Atrybut 2 (opcjonalnie)', 'Atrybut 3 (opcjonalnie)']
 return final_data
 else:
 return pd.DataFrame()
 except Exception as e:
 logging.error(f"An error occurred while processing the Excel file: {e}")
 return pd.DataFrame()
@app.route('/', methods=['GET', 'POST'])
def upload_file():
 if request.method == 'POST':
 if 'file' not in request.files:
 flash('Brak pliku!', 'error')
 return redirect(url_for('upload_file'))
 file = request.files['file']
 if file.filename == '':
 flash('Nie wybrałeś pliku!', 'error')
 return redirect(url_for('upload_file'))
 if file and allowed_file(file.filename):
 filename = secure_filename(file.filename)
 file_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
 file.save(file_path)
 processed_data = processor.process_file(file_path)
 if not processed_data.empty:
 session['data'] = processed_data.to_csv(index=False, sep=';')
 return render_template('upload.html', data=processed_data.to_html(classes='data', index=False), has_data=True)
 else:
 flash('Brak informacji, bądź nie można ich znaleźć.', 'error')
 return redirect(url_for('upload_file'))
 else:
 flash('Nieprawidłowy typ pliku', 'error')
 return redirect(url_for('upload_file'))
 else:
 return render_template('upload.html', data=None, has_data=False)
@app.route('/normalizer', methods=['GET'])
def display_vr():
 if request.method == 'GET':
 return render_template('normalizer.html'); 
@app.route('/pobierz', methods=['GET'])
def download_file():
 if 'data' in session:
 csv_data = pd.read_csv(StringIO(session['data']), sep=';')
 client_name = csv_data['Klient'].iloc[0] if not csv_data['Klient'].empty else 'default'
 today = datetime.now().strftime('%Y-%m-%d')
 filename = f"{client_name}_{today}.csv"
 response = make_response(session['data'])
 response.headers["Content-Disposition"] = f"attachment; filename={filename}"
 response.headers["Content-Type"] = "text/csv"
 return response
 else:
 return redirect(url_for('upload_file'))
@app.route('/edycja', methods=['POST'])
def adjust_data():
 if 'data' in session:
 client_name = request.form.get('client_name', None)
 expected_date = request.form.get('expected_date', None)
 confirmed_date = request.form.get('confirmed_date', None)
 csv_data = pd.read_csv(StringIO(session['data']), sep=';')
 if client_name is not None and client_name.strip():
 csv_data['Klient'] = client_name.strip()
 if expected_date is not None and expected_date.strip():
 expected_date = datetime.strptime(expected_date.strip(), '%d.%m.%Y').strftime('%Y-%m-%d')
 csv_data['Oczekiwany termin realizacji'] = expected_date
 if confirmed_date is not None and confirmed_date.strip():
 confirmed_date = datetime.strptime(confirmed_date.strip(), '%d.%m.%Y').strftime('%Y-%m-%d')
 csv_data['Termin potwierdzony'] = confirmed_date
 updated_csv = csv_data.to_csv(index=False, sep=';', na_rep='')
 session['data'] = updated_csv
 return render_template('upload.html', data=csv_data.to_html(classes='data', index=False, na_rep=''), has_data=True)
 else:
 return redirect(url_for('upload_file'))
@app.route('/adjust_client_name', methods=['POST'])
def adjust_client_name():
 if 'data' in session:
 client_name = request.form.get('client_name', None)
 
 csv_data = pd.read_csv(StringIO(session['data']))
 if client_name is not None and client_name.strip():
 csv_data['Klient'] = client_name.strip()
 updated_csv = csv_data.to_csv(index=False, na_rep='')
 session['data'] = updated_csv
 return render_template('upload.html', data=csv_data.to_html(classes='data', index=False, na_rep=''), has_data=True)
 else:
 return redirect(url_for('upload_file'))
def allowed_file(filename):
 return '.' in filename and filename.rsplit('.', 1)[1].lower() in app.config['ALLOWED_EXTENSIONS']
if __name__ == '__main__':
 config = ExcelProcessorConfig()
 processor = ExcelProcessor(config)
 # http_server = WSGIServer(('', 3000), app)
 # http_server.serve_forever()
 app.run(host="0.0.0.0", port=3000)

Example data:

https://docs.google.com/spreadsheets/d/1tJY9LebEV8q6Yn_sbm0_Ocl6xNTxJ2EPCLx-aP73zqM/edit?usp=sharing

How can I improve the separation of concerns, especially separating the web handling logic from the data processing?

Are there any obvious security flaws in the way I handle file uploads and session management?

How can I make the error handling more robust and less invasive?

What improvements can be made to the configuration management to make the app more maintainable?

Question 2

Disclaimer: I haven't run your code, so this is a static analysis exercise based on what I have noticed while perusing it.

Routes

This method only accepts GET requests:

@app.route('/normalizer', methods=['GET'])
def display_vr():
 if request.method == 'GET':
 return render_template('normalizer.html');

Therefore, the if request.method == 'GET': statement is redundant.

Likewise, you could streamline your routes a bit. Instead of having a route that accepts both GET and POST:

@app.route('/', methods=['GET', 'POST'])
def upload_file():

Just split the code in two routes:

@app.route('/', methods=['POST'])
def upload_file():
@app.route('/', methods=['GET'])
def show_upload_file():

Then you can remove a few conditional statements, like if request.method == 'POST':. It is perfectly fine to handle the requested route on a per-method basis, but the Python function names have to be unique of course. Not many tutorials show that pattern, so many Flask developers are not aware probably.

File validation

Function allowed_file could be replaced with pathlib: PurePath(filename).suffix. Pathlib can also replace os.path.join, save etc

It is good that you are using secure_filename to sanitize the file name. But you could still overwrite an existing file. Maybe this is not a concern to you.

Logging

Good points: usage of logging, including for debugging.

Comments and naming

A few docstrings/comments would be welcome. For instance, looking at functions like prepare_order_data, the function name is so broad that it does not tell us what it does exactly. The only way is to parse the code to figure it out. find_relevant_sheet is mysterious as well, because the definition of "relevant" in this context is not stated explicitly.

On the other hand, extract_date is quite telling, and we even have the type hint which is nice. Ditto for extract_client_name.

Looping

Some loops could be made more succinct, consider this:

 if pattern.search(str(cell_value)):
 desired_data = None
 for col in range(col_index + 1, len(row)):
 if pd.notna(data.iloc[index, col]):
 desired_data = str(data.iloc[index, col])
 break
 if desired_data is not None:
 return desired_data
 else:
 return "Unknown date"

You can return early instead of breaking out of the loop. break is used when you intend to continue running some code past the loop. But there is no more code in this function. The additional if statements are unnecessary and just bring bloat and distraction. Just take care of indentation:

 if pattern.search(str(cell_value)):
 for col in range(col_index + 1, len(row)):
 if pd.notna(data.iloc[index, col]):
 return str(data.iloc[index, col])
 return "Unknown date"

In fact, you are already using that pattern in find_relevant_sheet. You know this already.

This is still not optimal. Since you are looking up for a specific value/pattern in a dataframe, there are filtering expressions you can use in Pandas. You normally wouldn't do a loop for this. See for example: 8 Ways to Filter Pandas DataFrames

More Pandas stuff

I am not a Pandas expert, but I can see a few areas where it is underutilized. For example, in load_product_mapping you do some trimming of whitespace, which should be fast enough in its own right because Pandas is fast and you don't seem to be handling large files.

def load_product_mapping():
 product_mapping_df = pd.read_csv('products_data.csv')
 product_mapping_df['Dodatkowe oznaczenia'] = product_mapping_df['Dodatkowe oznaczenia'].fillna('').astype(str).str.strip()
 product_mapping_df['Nazwa'] = product_mapping_df['Nazwa'].str.strip()
 product_mapping = {k: v for k, v in product_mapping_df.set_index('Nazwa')['Dodatkowe oznaczenia'].items() if v}
 return product_mapping

pandas.read_csv has many arguments, but you are not using any (beyond the separator), yet I am sure that some would be useful, like skipinitialspace. I would try this rather than apply strip() etc. The idea is to acquire the file at the source in the most optimal way possible, rather than do cleanup afterwards.

Class

+1 for using @dataclass. Since you are already familiar with, other classes like ExcelProcessor could be upgraded as well.

Structure

Here it is normal to post MRE (Minimal reproducible example) code. In production, you would split the code in a few files, that you import as required. For example, the ExcelProcessor class can and should go to a separate file. It's easier to maintain code that is short and well delimited.

In fact, the routes can have their own file too, especially when you use Flask blueprints. When the app is small like in this case, you don't bother. If you have a lot of routes, then it can make sense to externalize that part.

On the other hand, one thing that could be detached from the main code is the configuration options like secret key, file paths etc. The idea is to be able to make changes easily, or have different settings for dev and prod, dev machine and server, without touching the codebase. Also, if you use versioning software like git, you don't want to commit secrets, especially when the code is pushed to a public repo like Github. Environment variables (using .env files) can be used for storing sensitive values.

Recommended reading: Configuring Your Flask App

Session

Instead of using session to pass data like session['data'], you could just pass a reference to a file name. The uploaded file should be stored in the temp directory.

Misc

You have multiple bits of code like this:

if client_name is not None and client_name.strip():
 csv_data['Klient'] = client_name.strip()

client_name.strip() need not be in the if statement. Just strip it.

has_data is probably redundant:

return render_template('upload.html', data=None, has_data=False)
return render_template('upload.html', data=processed_data.to_html(classes='data', index=False), has_data=True)

Your Jinja template can evaluate the data being passed is None or zero length.

Validation

In adjust_data, you are processing form data:

expected_date = request.form.get('expected_date', None)
confirmed_date = request.form.get('confirmed_date', None)

But further down, you apply strptime on the dates. This code could crash if the input format is not correct. Your Flask page should make it clear what format is expected, unless you use a datepicker in your UI. But you still have to check that incoming data. A small dedicated function could be written to validates dates, not just the format but the date value. For example, 30 February should be rejected.

Question 3

Thank you @Kate, good review! "I haven't run your code" That's normal. The responsibility of Author is to convince Reviewer not merely that the code is Correct, but that it is Obviously Correct. Feel free to do more with the code beyond reading it, but for many reviews just that much should suffice. // Suppose we're interested in e.g. performance. Then a ReadMe or similar document should summarize the Author's measurements and observations, and be part of the review submission. Why? So there's a record in git a year from now when some maintainer revisits "why is this slow?".

Kate Kate 8,1089 silver badges21 bronze badges · Answer 1 · 2024-04-24 21:10:16Z

Disclaimer: I haven't run your code, so this is a static analysis exercise based on what I have noticed while perusing it.

Routes

This method only accepts GET requests:

@app.route('/normalizer', methods=['GET'])
def display_vr():
 if request.method == 'GET':
 return render_template('normalizer.html');

Therefore, the if request.method == 'GET': statement is redundant.

Likewise, you could streamline your routes a bit. Instead of having a route that accepts both GET and POST:

@app.route('/', methods=['GET', 'POST'])
def upload_file():

Just split the code in two routes:

@app.route('/', methods=['POST'])
def upload_file():
@app.route('/', methods=['GET'])
def show_upload_file():

Then you can remove a few conditional statements, like if request.method == 'POST':. It is perfectly fine to handle the requested route on a per-method basis, but the Python function names have to be unique of course. Not many tutorials show that pattern, so many Flask developers are not aware probably.

File validation

Function allowed_file could be replaced with pathlib: PurePath(filename).suffix. Pathlib can also replace os.path.join, save etc

It is good that you are using secure_filename to sanitize the file name. But you could still overwrite an existing file. Maybe this is not a concern to you.

Logging

Good points: usage of logging, including for debugging.

Comments and naming

A few docstrings/comments would be welcome. For instance, looking at functions like prepare_order_data, the function name is so broad that it does not tell us what it does exactly. The only way is to parse the code to figure it out. find_relevant_sheet is mysterious as well, because the definition of "relevant" in this context is not stated explicitly.

On the other hand, extract_date is quite telling, and we even have the type hint which is nice. Ditto for extract_client_name.

Looping

Some loops could be made more succinct, consider this:

 if pattern.search(str(cell_value)):
 desired_data = None
 for col in range(col_index + 1, len(row)):
 if pd.notna(data.iloc[index, col]):
 desired_data = str(data.iloc[index, col])
 break
 if desired_data is not None:
 return desired_data
 else:
 return "Unknown date"

You can return early instead of breaking out of the loop. break is used when you intend to continue running some code past the loop. But there is no more code in this function. The additional if statements are unnecessary and just bring bloat and distraction. Just take care of indentation:

 if pattern.search(str(cell_value)):
 for col in range(col_index + 1, len(row)):
 if pd.notna(data.iloc[index, col]):
 return str(data.iloc[index, col])
 return "Unknown date"

In fact, you are already using that pattern in find_relevant_sheet. You know this already.

This is still not optimal. Since you are looking up for a specific value/pattern in a dataframe, there are filtering expressions you can use in Pandas. You normally wouldn't do a loop for this. See for example: 8 Ways to Filter Pandas DataFrames

More Pandas stuff

I am not a Pandas expert, but I can see a few areas where it is underutilized. For example, in load_product_mapping you do some trimming of whitespace, which should be fast enough in its own right because Pandas is fast and you don't seem to be handling large files.

def load_product_mapping():
 product_mapping_df = pd.read_csv('products_data.csv')
 product_mapping_df['Dodatkowe oznaczenia'] = product_mapping_df['Dodatkowe oznaczenia'].fillna('').astype(str).str.strip()
 product_mapping_df['Nazwa'] = product_mapping_df['Nazwa'].str.strip()
 product_mapping = {k: v for k, v in product_mapping_df.set_index('Nazwa')['Dodatkowe oznaczenia'].items() if v}
 return product_mapping

pandas.read_csv has many arguments, but you are not using any (beyond the separator), yet I am sure that some would be useful, like skipinitialspace. I would try this rather than apply strip() etc. The idea is to acquire the file at the source in the most optimal way possible, rather than do cleanup afterwards.

Class

+1 for using @dataclass. Since you are already familiar with, other classes like ExcelProcessor could be upgraded as well.

Structure

Here it is normal to post MRE (Minimal reproducible example) code. In production, you would split the code in a few files, that you import as required. For example, the ExcelProcessor class can and should go to a separate file. It's easier to maintain code that is short and well delimited.

In fact, the routes can have their own file too, especially when you use Flask blueprints. When the app is small like in this case, you don't bother. If you have a lot of routes, then it can make sense to externalize that part.

On the other hand, one thing that could be detached from the main code is the configuration options like secret key, file paths etc. The idea is to be able to make changes easily, or have different settings for dev and prod, dev machine and server, without touching the codebase. Also, if you use versioning software like git, you don't want to commit secrets, especially when the code is pushed to a public repo like Github. Environment variables (using .env files) can be used for storing sensitive values.

Recommended reading: Configuring Your Flask App

Session

Instead of using session to pass data like session['data'], you could just pass a reference to a file name. The uploaded file should be stored in the temp directory.

Misc

You have multiple bits of code like this:

if client_name is not None and client_name.strip():
 csv_data['Klient'] = client_name.strip()

client_name.strip() need not be in the if statement. Just strip it.

has_data is probably redundant:

return render_template('upload.html', data=None, has_data=False)
return render_template('upload.html', data=processed_data.to_html(classes='data', index=False), has_data=True)

Your Jinja template can evaluate the data being passed is None or zero length.

Validation

In adjust_data, you are processing form data:

expected_date = request.form.get('expected_date', None)
confirmed_date = request.form.get('confirmed_date', None)

But further down, you apply strptime on the dates. This code could crash if the input format is not correct. Your Flask page should make it clear what format is expected, unless you use a datepicker in your UI. But you still have to check that incoming data. A small dedicated function could be written to validates dates, not just the format but the date value. For example, 30 February should be rejected.

Thank you @Kate, good review! "I haven't run your code" That's normal. The responsibility of Author is to convince Reviewer not merely that the code is Correct, but that it is Obviously Correct. Feel free to do more with the code beyond reading it, but for many reviews just that much should suffice. // Suppose we're interested in e.g. performance. Then a ReadMe or similar document should summarize the Author's measurements and observations, and be part of the review submission. Why? So there's a record in git a year from now when some maintainer revisits "why is this slow?".

Stack Exchange Network

Flask web normalizer CSV app

1 Answer 1

Routes

File validation

Logging

Comments and naming

Looping

More Pandas stuff

Class

Structure

Session

Misc

Validation

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

Flask web normalizer CSV app

1 Answer 1

Routes

File validation

Logging

Comments and naming

Looping

More Pandas stuff

Class

Structure

Session

Misc

Validation

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions