Return to Revisions

2 of 7

took @freda at his word that these aren't errors, but style warnings

edited Oct 31, 2014 at 14:46

31.1k
6
73
176

Python package for handwriting recognition

I am currently writing my bachelor's thesis about on-line handwriting recognition. This is not OCR, because I have the information how a symbol is written as a list of pen trajectory coordinates (x, y).

I call this hwrt - handwriting recognition toolkit. It has a documentation and a friend of mine got the "first steps" to work on his computer.

However, it is the first time I wrote a Python package of which I hope that others might use it. I hope to get general feedback about this project.

The project structure is hosted at GitHub and has the following structure:

.
├── bin
├── dist
├── docs
├── hwrt
│  ├── misc
│  └── templates
└── tests
 └── symbols

I has some nosetests (not enough, I am working on it).

One of the files in bin is view.py. It allows users to take a look at the data they have previously downloaded (see "first steps" in my documentation).

setup.py

try:
 from setuptools import setup
except ImportError:
 from distutils.core import setup
config = {
 'name': 'hwrt',
 'version': '0.1.125',
 'author': 'Martin Thoma',
 'author_email': '[email protected]',
 'packages': ['hwrt'],
 'scripts': ['bin/backup.py', 'bin/view.py', 'bin/download.py',
 'bin/test.py', 'bin/train.py', 'bin/analyze_data.py',
 'bin/hwrt', 'bin/record.py'],
 'package_data': {'hwrt': ['templates/*', 'misc/*']},
 'url': 'https://github.com/MartinThoma/hwrt',
 'license': 'MIT',
 'description': 'Handwriting Recognition Tools',
 'long_description': """A tookit for handwriting recognition. It was
 developed as part of the bachelors thesis of Martin Thoma.""",
 'install_requires': [
 "argparse",
 "theano",
 "nose",
 "natsort",
 "PyYAML",
 "matplotlib",
 "shapely"
 ],
 'keywords': ['HWRT', 'recognition', 'handwriting', 'on-line'],
 'download_url': 'https://github.com/MartinThoma/hwrt',
 'classifiers': ['Development Status :: 3 - Alpha',
 'Environment :: Console',
 'Intended Audience :: Developers',
 'Intended Audience :: Science/Research',
 'License :: OSI Approved :: MIT License',
 'Natural Language :: English',
 'Programming Language :: Python :: 2.7',
 'Programming Language :: Python :: 3',
 'Topic :: Scientific/Engineering :: Artificial Intelligence',
 'Topic :: Software Development',
 'Topic :: Utilities'],
 'zip_safe': False,
 'test_suite': 'nose.collector'
}
setup(**config)

view.py

#!/usr/bin/env python
"""
Display a recorded handwritten symbol as well as the preprocessing methods
and the data multiplication steps that get applied.
"""
import sys
import os
import logging
logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s',
 level=logging.DEBUG,
 stream=sys.stdout)
import yaml
try: # Python 2
 import cPickle as pickle
except ImportError: # Python 3
 import pickle
# My modules
import hwrt
from hwrt import HandwrittenData
sys.modules['HandwrittenData'] = HandwrittenData
import hwrt.utils as utils
import hwrt.preprocessing as preprocessing
import hwrt.features as features
import hwrt.data_multiplication as data_multiplication
def _fetch_data_from_server(raw_data_id):
 """Get the data from raw_data_id from the server.
 :returns: The ``data`` if fetching worked, ``None`` if it failed."""
 import MySQLdb
 import MySQLdb.cursors
 # Import configuration file
 cfg = utils.get_database_configuration()
 if cfg is None:
 return None
 # Establish database connection
 connection = MySQLdb.connect(host=cfg[args.mysql]['host'],
 user=cfg[args.mysql]['user'],
 passwd=cfg[args.mysql]['passwd'],
 db=cfg[args.mysql]['db'],
 cursorclass=MySQLdb.cursors.DictCursor)
 cursor = connection.cursor()
 # Download dataset
 sql = ("SELECT `id`, `data` "
 "FROM `wm_raw_draw_data` WHERE `id`=%i") % raw_data_id
 cursor.execute(sql)
 return cursor.fetchone()
def _get_data_from_rawfile(path_to_data, raw_data_id):
 """Get a HandwrittenData object that has ``raw_data_id`` from a pickle file
 ``path_to_data``.
 :returns: The HandwrittenData object if ``raw_data_id`` is in
 path_to_data, otherwise ``None``."""
 loaded = pickle.load(open(path_to_data))
 raw_datasets = loaded['handwriting_datasets']
 for raw_dataset in raw_datasets:
 if raw_dataset['handwriting'].raw_data_id == raw_data_id:
 return raw_dataset['handwriting']
 return None
def _list_ids(path_to_data):
 """List raw data IDs grouped by symbol ID from a pickle file
 ``path_to_data``."""
 loaded = pickle.load(open(path_to_data))
 raw_datasets = loaded['handwriting_datasets']
 raw_ids = {}
 for raw_dataset in raw_datasets:
 raw_data_id = raw_dataset['handwriting'].raw_data_id
 if raw_dataset['formula_id'] not in raw_ids:
 raw_ids[raw_dataset['formula_id']] = [raw_data_id]
 else:
 raw_ids[raw_dataset['formula_id']].append(raw_data_id)
 for symbol_id in sorted(raw_ids):
 print("%i: %s" % (symbol_id, sorted(raw_ids[symbol_id])))
def _get_description(prev_description):
 """Get the parsed description file (a dictionary) from another
 parsed description file."""
 current_desc_file = os.path.join(utils.get_project_root(),
 prev_description['data-source'],
 "info.yml")
 if not os.path.isfile(current_desc_file):
 logging.error("You are probably not in the folder of a model, because "
 "%s is not a file.", current_desc_file)
 sys.exit(-1)
 with open(current_desc_file, 'r') as ymlfile:
 current_description = yaml.load(ymlfile)
 return current_description
def _get_system(model_folder):
 """Return the preprocessing description, the feature description and the
 model description."""
 # Get model description
 model_description_file = os.path.join(model_folder, "info.yml")
 if not os.path.isfile(model_description_file):
 logging.error("You are probably not in the folder of a model, because "
 "%s is not a file. (-m argument)",
 model_description_file)
 sys.exit(-1)
 with open(model_description_file, 'r') as ymlfile:
 model_desc = yaml.load(ymlfile)
 # Get the feature and the preprocessing description
 feature_desc = _get_description(model_desc)
 preprocessing_desc = _get_description(feature_desc)
 return (preprocessing_desc, feature_desc, model_desc)
def display_data(raw_data_string, raw_data_id, model_folder):
 """Print ``raw_data_id`` with the content ``raw_data_string`` after
 applying the preprocessing of ``model_folder`` to it."""
 print("## Raw Data (ID: %i)" % raw_data_id)
 print("```")
 print(raw_data_string)
 print("```")
 preprocessing_desc, feature_desc, _ = _get_system(model_folder)
 # Print model
 print("## Model")
 print("%s\n" % model_folder)
 # Print preprocessing queue
 print("## Preprocessing")
 print("```")
 tmp = preprocessing_desc['queue']
 preprocessing_queue = preprocessing.get_preprocessing_queue(tmp)
 for algorithm in preprocessing_queue:
 print("* " + str(algorithm))
 print("```")
 feature_list = features.get_features(feature_desc['features'])
 input_features = sum(map(lambda n: n.get_dimension(), feature_list))
 print("## Features (%i)" % input_features)
 print("```")
 for algorithm in feature_list:
 print("* %s" % str(algorithm))
 print("```")
 # Get Handwriting
 recording = HandwrittenData.HandwrittenData(raw_data_string,
 raw_data_id=raw_data_id)
 # Get the preprocessing queue
 tmp = preprocessing_desc['queue']
 preprocessing_queue = preprocessing.get_preprocessing_queue(tmp)
 recording.preprocessing(preprocessing_queue)
 # Get feature values as list of floats, rounded to 3 decimal places
 tmp = feature_desc['features']
 feature_list = features.get_features(tmp)
 feature_values = recording.feature_extraction(feature_list)
 feature_values = [round(el, 3) for el in feature_values]
 print("Features:")
 print(feature_values)
 # Get the list of data multiplication algorithms
 mult_queue = data_multiplication.get_data_multiplication_queue(
 feature_desc['data-multiplication'])
 # Multiply traing_set
 training_set = [recording]
 for algorithm in mult_queue:
 new_trning_set = []
 for recording in training_set:
 samples = algorithm(recording)
 for sample in samples:
 new_trning_set.append(sample)
 training_set = new_trning_set
 # Display it
 for recording in training_set:
 recording.show()
def get_parser():
 """Return the parser object for this script."""
 from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
 parser = ArgumentParser(description=__doc__,
 formatter_class=ArgumentDefaultsHelpFormatter)
 parser.add_argument("-i", "--id", dest="id", default=292293,
 type=int,
 help="which RAW_DATA_ID do you want?")
 parser.add_argument("--mysql", dest="mysql", default='mysql_online',
 help="which mysql configuration should be used?")
 parser.add_argument("-m", "--model",
 dest="model",
 help="where is the model folder (with a info.yml)?",
 metavar="FOLDER",
 type=lambda x: utils.is_valid_folder(parser, x),
 default=utils.default_model())
 parser.add_argument("-l", "--list",
 dest="list",
 help="list all raw data IDs / symbol IDs",
 action='store_true',
 default=False)
 parser.add_argument("-s", "--server",
 dest="server",
 help="contact the MySQL server",
 action='store_true',
 default=False)
 return parser
if __name__ == '__main__':
 args = get_parser().parse_args()
 if args.list:
 preprocessing_desc, _, _ = _get_system(args.model)
 raw_datapath = os.path.join(utils.get_project_root(),
 preprocessing_desc['data-source'])
 _list_ids(raw_datapath)
 else:
 if args.server:
 data = _fetch_data_from_server(args.id)
 print("hwrt version: %s" % hwrt.__version__)
 display_data(data['data'], data['id'], args.model)
 else:
 logging.info("RAW_DATA_ID %i does not exist or "
 "database connection did not work.", args.id)
 # The data was not on the server / the connection to the server did
 # not work. So try it again with the model data
 preprocessing_desc, _, _ = _get_system(args.model)
 raw_datapath = os.path.join(utils.get_project_root(),
 preprocessing_desc['data-source'])
 handwriting = _get_data_from_rawfile(raw_datapath, args.id)
 if handwriting is None:
 logging.info("Recording with ID %i was not found in %s",
 args.id,
 raw_datapath)
 else:
 print("hwrt version: %s" % hwrt.__version__)
 display_data(handwriting.raw_data_json,
 handwriting.formula_id,
 args.model)

As I wrote, I would like to get general Feedback about the project. However, I am not experienced with packaging, so I copied the setup.py. I am especially not sure if my choice zip_safe: False was correct.

I think I follow PEP8 everywhere and I use pylint to improve my code. However, for view.py I don't understand the following style warnings / I don't know how to fix them (in a good way):

W:115, 4: Redefining name 'preprocessing_desc' from outer scope (line 218) (redefined-outer-name)
W:128, 4: Redefining name 'preprocessing_desc' from outer scope (line 218) (redefined-outer-name)
R:120, 0: Too many local variables (19/15) (too-many-locals)
C:216, 4: Invalid constant name "args" (invalid-name)
C:218, 8: Invalid constant name "preprocessing_desc" (invalid-name)
C:219, 8: Invalid constant name "raw_datapath" (invalid-name)
C:224,12: Invalid constant name "data" (invalid-name)
C:232,12: Invalid constant name "preprocessing_desc" (invalid-name)
C:233,12: Invalid constant name "raw_datapath" (invalid-name)
C:235,12: Invalid constant name "handwriting" (invalid-name)

python

asked Oct 31, 2014 at 13:57

Martin Thoma

1.3k
11
24

lang-py