Dataset: Labelled epidemic data consisting of number of infectious individuals per unit time.
Challenge: Use supervised classification via a recurrent neural network to classify each epidemic as belonging to one of eight classes.
My problem: I have working code, but I have a feeling it's not the best way to approach the problem. In particular, I have assumed that hyperparameters like number of units per layer, learning rate, batch size, etc. come from a discrete set, and I run a different neural network for each setting. There must be a standard (better) way of doing this?
Relevant section of working code: (Disclaimer: huge debt of gratitude to http://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/)
#!/usr/bin/env python
import numpy as np
import keras
from keras.models import Sequential, Dense, SimpleRNN
from keras.preprocessing import sequence
from sklearn.preprocessing import MinMaxScaler
from keras.utils import np_utils
import itertools, argparse
def network_simple_rnn(data_in, out_dim, optim_type, b_size, save_file, num_classes, epochs, default_val):
X_train = data_in[0]
dummy_y = data_in[1]
X_test = data_in[2]
dummy_y_test = data_in[3]
model = Sequential()
model.add(SimpleRNN(out_dim, input_shape = (X_train.shape[1], X_train.shape[2]), return_sequences = False))
model.add(Dense(num_classes, activation='sigmoid'))
optim_type = ["rmsprop", "adam", "sgd"]
s_in = save_file
for optim_val in optim_type:
if optim_val == "sgd" and default_val == False:
lr_ = [0.001, 0.01, 0.05]
momentum_in = [0., 0.8, 0.9, 0.99]
decay_in = [0., 0.01, 0.1, 0.5]
nest_in = [True, False]
paras_in = itertools.product(lr_, momentum_in, decay_in, nest_in)
for l_in, m_in, d_in, n_in in paras_in:
save_file = s_in
optim_use = keras.optimizers.sgd(lr = l_in, momentum = m_in, decay = d_in, nesterov = n_in)
model.compile(loss='categorical_crossentropy', optimizer = optim_use, metrics = ['accuracy'])
hist = model.fit(X_train, dummy_y, validation_data=(X_test, dummy_y_test), nb_epoch = epochs, batch_size = b_size)
scores = model.evaluate(X_train, dummy_y)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
h1 = hist.history
acc_ = np.asarray(h1['acc']) #ndarray
loss_ = np.asarray(h1['loss']) #ndarray
val_loss_ = np.asarray(h1['val_loss'])
val_acc_ = np.asarray(h1['val_acc'])
acc_and_loss = np.column_stack((acc_, loss_, val_acc_, val_loss_))
save_file = save_file + str(l_in) + str(m_in) + str(d_in) + str(n_in) + str(epochs) + ".txt"
print 'saving file'
#Write the scores to a file
with open(save_file, 'w') as f:
np.savetxt(save_file, acc_and_loss, delimiter=" ")
print 'saved file', save_file
else:
model.compile(loss='categorical_crossentropy', optimizer = optim_val, metrics = ['accuracy'])
hist = model.fit(X_train, dummy_y, validation_data=(X_test, dummy_y_test), nb_epoch = epochs, batch_size = b_size)
scores = model.evaluate(X_train, dummy_y)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
save_file = s_in
h1 = hist.history
acc_ = np.asarray(h1['acc']) #ndarray
loss_ = np.asarray(h1['loss']) #ndarray
val_loss_ = np.asarray(h1['val_loss'])
val_acc_ = np.asarray(h1['val_acc'])
acc_and_loss = np.column_stack((acc_, loss_, val_acc_, val_loss_))
save_file = save_file + str(optim_type) + str(epochs) + ".txt"
print 'saving file'
with open(save_file, 'w') as f:
np.savetxt(save_file, acc_and_loss, delimiter=" ")
print 'saved file', save_file
if __name__ == '__main__':
#This section reads in command line arguments from a separate file
parser = argparse.ArgumentParser()
parser.add_argument('--train_file')
parser.add_argument('--test_file')
parser.add_argument('--out_dim')
parser.add_argument('--optim_type')
parser.add_argument('--batch_size')
parser.add_argument('--save_file')
parser.add_argument('--num_classes')
parser.add_argument('--epochs')
parser.add_argument('--default_val')
args = parser.parse_args()
train_file = str(args.train_file)
test_file = str(args.test_file)
out_dim = int(args.out_dim)
optim_type = str(args.optim_type)
b_size = int(args.batch_size)
save_file = str(args.save_file)
num_classes = int(args.num_classes)
epochs = int(args.epochs)
default_val = bool(args.default_val)
data_in = read_data(train_file, test_file)
network_simple_rnn(data_in, out_dim, optim_type, b_size, save_file, num_classes, epochs, default_val)
1 Answer 1
This i going to be a style-review only, because after that the actual review becomes that much easier.
Use a consistent number of spaces. Currently you have 8 spaces in your functions, but 4 spaces per tab in your
if __name__ = "__main__":
part (which is a good thing to have). PEP8, Python's official style-guide, recommends using 4 spaces per tab.Use tuple unpacking.
X_train = data_in[0]
dummy_y = data_in[1]
X_test = data_in[2]
dummy_y_test = data_in[3]
Can be more succinctly written as:
X_train, dummy_y, X_test, dummy_y_test = data_in
Use better names.
dummy_y_test
,s_in
,optim_use
are all not very descriptive names. Try to come up with better ones.Use
str.format
to build your save file name.
save_file = "{save_file}{l_in}{m_in}{d_in}{n_in}{epochs}.txt".format(**locals())
Or, in Python 3.6+, using f-strings:
save_file = f"{save_file}{l_in}{m_in}{d_in}{n_in}{epochs}.txt"
This way you don't need to call str
on all of them, because format
does that for you. Also, whenever you do "str1" + "str2"
, you create a new string (because strings are immutable in Python). For long chains of long strings, this becomes quite inefficient.
- Use
str.format
everywhere. It is the more modern, recommended way to do string formatting.
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
becomes
print("{}: {:.2f}%".format(model.metrics_names[1], scores[1]*100))
or, again for Python 3.6+:
print(f"{model.metrics_names[1]}: {scores[1]*100:.2f}%")
Be consistent with your
print
. Right now you mixprint(X)
andprint X
. To be future-proof, use only the former, or consistently use the latter (You might want to add the python-2.7 tag, if you want only recommendations taking Python 2.7 into account).Use more tuples. There are a lot of places, where you do something like
optim_type = ["rmsprop", "adam", "sgd"]
. Since you never add any type or modify it, and the only thing you do is iterate over it, you can save a tiny bit of space and use tuples here:optim_type = "rmsprop", "adam", "sgd"
.Move argument parsing to its own function. Just put all you argument parsing into a
parse_args
function, this way it does not clutter your code. Note thatargparse
is quite sophisticated and can take a type argument for each argument. The default type isstr
. Note that you can useaction='store_true'
to make an argument behave like a flag. This way./script.py --default_val
will makeargs.default_val == True
and./script.py
will giveargs.default_val == False
:
def parse_args():
"""Reads in command line arguments from a separate file"""
parser = argparse.ArgumentParser()
parser.add_argument('--train_file')
parser.add_argument('--test_file')
parser.add_argument('--out_dim', type=int)
parser.add_argument('--optim_type')
parser.add_argument('--batch_size', type=int)
parser.add_argument('--save_file')
parser.add_argument('--num_classes', type=int)
parser.add_argument('--epochs', type=int)
parser.add_argument('--default_val', action='store_true')
return parser.parse_args()
if __name__ == '__main__':
args = parse_args()
data_in = read_data(args.train_file, args.test_file)
network_simple_rnn(data_in, args.out_dim, args.optim_type, args.batch_size,
args.save_file, args.num_classes, args.epochs,
args.default_val)
This still looks somewhat messy, mostly due to the lot of args.X
in there. If you renamed the variables in you __init__
to exactly the same ones of the argument, you could do:
def network_simple_rnn(data_in, out_dim, optim_type, batch_size, save_file, num_classes, epochs, default_val, **kwargs):
...
if __name__ == '__main__':
args = parse_args()
data_in = read_data(args.train_file, args.test_file)
network_simple_rnn(data_in, **vars(args))
The **kwargs
is needed to catch all superfluous keyword arguments.
read_data
function seems to be not defined currently. \$\endgroup\$