Python command-line program to convert genomic data file

Question 1

Background:

I have written this code to transforme a .csv file exported from a software called Geneious containing SNPs and concatenate them into a DNA sequence.

So basically take fields from .csv file to make strings.

The code itself is just a bunch of functions that perform small tasks, some functions call others and in the end the result is printed to a file. I used argparse because this is going to be a command line tool, and is useful to have obligatory arguments and default values for the others.

I am inexperienced in coding and have noone to review my code. I feel that needing to call each argument for each function is really awkward.

My questions:

Is this the best structure? Is creating a "chain" of functions like this the Best practice?

Code

import argparse
import collections
import csv
def cleaning(file_as_list, snp, names):
 """From input file get the SNPS."""
 with open(file_as_list, 'r') as input_file:
 reader = csv.reader(input_file)
 file = list(reader)
 have_SNP = [x for x in file if x[snp] == '1']
 for i in range(len(have_SNP)):
 mult_names = have_SNP[i][names].replace(':', ',').replace(', ', ',')
 sep_names = mult_names.split(',')
 only_names = [x for x in sep_names if ' ' not in x]
 have_SNP[i][names] = only_names
 return have_SNP
def reference_dic(file_as_list, snp, names, col_ref, pos):
 """Creates the dict with all positions and reference nucleotides."""
 have_SNP = cleaning(file_as_list, snp, names)
 ref_dic = {}
 for i in have_SNP:
 ref_dic[int(i[pos].replace(',', ''))] = i[col_ref]
 return ref_dic
def pos_list(file_as_list, snp, names, col_ref, pos):
 """Creates a list with all the ehxisting positions in reference."""
 ref_dic = reference_dic(file_as_list, snp, names, col_ref, pos)
 list_pos = []
 for key in ref_dic:
 list_pos.append(key)
 sorted_pos_lis = sorted(list_pos)
 return sorted_pos_lis
def genomes_list(file_as_list, snp, names, col_ref, pos):
 """Identifies the genomes present in the input file."""
 have_SNP = cleaning(file_as_list, snp, names)
 genomes_dic = {}
 for i in have_SNP:
 for j in i[names]:
 genomes_dic[j] = ""
 genomes_list = []
 for key in genomes_dic:
 genomes_list.append(key)
 return genomes_list
def identify_genomes(file_as_list, snp, names, col_ref, pos, col_genome):
 """Creates a list of tuples with genome name and respesctive SNPs."""
 have_SNP = cleaning(file_as_list, snp, names)
 genomes = genomes_list(file_as_list, snp, names, col_ref, pos)
 entrys_per_genome = []
 pos_genomes_in_dict = []
 for i in genomes:
 sub_tup = ()
 sub_list = []
 sub_dict = {}
 for j in have_SNP:
 if i in j[names]:
 sub_sub_list = [int(j[pos].replace(',', '')), j[col_genome]]
 sub_list.append(sub_sub_list)
 sub_dict[int(j[pos].replace(',', ''))] = j[col_genome]
 sub_tup = (i, sub_list)
 sub_dic_tup = (i, sub_dict)
 entrys_per_genome.append(sub_tup)
 pos_genomes_in_dict.append(sub_dic_tup)
 return entrys_per_genome, pos_genomes_in_dict
def remove_dupli_pos(file_as_list, snp, names, col_ref, pos, col_genome):
 """Creates a list without SNPs that appear 2 times for one genome."""
 entrys_per_genome = identify_genomes(file_as_list, snp, names, col_ref,
 pos, col_genome)[0]
 all_genomes_pos = []
 for i in entrys_per_genome:
 genome_pos = []
 for j in i[1]:
 genome_pos.append(j[0])
 all_genomes_pos.append(genome_pos)
 list_dup_pos = []
 for i in all_genomes_pos:
 duplicated = [k for k, v in collections.Counter(i).items() if v > 1]
 list_dup_pos.extend(duplicated)
 no_dup_list_dup_pos = set(list_dup_pos)
 all_positions = pos_list(file_as_list, snp, names, col_ref, pos)
 pos_no_dup = [x for x in all_positions if x not in no_dup_list_dup_pos]
 return pos_no_dup
def get_ref(file_as_list, snp, names, col_ref, pos, col_genome):
 """Creates the reference sequence based on all SNPs."""
 ref_dic = reference_dic(file_as_list, snp, names, col_ref, pos)
 pos_no_dup = remove_dupli_pos(file_as_list, snp, names, col_ref,
 pos, col_genome)
 reference_snps_list = ""
 for i in pos_no_dup:
 reference_snps_list += str(ref_dic[i])
 return reference_snps_list
def get_genomes(file_as_list, snp, names, col_ref, pos, col_genome):
 """Uses the SNPs for each genome and 'N's to build each genome sequence."""
 ref_dic = reference_dic(file_as_list, snp, names, col_ref, pos)
 pos_no_dup = remove_dupli_pos(file_as_list, snp, names, col_ref, pos,
 col_genome)
 genomes_pos = identify_genomes(file_as_list, snp, names, col_ref, pos,
 col_genome)[1]
 genomes = []
 for i in genomes_pos:
 dic_of_genome = i[1]
 this_genome = ""
 for j in pos_no_dup:
 if j in dic_of_genome.keys():
 this_genome += str(dic_of_genome[j])
 elif j in ref_dic:
 this_genome += 'N'
 else:
 print("ERROR!!!!")
 break
 genomes.append(">{0}".format(i[0]))
 genomes.append(this_genome)
 return genomes
def main(file_as_list, snp, names, col_ref, pos, col_genome):
 """Creates 'files.fasta' with the ref and genomes in fasta format."""
 ref_genome = get_ref(file_as_list, snp, names, col_ref, pos, col_genome)
 genomes = get_genomes(file_as_list, snp, names, col_ref, pos, col_genome)
 with open("files.fasta", "w") as out_file:
 out_file.write(">reference_sequence\n")
 out_file.write("{0}\n".format(ref_genome))
 for i in genomes:
 out_file.write("{0}\n".format(i))
if __name__ == '__main__':
 parser = argparse.ArgumentParser()
 parser.add_argument("input",
 help="name of the input file")
 parser.add_argument("-r", "--col_ref_genome_nuc", default=2,
 help="""number of the column with the reference genome
 nucleotides""")
 parser.add_argument("-g", "--col_genomes_nuc", default=8,
 help="""number of the column with the genomes
 nucleotides""")
 parser.add_argument("-p", "--position", default=3,
 help="""number of the column with the position in the
 genome""")
 parser.add_argument("-n", "--genome_names", default=10,
 help="number of the column with the genomes names")
 parser.add_argument("-s", "--is_snp", default=7,
 help="number of the column with lenght")
 args = parser.parse_args()
 print("""Columns:\n[Reference genome:{0}]\n[Genomes:{1}]
[Position of the SNP:{2}]\n[Genomes name:{3}]
[Is SNP:{4}]""" .format(args.col_ref_genome_nuc, args.col_genomes_nuc,
 args.position, args.genome_names, args.is_snp))
 col_ref = int(args.col_ref_genome_nuc) - 1
 col_genome = int(args.col_genomes_nuc) - 1
 pos = int(args.position) - 1
 names = int(args.genome_names) - 1
 snp = int(args.is_snp) - 1
 file_as_list = str(args.input)
 print("\nProcessing...")
 main(file_as_list, snp, names, col_ref, pos, col_genome)
 print("\nJob Done. Output written as <files.fasta>")

Question 2

This sounds like a good use case to use a class where the arguments you currently pass through the chain of functions would be class attributes, for instance, if we would have a single Converter class, we may have it initialized this way:

class Converter:
 def __init__(self, filename, snp, names, col_ref, pos, col_genome):
 self.filename = filename
 self.snp = snp
 self.names = names
 self.col_ref = col_ref
 self.pos = pos
 self.col_genome = col_genome

Then, your functions will become instance methods where you'll access the instance attributes via self.<attribute> instead of taking an argument.

Think of a class as a way to group related things logically, providing a shared access to the common variables and methods.

There are also some other things to improve:

instead of converting your arguments to int, you can define them with type=int
you can make use of dictionary and list comprehensions in multiple places
you can make use of str.join() - for example, when defining reference_snps_list:
```
reference_snps_list = "".join(str(ref_dic[i]) for i in pos_no_dup)
```
you can use the special argparse.FileType for the input file argument

FYI, since this is a controversial topic in general:

Question 3

Thanks for the answer. Appreciated that you gave links to the two sides of the story about classes. Being each function a instance method would it be require to call each subfunction (other functions used by this function) as self.<attribute> ? Feels strange to have that code in a function... may be just my inexperience talking.

Question 4

@PmmA right, since your functions swill now be instance methods, you would need to call them as self.function_name() from inside the class and instance.function_name() if called outside the class where instance = Converter(). Hope that helps.

alecxe alecxe 17.5k8 gold badges52 silver badges93 bronze badges · Accepted Answer · 2017-04-05 13:54:53Z

This sounds like a good use case to use a class where the arguments you currently pass through the chain of functions would be class attributes, for instance, if we would have a single Converter class, we may have it initialized this way:

class Converter:
 def __init__(self, filename, snp, names, col_ref, pos, col_genome):
 self.filename = filename
 self.snp = snp
 self.names = names
 self.col_ref = col_ref
 self.pos = pos
 self.col_genome = col_genome

Then, your functions will become instance methods where you'll access the instance attributes via self.<attribute> instead of taking an argument.

Think of a class as a way to group related things logically, providing a shared access to the common variables and methods.

There are also some other things to improve:

instead of converting your arguments to int, you can define them with type=int
you can make use of dictionary and list comprehensions in multiple places
you can make use of str.join() - for example, when defining reference_snps_list:
```
reference_snps_list = "".join(str(ref_dic[i]) for i in pos_no_dup)
```
you can use the special argparse.FileType for the input file argument

FYI, since this is a controversial topic in general:

Thanks for the answer. Appreciated that you gave links to the two sides of the story about classes. Being each function a instance method would it be require to call each subfunction (other functions used by this function) as self.<attribute> ? Feels strange to have that code in a function... may be just my inexperience talking.
@PmmA right, since your functions swill now be instance methods, you would need to call them as self.function_name() from inside the class and instance.function_name() if called outside the class where instance = Converter(). Hope that helps.

Stack Exchange Network

Python command-line program to convert genomic data file

Background:

My questions:

Code

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

Python command-line program to convert genomic data file

Background:

My questions:

Code

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions