Return to Question

Commonmark migration

edited Jun 10, 2020 at 13:24

Background:

#Background: II have written this code to transforme a .csv file exported from a software called Geneious containing SNPs and concatenate them into a DNA sequence.

#My questions: Is this the best structure? Is creating a "chain" of functions like this the Best practice?

My questions:

#Code import argparse import collections import csvIs this the best structure? Is creating a "chain" of functions like this the Best practice?

Code

import argparse
import collections
import csv
def cleaning(file_as_list, snp, names):
 """From input file get the SNPS."""
 with open(file_as_list, 'r') as input_file:
 reader = csv.reader(input_file)
 file = list(reader)
 have_SNP = [x for x in file if x[snp] == '1']
 for i in range(len(have_SNP)):
 mult_names = have_SNP[i][names].replace(':', ',').replace(', ', ',')
 sep_names = mult_names.split(',')
 only_names = [x for x in sep_names if ' ' not in x]
 have_SNP[i][names] = only_names
 return have_SNP
def reference_dic(file_as_list, snp, names, col_ref, pos):
 """Creates the dict with all positions and reference nucleotides."""
 have_SNP = cleaning(file_as_list, snp, names)
 ref_dic = {}
 for i in have_SNP:
 ref_dic[int(i[pos].replace(',', ''))] = i[col_ref]
 return ref_dic
def pos_list(file_as_list, snp, names, col_ref, pos):
 """Creates a list with all the ehxisting positions in reference."""
 ref_dic = reference_dic(file_as_list, snp, names, col_ref, pos)
 list_pos = []
 for key in ref_dic:
 list_pos.append(key)
 sorted_pos_lis = sorted(list_pos)
 return sorted_pos_lis
def genomes_list(file_as_list, snp, names, col_ref, pos):
 """Identifies the genomes present in the input file."""
 have_SNP = cleaning(file_as_list, snp, names)
 genomes_dic = {}
 for i in have_SNP:
 for j in i[names]:
 genomes_dic[j] = ""
 genomes_list = []
 for key in genomes_dic:
 genomes_list.append(key)
 return genomes_list
def identify_genomes(file_as_list, snp, names, col_ref, pos, col_genome):
 """Creates a list of tuples with genome name and respesctive SNPs."""
 have_SNP = cleaning(file_as_list, snp, names)
 genomes = genomes_list(file_as_list, snp, names, col_ref, pos)
 entrys_per_genome = []
 pos_genomes_in_dict = []
 for i in genomes:
 sub_tup = ()
 sub_list = []
 sub_dict = {}
 for j in have_SNP:
 if i in j[names]:
 sub_sub_list = [int(j[pos].replace(',', '')), j[col_genome]]
 sub_list.append(sub_sub_list)
 sub_dict[int(j[pos].replace(',', ''))] = j[col_genome]
 sub_tup = (i, sub_list)
 sub_dic_tup = (i, sub_dict)
 entrys_per_genome.append(sub_tup)
 pos_genomes_in_dict.append(sub_dic_tup)
 return entrys_per_genome, pos_genomes_in_dict
def remove_dupli_pos(file_as_list, snp, names, col_ref, pos, col_genome):
 """Creates a list without SNPs that appear 2 times for one genome."""
 entrys_per_genome = identify_genomes(file_as_list, snp, names, col_ref,
 pos, col_genome)[0]
 all_genomes_pos = []
 for i in entrys_per_genome:
 genome_pos = []
 for j in i[1]:
 genome_pos.append(j[0])
 all_genomes_pos.append(genome_pos)
 list_dup_pos = []
 for i in all_genomes_pos:
 duplicated = [k for k, v in collections.Counter(i).items() if v > 1]
 list_dup_pos.extend(duplicated)
 no_dup_list_dup_pos = set(list_dup_pos)
 all_positions = pos_list(file_as_list, snp, names, col_ref, pos)
 pos_no_dup = [x for x in all_positions if x not in no_dup_list_dup_pos]
 return pos_no_dup
def get_ref(file_as_list, snp, names, col_ref, pos, col_genome):
 """Creates the reference sequence based on all SNPs."""
 ref_dic = reference_dic(file_as_list, snp, names, col_ref, pos)
 pos_no_dup = remove_dupli_pos(file_as_list, snp, names, col_ref,
 pos, col_genome)
 reference_snps_list = ""
 for i in pos_no_dup:
 reference_snps_list += str(ref_dic[i])
 return reference_snps_list
def get_genomes(file_as_list, snp, names, col_ref, pos, col_genome):
 """Uses the SNPs for each genome and 'N's to build each genome sequence."""
 ref_dic = reference_dic(file_as_list, snp, names, col_ref, pos)
 pos_no_dup = remove_dupli_pos(file_as_list, snp, names, col_ref, pos,
 col_genome)
 genomes_pos = identify_genomes(file_as_list, snp, names, col_ref, pos,
 col_genome)[1]
 genomes = []
 for i in genomes_pos:
 dic_of_genome = i[1]
 this_genome = ""
 for j in pos_no_dup:
 if j in dic_of_genome.keys():
 this_genome += str(dic_of_genome[j])
 elif j in ref_dic:
 this_genome += 'N'
 else:
 print("ERROR!!!!")
 break
 genomes.append(">{0}".format(i[0]))
 genomes.append(this_genome)
 return genomes
def main(file_as_list, snp, names, col_ref, pos, col_genome):
 """Creates 'files.fasta' with the ref and genomes in fasta format."""
 ref_genome = get_ref(file_as_list, snp, names, col_ref, pos, col_genome)
 genomes = get_genomes(file_as_list, snp, names, col_ref, pos, col_genome)
 with open("files.fasta", "w") as out_file:
 out_file.write(">reference_sequence\n")
 out_file.write("{0}\n".format(ref_genome))
 for i in genomes:
 out_file.write("{0}\n".format(i))
if __name__ == '__main__':
 parser = argparse.ArgumentParser()
 parser.add_argument("input",
 help="name of the input file")
 parser.add_argument("-r", "--col_ref_genome_nuc", default=2,
 help="""number of the column with the reference genome
 nucleotides""")
 parser.add_argument("-g", "--col_genomes_nuc", default=8,
 help="""number of the column with the genomes
 nucleotides""")
 parser.add_argument("-p", "--position", default=3,
 help="""number of the column with the position in the
 genome""")
 parser.add_argument("-n", "--genome_names", default=10,
 help="number of the column with the genomes names")
 parser.add_argument("-s", "--is_snp", default=7,
 help="number of the column with lenght")
 args = parser.parse_args()
 print("""Columns:\n[Reference genome:{0}]\n[Genomes:{1}]
[Position of the SNP:{2}]\n[Genomes name:{3}]
[Is SNP:{4}]""" .format(args.col_ref_genome_nuc, args.col_genomes_nuc,
 args.position, args.genome_names, args.is_snp))
 col_ref = int(args.col_ref_genome_nuc) - 1
 col_genome = int(args.col_genomes_nuc) - 1
 pos = int(args.position) - 1
 names = int(args.genome_names) - 1
 snp = int(args.is_snp) - 1
 file_as_list = str(args.input)
 print("\nProcessing...")
 main(file_as_list, snp, names, col_ref, pos, col_genome)
 print("\nJob Done. Output written as <files.fasta>")

#Background: I have written this code to transforme a .csv file exported from a software called Geneious containing SNPs and concatenate them into a DNA sequence.

#My questions: Is this the best structure? Is creating a "chain" of functions like this the Best practice?

#Code import argparse import collections import csv

def cleaning(file_as_list, snp, names):
 """From input file get the SNPS."""
 with open(file_as_list, 'r') as input_file:
 reader = csv.reader(input_file)
 file = list(reader)
 have_SNP = [x for x in file if x[snp] == '1']
 for i in range(len(have_SNP)):
 mult_names = have_SNP[i][names].replace(':', ',').replace(', ', ',')
 sep_names = mult_names.split(',')
 only_names = [x for x in sep_names if ' ' not in x]
 have_SNP[i][names] = only_names
 return have_SNP
def reference_dic(file_as_list, snp, names, col_ref, pos):
 """Creates the dict with all positions and reference nucleotides."""
 have_SNP = cleaning(file_as_list, snp, names)
 ref_dic = {}
 for i in have_SNP:
 ref_dic[int(i[pos].replace(',', ''))] = i[col_ref]
 return ref_dic
def pos_list(file_as_list, snp, names, col_ref, pos):
 """Creates a list with all the ehxisting positions in reference."""
 ref_dic = reference_dic(file_as_list, snp, names, col_ref, pos)
 list_pos = []
 for key in ref_dic:
 list_pos.append(key)
 sorted_pos_lis = sorted(list_pos)
 return sorted_pos_lis
def genomes_list(file_as_list, snp, names, col_ref, pos):
 """Identifies the genomes present in the input file."""
 have_SNP = cleaning(file_as_list, snp, names)
 genomes_dic = {}
 for i in have_SNP:
 for j in i[names]:
 genomes_dic[j] = ""
 genomes_list = []
 for key in genomes_dic:
 genomes_list.append(key)
 return genomes_list
def identify_genomes(file_as_list, snp, names, col_ref, pos, col_genome):
 """Creates a list of tuples with genome name and respesctive SNPs."""
 have_SNP = cleaning(file_as_list, snp, names)
 genomes = genomes_list(file_as_list, snp, names, col_ref, pos)
 entrys_per_genome = []
 pos_genomes_in_dict = []
 for i in genomes:
 sub_tup = ()
 sub_list = []
 sub_dict = {}
 for j in have_SNP:
 if i in j[names]:
 sub_sub_list = [int(j[pos].replace(',', '')), j[col_genome]]
 sub_list.append(sub_sub_list)
 sub_dict[int(j[pos].replace(',', ''))] = j[col_genome]
 sub_tup = (i, sub_list)
 sub_dic_tup = (i, sub_dict)
 entrys_per_genome.append(sub_tup)
 pos_genomes_in_dict.append(sub_dic_tup)
 return entrys_per_genome, pos_genomes_in_dict
def remove_dupli_pos(file_as_list, snp, names, col_ref, pos, col_genome):
 """Creates a list without SNPs that appear 2 times for one genome."""
 entrys_per_genome = identify_genomes(file_as_list, snp, names, col_ref,
 pos, col_genome)[0]
 all_genomes_pos = []
 for i in entrys_per_genome:
 genome_pos = []
 for j in i[1]:
 genome_pos.append(j[0])
 all_genomes_pos.append(genome_pos)
 list_dup_pos = []
 for i in all_genomes_pos:
 duplicated = [k for k, v in collections.Counter(i).items() if v > 1]
 list_dup_pos.extend(duplicated)
 no_dup_list_dup_pos = set(list_dup_pos)
 all_positions = pos_list(file_as_list, snp, names, col_ref, pos)
 pos_no_dup = [x for x in all_positions if x not in no_dup_list_dup_pos]
 return pos_no_dup
def get_ref(file_as_list, snp, names, col_ref, pos, col_genome):
 """Creates the reference sequence based on all SNPs."""
 ref_dic = reference_dic(file_as_list, snp, names, col_ref, pos)
 pos_no_dup = remove_dupli_pos(file_as_list, snp, names, col_ref,
 pos, col_genome)
 reference_snps_list = ""
 for i in pos_no_dup:
 reference_snps_list += str(ref_dic[i])
 return reference_snps_list
def get_genomes(file_as_list, snp, names, col_ref, pos, col_genome):
 """Uses the SNPs for each genome and 'N's to build each genome sequence."""
 ref_dic = reference_dic(file_as_list, snp, names, col_ref, pos)
 pos_no_dup = remove_dupli_pos(file_as_list, snp, names, col_ref, pos,
 col_genome)
 genomes_pos = identify_genomes(file_as_list, snp, names, col_ref, pos,
 col_genome)[1]
 genomes = []
 for i in genomes_pos:
 dic_of_genome = i[1]
 this_genome = ""
 for j in pos_no_dup:
 if j in dic_of_genome.keys():
 this_genome += str(dic_of_genome[j])
 elif j in ref_dic:
 this_genome += 'N'
 else:
 print("ERROR!!!!")
 break
 genomes.append(">{0}".format(i[0]))
 genomes.append(this_genome)
 return genomes
def main(file_as_list, snp, names, col_ref, pos, col_genome):
 """Creates 'files.fasta' with the ref and genomes in fasta format."""
 ref_genome = get_ref(file_as_list, snp, names, col_ref, pos, col_genome)
 genomes = get_genomes(file_as_list, snp, names, col_ref, pos, col_genome)
 with open("files.fasta", "w") as out_file:
 out_file.write(">reference_sequence\n")
 out_file.write("{0}\n".format(ref_genome))
 for i in genomes:
 out_file.write("{0}\n".format(i))
if __name__ == '__main__':
 parser = argparse.ArgumentParser()
 parser.add_argument("input",
 help="name of the input file")
 parser.add_argument("-r", "--col_ref_genome_nuc", default=2,
 help="""number of the column with the reference genome
 nucleotides""")
 parser.add_argument("-g", "--col_genomes_nuc", default=8,
 help="""number of the column with the genomes
 nucleotides""")
 parser.add_argument("-p", "--position", default=3,
 help="""number of the column with the position in the
 genome""")
 parser.add_argument("-n", "--genome_names", default=10,
 help="number of the column with the genomes names")
 parser.add_argument("-s", "--is_snp", default=7,
 help="number of the column with lenght")
 args = parser.parse_args()
 print("""Columns:\n[Reference genome:{0}]\n[Genomes:{1}]
[Position of the SNP:{2}]\n[Genomes name:{3}]
[Is SNP:{4}]""" .format(args.col_ref_genome_nuc, args.col_genomes_nuc,
 args.position, args.genome_names, args.is_snp))
 col_ref = int(args.col_ref_genome_nuc) - 1
 col_genome = int(args.col_genomes_nuc) - 1
 pos = int(args.position) - 1
 names = int(args.genome_names) - 1
 snp = int(args.is_snp) - 1
 file_as_list = str(args.input)
 print("\nProcessing...")
 main(file_as_list, snp, names, col_ref, pos, col_genome)
 print("\nJob Done. Output written as <files.fasta>")

Background:

I have written this code to transforme a .csv file exported from a software called Geneious containing SNPs and concatenate them into a DNA sequence.

My questions:

Is this the best structure? Is creating a "chain" of functions like this the Best practice?

Code

import argparse
import collections
import csv
def cleaning(file_as_list, snp, names):
 """From input file get the SNPS."""
 with open(file_as_list, 'r') as input_file:
 reader = csv.reader(input_file)
 file = list(reader)
 have_SNP = [x for x in file if x[snp] == '1']
 for i in range(len(have_SNP)):
 mult_names = have_SNP[i][names].replace(':', ',').replace(', ', ',')
 sep_names = mult_names.split(',')
 only_names = [x for x in sep_names if ' ' not in x]
 have_SNP[i][names] = only_names
 return have_SNP
def reference_dic(file_as_list, snp, names, col_ref, pos):
 """Creates the dict with all positions and reference nucleotides."""
 have_SNP = cleaning(file_as_list, snp, names)
 ref_dic = {}
 for i in have_SNP:
 ref_dic[int(i[pos].replace(',', ''))] = i[col_ref]
 return ref_dic
def pos_list(file_as_list, snp, names, col_ref, pos):
 """Creates a list with all the ehxisting positions in reference."""
 ref_dic = reference_dic(file_as_list, snp, names, col_ref, pos)
 list_pos = []
 for key in ref_dic:
 list_pos.append(key)
 sorted_pos_lis = sorted(list_pos)
 return sorted_pos_lis
def genomes_list(file_as_list, snp, names, col_ref, pos):
 """Identifies the genomes present in the input file."""
 have_SNP = cleaning(file_as_list, snp, names)
 genomes_dic = {}
 for i in have_SNP:
 for j in i[names]:
 genomes_dic[j] = ""
 genomes_list = []
 for key in genomes_dic:
 genomes_list.append(key)
 return genomes_list
def identify_genomes(file_as_list, snp, names, col_ref, pos, col_genome):
 """Creates a list of tuples with genome name and respesctive SNPs."""
 have_SNP = cleaning(file_as_list, snp, names)
 genomes = genomes_list(file_as_list, snp, names, col_ref, pos)
 entrys_per_genome = []
 pos_genomes_in_dict = []
 for i in genomes:
 sub_tup = ()
 sub_list = []
 sub_dict = {}
 for j in have_SNP:
 if i in j[names]:
 sub_sub_list = [int(j[pos].replace(',', '')), j[col_genome]]
 sub_list.append(sub_sub_list)
 sub_dict[int(j[pos].replace(',', ''))] = j[col_genome]
 sub_tup = (i, sub_list)
 sub_dic_tup = (i, sub_dict)
 entrys_per_genome.append(sub_tup)
 pos_genomes_in_dict.append(sub_dic_tup)
 return entrys_per_genome, pos_genomes_in_dict
def remove_dupli_pos(file_as_list, snp, names, col_ref, pos, col_genome):
 """Creates a list without SNPs that appear 2 times for one genome."""
 entrys_per_genome = identify_genomes(file_as_list, snp, names, col_ref,
 pos, col_genome)[0]
 all_genomes_pos = []
 for i in entrys_per_genome:
 genome_pos = []
 for j in i[1]:
 genome_pos.append(j[0])
 all_genomes_pos.append(genome_pos)
 list_dup_pos = []
 for i in all_genomes_pos:
 duplicated = [k for k, v in collections.Counter(i).items() if v > 1]
 list_dup_pos.extend(duplicated)
 no_dup_list_dup_pos = set(list_dup_pos)
 all_positions = pos_list(file_as_list, snp, names, col_ref, pos)
 pos_no_dup = [x for x in all_positions if x not in no_dup_list_dup_pos]
 return pos_no_dup
def get_ref(file_as_list, snp, names, col_ref, pos, col_genome):
 """Creates the reference sequence based on all SNPs."""
 ref_dic = reference_dic(file_as_list, snp, names, col_ref, pos)
 pos_no_dup = remove_dupli_pos(file_as_list, snp, names, col_ref,
 pos, col_genome)
 reference_snps_list = ""
 for i in pos_no_dup:
 reference_snps_list += str(ref_dic[i])
 return reference_snps_list
def get_genomes(file_as_list, snp, names, col_ref, pos, col_genome):
 """Uses the SNPs for each genome and 'N's to build each genome sequence."""
 ref_dic = reference_dic(file_as_list, snp, names, col_ref, pos)
 pos_no_dup = remove_dupli_pos(file_as_list, snp, names, col_ref, pos,
 col_genome)
 genomes_pos = identify_genomes(file_as_list, snp, names, col_ref, pos,
 col_genome)[1]
 genomes = []
 for i in genomes_pos:
 dic_of_genome = i[1]
 this_genome = ""
 for j in pos_no_dup:
 if j in dic_of_genome.keys():
 this_genome += str(dic_of_genome[j])
 elif j in ref_dic:
 this_genome += 'N'
 else:
 print("ERROR!!!!")
 break
 genomes.append(">{0}".format(i[0]))
 genomes.append(this_genome)
 return genomes
def main(file_as_list, snp, names, col_ref, pos, col_genome):
 """Creates 'files.fasta' with the ref and genomes in fasta format."""
 ref_genome = get_ref(file_as_list, snp, names, col_ref, pos, col_genome)
 genomes = get_genomes(file_as_list, snp, names, col_ref, pos, col_genome)
 with open("files.fasta", "w") as out_file:
 out_file.write(">reference_sequence\n")
 out_file.write("{0}\n".format(ref_genome))
 for i in genomes:
 out_file.write("{0}\n".format(i))
if __name__ == '__main__':
 parser = argparse.ArgumentParser()
 parser.add_argument("input",
 help="name of the input file")
 parser.add_argument("-r", "--col_ref_genome_nuc", default=2,
 help="""number of the column with the reference genome
 nucleotides""")
 parser.add_argument("-g", "--col_genomes_nuc", default=8,
 help="""number of the column with the genomes
 nucleotides""")
 parser.add_argument("-p", "--position", default=3,
 help="""number of the column with the position in the
 genome""")
 parser.add_argument("-n", "--genome_names", default=10,
 help="number of the column with the genomes names")
 parser.add_argument("-s", "--is_snp", default=7,
 help="number of the column with lenght")
 args = parser.parse_args()
 print("""Columns:\n[Reference genome:{0}]\n[Genomes:{1}]
[Position of the SNP:{2}]\n[Genomes name:{3}]
[Is SNP:{4}]""" .format(args.col_ref_genome_nuc, args.col_genomes_nuc,
 args.position, args.genome_names, args.is_snp))
 col_ref = int(args.col_ref_genome_nuc) - 1
 col_genome = int(args.col_genomes_nuc) - 1
 pos = int(args.position) - 1
 names = int(args.genome_names) - 1
 snp = int(args.is_snp) - 1
 file_as_list = str(args.input)
 print("\nProcessing...")
 main(file_as_list, snp, names, col_ref, pos, col_genome)
 print("\nJob Done. Output written as <files.fasta>")

deleted 1 character in body; edited tags; edited title

Source Link

edited Apr 5, 2017 at 14:00

200_success

edited Apr 5, 2017 at 14:00

200_success

145.5k
22
190
479

Structure for a multiple functions script in Python command-line program to convert genomic data file

The code itself is just a bunch of functions that perform small tasks, some functions call others and in the end the result is printed to a file. I used argparse because this is going to be a command line tool, and is usefulluseful to have obligatory arguments and default values for the others.

Structure for a multiple functions script in Python

The code itself is just a bunch of functions that perform small tasks, some functions call others and in the end the result is printed to a file. I used argparse because this is going to be a command line tool, and is usefull to have obligatory arguments and default values for the others.

Python command-line program to convert genomic data file

The code itself is just a bunch of functions that perform small tasks, some functions call others and in the end the result is printed to a file. I used argparse because this is going to be a command line tool, and is useful to have obligatory arguments and default values for the others.