I came across this BMC Genomics paper: Analysis of intra-genomic GC content homogeneity within prokaryotes
And I implemented some Python functions to make this available as part of a personal project. The functions are these:
import numpy as np
from collections import defaultdict
def get_sequence_chunks(sequence, window, step, as_overlap=False):
"""GC Content in a DNA/RNA sub-sequence length k. In
overlap windows of length k, other wise non overlapping.
Inputs:
sequence - a string representing a DNA sequence.
as_overlap - boolean that represents if overlap is needed.
k - a integer representing the lengths of overlapping bases.
Default is 20.
Outputs:
subseq - a string representing a slice of the sequence with or without
overlapping characters.
"""
# make sequence upper case and getting the length of it
sequence, seq_len = sequence.upper(), len(sequence)
# non overlap sequence length
non_overlap = range(0, seq_len - window + 1, step)
# overlap sequence length
overlap = range(0, seq_len - window + 1)
# overlap is needed
if as_overlap:
# iterates to the overlap region
for i in overlap:
# creates the substring
subseq = sequence[i:i + window]
yield subseq
# if non overlap is choosed
else:
# iterates to the non overlap region
for j in non_overlap:
# creates the sub-string to count the gc_content
subseq = sequence[j:j + window]
yield subseq
def gc_content(seq, percent=True):
"""
Calculate G+C content, return percentage (as float between 0 and 100).
Copes mixed case sequences, and with the ambiguous nucleotide S (G or C)
when counting the G and C content. The percentage is calculated against
the full length.
Inputs:
sequence - a string representing a sequence (DNA/RNA/Protein)
Outputs:
gc - a float number or a percent value representing the gc content of the sequence.
"""
seq_len = len(seq)
gc = sum(seq.count(x) for x in ["G", "C", "g", "c", "S", "s"])
if percent:
try:
return gc * 100.0 / seq_len
except ZeroDivisionError:
return 0.0
else:
return gc / seq_len
def get_gc_content_slide_window(sequence, window, step):
"""
Calculate teh GC (G+C) content along a sequence. Returns a dictionary -like
object mapping the sequence window to the gc value (floats).
Returns 0 for windows without any G/C by handling zero division errors, and
does NOT look at any ambiguous nucleotides.
Inputs:
sequence - a string representing a sequence (DNA/RNA/Protein)
window_size - a integer representing the length of sequence to be analyzed
at each step in the full sequence.
step - a integer representing the length of oerlapping sequence allowed.
Outputs:
gc- a dictionary-like object mapping the window size sequence to it gc values
as floating numbers
"""
gc = defaultdict(float)
for s in get_sequence_chunks(sequence, window, step, as_overlap=False):
gc[s] = gc.get(s, 0.0) + gc_content(s)
return gc
def difference_gc(mean_gc, gc_dict):
"""
Calculates the difference between the mean GC content of window i, and the
mean chromosomal GC content, as Di = GC i − GC
"""
# get the keys and default values in the dictionary
d_i = defaultdict(float, [(k, 0.0) for k in gc_dict.keys()])
# iterate through all keys, gc calculated values for the
# genome chunk
for chunk, gc_cnt in gc_dict.items():
# get the difference between the chromosomal mean and the chunks
dif = gc_cnt - mean_gc
# add the difference to the appropriate chunk
d_i[chunk] = dif
return d_i
def get_chromosomal_gc_variantion(difference_dic):
"""
Calculates the chromosomal GC variation defined as the log-transformed average
of the absolute value of the difference between the mean GC content of each
non-overlapping sliding window i and mean chromosomal GC content.
chromosomal_gc_variantion = log(1/N*sum(|Di|), where N is the maximum number of
non-overlapping window size in bp in a sliding windows.
"""
# get the number of chunks
n = len(difference_dic.keys())
arr = np.array(list(difference_dic.values()))
var = np.log((1/n) * sum(abs(arr)))
return var
I worked with e. coli genome: Escherichia coli strain RM13322 chromosome, complete genome
gc = gc_content(ec, percent=True) = 50.76819424506625
It is similar to found here: Genome Assembly and Annotation report (26286) ~ 50.73.
The slide window gave it non overlapping results like, where ec = escherichia coli genome, 100 size window, and step = 100:
get_gc_content_slide_window(ec, 100, 100)
defaultdict(float,
{'GTTGGCATGCCATGGTGATGTTTGTTAGGAAAGCAAAGATGGCAAAACTGCTGGGGGTTTTGTGGTTGAGTATGCCAATATAATTAATAGATTAAAGAGT': 39.0,
'TAGTTGTGAAGAAAATATGGATAAACAGGACGACGAATGCTTTCACCGATAAGGACAACTTTCCATAACTCAGTAAATATAGTGCAGAGTTCACCCTGTC': 39.0,
'AAACGGTTTCTTTTGCAGGAAAGGAATATGAGTTAAAGGTCATTGATGAAAAAACGCCTATTCTTTTTCAGTGGTTTGAACCTAATCCTGAACGATATAA': 34.0,
'GAAAGATGAGGTTCCAATAGTTAATACTAAGCAGCATCCCTATTTAGATAATGTCACAAATGCGGCAAGGATAGAGAGTGATCGTATGATAGGTATTTTT': 36.0,
'GTTGATGGCGATTTTTCAGTCAACCAAAAGACTGCTTTTTCAAAATTGGAACGAGATTTTGAAAATGTAATGATAATCTATCGGGAAGATGTTGACTTCA': 34.0,
'GTATGTATGACAGAAAACTATCAGATATTTATCATGATATTATATGTGAACAAAGGTTACGAACTGAAGACAAAAGAGATGAATACTTGTTGAATCTGTT': 29.0,...}
The difference function (slide window gc content - gc content all sequence), where ec_gc = gc content of e.coli genome and ec_non = gc content in each subsequence of size 100 non overlapped:
difference_gc(ec_gc, ec_non)
defaultdict(float,
{'GTTGGCATGCCATGGTGATGTTTGTTAGGAAAGCAAAGATGGCAAAACTGCTGGGGGTTTTGTGGTTGAGTATGCCAATATAATTAATAGATTAAAGAGT': -11.768194245066248,
'TAGTTGTGAAGAAAATATGGATAAACAGGACGACGAATGCTTTCACCGATAAGGACAACTTTCCATAACTCAGTAAATATAGTGCAGAGTTCACCCTGTC': -11.768194245066248,
'AAACGGTTTCTTTTGCAGGAAAGGAATATGAGTTAAAGGTCATTGATGAAAAAACGCCTATTCTTTTTCAGTGGTTTGAACCTAATCCTGAACGATATAA': -16.768194245066248,
'GAAAGATGAGGTTCCAATAGTTAATACTAAGCAGCATCCCTATTTAGATAATGTCACAAATGCGGCAAGGATAGAGAGTGATCGTATGATAGGTATTTTT': -14.768194245066248,
'GTTGATGGCGATTTTTCAGTCAACCAAAAGACTGCTTTTTCAAAATTGGAACGAGATTTTGAAAATGTAATGATAATCTATCGGGAAGATGTTGACTTCA': -16.768194245066248,...}
And the (ec_dif = difference on gc content from total sequence and from subsequences):
get_chromosomal_gc_variantion(ec_dif)
1.7809591767493207
My principal concern It is to know if this function get_chromosomal_gc_variantion(ec_dif) was implemented as in the paper above (GCVAR):
As suggested, a toy example:
s="TAGTTGTGAAGAAAATATGGATAAACAGGACGACGAATGCTTTCACCGATAAGGACAACTTTCCATAACTCAGTAAATATAGTGCAGAGTTCACCCTGTC"
seq_gc_non_overllap = get_gc_content_slide_window(s, 20, 20)
seq_gc_non_overllap
defaultdict(float,
{'TAGTTGTGAAGAAAATATGG': 30.0,
'ATAAACAGGACGACGAATGC': 45.0,
'TTTCACCGATAAGGACAACT': 40.0,
'TTCCATAACTCAGTAAATAT': 25.0,
'AGTGCAGAGTTCACCCTGTC': 55.0})
seq_gc_total = gc_content(s, percent=True)
seq_gc_total
39.0
seq_dif_gctot_gc_slidewindow = difference_gc(seq_gc_total, seq_gc_non_overllap)
defaultdict(float,
{'TAGTTGTGAAGAAAATATGG': -9.0,
'ATAAACAGGACGACGAATGC': 6.0,
'TTTCACCGATAAGGACAACT': 1.0,
'TTCCATAACTCAGTAAATAT': -14.0,
'AGTGCAGAGTTCACCCTGTC': 16.0})
chromosome_gc_variation = get_chromosomal_gc_variantion(seq_dif_gctot, gc_slidewindow)
chromosome_gc_variation
2.2192034840549946
Any tip to improve the code would be very welcome, once I only have been working with python for a couple of years and I am not very skilled, and much less with numpy.
1 Answer 1
Quite interesting!
You will benefit from introducing PEP484 type hints.
Your docstring for get_sequence_chunks
is way out of whack; you'll need to revisit your parameter names.
There isn't a whole lot of benefit from tuple-combination assignment for your sequence
and seq_len
so I suggest just doing them separately.
Your overlap logic can be greatly simplified: notice that the only difference between the two code paths is the step size.
Minor spelling issues such as other wise
that should be one word, overllap
-> overlap
, and variantion
-> variation
.
Comments such as
# make sequence upper case and getting the length of it
are less helpful than not having a comment at all. Note the difference between that and
# get the difference between the chromosomal mean and the chunks
which has content that can't necessarily be inferred from the code alone (though even this will be redundant with a sufficiently descriptive variable name).
This comprehension:
gc = sum(seq.count(x) for x in ["G", "C", "g", "c", "S", "s"])
will iterate through the entire seq
string for every potential character. It's likely that refactoring this to use set membership checks and one single iteration through the string will be more efficient.
This code to prevent division-by-zero:
seq_len = len(seq)
gc = sum(seq.count(x) for x in ["G", "C", "g", "c", "S", "s"])
if percent:
try:
return gc * 100.0 / seq_len
except ZeroDivisionError:
return 0.0
else:
return gc / seq_len
first of all can trivially avoid logic-by-exception by checking if the sequence length is zero; and also should extend that check to the case where a non-percent figure is returned.
In difference_gc
the initial defaultdict
initialization is not needed, since you overwrite every single key. You can replace the whole function with one dictionary comprehension.
Try to avoid abbreviating words like count
to cnt
.
The len
of a dict's keys is equal to the len
of a dict itself, so you can drop the .keys()
in len(difference_dic.keys())
.
You can avoid the construction of a list here:
arr = np.array(list(difference_dic.values()))
by using fromiter
instead.
Your example is kind of broken. get_chromosomal_gc_variation
only accepts one parameter.
Consider reworking your example into a basic unit test.
Suggested
import math
from typing import Iterator, DefaultDict, Dict
import numpy as np
from collections import defaultdict
def get_sequence_chunks(
sequence: str,
window: int,
step: int,
as_overlap: bool = False,
) -> Iterator[str]:
"""GC Content in a DNA/RNA sub-sequence length k. In
overlap windows of length k, other wise non overlapping.
Inputs:
sequence - a string representing a DNA sequence.
as_overlap - boolean that represents if overlap is needed.
k - a integer representing the lengths of overlapping bases.
Default is 20.
Outputs:
subseq - a string representing a slice of the sequence with or without
overlapping characters.
"""
sequence = sequence.upper()
if as_overlap:
# overlap sequence length
step = 1
# iterates to the overlap region
for i in range(0, len(sequence) - window + 1, step):
# creates the substring
yield sequence[i:i + window]
def gc_content(seq: str, percent: bool = True) -> float:
"""
Calculate G+C content, return percentage (as float between 0 and 100).
Copes mixed case sequences, and with the ambiguous nucleotide S (G or C)
when counting the G and C content. The percentage is calculated against
the full length.
Inputs:
sequence - a string representing a sequence (DNA/RNA/Protein)
Outputs:
gc - a float number or a percent value representing the gc content of the sequence.
"""
seq_len = len(seq)
gc = sum(1 for x in seq.upper() if x in {"G", "C", "S"})
if seq_len == 0:
return 0
gc /= seq_len
if percent:
return gc * 100
return gc
def get_gc_content_slide_window(sequence: str, window: int, step: int) -> DefaultDict[str, float]:
"""
Calculate the GC (G+C) content along a sequence. Returns a dictionary-like
object mapping the sequence window to the gc value (floats).
Returns 0 for windows without any G/C by handling zero division errors, and
does NOT look at any ambiguous nucleotides.
Inputs:
sequence - a string representing a sequence (DNA/RNA/Protein)
window_size - a integer representing the length of sequence to be analyzed
at each step in the full sequence.
step - a integer representing the length of oerlapping sequence allowed.
Outputs:
gc- a dictionary-like object mapping the window size sequence to it gc values
as floating numbers
"""
gc = defaultdict(float)
for s in get_sequence_chunks(sequence, window, step, as_overlap=False):
gc[s] += gc_content(s)
return gc
def difference_gc(mean_gc: float, gc_dict: Dict[str, float]) -> Dict[str, float]:
"""
Calculates the difference between the mean GC content of window i, and the
mean chromosomal GC content, as Di = GC i − GC
"""
# iterate through all keys, gc calculated values for the
# genome chunk
# get the difference between the chromosomal mean and the chunks
# add the difference to the appropriate chunk
d_i = {
chunk: gc_cnt - mean_gc
for chunk, gc_cnt in gc_dict.items()
}
return d_i
def get_chromosomal_gc_variation(difference_dic: Dict[str, float]) -> float:
"""
Calculates the chromosomal GC variation defined as the log-transformed average
of the absolute value of the difference between the mean GC content of each
non-overlapping sliding window i and mean chromosomal GC content.
chromosomal_gc_variantion = log(1/N*sum(|Di|), where N is the maximum number of
non-overlapping window size in bp in a sliding windows.
"""
# get the number of chunks
n = len(difference_dic)
arr = np.fromiter(
difference_dic.values(),
dtype=np.float32,
count=len(difference_dic),
)
var = np.log(np.sum(np.abs(arr)) / n)
return var
def test() -> None:
s = (
"TAGTTGTGAAGAAAATATGGATAAACAGGACGACGAATGCTTTCACCGAT"
"AAGGACAACTTTCCATAACTCAGTAAATATAGTGCAGAGTTCACCCTGTC"
)
seq_gc_non_overlap = get_gc_content_slide_window(s, 20, 20)
expected = {
'TAGTTGTGAAGAAAATATGG': 30,
'ATAAACAGGACGACGAATGC': 45,
'TTTCACCGATAAGGACAACT': 40,
'TTCCATAACTCAGTAAATAT': 25,
'AGTGCAGAGTTCACCCTGTC': 55,
}
assert expected.keys() == seq_gc_non_overlap.keys()
for k, v in expected.items():
assert math.isclose(seq_gc_non_overlap[k], v)
seq_gc_total = gc_content(s, percent=True)
assert math.isclose(seq_gc_total, 39)
seq_dif_gctot_gc_slidewindow = difference_gc(seq_gc_total, seq_gc_non_overlap)
expected = {
'TAGTTGTGAAGAAAATATGG': -9,
'ATAAACAGGACGACGAATGC': 6,
'TTTCACCGATAAGGACAACT': 1,
'TTCCATAACTCAGTAAATAT': -14,
'AGTGCAGAGTTCACCCTGTC': 16,
}
assert expected.keys() == seq_dif_gctot_gc_slidewindow.keys()
for k, v in expected.items():
assert math.isclose(seq_dif_gctot_gc_slidewindow[k], v)
chromosome_gc_variation = get_chromosomal_gc_variation(seq_dif_gctot_gc_slidewindow)
assert math.isclose(chromosome_gc_variation, 2.2192034840549946)
if __name__ == '__main__':
test()
-
2\$\begingroup\$ thank you for your suggestions! I will check it out and sorry because English it is not my native language...some times I do a lot of mistakes. But for me it is much important to check if the statistics function chromosome_gc_variation is doing what it is need to do. But, Is always excellent to learn to write a good/efficient and beautiful code. Thank you for your time and tips. They are awesome. Paulo \$\endgroup\$Paulo Sergio Schlogl– Paulo Sergio Schlogl2021年11月07日 13:12:04 +00:00Commented Nov 7, 2021 at 13:12
Explore related questions
See similar questions with these tags.
ec
,ec_gc
,ec_non
,ec_dif
? \$\endgroup\$