Consider this code where I read an input file with 6 columns (0-5):
- Initialize a variable
history_ends
to 5000. - When the
column0
value (i.e. job[0] < 5000) I add 5000 lines of the input file in a list (historyjobs
) else the rest of the lines until the eof in another list (targetjobs
). - All the
historyjobs
list all contents initem3
,item4
,item5
is equal totargetjobs
. First listitem3
,item4
,item5
when this condition is satisfied. Add thosehistoryjobs
allitem1
to listlistsub
. - Find the running mean of the items in
listsub
and reverse the list, store it in list- Check the condition if items in
listsub > a*0.9
which satisfies the condition. Stores the result items in listcondsub
.
- Check the condition if items in
- Reopen the
inputfile
and check whethercolumn0
is equal to items incondsub
. If it satisfies, then add thecolumn1
to a listcondrun
. - Open the output file and write in
colum0
the second item of first list intargetjobs
i.e.j
, incolumn1
, write the average of listcondrun
,column2
is(j-avg)/j
,column3
is the maximum item in listcondrun
,column4
is the minimum item in listcondrun
,column5
is the length of the listcondrun
, the last four columns is based on the condition. - I am repeating the whole procedure using a
while
loop by assigning the variablehistoryends
to the next itemint(targetjobs[1][0])
.
from __future__ import division
import itertools
history_begins = 1; history_ends = 5000; n = 0; total = 0
historyjobs = []; targetjobs = []
listsub = []; listrun = []; listavg = [] ; F = [] ; condsub = [] ;condrun = [] ;mlistsub = []; a = []
def check(inputfile):
f = open(inputfile,'r') #reads the inputfile
lines = f.readlines()
for line in lines:
job = line.split()
if( int(job[0]) < history_ends ): #if the column0 is less then history_ends(i,e 5000 initially)
historyjobs.append(job) #historyjobs list contains all the lines from the list whose column1 < history_ends
else:
targetjobs.append(job) #historyjobs list contains all the lines from the list whose column1 > history_ends
k = 0
for i, element in enumerate(historyjobs):
if( (int(historyjobs[i][3]) == int(targetjobs[k][3])) and (int(historyjobs[i][4]) == int(targetjobs[k][4])) and (int(historyjobs[i][5]) == int(targetjobs[k][5])) ): #historyjobs list all contents in column3,column4,column5 is equal to targetjobs first list column3,column4,column5
listsub.append(historyjobs[i][1]) #when if condition true add those historyjobs column1 to list listsub
def runningMean(iterable):
"""A generator, yielding a cumulative average of its input."""
num = 0
denom = 0
for x in iterable:
num += x
denom += 1
yield num / denom
def newfun(results):
results.reverse() # put them back in regular order
for value, average in results:
a.append(value)
return a #to return the value
def runcheck(subseq):
f = open('newfileinput','r') #again read the same inputfile
lines = f.readlines()
for line in lines:
job = line.split()
for i, element in enumerate(subseq):
if(int(job[1]) == int(subseq[i])): # if the column1 value of the inputfile becomes equal to list obtained
condrun.append(str(job[2])) #return the value of column2 which satisfies the if condition
return condrun
def listcreate(condrun,condsub):
f1 = open('outputfile','a') #outputfile to append the result
s = map(int,condrun)
j = int(targetjobs[0][2])
targetsub = int(targetjobs[0][1])
if(condsub != []):
try:
convertsub = int(condsub[-1])
a=sum(s)/len(s)
c=max(s)
d=min(s)
e1=abs(j-a)
er1=e1/j
g=len(s)
h=abs(convertsub-targetsub)
f1.write(str(j))
f1.write('\t')
f1.write('\t')
f1.write(str(round(a,2)))
f1.write('\t')
f1.write('\t')
f1.write(str(round(er1,3)))
f1.write('\t')
f1.write('\t')
f1.write(str(c))
f1.write('\t')
f1.write('\t')
f1.write(str(d))
f1.write('\t')
f1.write('\t')
f1.write(str(g))
f1.write('\t')
f1.write('\t')
f1.write(str(h))
f1.write('\t')
f1.write("\t")
if (float(er1) < 0.20):
f1.write("good")
f1.write("\t")
else :
f1.write("bad")
f1.write("\t")
if (float(er1) < 0.30):
f1.write("good")
f1.write("\t")
else :
f1.write("bad")
f1.write("\t")
if (float(er1) < 0.40):
f1.write("good")
f1.write("\t")
else :
f1.write("bad")
f1.write("\t")
if (float(er1) < 0.50):
f1.write("good")
f1.write("\n")
else :
f1.write("bad")
f1.write("\n")
except ZeroDivisionError :
print 'dem 0'
else:
print '0'
f1.close()
def new():
global history_ends
while 1: #To repeat the process untill the EOF(end of input file)
check('newfileinput') #First function call
if(len(targetjobs) != 1):
history_ends = int(targetjobs[1][0]) #initialize historyends to targetjobs second lines first item
mlistsub = map(int,listsub)
results = list(itertools.takewhile(lambda x: x[0] > 0.9 * x[1],
itertools.izip(reversed(mlistsub),
runningMean(reversed(mlistsub)))))#call runningmean function & check the condition
condsub = newfun(results) #function to reverse back the result
condrun=runcheck(condsub) #functionto match & return the value
listcreate(condrun,condsub) #function to write result to output file
del condrun[0:len(condrun)]#to delete the values in list
del condsub[0:len(condsub)]#to delete the values in list
del listsub[0:len(listsub)]#to delete the values in list
del targetjobs[0:len(targetjobs)]#to delete the values in list
del historyjobs[0:len(historyjobs)]#to delete the values in list
else:
break
def main():
new()
if __name__ == '__main__':
main()
The sample input file (whole file contains 200,000 lines):
1 0 9227 1152 34 2 2 111 7622 1120 34 2 3 68486 710 1024 14 2 6 265065 3389 800 22 2 7 393152 48438 64 132 3 8 412251 46744 64 132 3 9 430593 50866 256 95 4 10 430730 10770 256 95 4 11 433750 12701 256 14 3 12 437926 2794 64 34 2 13 440070 43 32 96 3
The sample output file contents:
930 1389.14 0.494 3625 977 7 15 bad bad bad good 4348 1331.75 0.694 3625 930 8 164 bad bad bad bad 18047 32237.0 0.786 61465 17285 3 325774 bad bad bad bad 1607 1509.0 0.061 1509 1509 1 6508 good good good good 304 40.06 0.868 80 32 35 53472 bad bad bad bad 7246 7247.0 0.0 7247 7247 1 9691 good good good good 95 1558.0 15.4 1607 1509 2 2148 bad bad bad bad 55 54.33 0.012 56 53 3 448142 good good good good 31 76.38 1.464 392 35 13 237152 bad bad bad bad 207 55.0 0.734 55 55 1 370 bad bad bad bad
If anyone could suggest some changes to help make the code run faster, that'd be helpful.
2 Answers 2
Comments about coding style:
- inconsistent indentation makes the code harder to read
- preferred indentation width in python is 4 spaces
Why is the question tagged python3 ? This is python2 code.
history_begins = 1; history_ends = 5000; n = 0; total = 0
historyjobs = []; targetjobs = []
listsub = []; listrun = []; listavg = [] ; F = [] ; condsub = [] ;condrun = [] ;mlistsub = []; a = []
There are variables defined here that aren't actually used in the script, and global variables shouldn't have one-letter names.
f = open(inputfile,'r') #reads the inputfile
No it doesn't, it just creates a file handle.
lines = f.readlines()
for line in lines:
However, this does read the file, it even loads it all in RAM at once, which is a waste because you don't actually need to, so do this instead:
for line in f:
f1.write(str(j))
f1.write('\t')
f1.write('\t')
...
if (float(er1) < 0.50):
f1.write("good")
f1.write("\n")
else:
f1.write("bad")
f1.write("\n")
That's redundant, factor it:
print(j, round(a,2), round(er1,3), c, d, g, h, sep='\t\t', end='\t\t', file=out)
w = ('bad', 'good')
print(w[er1 < .2], w[er1 < .3], w[er1 < .4], w[er1 < .5], sep='\t', file=out)
f = open('newfileinput','r') #again read the same inputfile
Why read the same file multiple times ? That's inefficient...
OK, your code is a bit of a mess. I take it you are fairly new to python. Read the python style guide and stick to it. It will make it easier for people to read your code.
It is not easy to work out what you are trying to do here. As it stands the code doesn't run. However, here are some thoughts.
building on @Changaco's answer you can iterate through the file and process each row into integers like this.
def check(inputfile):
f = open(inputfile,'r')
for line in f:
job = [int(s) for s in line.split()]
...
Doing the conversion once up front you can forget all the other int() conversions that clutter up your code. This includes removing the need for your mlistsub global entirely as far as I can tell as it seems to be an integer version of the listsub variable.
I suspect the following reference to job[0] should reference job[1 ] as it is column1, not column0 that you talk about in other comments.
if( int(job[0]) < history_ends ): #if the column0 is less then history_ends(i,e 5000 initially)
...
Also, brackets are not necessary and with the above int() conversion already done the line becomes
if job[1] < history_ends:
...
I would reduce the global variables to the absolute minimum (which is usually none). I cannot work out the detail but the only reason I can see to keep globals here would be if they are continually being appended to and this is not happening as far as I can tell.
For example, I think the global variable a = [] can be removed.
Your original function.
def newfun(results):
results.reverse() # put them back in regular order
for value, average in results:
a.append(value)
return a #to return the value
is called like this
condsub = newfun(results) #function to reverse back the result
Aside from the terrible function name, this function seems to do very little. You could replace the function call with something like this
condsub = [value for value, average in reversed(results)] #reverse back the result
The following code
k = 0
for i, element in enumerate(historyjobs):
if( (int(historyjobs[i][3]) == int(targetjobs[k][3])) and (int(historyjobs[i][4]) == int(targetjobs[k][4])) and (int(historyjobs[i][5]) == int(targetjobs[k][5])) ): #historyjobs list all contents in column3,column4,column5 is equal to targetjobs first list column3,column4,column5
listsub.append(historyjobs[i][1]) #when if condition true add those historyjobs column1 to list listsub
can be significantly cleaned up. There is no need to use enumerate() just to get the i variable. The idiomatic approach is to use 'for hjob in historyjobs:'. With the int() conversion already done you can replace the complicated comparison with a simple comparison of the slice of interest.
tjob = targetjobs[0]
for hjob in historyjobs:
if hjob[3:6] == tjob[3:6]:
listsub.append(hjob[1])
If I understand your intent, you can also create a simple list and return it from the function as a way to remove the listsub global variable.
result = []
tjob = targetjobs[0]
for hjob in historyjobs:
if hjob[3:6] == tjob[3:6]:
result.append(hjob[1])
return result
A conversion and filter operation can usually be achieved with a list comprehension. Something like this should do the trick.
return [hjob[1] for hjob in historyjobs if hjob[3:6] == targetjobs[0][3:6]]
Since you are deleting all your globals at the end of each loop, I am assuming you don't need them at all.
del condrun[0:len(condrun)]#to delete the values in list
del condsub[0:len(condsub)]#to delete the values in list
del listsub[0:len(listsub)]#to delete the values in list
del targetjobs[0:len(targetjobs)]#to delete the values in list
del historyjobs[0:len(historyjobs)]#to delete the values in list
This is not necessary if you return data from your functions as above and assign the result of the function calls to variables which are created in the scope of your new() function.
As I say, I'm not sure what your doing exactly here so I can't advise on the logic. If you can tidy it up a bit then I'm sure we can help you get to the bottom of it. My feeling is that the code can become much shorter and much clearer than your current version.