Please help me in reducing the time complexity of this code as it is taking a long time to read its Excel input.
# -*- coding: utf-8 -*-
"""
Created on Wed Mar 9 14:26:22 2016
@author: DalalNS
"""
from openpyxl import load_workbook
import os
import shutil
filename = 'test.xlsx'
wb = load_workbook(filename, use_iterators=True)
sheetlist=(wb.get_sheet_names())
searclist=[]
for i in range(len(sheetlist)):
ws=wb[sheetlist[i]]
for row in ws.iter_rows():
for cell in row:
if(cell[1]=='B'):
searclist.append((cell.internal_value))
searclist = [os.path.splitext(each)[0] for each in searclist]
def search(searclist):
for root, dirs, files in os.walk('C://Users/DalalNS/Desktop/Mapping/a/b'):
for _file in files:
try:
print(_file[:-4])
if _file[:-4] in searclist:
# If we find it, notify us about it and copy it it to C:\NewPath\
print('Found file in: ' + str(root))
shutil.copy(os.path.abspath(root + '/' + _file), 'C://Users/DalalNS/Desktop/Mapping/output')
elif _file[:-5] in searclist:
# If we find it, notify us about it and copy it it to C:\NewPath\
print('Found file in: ' + str(root))
shutil.copy(os.path.abspath(root + '/' + _file), 'C://Users/DalalNS/Desktop/Mapping/output')
else:
print("not found")
except:
print("runtime exception")
print(search(searclist))
-
\$\begingroup\$ @200_success Is there any way so that i can make this algo Fast as the content in excel is very large and traversing it through O(n**3) is taking alot of time,if anyone can help? \$\endgroup\$Navin Dalal– Navin Dalal2016年03月14日 05:37:24 +00:00Commented Mar 14, 2016 at 5:37
2 Answers 2
Building a list using append
in a for loop is not only slow, it is an anti-pattern. Use list comprehensions instead:
searclist = [cell.internal_value for i in range(len(sheetlist)) for row in wb[sheetlist[i]].iter_rows() for cell in row if cell[1] == 'B']
This is not as readable as the previous form, lets try to fix that by iterating over sheet names directly instead of using an index, it will also speed things up.
searclist = [cell.internal_value for sheet in sheetlist for row in wb[sheet].iter_rows() for cell in row if cell[1] == 'B']
Not entirely better. In such cases you can try using a generator: simply wraps your loops into a function that yield
results instead of append
ing them in a list:
def files_in_xl(filename):
workbook = load_workbook(filename, use_iterators=True)
for sheet in workbook.get_sheet_names():
worksheet = workbook[sheet]
for row in worksheet.iter_rows():
for cell in row:
if cell[1] == 'B':
yield cell.internal_value
You can then easily create searclist
using:
searclist = list(files_in_xl('test.xlsx'))
But we’re not done. First off, you iterate over that list again to remove file extensions; why not do that when getting the file name:
yield os.path.splitext(cell.internal_value)[0]
Second, openpyxl
documentation suggest using read only mode for reduced memory footprint:
def files_in_xl(filename):
workbook = load_workbook(filename, read_only=True)
for sheet in workbook.get_sheet_names():
worksheet = workbook[sheet]
for row in worksheet.rows:
for cell in row:
if cell[1] == 'B':
yield os.path.splitext(cell.value)[0]
Third, you will use searclist
only to check if values are in there. Lists are terrible for that purpose since the in
operation is \$O(n)\$. Use set
s instead, this operation is \$O(1)\$:
patterns = set(files_in_xl('test.xlsx')) # Renamed searclist to patterns
Now on to your search
function; which is poorly named since it both search and copy:
- You used
os.path.splitext
previously to separate the filename from its extension and here you try to do it manualy: it is both inefficient and lead to duplicated code. - You swallow meaningful information by using a bare
except
: if something goes wrong, the user is only left with your "there was an error" message and nothing to get a hint at what the error was. Never use bareexcept
, always specify the kind of exception you’re expecting; and only expect exceptions you are ready to handle (printing a message is useless here since the traceback has so much more informations). - Use
os.path.join
to build paths instead of manually concatenating a'/'
. - You should also allow the user to pass the "in" and "out" path as parameters.
Proposed improvements
# -*- coding: utf-8 -*-
"""
Created on Wed Mar 9 14:26:22 2016
@author: DalalNS
"""
from openpyxl import load_workbook
import os
import shutil
import traceback
def files_in_xl(filename):
workbook = load_workbook(filename, read_only=True)
for sheet in workbook.get_sheet_names():
worksheet = workbook[sheet]
for row in worksheet.rows:
for cell in row:
if cell[1] == 'B':
yield os.path.splitext(cell.value)[0]
def filter_and_copy(in_dir, out_dir, search_patterns):
for root, dirs, files in os.walk(in_dir):
absolute_root_path = os.path.abspath(root)
for _file in files:
filename = os.path.splitext(_file)[0]
if filename in search_patterns:
src_path = os.path.join(absolute_root_path, _file)
dest_path = os.path.join(out_dir, _file)
try:
shutil.copyfile(src_path, dest_path)
except (OSError, SameFileError) as e:
# Alert the user if we couldn't copy and continue with other files (you may want something better than that).
print('Error while copying', src_path, 'to', dest_path)
traceback.print_exc()
else:
print(src_path, 'copied to', dest_path)
else:
print(filename, "not found")
if __name__ == '__main__':
patterns = set(files_in_xl('test.xlsx'))
filter_and_copy(
'C://Users/DalalNS/Desktop/Mapping/a/b',
'C://Users/DalalNS/Desktop/Mapping/output',
patterns)
I also changed shutils.copy
to shutils.copyfile
as it might be a little bit faster. Check the documentation for your version of Python to change the exceptions you want to handle accordingly.
-
\$\begingroup\$ FWIW many Python programers -- myself included -- categorically reject the idea that "building a list using append in a for loop is [...] an anti-pattern." Sure, if something's trivial, why not use a listcomp? But by the time you're dealing with multiple loops and conditions, you're just making something you can't easily read or debug. \$\endgroup\$DSM– DSM2016年04月26日 00:26:58 +00:00Commented Apr 26, 2016 at 0:26
Two suggestions:
- Abort/break early: That cuts the size of
n
down in O(n
**3). - Your file-searches are atomic, so run it using multi-processing:
Abort early: (example of idea)
for i in range(len(sheetlist)):
ws=wb[sheetlist[i]]
blankrows = 0 # ------------------ New
for row in ws.iter_rows():
for cell in row:
if(cell[1]=='B'):
searclist.append((cell.internal_value))
elif cell[1]=='': # ------
blankrows += 1 # ----
if blankrows > 100: # --------
break # ------------------
Run in parallel:
Each file can be search in isolation, so why not use multiprocessing? Below is a translation of your code into a producer-consumer pattern, which runs in the following manner:
- The
manager
finds the files - The
manager
adds the filepath to aworkqueue
- The
worker
picks a file from theworkqueue
and processes it. - The
worker
post results during processing to themanagers
resultqueue
- The
manager
reads theresultqueue
and stores all results in a list. - Finally, the results are printed.
Please note that you cannot run this code from the interpreter, but have to run it from the command line.
import multiprocessing
from openpyxl import load_workbook
import os
def get_target_cells(excelworkbook_file, pattern):
wb = load_workbook(excelworkbook_file, use_iterators=True)
sheetnames = wb.get_sheet_names()
for sheetname in sheetnames:
ws = wb[sheetname]
for row in ws.iter_rows():
for cell in row:
if cell[1] == pattern:
yield (excelworkbook_file, sheetname, row, cell)
def get_files(path):
for root, dirs, files in os.walk(path):
for _file in files:
yield os.path.join(root, _file)
def manager(path, pattern, n_workers=4):
work_queue = multiprocessing.Queue()
result_queue = multiprocessing.Queue()
for file in get_files(path):
work_queue.put(file)
work_queue.put(None)
print("workqueue loaded")
workers = []
for n_worker in range(n_workers):
worker = multiprocessing.Process(target=searchprocess, args=(work_queue, result_queue, pattern))
workers.append(worker)
worker.start()
counts_of_none = 0
results = []
while True:
result = result_queue.get()
if result is not None:
results.append(result)
print(".", end='', flush=True) # prints a dot every time theres a result...
else:
counts_of_none += 1
if counts_of_none == len(workers):
break
_ = [worker.join() for worker in workers]
return results
def searchprocess(work_queue, result_queue, pattern):
while True:
item = work_queue.get()
if item is None:
result_queue.put(None)
break
for result in get_target_cells(item, pattern):
result_queue.put(result)
if __name__ == "__main__":
path = 'C://Users/DalalNS/Desktop/Mapping/a/b'
pattern = 'B'
results = manager(path=path, pattern=pattern)
for row in results:
print(row)
-
\$\begingroup\$ You could use
Pool.map
instead of manually managingQueue
s andProcess
es. \$\endgroup\$301_Moved_Permanently– 301_Moved_Permanently2016年04月25日 14:26:27 +00:00Commented Apr 25, 2016 at 14:26
Explore related questions
See similar questions with these tags.