Copy files mentioned in an Excel sheet

Question 1

Please help me in reducing the time complexity of this code as it is taking a long time to read its Excel input.

# -*- coding: utf-8 -*-
"""
Created on Wed Mar 9 14:26:22 2016
@author: DalalNS
"""
from openpyxl import load_workbook
import os
import shutil
filename = 'test.xlsx'
wb = load_workbook(filename, use_iterators=True)
sheetlist=(wb.get_sheet_names())
searclist=[]
for i in range(len(sheetlist)):
 ws=wb[sheetlist[i]]
 for row in ws.iter_rows():
 for cell in row:
 if(cell[1]=='B'):
 searclist.append((cell.internal_value))
searclist = [os.path.splitext(each)[0] for each in searclist] 
def search(searclist):
 for root, dirs, files in os.walk('C://Users/DalalNS/Desktop/Mapping/a/b'):
 for _file in files:
 try:
 print(_file[:-4])
 if _file[:-4] in searclist:
 # If we find it, notify us about it and copy it it to C:\NewPath\
 print('Found file in: ' + str(root))
 shutil.copy(os.path.abspath(root + '/' + _file), 'C://Users/DalalNS/Desktop/Mapping/output')
 elif _file[:-5] in searclist:
 # If we find it, notify us about it and copy it it to C:\NewPath\
 print('Found file in: ' + str(root))
 shutil.copy(os.path.abspath(root + '/' + _file), 'C://Users/DalalNS/Desktop/Mapping/output')
 else:
 print("not found")
 except:
 print("runtime exception")
print(search(searclist))

Question 2

@200_success Is there any way so that i can make this algo Fast as the content in excel is very large and traversing it through O(n**3) is taking alot of time,if anyone can help?

Question 3

Building a list using append in a for loop is not only slow, it is an anti-pattern. Use list comprehensions instead:

searclist = [cell.internal_value for i in range(len(sheetlist)) for row in wb[sheetlist[i]].iter_rows() for cell in row if cell[1] == 'B']

This is not as readable as the previous form, lets try to fix that by iterating over sheet names directly instead of using an index, it will also speed things up.

searclist = [cell.internal_value for sheet in sheetlist for row in wb[sheet].iter_rows() for cell in row if cell[1] == 'B']

Not entirely better. In such cases you can try using a generator: simply wraps your loops into a function that yield results instead of appending them in a list:

def files_in_xl(filename):
 workbook = load_workbook(filename, use_iterators=True)
 for sheet in workbook.get_sheet_names():
 worksheet = workbook[sheet]
 for row in worksheet.iter_rows():
 for cell in row:
 if cell[1] == 'B':
 yield cell.internal_value

You can then easily create searclist using:

searclist = list(files_in_xl('test.xlsx'))

But we’re not done. First off, you iterate over that list again to remove file extensions; why not do that when getting the file name:

yield os.path.splitext(cell.internal_value)[0]

Second, openpyxl documentation suggest using read only mode for reduced memory footprint:

def files_in_xl(filename):
 workbook = load_workbook(filename, read_only=True)
 for sheet in workbook.get_sheet_names():
 worksheet = workbook[sheet]
 for row in worksheet.rows:
 for cell in row:
 if cell[1] == 'B':
 yield os.path.splitext(cell.value)[0]

Third, you will use searclist only to check if values are in there. Lists are terrible for that purpose since the in operation is \$O(n)\$. Use sets instead, this operation is \$O(1)\$:

patterns = set(files_in_xl('test.xlsx')) # Renamed searclist to patterns

Now on to your search function; which is poorly named since it both search and copy:

You used os.path.splitext previously to separate the filename from its extension and here you try to do it manualy: it is both inefficient and lead to duplicated code.
You swallow meaningful information by using a bare except: if something goes wrong, the user is only left with your "there was an error" message and nothing to get a hint at what the error was. Never use bare except, always specify the kind of exception you’re expecting; and only expect exceptions you are ready to handle (printing a message is useless here since the traceback has so much more informations).
Use os.path.join to build paths instead of manually concatenating a '/'.
You should also allow the user to pass the "in" and "out" path as parameters.

Proposed improvements

# -*- coding: utf-8 -*-
"""
Created on Wed Mar 9 14:26:22 2016
@author: DalalNS
"""
from openpyxl import load_workbook
import os
import shutil
import traceback
def files_in_xl(filename):
 workbook = load_workbook(filename, read_only=True)
 for sheet in workbook.get_sheet_names():
 worksheet = workbook[sheet]
 for row in worksheet.rows:
 for cell in row:
 if cell[1] == 'B':
 yield os.path.splitext(cell.value)[0]
def filter_and_copy(in_dir, out_dir, search_patterns):
 for root, dirs, files in os.walk(in_dir):
 absolute_root_path = os.path.abspath(root)
 for _file in files:
 filename = os.path.splitext(_file)[0]
 if filename in search_patterns:
 src_path = os.path.join(absolute_root_path, _file)
 dest_path = os.path.join(out_dir, _file)
 try:
 shutil.copyfile(src_path, dest_path)
 except (OSError, SameFileError) as e:
 # Alert the user if we couldn't copy and continue with other files (you may want something better than that).
 print('Error while copying', src_path, 'to', dest_path)
 traceback.print_exc()
 else:
 print(src_path, 'copied to', dest_path)
 else:
 print(filename, "not found")
if __name__ == '__main__':
 patterns = set(files_in_xl('test.xlsx'))
 filter_and_copy(
 'C://Users/DalalNS/Desktop/Mapping/a/b',
 'C://Users/DalalNS/Desktop/Mapping/output',
 patterns)

I also changed shutils.copy to shutils.copyfile as it might be a little bit faster. Check the documentation for your version of Python to change the exceptions you want to handle accordingly.

Question 4

FWIW many Python programers -- myself included -- categorically reject the idea that "building a list using append in a for loop is [...] an anti-pattern." Sure, if something's trivial, why not use a listcomp? But by the time you're dealing with multiple loops and conditions, you're just making something you can't easily read or debug.

Question 5

Two suggestions:

Abort/break early: That cuts the size of n down in O(n**3).
Your file-searches are atomic, so run it using multi-processing:

Abort early: (example of idea)

for i in range(len(sheetlist)):
ws=wb[sheetlist[i]]
blankrows = 0 # ------------------ New
for row in ws.iter_rows():
 for cell in row:
 if(cell[1]=='B'):
 searclist.append((cell.internal_value))
 elif cell[1]=='': # ------
 blankrows += 1 # ----
 if blankrows > 100: # --------
 break # ------------------

Run in parallel:

Each file can be search in isolation, so why not use multiprocessing? Below is a translation of your code into a producer-consumer pattern, which runs in the following manner:

The manager finds the files
The manager adds the filepath to a workqueue
The worker picks a file from the workqueue and processes it.
The worker post results during processing to the managers resultqueue
The manager reads the resultqueue and stores all results in a list.
Finally, the results are printed.

Please note that you cannot run this code from the interpreter, but have to run it from the command line.

import multiprocessing
from openpyxl import load_workbook
import os
def get_target_cells(excelworkbook_file, pattern):
 wb = load_workbook(excelworkbook_file, use_iterators=True)
 sheetnames = wb.get_sheet_names()
 for sheetname in sheetnames:
 ws = wb[sheetname]
 for row in ws.iter_rows():
 for cell in row:
 if cell[1] == pattern:
 yield (excelworkbook_file, sheetname, row, cell)
def get_files(path):
 for root, dirs, files in os.walk(path):
 for _file in files:
 yield os.path.join(root, _file)
def manager(path, pattern, n_workers=4):
 work_queue = multiprocessing.Queue()
 result_queue = multiprocessing.Queue()
 for file in get_files(path):
 work_queue.put(file)
 work_queue.put(None)
 print("workqueue loaded")
 workers = []
 for n_worker in range(n_workers):
 worker = multiprocessing.Process(target=searchprocess, args=(work_queue, result_queue, pattern))
 workers.append(worker)
 worker.start()
 counts_of_none = 0
 results = []
 while True:
 result = result_queue.get()
 if result is not None:
 results.append(result)
 print(".", end='', flush=True) # prints a dot every time theres a result...
 else:
 counts_of_none += 1
 if counts_of_none == len(workers):
 break
 _ = [worker.join() for worker in workers]
 return results
def searchprocess(work_queue, result_queue, pattern):
 while True:
 item = work_queue.get()
 if item is None:
 result_queue.put(None)
 break
 for result in get_target_cells(item, pattern):
 result_queue.put(result)
if __name__ == "__main__":
 path = 'C://Users/DalalNS/Desktop/Mapping/a/b'
 pattern = 'B'
 results = manager(path=path, pattern=pattern)
 for row in results:
 print(row)

Question 6

You could use Pool.map instead of manually managing Queues and Processes.

score 2 · Answer 1 · 2016-04-25 15:45:37Z

Building a list using append in a for loop is not only slow, it is an anti-pattern. Use list comprehensions instead:

searclist = [cell.internal_value for i in range(len(sheetlist)) for row in wb[sheetlist[i]].iter_rows() for cell in row if cell[1] == 'B']

This is not as readable as the previous form, lets try to fix that by iterating over sheet names directly instead of using an index, it will also speed things up.

searclist = [cell.internal_value for sheet in sheetlist for row in wb[sheet].iter_rows() for cell in row if cell[1] == 'B']

Not entirely better. In such cases you can try using a generator: simply wraps your loops into a function that yield results instead of appending them in a list:

def files_in_xl(filename):
 workbook = load_workbook(filename, use_iterators=True)
 for sheet in workbook.get_sheet_names():
 worksheet = workbook[sheet]
 for row in worksheet.iter_rows():
 for cell in row:
 if cell[1] == 'B':
 yield cell.internal_value

You can then easily create searclist using:

searclist = list(files_in_xl('test.xlsx'))

But we’re not done. First off, you iterate over that list again to remove file extensions; why not do that when getting the file name:

yield os.path.splitext(cell.internal_value)[0]

Second, openpyxl documentation suggest using read only mode for reduced memory footprint:

def files_in_xl(filename):
 workbook = load_workbook(filename, read_only=True)
 for sheet in workbook.get_sheet_names():
 worksheet = workbook[sheet]
 for row in worksheet.rows:
 for cell in row:
 if cell[1] == 'B':
 yield os.path.splitext(cell.value)[0]

Third, you will use searclist only to check if values are in there. Lists are terrible for that purpose since the in operation is \$O(n)\$. Use sets instead, this operation is \$O(1)\$:

patterns = set(files_in_xl('test.xlsx')) # Renamed searclist to patterns

Now on to your search function; which is poorly named since it both search and copy:

You used os.path.splitext previously to separate the filename from its extension and here you try to do it manualy: it is both inefficient and lead to duplicated code.
You swallow meaningful information by using a bare except: if something goes wrong, the user is only left with your "there was an error" message and nothing to get a hint at what the error was. Never use bare except, always specify the kind of exception you’re expecting; and only expect exceptions you are ready to handle (printing a message is useless here since the traceback has so much more informations).
Use os.path.join to build paths instead of manually concatenating a '/'.
You should also allow the user to pass the "in" and "out" path as parameters.

Proposed improvements

# -*- coding: utf-8 -*-
"""
Created on Wed Mar 9 14:26:22 2016
@author: DalalNS
"""
from openpyxl import load_workbook
import os
import shutil
import traceback
def files_in_xl(filename):
 workbook = load_workbook(filename, read_only=True)
 for sheet in workbook.get_sheet_names():
 worksheet = workbook[sheet]
 for row in worksheet.rows:
 for cell in row:
 if cell[1] == 'B':
 yield os.path.splitext(cell.value)[0]
def filter_and_copy(in_dir, out_dir, search_patterns):
 for root, dirs, files in os.walk(in_dir):
 absolute_root_path = os.path.abspath(root)
 for _file in files:
 filename = os.path.splitext(_file)[0]
 if filename in search_patterns:
 src_path = os.path.join(absolute_root_path, _file)
 dest_path = os.path.join(out_dir, _file)
 try:
 shutil.copyfile(src_path, dest_path)
 except (OSError, SameFileError) as e:
 # Alert the user if we couldn't copy and continue with other files (you may want something better than that).
 print('Error while copying', src_path, 'to', dest_path)
 traceback.print_exc()
 else:
 print(src_path, 'copied to', dest_path)
 else:
 print(filename, "not found")
if __name__ == '__main__':
 patterns = set(files_in_xl('test.xlsx'))
 filter_and_copy(
 'C://Users/DalalNS/Desktop/Mapping/a/b',
 'C://Users/DalalNS/Desktop/Mapping/output',
 patterns)

I also changed shutils.copy to shutils.copyfile as it might be a little bit faster. Check the documentation for your version of Python to change the exceptions you want to handle accordingly.

FWIW many Python programers -- myself included -- categorically reject the idea that "building a list using append in a for loop is [...] an anti-pattern." Sure, if something's trivial, why not use a listcomp? But by the time you're dealing with multiple loops and conditions, you're just making something you can't easily read or debug.

root-11 root-11 5421 gold badge6 silver badges16 bronze badges · Answer 2 · 2016-03-26 10:28:02Z

Two suggestions:

Abort/break early: That cuts the size of n down in O(n**3).
Your file-searches are atomic, so run it using multi-processing:

Abort early: (example of idea)

for i in range(len(sheetlist)):
ws=wb[sheetlist[i]]
blankrows = 0 # ------------------ New
for row in ws.iter_rows():
 for cell in row:
 if(cell[1]=='B'):
 searclist.append((cell.internal_value))
 elif cell[1]=='': # ------
 blankrows += 1 # ----
 if blankrows > 100: # --------
 break # ------------------

Run in parallel:

Each file can be search in isolation, so why not use multiprocessing? Below is a translation of your code into a producer-consumer pattern, which runs in the following manner:

The manager finds the files
The manager adds the filepath to a workqueue
The worker picks a file from the workqueue and processes it.
The worker post results during processing to the managers resultqueue
The manager reads the resultqueue and stores all results in a list.
Finally, the results are printed.

Please note that you cannot run this code from the interpreter, but have to run it from the command line.

import multiprocessing
from openpyxl import load_workbook
import os
def get_target_cells(excelworkbook_file, pattern):
 wb = load_workbook(excelworkbook_file, use_iterators=True)
 sheetnames = wb.get_sheet_names()
 for sheetname in sheetnames:
 ws = wb[sheetname]
 for row in ws.iter_rows():
 for cell in row:
 if cell[1] == pattern:
 yield (excelworkbook_file, sheetname, row, cell)
def get_files(path):
 for root, dirs, files in os.walk(path):
 for _file in files:
 yield os.path.join(root, _file)
def manager(path, pattern, n_workers=4):
 work_queue = multiprocessing.Queue()
 result_queue = multiprocessing.Queue()
 for file in get_files(path):
 work_queue.put(file)
 work_queue.put(None)
 print("workqueue loaded")
 workers = []
 for n_worker in range(n_workers):
 worker = multiprocessing.Process(target=searchprocess, args=(work_queue, result_queue, pattern))
 workers.append(worker)
 worker.start()
 counts_of_none = 0
 results = []
 while True:
 result = result_queue.get()
 if result is not None:
 results.append(result)
 print(".", end='', flush=True) # prints a dot every time theres a result...
 else:
 counts_of_none += 1
 if counts_of_none == len(workers):
 break
 _ = [worker.join() for worker in workers]
 return results
def searchprocess(work_queue, result_queue, pattern):
 while True:
 item = work_queue.get()
 if item is None:
 result_queue.put(None)
 break
 for result in get_target_cells(item, pattern):
 result_queue.put(result)
if __name__ == "__main__":
 path = 'C://Users/DalalNS/Desktop/Mapping/a/b'
 pattern = 'B'
 results = manager(path=path, pattern=pattern)
 for row in results:
 print(row)

You could use Pool.map instead of manually managing Queues and Processes.

Stack Exchange Network

Copy files mentioned in an Excel sheet

2 Answers 2

Proposed improvements

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

Copy files mentioned in an Excel sheet

2 Answers 2

Proposed improvements

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions