Everything is working fine except timing. it takes lot time for my file containing 1000 pages and having 100 pages of interest.
import re
from PyPDF2 import PdfFileReader, PdfFileWriter
import glob, os
# find pages
def findText(f, slist):
file = open(f, 'rb')
pdfDoc = PdfFileReader(file)
pages = []
for i in range(pdfDoc.getNumPages()):
content = pdfDoc.getPage(i).extractText().lower()
for s in slist:
if re.search(s.lower(), content) is not None:
if i not in pages:
pages.append(i)
return pages
#extract pages
def extractPage(f, fOut, pages):
file = open(f, 'rb')
output = PdfFileWriter()
pdfOne = PdfFileReader(file)
for i in pages:
output.addPage(pdfOne.getPage(i))
outputStream = open(fOut, "wb")
output.write(outputStream)
outputStream.close()
return
os.chdir(r"path\to\mydir")
for pdfFile in glob.glob("*.pdf"):
print(pdfFile)
outPdfFile = pdfFile.replace(".pdf","_searched_extracted.pdf")
stringList = ["string1", "string2"]
extractPage(pdfFile, outPdfFile, findText(pdfFile, stringList))
Updated code after suggestions is at:
https://gist.github.com/pra007/099f10b07be5b7126a36438c67ad7a1f
2 Answers 2
You could try profiling but the code is simple enough that I think you're spending most of the time in PyPDF2 code. Two options:
- You can preprocess your PDF files to store their text somewhere, which will make the search phase much faster, especially if you run multiples queries on the same PDF files
- You can try another parser such as a Python 3 version of PDFMiner, or even a parser written in a faster language
-
\$\begingroup\$ Thanks. I thought pdfminer is dead. let me test pdfminer3k \$\endgroup\$Rahul Patel– Rahul Patel2016年09月07日 09:34:40 +00:00Commented Sep 7, 2016 at 9:34
-
\$\begingroup\$ @Rahul Preprocessing sounds better. It's not an option for you? \$\endgroup\$Quentin Pradet– Quentin Pradet2016年09月07日 10:14:36 +00:00Commented Sep 7, 2016 at 10:14
One thing that might help a lot is to compile your regexs just once. Instead of
def findText(f, slist):
file = open(f, 'rb')
pdfDoc = PdfFileReader(file)
pages = []
for i in range(pdfDoc.getNumPages()):
content = pdfDoc.getPage(i).extractText().lower()
for s in slist:
if re.search(s.lower(), content) is not None:
if i not in pages:
pages.append(i)
return pages
try:
def findText(f, slist):
file = open(f, 'rb')
pdfDoc = PdfFileReader(file)
pages = []
searches = [ re.compile(s.lower()) for s in slist ]
for i in range(pdfDoc.getNumPages()):
content = pdfDoc.getPage(i).extractText().lower()
for s in searches:
if s.search(content) is not None:
if i not in pages:
pages.append(i)
return pages
Also, you can short-circuit out a lot faster than you're doing:
def findText(f, slist):
file = open(f, 'rb')
pdfDoc = PdfFileReader(file)
pages = []
searches = [ re.compile(s.lower()) for s in slist ]
for i in range(pdfDoc.getNumPages()):
content = pdfDoc.getPage(i).extractText().lower()
for s in searches:
if s.search(content) is not None:
pages.append(i)
break
return pages
python file.py
, usepython -m cProfile -s cumtime file.py
and post the functions that took the most time. \$\endgroup\$