How to Extract Fonts from a PDF

dothinking edited this page Apr 16, 2021 · 4 revisions

This script can be used to extract all fonts referenced by some page of a PDF.

from __future__ import print_function
import fitz
# Open the PDF
doc = fitz.open("some.pdf")
xref_visited = [] # memorize already processed font xrefs here
num = 0 # count the extracted fonts
for page in doc:
 fl = page.getFontList() # list of fonts of page
 for f in fl:
 xref = f[0] # xref of font
 if xref in xref_visited:
 continue # skip if already processed
 xref_visited.append(xref) # do not process a second time
 # extract font buffer
 basename, ext, _, buffer = doc.extractFont(xref)
 if ext != "n/a": # is the font extractable?
 num += 1
 foutname = "%s-%i.%s" % (basename, xref, ext) # build the filename
 fout = open(foutname, "wb") # and output the font
 fout.write(buffer)
 fout.close()
 print("extracted", foutname)
footer = "extracted %i font files from %s." % (num, doc.name)
footer_line = "-".ljust(len(footer), "-")
# output some protocol
print(footer_line)
print(footer)
print(footer_line)
# Close the PDF
doc.close()