Return to Question

No need to get emotional here

1.3k
4
15

I am not proud of this...

grammar

Source Link

edited Jan 5, 2024 at 8:24

Bob

edited Jan 5, 2024 at 8:24

Bob

I am lacking creativity to get this to an elegant solution and thought I would add it here to see if there are any fresh minds that whatwant to rework it.

I am lacking creativity to get this to an elegant solution and thought I would add it here to see if there any fresh minds that what to rework it.

I am lacking creativity to get this to an elegant solution and thought I would add it here to see if there are any fresh minds that want to rework it.

Source Link

asked Jan 5, 2024 at 8:18

Bob

asked Jan 5, 2024 at 8:18

Bob

Parsing PDFs into Python structures

I am not proud of this...

Take a resume (or "CV" outside of the US) and return all the text within it in a formatted way. Right now the below script outputs it to a txt file. I used this guide as inspiration and have looked to improve on it, by using slightly saner control flow and functions. Although the script works as intended, there are quite a lot of things that smell very bad.

Bad Things:

Horrendous main()
Kind of crazy looping (lots of nested indentations which is usually a bad sign).
2D structures and LOTS of lists (again! A bad sign).
No use of yield so materialising a lot in memory.
No use of @dataclass/NamedTuple (I feel like I should be modelling the PDFPage at least).
Could this be vectorised?
Converting it to an object-oriented design seems like a OK idea.
Dumb statements like using pass and if table_in_page == -1
PEP8 Violations

I am lacking creativity to get this to an elegant solution and thought I would add it here to see if there any fresh minds that what to rework it.

The Code

from typing import Any, Optional
import pdfplumber
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTPage, LTTextContainer, LTChar
def text_extraction(element: LTTextContainer) -> tuple[str, list[str]]:
 """
 Extracts text and unique formats (font names and sizes) from a given element.
 Parameters:
 element (LTTextContainer): The element from which text and formats are extracted.
 Returns:
 tuple[str, list[str]]: A tuple containing the extracted text and a list of unique formats.
 """
 line_text = element.get_text()
 line_formats = set()
 for text_line in element:
 if isinstance(text_line, LTTextContainer):
 for character in text_line:
 if isinstance(character, LTChar):
 format_info = f"{character.fontname}, {character.size}"
 line_formats.add(format_info)
 format_per_line = list(line_formats)
 return line_text, format_per_line
def extract_table(pdf_path: str, page_num: int, table_num: int) -> Optional[list[list[str]]]:
 """
 Extracts a specified table from a given page of a PDF document.
 Parameters:
 pdf_path (str): The file path of the PDF document.
 page_num (int): The page number from which to extract the table.
 table_num (int): The index of the table on the page to extract.
 Returns:
 Optional[list[list[str]]]: A 2D list representing the extracted table, or None if an error occurs.
 """
 try:
 with pdfplumber.open(pdf_path) as pdf:
 # Check if the page number is valid
 if page_num < 0 or page_num >= len(pdf.pages):
 raise ValueError("Page number out of range.")
 table_page = pdf.pages[page_num]
 tables = table_page.extract_tables()
 # Check if the table number is valid
 if table_num < 0 or table_num >= len(tables):
 raise ValueError("Table number out of range.")
 return tables[table_num]
 except Exception as e:
 print(f"An error occurred: {e}")
 return None
def table_converter(table: list[list[str]]) -> str:
 """
 Converts a 2D table into a string format, where each cell is separated by '|'
 and each row is on a new line. Newline characters in cells are replaced with spaces,
 and None values are converted to the string 'None'.
 Parameters:
 table (list[list[str]]): The 2D table to convert.
 Returns:
 str: The string representation of the table.
 Example usage:
 table = [['Name', 'Age'], ['Alice', '23'], ['Bob', None]]
 print(table_converter(table))
 """
 converted_rows = []
 for row in table:
 cleaned_row = [
 item.replace('\n', ' ') if item is not None else 'None'
 for item in row
 ]
 converted_rows.append('|' + '|'.join(cleaned_row) + '|')
 return '\n'.join(converted_rows)
def is_element_inside_any_table(element, page: LTPage, tables: list[Any]) -> bool:
 """
 Checks whether a given element is inside any of the tables on a PDF page.
 Parameters:
 element: The element to check.
 page (LTPage): The PDF page.
 tables (List[Any]): A list of tables, where each table is an object with a bounding box.
 Returns:
 bool: True if the element is inside any of the tables, False otherwise.
 """
 x0, y0up, x1, y1up = element.bbox
 page_height = page.bbox[3]
 # Transform coordinates
 y0, y1 = page_height - y1up, page_height - y0up
 for table in tables:
 tx0, ty0, tx1, ty1 = table.bbox
 # Check if element bbox is inside table bbox
 if tx0 <= x0 < x1 <= tx1 and ty0 <= y0 < y1 <= ty1:
 return True
 return False
def find_table_for_element(element, page: LTPage, tables: list[Any]) -> Optional[int]:
 """
 Finds the index of the table that a given element is inside on a PDF page.
 Parameters:
 element: The element to check.
 page (LTPage): The PDF page.
 tables (list[Any]): A list of tables, where each table is an object with a bounding box.
 Returns:
 Optional[int]: The index of the table that contains the element, or None if not found.
 """
 x0, y0up, x1, y1up = element.bbox
 page_height = page.bbox[3]
 # Transform coordinates
 y0, y1 = page_height - y1up, page_height - y0up
 for i, table in enumerate(tables):
 tx0, ty0, tx1, ty1 = table.bbox
 if tx0 <= x0 < x1 <= tx1 and ty0 <= y0 < y1 <= ty1:
 return i # Return the index of the table
 return None
def process_tables(tables, pdf_path, pagenum, text_from_tables):
 # Extracting the tables of the page
 for table_num in range(len(tables)):
 # Extract the information of the table
 table = extract_table(pdf_path, pagenum, table_num)
 # Convert the table information in structured string format
 table_string = table_converter(table)
 # Append the table string into a list
 text_from_tables.append(table_string)
def process_text_element(element, page_text, line_format, page_content):
 # Check if the element is text element
 if isinstance(element, LTTextContainer):
 # Use the function to extract the text and format for each text element
 (line_text, format_per_line) = text_extraction(element)
 # Append the text of each line to the page text
 page_text.append(line_text)
 # Append the format for each line containing text
 line_format.append(format_per_line)
 page_content.append(line_text)
 return line_format, page_content
def main(filepath: str) -> None:
 pdf = open(filepath, 'rb')
 text_per_page = {}
 # We extract the pages from the PDF
 for pagenum, page in enumerate(extract_pages(filepath)):
 # Initialize the variables needed for the text extraction from the page
 page_text = []
 line_format = []
 text_from_images = []
 text_from_tables = []
 page_content = []
 # Initialize the number of the examined tables
 table_in_page = -1
 pdf = pdfplumber.open(pdf_path)
 page_tables = pdf.pages[pagenum]
 tables = page_tables.find_tables()
 if len(tables) != 0:
 table_in_page = 0
 process_tables(tables, filepath, pagenum, text_from_tables)
 # Find all the elements
 page_elements = [(element.y1, element) for element in page._objs]
 # Sort all the element as they appear in the page
 page_elements.sort(key=lambda a: a[0], reverse=True)
 # Find the elements that composed a page
 for i, component in enumerate(page_elements):
 # Extract the element of the page layout
 element = component[1]
 # Check the elements for tables
 if table_in_page == -1:
 pass
 else:
 if is_element_inside_any_table(element, page, tables):
 table_found = find_table_for_element(element, page,tables)
 if table_found == table_in_page and table_found is not None:
 page_content.append(text_from_tables[table_in_page])
 page_text.append('table')
 line_format.append('table')
 table_in_page += 1
 # Pass this iteration because the content of this element was extracted from the tables
 continue
 if not is_element_inside_any_table(element, page, tables):
 line_format, page_content = process_text_element(element, page_text, line_format, page_content)
 # Add the list of list as value of the page key
 text_per_page[f'Page_{pagenum}'] = [page_text, line_format, text_from_images, text_from_tables, page_content]
 # Close the pdf file object
 pdf.close()
 # For now just write to file.
 result = ''.join(text_per_page['Page_0'][4])
 with open("/path/to/processed-resume.pdf.txt", "w") as text_file:
 text_file.write(result)
# TODO: this needs a lot of refinement.
if __name__ == "__main__":
 pdf_path = '/path/to/any/test-resume.pdf'
 main(pdf_path)

python

lang-py