How to extract header, paragraph, table structure from pdf using azure form recognizer in python

Question 1

I would like to extract the data like Header, paragraphs, tables, pagenumber, pagefooter from the pdf in the dataframe format using the azure form recognizer using python.

PFB expected output.

enter image description here

I have tried using layout model but the from the response i am not able to identify the header, paragraph or table

https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/quickstarts/get-started-sdks-rest-api?view=doc-intel-3.1.0&viewFallbackFrom=form-recog-3.0.0&preserve-view=true&pivots=programming-language-python

import os
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential
endpoint = "<your-endpoint>"
key = "<your-key>"
def format_polygon(polygon):
 if not polygon:
 return "N/A"
 return ", ".join(["[{}, {}]".format(p.x, p.y) for p in polygon])
def analyze_layout():
 # sample form document
 formUrl = "https://raw.githubusercontent.com/Azure-Samples/cognitive-services-REST-api-samples/master/curl/form-recognizer/sample-layout.pdf"
 document_analysis_client = DocumentAnalysisClient(
 endpoint=endpoint, credential=AzureKeyCredential(key)
 )
 poller = document_analysis_client.begin_analyze_document_from_url(
 "prebuilt-layout", formUrl)
 result = poller.result()
 for idx, style in enumerate(result.styles):
 print(
 "Document contains {} content".format(
 "handwritten" if style.is_handwritten else "no handwritten"
 )
 )
 for page in result.pages:
 print("----Analyzing layout from page #{}----".format(page.page_number))
 print(
 "Page has width: {} and height: {}, measured with unit: {}".format(
 page.width, page.height, page.unit
 )
 )
 for line_idx, line in enumerate(page.lines):
 words = line.get_words()
 print(
 "...Line # {} has word count {} and text '{}' within bounding box '{}'".format(
 line_idx,
 len(words),
 line.content,
 format_polygon(line.polygon),
 )
 )
 for word in words:
 print(
 "......Word '{}' has a confidence of {}".format(
 word.content, word.confidence
 )
 )
 for selection_mark in page.selection_marks:
 print(
 "...Selection mark is '{}' within bounding box '{}' and has a confidence of {}".format(
 selection_mark.state,
 format_polygon(selection_mark.polygon),
 selection_mark.confidence,
 )
 )
 for table_idx, table in enumerate(result.tables):
 print(
 "Table # {} has {} rows and {} columns".format(
 table_idx, table.row_count, table.column_count
 )
 )
 for region in table.bounding_regions:
 print(
 "Table # {} location on page: {} is {}".format(
 table_idx,
 region.page_number,
 format_polygon(region.polygon),
 )
 )
 for cell in table.cells:
 print(
 "...Cell[{}][{}] has content '{}'".format(
 cell.row_index,
 cell.column_index,
 cell.content,
 )
 )
 for region in cell.bounding_regions:
 print(
 "...content on page {} is within bounding box '{}'".format(
 region.page_number,
 format_polygon(region.polygon),
 )
 )
 print("----------------------------------------")
if __name__ == "__main__":
 analyze_layout()

Question 2

I realise this only answer a part of your question but it may help a bit : Here is a fonction to get the tables and store them in a list of dataframe:

def get_tables(result):
 result_dict = result.to_dict()
 all_tables = []
 for idx, atable in enumerate(result_dict["tables"]):
 l = list()
 row_count = atable["row_count"]
 column_count = atable["column_count"]
 for aval in atable["cells"]:
 l.append(aval["content"])
 df = pd.DataFrame(np.array(l).reshape(row_count, column_count))
 df.columns = df.iloc[0]
 df = df.drop(df.index[0])
 all_tables.append(df)
 return all_tables

You get the whole text content by using result.content

If anyone as a quick way to get the content paragraph per paragraph I very interested :)

SophieG SophieG 1011 silver badge6 bronze badges · Accepted Answer · 2023-09-07 17:38:21Z

I realise this only answer a part of your question but it may help a bit : Here is a fonction to get the tables and store them in a list of dataframe:

def get_tables(result):
 result_dict = result.to_dict()
 all_tables = []
 for idx, atable in enumerate(result_dict["tables"]):
 l = list()
 row_count = atable["row_count"]
 column_count = atable["column_count"]
 for aval in atable["cells"]:
 l.append(aval["content"])
 df = pd.DataFrame(np.array(l).reshape(row_count, column_count))
 df.columns = df.iloc[0]
 df = df.drop(df.index[0])
 all_tables.append(df)
 return all_tables

You get the whole text content by using result.content

If anyone as a quick way to get the content paragraph per paragraph I very interested :)

CollectivesTM on Stack Overflow

How to extract header, paragraph, table structure from pdf using azure form recognizer in python

1 Answer 1

Comments

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

CollectivesTM on Stack Overflow

1 Answer 1

Comments

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related