Following Google's documentation, I am trying to perform a Document AI OCR batch request (async), and I constantly receive an error. I tried both with gcs_input_uri and gcs_input_prefix. I can not find any relevant logs in the GCP console, and no operations in the destination bucket 'operations' tab.
The error and code follow. I'd appreciate any assistance!
Error:
google.api_core.exceptions.InvalidArgument: 400 Failed to process all
documents. 3: Failed to process all documents.
Code:
import re
from typing import Optional
from google.api_core.client_options import ClientOptions
from google.api_core.exceptions import InternalServerError
from google.api_core.exceptions import RetryError
from google.cloud import documentai # type: ignore
from google.cloud import storage
project_id = "document-ocr-xxxxx"
location = "us"
processor_id = "7797cdfaxxxxxx" # Create processor before running sample
field_mask = "entities.id, entities.confidence, entities.type, entities.mentionText"
gcs_output_uri = "gs://score/"
input_mime_type = "application/pdf"
gcs_input_prefix = "gs://score_input/"
def batch_process_documents(
project_id: str,
location: str,
processor_id: str,
gcs_output_uri: str,
gcs_input_prefix: Optional[str] = None,
timeout: int = 400,
) -> None:
opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
client = documentai.DocumentProcessorServiceClient(client_options=opts)
gcs_input_uri = 'gs://score_input/278040xxxxx.pdf'
if gcs_input_uri:
gcs_document = documentai.GcsDocument(
gcs_uri=gcs_input_uri, mime_type=input_mime_type
)
gcs_documents = documentai.GcsDocuments(documents=[gcs_document])
input_config = documentai.BatchDocumentsInputConfig(gcs_documents=gcs_documents)
else:
gcs_prefix = documentai.GcsPrefix(gcs_uri_prefix=gcs_input_prefix)
input_config = documentai.BatchDocumentsInputConfig(gcs_prefix=gcs_prefix)
gcs_output_config = documentai.DocumentOutputConfig.GcsOutputConfig(
gcs_uri=gcs_output_uri, field_mask=field_mask
)
output_config = documentai.DocumentOutputConfig(gcs_output_config=gcs_output_config)
name = client.processor_path(project_id, location, processor_id)
request = documentai.BatchProcessRequest(
name=name,
input_documents=input_config,
document_output_config=output_config,
)
operation = client.batch_process_documents(request)
try:
print(f"Waiting for operation {operation.operation.name} to complete...")
operation.result(timeout=timeout)
except (RetryError, InternalServerError) as e:
print(e.message)
batch_process_documents(project_id, location, processor_id, gcs_output_uri, gcs_input_prefix)
Output:
(venv) ubuntu@MacBook-Pro-2 playground % python doc_batch.py
Waiting for operation projects/106038xxxxxx/locations/us/operations/1505725931527xxxxxx to complete...
Traceback (most recent call last):
File "/opt/homebrew/Cellar/[email protected]/3.12.10/Frameworks/Python.framework/Versions/3.12/lib/python3.12/runpy.py", line 198, in _run_module_as_main
return _run_code(code, main_globals, None,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/homebrew/Cellar/[email protected]/3.12.10/Frameworks/Python.framework/Versions/3.12/lib/python3.12/runpy.py", line 88, in _run_code
exec(code, run_globals)
File "/Users/ubuntu/.cursor/extensions/ms-python.debugpy-202460-darwin-arm64/bundled/libs/debugpy/adapter/../../debugpy/launcher/../../debugpy/__main__.py", line 39, in <module>
cli.main()
File "/Users/ubuntu/.cursor/extensions/ms-python.debugpy-202460-darwin-arm64/bundled/libs/debugpy/adapter/../../debugpy/launcher/../../debugpy/../debugpy/server/cli.py", line 430, in main
run()
File "/Users/ubuntu/.cursor/extensions/ms-python.debugpy-202460-darwin-arm64/bundled/libs/debugpy/adapter/../../debugpy/launcher/../../debugpy/../debugpy/server/cli.py", line 284, in run_file
runpy.run_path(target, run_name="__main__")
File "/Users/ubuntu/.cursor/extensions/ms-python.debugpy-202460-darwin-arm64/bundled/libs/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_runpy.py", line 321, in run_path
return _run_module_code(code, init_globals, run_name,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/ubuntu/.cursor/extensions/ms-python.debugpy-202460-darwin-arm64/bundled/libs/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_runpy.py", line 135, in _run_module_code
_run_code(code, mod_globals, init_globals,
File "/Users/ubuntu/.cursor/extensions/ms-python.debugpy-202460-darwin-arm64/bundled/libs/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_runpy.py", line 124, in _run_code
exec(code, run_globals)
File "/Users/ubuntu/source/playground/doc_batch.py", line 151, in <module>
batch_process_documents(project_id, location, processor_id, gcs_output_uri, gcs_input_prefix)
File "/Users/ubuntu/source/playground/doc_batch.py", line 90, in batch_process_documents
operation.result(timeout=timeout)
File "/Users/ubuntu/source/playground/venv/lib/python3.12/site-packages/google/api_core/future/polling.py", line 261, in result
raise self._exception
google.api_core.exceptions.InvalidArgument: 400 Failed to process all documents. 3: Failed to process all documents.
(venv) ubuntu@MacBook-Pro-2 playground %
2 Answers 2
Your error 400 could be one of the following:
You might exceed your limit in Document AI or the PDF you uploaded might be invalid or corrupted, you can confirm it by checking the file or re-uploading it.
Also try to check your IAM permission for Document AI if it has access. Also double check if the processor ID is correct.
2 Comments
Firstly, the problem was with my list of field to include
field_mask = "entities.id, entities.confidence, entities.type, entities.mentionText"
after looking into the API which was not easy to come by, I realized that the entities.* element does not support retrieving its children. only top level element is possible to retrieve "entities" (unlike the page element for instance).
Unfortunately, the batch process request did not show this error, but after using the API to retrieve the operation status (get_operation), I could see the full error message.
Comments
Explore related questions
See similar questions with these tags.
POSTrequest (see:processors.batchProcess. This eliminates your code and the library from the problem path (it's very unlikely to be the library) and allows you to focus on generating the correct request.