from typing import Optional, Sequence
from google.api_core.client_options import ClientOptions
from google.cloud import documentai
# TODO(developer): Uncomment these variables before running the sample.
# project_id = "YOUR_PROJECT_ID"
# location = "YOUR_PROCESSOR_LOCATION" # Format is "us" or "eu"
# processor_id = "YOUR_PROCESSOR_ID" # Create processor before running sample
# processor_version = "rc" # Refer to https://cloud.google.com/document-ai/docs/manage-processor-versions for more information
# file_path = "/path/to/local/pdf"
# mime_type = "application/pdf" # Refer to https://cloud.google.com/document-ai/docs/file-types for supported file types
def process_document_splitter_sample(
project_id: str,
location: str,
processor_id: str,
processor_version: str,
file_path: str,
mime_type: str,
) -> None:
# Online processing request to Document AI
document = process_document(
project_id, location, processor_id, processor_version, file_path, mime_type
)
# Read the splitter output from a document splitter/classifier processor:
# e.g. https://cloud.google.com/document-ai/docs/processors-list#processor_procurement-document-splitter
# This processor only provides text for the document and information on how
# to split the document on logical boundaries. To identify and extract text,
# form elements, and entities please see other processors like the OCR, form,
# and specalized processors.
print(f"Found {len(document.entities)} subdocuments:")
for entity in document.entities:
conf_percent = f"{entity.confidence:.1%}"
pages_range = page_refs_to_string(entity.page_anchor.page_refs)
# Print subdocument type information, if available
if entity.type_:
print(
f"{conf_percent} confident that {pages_range} a '{entity.type_}' subdocument."
)
else:
print(f"{conf_percent} confident that {pages_range} a subdocument.")
def page_refs_to_string(
page_refs: Sequence[documentai.Document.PageAnchor.PageRef],
) -> str:
"""Converts a page ref to a string describing the page or page range."""
pages = [str(int(page_ref.page) + 1) for page_ref in page_refs]
if len(pages) == 1:
return f"page {pages[0]} is"
else:
return f"pages {', '.join(pages)} are"
def process_document(
project_id: str,
location: str,
processor_id: str,
processor_version: str,
file_path: str,
mime_type: str,
process_options: Optional[documentai.ProcessOptions] = None,
) -> documentai.Document:
# You must set the `api_endpoint` if you use a location other than "us".
client = documentai.DocumentProcessorServiceClient(
client_options=ClientOptions(
api_endpoint=f"{location}-documentai.googleapis.com"
)
)
# The full resource name of the processor version, e.g.:
# `projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id}`
# You must create a processor before running this sample.
name = client.processor_version_path(
project_id, location, processor_id, processor_version
)
# Read the file into memory
with open(file_path, "rb") as image:
image_content = image.read()
# Configure the process request
request = documentai.ProcessRequest(
name=name,
raw_document=documentai.RawDocument(content=image_content, mime_type=mime_type),
# Only supported for Document OCR processor
process_options=process_options,
)
result = client.process_document(request=request)
# For a full list of `Document` object attributes, reference this page:
# https://cloud.google.com/document-ai/docs/reference/rest/v1/Document
return result.document