从 Cloud Storage 加载已处理的文档(或文档分片)以进行后处理。
深入探索
如需查看包含此代码示例的详细文档,请参阅以下内容:
代码示例
Python
如需了解详情,请参阅 Document AI Python API 参考文档。
如需向 Document AI 进行身份验证,请设置应用默认凭据。 如需了解详情,请参阅为本地开发环境设置身份验证。
from typing import Optional
from google.cloud import documentai
from google.cloud.documentai_toolbox import document, gcs_utilities
# TODO(developer): Uncomment these variables before running the sample.
# Given a Document JSON or sharded Document JSON in path gs://bucket/path/to/folder
# gcs_bucket_name = "bucket"
# gcs_prefix = "path/to/folder"
# Or, given a Document JSON in path gs://bucket/path/to/folder/document.json
# gcs_uri = "gs://bucket/path/to/folder/document.json"
# Or, given a Document JSON in path local/path/to/folder/document.json
# document_path = "local/path/to/folder/document.json"
# Or, given a Document object from Document AI
# documentai_document = documentai.Document()
# Or, given a BatchProcessMetadata object from Document AI
# operation = client.batch_process_documents(request)
# operation.result(timeout=timeout)
# batch_process_metadata = documentai.BatchProcessMetadata(operation.metadata)
# Or, given a BatchProcessOperation name from Document AI
# batch_process_operation = "projects/project_id/locations/location/operations/operation_id"
def quickstart_sample(
gcs_bucket_name: Optional[str] = None,
gcs_prefix: Optional[str] = None,
gcs_uri: Optional[str] = None,
document_path: Optional[str] = None,
documentai_document: Optional[documentai.Document] = None,
batch_process_metadata: Optional[documentai.BatchProcessMetadata] = None,
batch_process_operation: Optional[str] = None,
) -> document.Document:
if gcs_bucket_name and gcs_prefix:
# Load from Google Cloud Storage Directory
print("Document structure in Cloud Storage")
gcs_utilities.print_gcs_document_tree(
gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix
)
wrapped_document = document.Document.from_gcs(
gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix
)
elif gcs_uri:
# Load a single Document from a Google Cloud Storage URI
wrapped_document = document.Document.from_gcs_uri(gcs_uri=gcs_uri)
elif document_path:
# Load from local `Document` JSON file
wrapped_document = document.Document.from_document_path(document_path)
elif documentai_document:
# Load from `documentai.Document` object
wrapped_document = document.Document.from_documentai_document(
documentai_document
)
elif batch_process_metadata:
# Load Documents from `BatchProcessMetadata` object
wrapped_documents = document.Document.from_batch_process_metadata(
metadata=batch_process_metadata
)
wrapped_document = wrapped_documents[0]
elif batch_process_operation:
wrapped_documents = document.Document.from_batch_process_operation(
location="us", operation_name=batch_process_operation
)
wrapped_document = wrapped_documents[0]
else:
raise ValueError("No document source provided.")
# For all properties and methods, refer to:
# https://cloud.google.com/python/docs/reference/documentai-toolbox/latest/google.cloud.documentai_toolbox.wrappers.document.Document
print("Document Successfully Loaded!")
print(f"\t Number of Pages: {len(wrapped_document.pages)}")
print(f"\t Number of Entities: {len(wrapped_document.entities)}")
for page in wrapped_document.pages:
print(f"Page {page.page_number}")
for block in page.blocks:
print(block.text)
for paragraph in page.paragraphs:
print(paragraph.text)
for line in page.lines:
print(line.text)
for token in page.tokens:
print(token.text)
# Only supported with Form Parser processor
# https://cloud.google.com/document-ai/docs/form-parser
for form_field in page.form_fields:
print(f"{form_field.field_name} : {form_field.field_value}")
# Only supported with Enterprise Document OCR version `pretrained-ocr-v2.0-2023-06-02`
# https://cloud.google.com/document-ai/docs/process-documents-ocr#enable_symbols
for symbol in page.symbols:
print(symbol.text)
# Only supported with Enterprise Document OCR version `pretrained-ocr-v2.0-2023-06-02`
# https://cloud.google.com/document-ai/docs/process-documents-ocr#math_ocr
for math_formula in page.math_formulas:
print(math_formula.text)
# Only supported with Entity Extraction processors
# https://cloud.google.com/document-ai/docs/processors-list
for entity in wrapped_document.entities:
print(f"{entity.type_} : {entity.mention_text}")
if entity.normalized_text:
print(f"\tNormalized Text: {entity.normalized_text}")
# Only supported with Layout Parser
for chunk in wrapped_document.chunks:
print(f"Chunk {chunk.chunk_id}: {chunk.content}")
for block in wrapped_document.document_layout_blocks:
print(f"Document Layout Block {block.block_id}")
if block.text_block:
print(f"{block.text_block.type_}: {block.text_block.text}")
if block.list_block:
print(f"{block.list_block.type_}: {block.list_block.list_entries}")
if block.table_block:
print(block.table_block.header_rows, block.table_block.body_rows)
后续步骤
如需搜索和过滤其他 Google Cloud 产品的代码示例,请参阅 Google Cloud 示例浏览器。