Document AI Toolbox 客户端库

本页介绍了如何开始使用 Document AI Toolbox API 的 Cloud 客户端库。通过客户端库,您可以更轻松地使用支持的语言访问Google Cloud API。虽然您可以通过向服务器发出原始请求来直接使用Google Cloud API,但客户端库可实现简化,从而显著减少您需要编写的代码量。

请参阅客户端库说明,详细了解 Cloud 客户端库和旧版 Google API 客户端库。

安装客户端库

Python

pip install --upgrade google-cloud-documentai-toolbox

如需了解详情,请参阅设置 Python 开发环境

设置身份验证

为了对 Google Cloud API 的调用进行身份验证,客户端库支持应用默认凭据 (ADC);这些库会在一组指定的位置查找凭据,并使用这些凭据对发送到 API 的请求进行身份验证。借助 ADC,您可以在各种环境(例如本地开发或生产环境)中为您的应用提供凭据,而无需修改应用代码。

对于生产环境,设置 ADC 的方式取决于服务和上下文。如需了解详情,请参阅设置应用默认凭据

对于本地开发环境,您可以使用与您的 Google 账号关联的凭据设置 ADC:

  1. Install the Google Cloud CLI, then initialize it by running the following command:

    gcloud init
  2. If you're using a local shell, then create local authentication credentials for your user account:

    gcloud auth application-default login

    You don't need to do this if you're using Cloud Shell.

    登录屏幕随即出现。在您登录后,您的凭据会存储在 ADC 使用的本地凭据文件中。

使用客户端库

Document AI Toolbox 是一个适用于 Python 的 SDK,提供实用函数来管理、处理和提取文档响应中的信息。它会根据 Cloud Storage 中的 JSON 文件、本地 JSON 文件或直接从 process_document() 方法输出的处理完毕的文档响应创建“封装”文档对象。

它可以执行以下操作:

代码示例

以下代码示例演示了如何使用 Document AI Toolbox。

快速入门

from typing import Optional

from google.cloud import documentai
from google.cloud.documentai_toolbox import document, gcs_utilities

# TODO(developer): Uncomment these variables before running the sample.
# Given a Document JSON or sharded Document JSON in path gs://bucket/path/to/folder
# gcs_bucket_name = "bucket"
# gcs_prefix = "path/to/folder"

# Or, given a Document JSON in path gs://bucket/path/to/folder/document.json
# gcs_uri = "gs://bucket/path/to/folder/document.json"

# Or, given a Document JSON in path local/path/to/folder/document.json
# document_path = "local/path/to/folder/document.json"

# Or, given a Document object from Document AI
# documentai_document = documentai.Document()

# Or, given a BatchProcessMetadata object from Document AI
# operation = client.batch_process_documents(request)
# operation.result(timeout=timeout)
# batch_process_metadata = documentai.BatchProcessMetadata(operation.metadata)

# Or, given a BatchProcessOperation name from Document AI
# batch_process_operation = "projects/project_id/locations/location/operations/operation_id"


def quickstart_sample(
    gcs_bucket_name: Optional[str] = None,
    gcs_prefix: Optional[str] = None,
    gcs_uri: Optional[str] = None,
    document_path: Optional[str] = None,
    documentai_document: Optional[documentai.Document] = None,
    batch_process_metadata: Optional[documentai.BatchProcessMetadata] = None,
    batch_process_operation: Optional[str] = None,
) -> document.Document:
    if gcs_bucket_name and gcs_prefix:
        # Load from Google Cloud Storage Directory
        print("Document structure in Cloud Storage")
        gcs_utilities.print_gcs_document_tree(
            gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix
        )

        wrapped_document = document.Document.from_gcs(
            gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix
        )
    elif gcs_uri:
        # Load a single Document from a Google Cloud Storage URI
        wrapped_document = document.Document.from_gcs_uri(gcs_uri=gcs_uri)
    elif document_path:
        # Load from local `Document` JSON file
        wrapped_document = document.Document.from_document_path(document_path)
    elif documentai_document:
        # Load from `documentai.Document` object
        wrapped_document = document.Document.from_documentai_document(
            documentai_document
        )
    elif batch_process_metadata:
        # Load Documents from `BatchProcessMetadata` object
        wrapped_documents = document.Document.from_batch_process_metadata(
            metadata=batch_process_metadata
        )
        wrapped_document = wrapped_documents[0]
    elif batch_process_operation:
        wrapped_documents = document.Document.from_batch_process_operation(
            location="us", operation_name=batch_process_operation
        )
        wrapped_document = wrapped_documents[0]
    else:
        raise ValueError("No document source provided.")

    # For all properties and methods, refer to:
    # https://cloud.google.com/python/docs/reference/documentai-toolbox/latest/google.cloud.documentai_toolbox.wrappers.document.Document

    print("Document Successfully Loaded!")
    print(f"\t Number of Pages: {len(wrapped_document.pages)}")
    print(f"\t Number of Entities: {len(wrapped_document.entities)}")

    for page in wrapped_document.pages:
        print(f"Page {page.page_number}")
        for block in page.blocks:
            print(block.text)
        for paragraph in page.paragraphs:
            print(paragraph.text)
        for line in page.lines:
            print(line.text)
        for token in page.tokens:
            print(token.text)

        # Only supported with Form Parser processor
        # https://cloud.google.com/document-ai/docs/form-parser
        for form_field in page.form_fields:
            print(f"{form_field.field_name} : {form_field.field_value}")

        # Only supported with Enterprise Document OCR version `pretrained-ocr-v2.0-2023-06-02`
        # https://cloud.google.com/document-ai/docs/process-documents-ocr#enable_symbols
        for symbol in page.symbols:
            print(symbol.text)

        # Only supported with Enterprise Document OCR version `pretrained-ocr-v2.0-2023-06-02`
        # https://cloud.google.com/document-ai/docs/process-documents-ocr#math_ocr
        for math_formula in page.math_formulas:
            print(math_formula.text)

    # Only supported with Entity Extraction processors
    # https://cloud.google.com/document-ai/docs/processors-list
    for entity in wrapped_document.entities:
        print(f"{entity.type_} : {entity.mention_text}")
        if entity.normalized_text:
            print(f"\tNormalized Text: {entity.normalized_text}")

    # Only supported with Layout Parser
    for chunk in wrapped_document.chunks:
        print(f"Chunk {chunk.chunk_id}: {chunk.content}")

    for block in wrapped_document.document_layout_blocks:
        print(f"Document Layout Block {block.block_id}")

        if block.text_block:
            print(f"{block.text_block.type_}: {block.text_block.text}")
        if block.list_block:
            print(f"{block.list_block.type_}: {block.list_block.list_entries}")
        if block.table_block:
            print(block.table_block.header_rows, block.table_block.body_rows)


from google.cloud.documentai_toolbox import document

# TODO(developer): Uncomment these variables before running the sample.
# Given a local document.proto or sharded document.proto in path
# document_path = "path/to/local/document.json"
# output_file_prefix = "output/table"


def table_sample(document_path: str, output_file_prefix: str) -> None:
    wrapped_document = document.Document.from_document_path(document_path=document_path)

    print("Tables in Document")
    for page in wrapped_document.pages:
        for table_index, table in enumerate(page.tables):
            # Convert table to Pandas Dataframe
            # Refer to https://pandas.pydata.org/docs/reference/frame.html for all supported methods
            df = table.to_dataframe()
            print(df)

            output_filename = f"{output_file_prefix}-{page.page_number}-{table_index}"

            # Write Dataframe to CSV file
            df.to_csv(f"{output_filename}.csv", index=False)

            # Write Dataframe to HTML file
            df.to_html(f"{output_filename}.html", index=False)

            # Write Dataframe to Markdown file
            df.to_markdown(f"{output_filename}.md", index=False)

BigQuery Export


from google.cloud.documentai_toolbox import document

# TODO(developer): Uncomment these variables before running the sample.
# Given a document.proto or sharded document.proto in path gs://bucket/path/to/folder
# gcs_bucket_name = "bucket"
# gcs_prefix = "path/to/folder"
# dataset_name = "test_dataset"
# table_name = "test_table"
# project_id = "YOUR_PROJECT_ID"


def entities_to_bigquery_sample(
    gcs_bucket_name: str,
    gcs_prefix: str,
    dataset_name: str,
    table_name: str,
    project_id: str,
) -> None:
    wrapped_document = document.Document.from_gcs(
        gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix
    )

    job = wrapped_document.entities_to_bigquery(
        dataset_name=dataset_name, table_name=table_name, project_id=project_id
    )

    # Also supported:
    # job = wrapped_document.form_fields_to_bigquery(
    #     dataset_name=dataset_name, table_name=table_name, project_id=project_id
    # )

    print("Document entities loaded into BigQuery")
    print(f"Job ID: {job.job_id}")
    print(f"Table: {job.destination.path}")

PDF 拆分


from google.cloud.documentai_toolbox import document

# TODO(developer): Uncomment these variables before running the sample.
# Given a local document.proto or sharded document.proto from a splitter/classifier in path
# document_path = "path/to/local/document.json"
# pdf_path = "path/to/local/document.pdf"
# output_path = "resources/output/"


def split_pdf_sample(document_path: str, pdf_path: str, output_path: str) -> None:
    wrapped_document = document.Document.from_document_path(document_path=document_path)

    output_files = wrapped_document.split_pdf(
        pdf_path=pdf_path, output_path=output_path
    )

    print("Document Successfully Split")
    for output_file in output_files:
        print(output_file)

图片提取


from google.cloud.documentai_toolbox import document

# TODO(developer): Uncomment these variables before running the sample.
# Given a local document.proto or sharded document.proto from an identity processor in path
# document_path = "path/to/local/document.json"
# output_path = "resources/output/"
# output_file_prefix = "exported_photo"
# output_file_extension = "png"


def export_images_sample(
    document_path: str,
    output_path: str,
    output_file_prefix: str,
    output_file_extension: str,
) -> None:
    wrapped_document = document.Document.from_document_path(document_path=document_path)

    output_files = wrapped_document.export_images(
        output_path=output_path,
        output_file_prefix=output_file_prefix,
        output_file_extension=output_file_extension,
    )
    print("Images Successfully Exported")
    for output_file in output_files:
        print(output_file)

视觉转化


from google.cloud.documentai_toolbox import document

# TODO(developer): Uncomment these variables before running the sample.
# Given a document.proto or sharded document.proto in path gs://bucket/path/to/folder
# gcs_bucket_name = "bucket"
# gcs_prefix = "path/to/folder"


def convert_document_to_vision_sample(
    gcs_bucket_name: str,
    gcs_prefix: str,
) -> None:
    wrapped_document = document.Document.from_gcs(
        gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix
    )

    # Converting wrapped_document to vision AnnotateFileResponse
    annotate_file_response = (
        wrapped_document.convert_document_to_annotate_file_response()
    )

    print("Document converted to AnnotateFileResponse!")
    print(
        f"Number of Pages : {len(annotate_file_response.responses[0].full_text_annotation.pages)}"
    )

hOCR 转换


from google.cloud.documentai_toolbox import document

# TODO(developer): Uncomment these variables before running the sample.
# Given a document.proto or sharded document.proto in path gs://bucket/path/to/folder
# document_path = "path/to/local/document.json"
# document_title = "your-document-title"


def convert_document_to_hocr_sample(document_path: str, document_title: str) -> str:
    wrapped_document = document.Document.from_document_path(document_path=document_path)

    # Converting wrapped_document to hOCR format
    hocr_string = wrapped_document.export_hocr_str(title=document_title)

    print("Document converted to hOCR!")
    return hocr_string

第三方转化


from google.cloud.documentai_toolbox import converter

# TODO(developer): Uncomment these variables before running the sample.
# This sample will convert external annotations to the Document.json format used by Document AI Workbench for training.
# To process this the external annotation must have these type of objects:
#       1) Type
#       2) Text
#       3) Bounding Box (bounding boxes must be 1 of the 3 optional types)
#
# This is the bare minimum requirement to convert the annotations but for better accuracy you will need to also have:
#       1) Document width & height
#
# Bounding Box Types:
#   Type 1:
#       bounding_box:[{"x":1,"y":2},{"x":2,"y":2},{"x":2,"y":3},{"x":1,"y":3}]
#   Type 2:
#       bounding_box:{ "Width": 1, "Height": 1, "Left": 1, "Top": 1}
#   Type 3:
#       bounding_box: [1,2,2,2,2,3,1,3]
#
#   Note: If these types are not sufficient you can propose a feature request or contribute the new type and conversion functionality.
#
# Given a folders in gcs_input_path with the following structure :
#
# gs://path/to/input/folder
#   ├──test_annotations.json
#   ├──test_config.json
#   └──test.pdf
#
# An example of the config is in sample-converter-configs/Azure/form-config.json
#
# location = "us",
# processor_id = "my_processor_id"
# gcs_input_path = "gs://path/to/input/folder"
# gcs_output_path = "gs://path/to/input/folder"


def convert_external_annotations_sample(
    location: str,
    processor_id: str,
    project_id: str,
    gcs_input_path: str,
    gcs_output_path: str,
) -> None:
    converter.convert_from_config(
        project_id=project_id,
        location=location,
        processor_id=processor_id,
        gcs_input_path=gcs_input_path,
        gcs_output_path=gcs_output_path,
    )

文档批次


from google.cloud import documentai
from google.cloud.documentai_toolbox import gcs_utilities

# TODO(developer): Uncomment these variables before running the sample.
# Given unprocessed documents in path gs://bucket/path/to/folder
# gcs_bucket_name = "bucket"
# gcs_prefix = "path/to/folder"
# batch_size = 50


def create_batches_sample(
    gcs_bucket_name: str,
    gcs_prefix: str,
    batch_size: int = 50,
) -> None:
    # Creating batches of documents for processing
    batches = gcs_utilities.create_batches(
        gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix, batch_size=batch_size
    )

    print(f"{len(batches)} batch(es) created.")
    for batch in batches:
        print(f"{len(batch.gcs_documents.documents)} files in batch.")
        print(batch.gcs_documents.documents)

        # Use as input for batch_process_documents()
        # Refer to https://cloud.google.com/document-ai/docs/send-request
        # for how to send a batch processing request
        request = documentai.BatchProcessRequest(
            name="processor_name", input_documents=batch
        )
        print(request)

合并文档分片


from google.cloud import documentai
from google.cloud.documentai_toolbox import document

# TODO(developer): Uncomment these variables before running the sample.
# Given a document.proto or sharded document.proto in path gs://bucket/path/to/folder
# gcs_bucket_name = "bucket"
# gcs_prefix = "path/to/folder"
# output_file_name = "path/to/folder/file.json"


def merge_document_shards_sample(
    gcs_bucket_name: str, gcs_prefix: str, output_file_name: str
) -> None:
    wrapped_document = document.Document.from_gcs(
        gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix
    )

    merged_document = wrapped_document.to_merged_documentai_document()

    with open(output_file_name, "w") as f:
        f.write(documentai.Document.to_json(merged_document))

    print(f"Document with {len(wrapped_document.shards)} shards successfully merged.")

其他资源

Python

以下列表包含与 Python 版客户端库相关的更多资源的链接: