Module gcs_utilities (0.10.1a0)

Google Cloud Storage utilities.

Modules Functions

_get_client_info

_get_client_info(
    module: typing.Optional[str] = None,
) -> google.api_core.client_info.ClientInfo

Returns a custom user agent header.

_get_storage_client

_get_storage_client(
    module: typing.Optional[str] = None,
) -> google.cloud.storage.client.Client

Returns a Storage client with custom user agent header.

create_batches

create_batches(
    gcs_bucket_name: str, gcs_prefix: str, batch_size: int = 1000
) -> typing.List[
    google.cloud.documentai_v1.types.document_io.BatchDocumentsInputConfig
]

Create batches of documents in Cloud Storage to process with batch_process_documents().

Parameters
NameDescription
gcs_bucket_name str

Required. The name of the gcs bucket. Format: gs://bucket/optional_folder/target_folder/ where gcs_bucket_name=bucket.

gcs_prefix str

Required. The prefix of the json files in the target_folder Format: gs://bucket/optional_folder/target_folder/ where gcs_prefix=optional_folder/target_folder.

batch_size int

Optional. Size of each batch of documents. Default is 50.

Returns
TypeDescription
List[documentai.BatchDocumentsInputConfig]A list of BatchDocumentsInputConfig, each corresponding to one batch.

create_gcs_uri

create_gcs_uri(gcs_bucket_name: str, gcs_prefix: str) -> str

Creates a Cloud Storage uri from the bucket_name and prefix.

Parameters
NameDescription
gcs_bucket_name str

Required. The name of the gcs bucket. Format: gs://{bucket_name}/{optional_folder}/{target_folder}/ where gcs_bucket_name=bucket.

gcs_prefix str

Required. The prefix of the files in the target_folder. Format: gs://{bucket_name}/{optional_folder}/{target_folder}/ where gcs_prefix={optional_folder}/{target_folder}.

get_blob

get_blob(
    gcs_uri: str, module: typing.Optional[str] = "get-bytes"
) -> google.cloud.storage.blob.Blob

Returns a blob from Cloud Storage.

Parameters
NameDescription
gcs_uri str

Required: The fully-qualified Google Cloud Storage URI. Format: gs://{bucket_name}/{optional_folder}/{target_folder}/{target_file}.{ext}

module Optional[str]

Optional. The module for a custom user agent header.

Returns
TypeDescription
List[storage.blob.Blob]A list of the blobs in the Cloud Storage path.

get_blobs

get_blobs(
    gcs_uri: typing.Optional[str] = None,
    gcs_bucket_name: typing.Optional[str] = None,
    gcs_prefix: typing.Optional[str] = "/",
    module: typing.Optional[str] = "get-bytes",
) -> typing.List[google.cloud.storage.blob.Blob]

Returns a list of blobs from Cloud Storage.

Parameters
NameDescription
gcs_uri Optional[str]

Optional: The fully-qualified Google Cloud Storage URI. You must provide either gcs_uri or both gcs_bucket_name and gcs_prefix. Format: gs://{bucket_name}/{optional_folder}/{target_folder}/

gcs_bucket_name Optional[str]

Optional. The name of the gcs bucket. You must provide either gcs_uri or both gcs_bucket_name and gcs_prefix. Format: gs://{bucket_name}/{optional_folder}/{target_folder}/ where gcs_bucket_name=bucket.

gcs_prefix Optional[str]

Optional. The prefix of the files in the target_folder. You must provide either gcs_uri or both gcs_bucket_name and gcs_prefix. Format: gs://{bucket_name}/{optional_folder}/{target_folder}/ where gcs_prefix={optional_folder}/{target_folder}.

module Optional[str]

Optional. The module for a custom user agent header.

Returns
TypeDescription
List[storage.blob.Blob]A list of the blobs in the Cloud Storage path.

get_bytes

get_bytes(gcs_bucket_name: str, gcs_prefix: str) -> typing.List[bytes]

Returns a list of bytes of json files from Cloud Storage.

Parameters
NameDescription
gcs_bucket_name str

Required. The name of the gcs bucket. Format: gs://{bucket_name}/{optional_folder}/{target_folder}/ where gcs_bucket_name=bucket.

gcs_prefix str

Required. The prefix of the json files in the target_folder Format: gs://{bucket_name}/{optional_folder}/{target_folder}/ where gcs_prefix={optional_folder}/{target_folder}.

Returns
TypeDescription
List[bytes]A list of bytes.

list_gcs_document_tree

list_gcs_document_tree(
    gcs_bucket_name: str, gcs_prefix: str
) -> typing.Dict[str, typing.List[str]]

Returns a list path to files in Cloud Storage folder.

Parameters
NameDescription
gcs_bucket_name str

Required. The name of the gcs bucket. Format: gs://{bucket_name}/{optional_folder}/{target_folder}/ where gcs_bucket_name=bucket.

gcs_prefix str

Required. The prefix of the json files in the target_folder. Format: gs://{bucket_name}/{optional_folder}/{target_folder}/ where gcs_prefix={optional_folder}/{target_folder}.

Returns
TypeDescription
Dict[str, List[str]]The paths to documents in gs://{gcs_bucket_name}/{gcs_prefix}.

print_gcs_document_tree

print_gcs_document_tree(
    gcs_bucket_name: str, gcs_prefix: str, files_to_display: int = 4
) -> None

Prints a tree of filenames in a Cloud Storage folder.

Parameters
NameDescription
gcs_bucket_name str

Required. The name of the gcs bucket. Format: gs://{bucket_name}/{optional_folder}/{target_folder}/ where gcs_bucket_name=bucket.

gcs_prefix str

Required. The prefix of the json files in the target_folder. Format: gs://{bucket_name}/{optional_folder}/{target_folder}/ where gcs_prefix={optional_folder}/{target_folder}.

files_to_display int

Optional. The amount of files to display. Default is 4.

split_gcs_uri

split_gcs_uri(gcs_uri: str) -> typing.Tuple[str, str]

Splits a Cloud Storage uri into the bucket_name and prefix.

Parameter
NameDescription
gcs_uri str

Required. The full Cloud Storage URI. Format: gs://{bucket_name}/{gcs_prefix}.

Returns
TypeDescription
Tuple[str, str]The Cloud Storage Bucket and Prefix.

upload_file

upload_file(
    gcs_output_directory: str,
    file_name: str,
    file_content: str,
    content_type: str = "application/json",
    module: typing.Optional[str] = "upload-file",
) -> None

Uploads the converted docproto to gcs.

Parameters
NameDescription
gcs_output_directory str

Required: The Google Cloud Storage directory to output the file. Format: gs://{bucket}/{optional_folder}

file_name str

Required. The name of the file with extension.

file_content str

Required. The docproto file in string format.

content_type str

Optional. The Media Type (MIME Type) of the file to upload. Default: application/json