检测 Cloud Storage PDF 文件中的文本

对 Cloud Storage 中存储的 PDF 文件执行光学字符识别 (OCR)。

深入探索

如需查看包含此代码示例的详细文档，请参阅以下内容：

代码示例

Go

试用此示例之前，请按照《Vision 快速入门：使用客户端库》中的 Go 设置说明进行操作。如需了解详情，请参阅 Vision Go API 参考文档。

如需向 Vision 进行身份验证，请设置应用默认凭证。如需了解详情，请参阅为本地开发环境设置身份验证。


// detectAsyncDocumentURI performs Optical Character Recognition (OCR) on a
// PDF file stored in GCS.
func detectAsyncDocumentURI(w io.Writer, gcsSourceURI, gcsDestinationURI string) error {
	ctx := context.Background()

	client, err := vision.NewImageAnnotatorClient(ctx)
	if err != nil {
		return err
	}

	request := &visionpb.AsyncBatchAnnotateFilesRequest{
		Requests: []*visionpb.AsyncAnnotateFileRequest{
			{
				Features: []*visionpb.Feature{
					{
						Type: visionpb.Feature_DOCUMENT_TEXT_DETECTION,
					},
				},
				InputConfig: &visionpb.InputConfig{
					GcsSource: &visionpb.GcsSource{Uri: gcsSourceURI},
					// Supported MimeTypes are: "application/pdf" and "image/tiff".
					MimeType: "application/pdf",
				},
				OutputConfig: &visionpb.OutputConfig{
					GcsDestination: &visionpb.GcsDestination{Uri: gcsDestinationURI},
					// How many pages should be grouped into each json output file.
					BatchSize: 2,
				},
			},
		},
	}

	operation, err := client.AsyncBatchAnnotateFiles(ctx, request)
	if err != nil {
		return err
	}

	fmt.Fprintf(w, "Waiting for the operation to finish.")

	resp, err := operation.Wait(ctx)
	if err != nil {
		return err
	}

	fmt.Fprintf(w, "%v", resp)

	return nil
}

Java

试用此示例之前，请按照《Vision 快速入门：使用客户端库》中的 Java 设置说明进行操作。如需了解详情，请参阅 Vision Java API 参考文档。

如需向 Vision 进行身份验证，请设置应用默认凭证。如需了解详情，请参阅为本地开发环境设置身份验证。

/**
 * Performs document text OCR with PDF/TIFF as source files on Google Cloud Storage.
 *
 * @param gcsSourcePath The path to the remote file on Google Cloud Storage to detect document
 *     text on.
 * @param gcsDestinationPath The path to the remote file on Google Cloud Storage to store the
 *     results on.
 * @throws Exception on errors while closing the client.
 */
public static void detectDocumentsGcs(String gcsSourcePath, String gcsDestinationPath)
    throws Exception {

  // Initialize client that will be used to send requests. This client only needs to be created
  // once, and can be reused for multiple requests. After completing all of your requests, call
  // the "close" method on the client to safely clean up any remaining background resources.
  try (ImageAnnotatorClient client = ImageAnnotatorClient.create()) {
    List<AsyncAnnotateFileRequest> requests = new ArrayList<>();

    // Set the GCS source path for the remote file.
    GcsSource gcsSource = GcsSource.newBuilder().setUri(gcsSourcePath).build();

    // Create the configuration with the specified MIME (Multipurpose Internet Mail Extensions)
    // types
    InputConfig inputConfig =
        InputConfig.newBuilder()
            .setMimeType(
                "application/pdf") // Supported MimeTypes: "application/pdf", "image/tiff"
            .setGcsSource(gcsSource)
            .build();

    // Set the GCS destination path for where to save the results.
    GcsDestination gcsDestination =
        GcsDestination.newBuilder().setUri(gcsDestinationPath).build();

    // Create the configuration for the System.output with the batch size.
    // The batch size sets how many pages should be grouped into each json System.output file.
    OutputConfig outputConfig =
        OutputConfig.newBuilder().setBatchSize(2).setGcsDestination(gcsDestination).build();

    // Select the Feature required by the vision API
    Feature feature = Feature.newBuilder().setType(Feature.Type.DOCUMENT_TEXT_DETECTION).build();

    // Build the OCR request
    AsyncAnnotateFileRequest request =
        AsyncAnnotateFileRequest.newBuilder()
            .addFeatures(feature)
            .setInputConfig(inputConfig)
            .setOutputConfig(outputConfig)
            .build();

    requests.add(request);

    // Perform the OCR request
    OperationFuture<AsyncBatchAnnotateFilesResponse, OperationMetadata> response =
        client.asyncBatchAnnotateFilesAsync(requests);

    System.out.println("Waiting for the operation to finish.");

    // Wait for the request to finish. (The result is not used, since the API saves the result to
    // the specified location on GCS.)
    List<AsyncAnnotateFileResponse> result =
        response.get(180, TimeUnit.SECONDS).getResponsesList();

    // Once the request has completed and the System.output has been
    // written to GCS, we can list all the System.output files.
    Storage storage = StorageOptions.getDefaultInstance().getService();

    // Get the destination location from the gcsDestinationPath
    Pattern pattern = Pattern.compile("gs://([^/]+)/(.+)");
    Matcher matcher = pattern.matcher(gcsDestinationPath);

    if (matcher.find()) {
      String bucketName = matcher.group(1);
      String prefix = matcher.group(2);

      // Get the list of objects with the given prefix from the GCS bucket
      Bucket bucket = storage.get(bucketName);
      com.google.api.gax.paging.Page<Blob> pageList = bucket.list(BlobListOption.prefix(prefix));

      Blob firstOutputFile = null;

      // List objects with the given prefix.
      System.out.println("Output files:");
      for (Blob blob : pageList.iterateAll()) {
        System.out.println(blob.getName());

        // Process the first System.output file from GCS.
        // Since we specified batch size = 2, the first response contains
        // the first two pages of the input file.
        if (firstOutputFile == null) {
          firstOutputFile = blob;
        }
      }

      // Get the contents of the file and convert the JSON contents to an AnnotateFileResponse
      // object. If the Blob is small read all its content in one request
      // (Note: the file is a .json file)
      // Storage guide: https://cloud.google.com/storage/docs/downloading-objects
      String jsonContents = new String(firstOutputFile.getContent());
      Builder builder = AnnotateFileResponse.newBuilder();
      JsonFormat.parser().merge(jsonContents, builder);

      // Build the AnnotateFileResponse object
      AnnotateFileResponse annotateFileResponse = builder.build();

      // Parse through the object to get the actual response for the first page of the input file.
      AnnotateImageResponse annotateImageResponse = annotateFileResponse.getResponses(0);

      // Here we print the full text from the first page.
      // The response contains more information:
      // annotation/pages/blocks/paragraphs/words/symbols
      // including confidence score and bounding boxes
      System.out.format("%nText: %s%n", annotateImageResponse.getFullTextAnnotation().getText());
    } else {
      System.out.println("No MATCH");
    }
  }
}

Node.js

试用此示例之前，请按照《Vision 快速入门：使用客户端库》中的 Node.js 设置说明进行操作。如需了解详情，请参阅 Vision Node.js API 参考文档。

如需向 Vision 进行身份验证，请设置应用默认凭证。如需了解详情，请参阅为本地开发环境设置身份验证。


// Imports the Google Cloud client libraries
const vision = require('@google-cloud/vision').v1;

// Creates a client
const client = new vision.ImageAnnotatorClient();

/**
 * TODO(developer): Uncomment the following lines before running the sample.
 */
// Bucket where the file resides
// const bucketName = 'my-bucket';
// Path to PDF file within bucket
// const fileName = 'path/to/document.pdf';
// The folder to store the results
// const outputPrefix = 'results'

const gcsSourceUri = `gs://${bucketName}/${fileName}`;
const gcsDestinationUri = `gs://${bucketName}/${outputPrefix}/`;

const inputConfig = {
  // Supported mime_types are: 'application/pdf' and 'image/tiff'
  mimeType: 'application/pdf',
  gcsSource: {
    uri: gcsSourceUri,
  },
};
const outputConfig = {
  gcsDestination: {
    uri: gcsDestinationUri,
  },
};
const features = [{type: 'DOCUMENT_TEXT_DETECTION'}];
const request = {
  requests: [
    {
      inputConfig: inputConfig,
      features: features,
      outputConfig: outputConfig,
    },
  ],
};

const [operation] = await client.asyncBatchAnnotateFiles(request);
const [filesResponse] = await operation.promise();
const destinationUri =
  filesResponse.responses[0].outputConfig.gcsDestination.uri;
console.log('Json saved to: ' + destinationUri);

PHP

试用此示例之前，请按照《Vision 快速入门：使用客户端库》中的 PHP 设置说明进行操作。如需了解详情，请参阅 Vision PHP API 参考文档。

如需向 Vision 进行身份验证，请设置应用默认凭证。如需了解详情，请参阅为本地开发环境设置身份验证。

namespace Google\Cloud\Samples\Vision;

use Google\Cloud\Storage\StorageClient;
use Google\Cloud\Vision\V1\AnnotateFileResponse;
use Google\Cloud\Vision\V1\AsyncAnnotateFileRequest;
use Google\Cloud\Vision\V1\Feature;
use Google\Cloud\Vision\V1\Feature\Type;
use Google\Cloud\Vision\V1\GcsDestination;
use Google\Cloud\Vision\V1\GcsSource;
use Google\Cloud\Vision\V1\ImageAnnotatorClient;
use Google\Cloud\Vision\V1\InputConfig;
use Google\Cloud\Vision\V1\OutputConfig;

/**
 * @param string $path    GCS path to the document, e.g. "gs://path/to/your/document.pdf"
 * @param string $output  GCS path to store the results, e.g. "gs://path/to/store/results/"
 */
function detect_pdf_gcs(string $path, string $output)
{
    # select ocr feature
    $feature = (new Feature())
        ->setType(Type::DOCUMENT_TEXT_DETECTION);

    # set $path (file to OCR) as source
    $gcsSource = (new GcsSource())
        ->setUri($path);
    # supported mime_types are: 'application/pdf' and 'image/tiff'
    $mimeType = 'application/pdf';
    $inputConfig = (new InputConfig())
        ->setGcsSource($gcsSource)
        ->setMimeType($mimeType);

    # set $output as destination
    $gcsDestination = (new GcsDestination())
        ->setUri($output);
    # how many pages should be grouped into each json output file.
    $batchSize = 2;
    $outputConfig = (new OutputConfig())
        ->setGcsDestination($gcsDestination)
        ->setBatchSize($batchSize);

    # prepare request using configs set above
    $request = (new AsyncAnnotateFileRequest())
        ->setFeatures([$feature])
        ->setInputConfig($inputConfig)
        ->setOutputConfig($outputConfig);
    $requests = [$request];

    # make request
    $imageAnnotator = new ImageAnnotatorClient();
    $operation = $imageAnnotator->asyncBatchAnnotateFiles($requests);
    print('Waiting for operation to finish.' . PHP_EOL);
    $operation->pollUntilComplete();

    # once the request has completed and the output has been
    # written to GCS, we can list all the output files.
    preg_match('/^gs:\/\/([a-zA-Z0-9\._\-]+)\/?(\S+)?$/', $output, $match);
    $bucketName = $match[1];
    $prefix = isset($match[2]) ? $match[2] : '';

    $storage = new StorageClient();
    $bucket = $storage->bucket($bucketName);
    $options = ['prefix' => $prefix];
    $objects = $bucket->objects($options);

    # save first object for sample below
    $objects->next();
    $firstObject = $objects->current();

    # list objects with the given prefix.
    print('Output files:' . PHP_EOL);
    foreach ($objects as $object) {
        print($object->name() . PHP_EOL);
    }

    # process the first output file from GCS.
    # since we specified batch_size=2, the first response contains
    # the first two pages of the input file.
    $jsonString = $firstObject->downloadAsString();
    $firstBatch = new AnnotateFileResponse();
    $firstBatch->mergeFromJsonString($jsonString);

    # get annotation and print text
    foreach ($firstBatch->getResponses() as $response) {
        $annotation = $response->getFullTextAnnotation();
        print($annotation->getText());
    }

    $imageAnnotator->close();
}

Python

试用此示例之前，请按照《Vision 快速入门：使用客户端库》中的 Python 设置说明进行操作。如需了解详情，请参阅 Vision Python API 参考文档。

如需向 Vision 进行身份验证，请设置应用默认凭证。如需了解详情，请参阅为本地开发环境设置身份验证。

def async_detect_document(gcs_source_uri, gcs_destination_uri):
    """OCR with PDF/TIFF as source files on GCS"""
    import json
    import re
    from google.cloud import vision
    from google.cloud import storage

    # Supported mime_types are: 'application/pdf' and 'image/tiff'
    mime_type = "application/pdf"

    # How many pages should be grouped into each json output file.
    batch_size = 2

    client = vision.ImageAnnotatorClient()

    feature = vision.Feature(type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)

    gcs_source = vision.GcsSource(uri=gcs_source_uri)
    input_config = vision.InputConfig(gcs_source=gcs_source, mime_type=mime_type)

    gcs_destination = vision.GcsDestination(uri=gcs_destination_uri)
    output_config = vision.OutputConfig(
        gcs_destination=gcs_destination, batch_size=batch_size
    )

    async_request = vision.AsyncAnnotateFileRequest(
        features=[feature], input_config=input_config, output_config=output_config
    )

    operation = client.async_batch_annotate_files(requests=[async_request])

    print("Waiting for the operation to finish.")
    operation.result(timeout=420)

    # Once the request has completed and the output has been
    # written to GCS, we can list all the output files.
    storage_client = storage.Client()

    match = re.match(r"gs://([^/]+)/(.+)", gcs_destination_uri)
    bucket_name = match.group(1)
    prefix = match.group(2)

    bucket = storage_client.get_bucket(bucket_name)

    # List objects with the given prefix, filtering out folders.
    blob_list = [
        blob
        for blob in list(bucket.list_blobs(prefix=prefix))
        if not blob.name.endswith("/")
    ]
    print("Output files:")
    for blob in blob_list:
        print(blob.name)

    # Process the first output file from GCS.
    # Since we specified batch_size=2, the first response contains
    # the first two pages of the input file.
    output = blob_list[0]

    json_string = output.download_as_bytes().decode("utf-8")
    response = json.loads(json_string)

    # The actual response for the first page of the input file.
    first_page_response = response["responses"][0]
    annotation = first_page_response["fullTextAnnotation"]

    # Here we print the full text from the first page.
    # The response contains more information:
    # annotation/pages/blocks/paragraphs/words/symbols
    # including confidence scores and bounding boxes
    print("Full text:\n")
    print(annotation["text"])

后续步骤

如需搜索和过滤其他 Google Cloud 产品的代码示例，请参阅Google Cloud 示例浏览器。

检测 Cloud Storage PDF 文件中的文本 使用集合让一切井井有条 根据您的偏好保存内容并对其进行分类。

深入探索

代码示例

Go

Java

Node.js

PHP

Python

后续步骤

检测 Cloud Storage PDF 文件中的文本