OCR tutorial step 2 - Detect text

Demonstrates how to upload image files to Cloud Storage, and extract and translate text from the images by using the Vision API and Translation API.

Documentation pages that include this code sample

To view the code sample used in context, see the following documentation:

Code sample

Go


package ocr

import (
	"context"
	"encoding/json"
	"fmt"
	"log"

	"cloud.google.com/go/pubsub"
	"golang.org/x/text/language"
	visionpb "google.golang.org/genproto/googleapis/cloud/vision/v1"
)

// detectText detects the text in an image using the Google Vision API.
func detectText(ctx context.Context, bucketName, fileName string) error {
	log.Printf("Looking for text in image %v", fileName)
	maxResults := 1
	image := &visionpb.Image{
		Source: &visionpb.ImageSource{
			GcsImageUri: fmt.Sprintf("gs://%s/%s", bucketName, fileName),
		},
	}
	annotations, err := visionClient.DetectTexts(ctx, image, &visionpb.ImageContext{}, maxResults)
	if err != nil {
		return fmt.Errorf("DetectTexts: %v", err)
	}
	text := ""
	if len(annotations) > 0 {
		text = annotations[0].Description
	}
	if len(annotations) == 0 || len(text) == 0 {
		log.Printf("No text detected in image %q. Returning early.", fileName)
		return nil
	}
	log.Printf("Extracted text %q from image (%d chars).", text, len(text))

	detectResponse, err := translateClient.DetectLanguage(ctx, []string{text})
	if err != nil {
		return fmt.Errorf("DetectLanguage: %v", err)
	}
	if len(detectResponse) == 0 || len(detectResponse[0]) == 0 {
		return fmt.Errorf("DetectLanguage gave empty response")
	}
	srcLang := detectResponse[0][0].Language.String()
	log.Printf("Detected language %q for text %q.", srcLang, text)

	// Submit a message to the bus for each target language
	for _, targetLang := range toLang {
		topicName := translateTopic
		if srcLang == targetLang || srcLang == "und" { // detection returns "und" for undefined language
			topicName = resultTopic
		}
		targetTag, err := language.Parse(targetLang)
		if err != nil {
			return fmt.Errorf("language.Parse: %v", err)
		}
		srcTag, err := language.Parse(srcLang)
		if err != nil {
			return fmt.Errorf("language.Parse: %v", err)
		}
		message, err := json.Marshal(ocrMessage{
			Text:     text,
			FileName: fileName,
			Lang:     targetTag,
			SrcLang:  srcTag,
		})
		if err != nil {
			return fmt.Errorf("json.Marshal: %v", err)
		}
		topic := pubsubClient.Topic(topicName)
		ok, err := topic.Exists(ctx)
		if err != nil {
			return fmt.Errorf("Exists: %v", err)
		}
		if !ok {
			topic, err = pubsubClient.CreateTopic(ctx, topicName)
			if err != nil {
				return fmt.Errorf("CreateTopic: %v", err)
			}
		}
		msg := &pubsub.Message{
			Data: []byte(message),
		}
		if _, err = topic.Publish(ctx, msg).Get(ctx); err != nil {
			return fmt.Errorf("Get: %v", err)
		}
	}
	return nil
}

Java

private void detectText(String bucket, String filename) {
  logger.info("Looking for text in image " + filename);

  List<AnnotateImageRequest> visionRequests = new ArrayList<>();
  String gcsPath = String.format("gs://%s/%s", bucket, filename);

  ImageSource imgSource = ImageSource.newBuilder().setGcsImageUri(gcsPath).build();
  Image img = Image.newBuilder().setSource(imgSource).build();

  Feature textFeature = Feature.newBuilder().setType(Feature.Type.TEXT_DETECTION).build();
  AnnotateImageRequest visionRequest =
      AnnotateImageRequest.newBuilder().addFeatures(textFeature).setImage(img).build();
  visionRequests.add(visionRequest);

  // Detect text in an image using the Cloud Vision API
  AnnotateImageResponse visionResponse;
  try (ImageAnnotatorClient client = ImageAnnotatorClient.create()) {
    visionResponse = client.batchAnnotateImages(visionRequests).getResponses(0);
    if (visionResponse == null || !visionResponse.hasFullTextAnnotation()) {
      logger.info(String.format("Image %s contains no text", filename));
      return;
    }

    if (visionResponse.hasError()) {
      // Log error
      logger.log(
          Level.SEVERE, "Error in vision API call: " + visionResponse.getError().getMessage());
      return;
    }
  } catch (IOException e) {
    // Log error (since IOException cannot be thrown by a Cloud Function)
    logger.log(Level.SEVERE, "Error detecting text: " + e.getMessage(), e);
    return;
  }

  String text = visionResponse.getFullTextAnnotation().getText();
  logger.info("Extracted text from image: " + text);

  // Detect language using the Cloud Translation API
  DetectLanguageRequest languageRequest =
      DetectLanguageRequest.newBuilder()
          .setParent(LOCATION_NAME)
          .setMimeType("text/plain")
          .setContent(text)
          .build();
  DetectLanguageResponse languageResponse;
  try (TranslationServiceClient client = TranslationServiceClient.create()) {
    languageResponse = client.detectLanguage(languageRequest);
  } catch (IOException e) {
    // Log error (since IOException cannot be thrown by a function)
    logger.log(Level.SEVERE, "Error detecting language: " + e.getMessage(), e);
    return;
  }

  if (languageResponse.getLanguagesCount() == 0) {
    logger.info("No languages were detected for text: " + text);
    return;
  }

  String languageCode = languageResponse.getLanguages(0).getLanguageCode();
  logger.info(String.format("Detected language %s for file %s", languageCode, filename));

  // Send a Pub/Sub translation request for every language we're going to translate to
  for (String targetLanguage : TO_LANGS) {
    logger.info("Sending translation request for language " + targetLanguage);
    OcrTranslateApiMessage message = new OcrTranslateApiMessage(text, filename, targetLanguage);
    ByteString byteStr = ByteString.copyFrom(message.toPubsubData());
    PubsubMessage pubsubApiMessage = PubsubMessage.newBuilder().setData(byteStr).build();
    try {
      publisher.publish(pubsubApiMessage).get();
    } catch (InterruptedException | ExecutionException e) {
      // Log error
      logger.log(Level.SEVERE, "Error publishing translation request: " + e.getMessage(), e);
      return;
    }
  }
}

Node.js

/**
 * Detects the text in an image using the Google Vision API.
 *
 * @param {string} bucketName Cloud Storage bucket name.
 * @param {string} filename Cloud Storage file name.
 * @returns {Promise}
 */
const detectText = async (bucketName, filename) => {
  console.log(`Looking for text in image ${filename}`);
  const [textDetections] = await vision.textDetection(
    `gs://${bucketName}/${filename}`
  );
  const [annotation] = textDetections.textAnnotations;
  const text = annotation ? annotation.description : '';
  console.log('Extracted text from image:', text);

  let [translateDetection] = await translate.detect(text);
  if (Array.isArray(translateDetection)) {
    [translateDetection] = translateDetection;
  }
  console.log(
    `Detected language "${translateDetection.language}" for ${filename}`
  );

  // Submit a message to the bus for each language we're going to translate to
  const TO_LANGS = process.env.TO_LANG.split(',');
  const topicName = process.env.TRANSLATE_TOPIC;

  const tasks = TO_LANGS.map(lang => {
    const messageData = {
      text: text,
      filename: filename,
      lang: lang,
    };

    // Helper function that publishes translation result to a Pub/Sub topic
    // For more information on publishing Pub/Sub messages, see this page:
    //   https://cloud.google.com/pubsub/docs/publisher
    return publishResult(topicName, messageData);
  });

  return Promise.all(tasks);
};

Python

def detect_text(bucket, filename):
    print("Looking for text in image {}".format(filename))

    futures = []

    image = vision.Image(
        source=vision.ImageSource(gcs_image_uri=f"gs://{bucket}/{filename}")
    )
    text_detection_response = vision_client.text_detection(image=image)
    annotations = text_detection_response.text_annotations
    if len(annotations) > 0:
        text = annotations[0].description
    else:
        text = ""
    print("Extracted text {} from image ({} chars).".format(text, len(text)))

    detect_language_response = translate_client.detect_language(text)
    src_lang = detect_language_response["language"]
    print("Detected language {} for text {}.".format(src_lang, text))

    # Submit a message to the bus for each target language
    to_langs = os.environ["TO_LANG"].split(",")
    for target_lang in to_langs:
        topic_name = os.environ["TRANSLATE_TOPIC"]
        if src_lang == target_lang or src_lang == "und":
            topic_name = os.environ["RESULT_TOPIC"]
        message = {
            "text": text,
            "filename": filename,
            "lang": target_lang,
            "src_lang": src_lang,
        }
        message_data = json.dumps(message).encode("utf-8")
        topic_path = publisher.topic_path(project_id, topic_name)
        future = publisher.publish(topic_path, data=message_data)
        futures.append(future)
    for future in futures:
        future.result()

What's next

To search and filter code samples for other Google Cloud products, see the Google Cloud sample browser.