Analyze syntax of a Cloud Storage file

Break up the text from a file stored in Cloud Storage into a series of sentences and tokens (generally, words) and provide linguistic information about those tokens.

Documentation pages that include this code sample

To view the code sample used in context, see the following documentation:

Code sample

Go


func analyzeSyntaxFromGCS(ctx context.Context, gcsURI string) (*languagepb.AnnotateTextResponse, error) {
	return client.AnnotateText(ctx, &languagepb.AnnotateTextRequest{
		Document: &languagepb.Document{
			Source: &languagepb.Document_GcsContentUri{
				GcsContentUri: gcsURI,
			},
			Type: languagepb.Document_PLAIN_TEXT,
		},
		Features: &languagepb.AnnotateTextRequest_Features{
			ExtractSyntax: true,
		},
		EncodingType: languagepb.EncodingType_UTF8,
	})
}

Java

// Instantiate the Language client com.google.cloud.language.v1.LanguageServiceClient
try (LanguageServiceClient language = LanguageServiceClient.create()) {
  Document doc =
      Document.newBuilder().setGcsContentUri(gcsUri).setType(Type.PLAIN_TEXT).build();
  AnalyzeSyntaxRequest request =
      AnalyzeSyntaxRequest.newBuilder()
          .setDocument(doc)
          .setEncodingType(EncodingType.UTF16)
          .build();
  // analyze the syntax in the given text
  AnalyzeSyntaxResponse response = language.analyzeSyntax(request);
  // print the response
  for (Token token : response.getTokensList()) {
    System.out.printf("\tText: %s\n", token.getText().getContent());
    System.out.printf("\tBeginOffset: %d\n", token.getText().getBeginOffset());
    System.out.printf("Lemma: %s\n", token.getLemma());
    System.out.printf("PartOfSpeechTag: %s\n", token.getPartOfSpeech().getTag());
    System.out.printf("\tAspect: %s\n", token.getPartOfSpeech().getAspect());
    System.out.printf("\tCase: %s\n", token.getPartOfSpeech().getCase());
    System.out.printf("\tForm: %s\n", token.getPartOfSpeech().getForm());
    System.out.printf("\tGender: %s\n", token.getPartOfSpeech().getGender());
    System.out.printf("\tMood: %s\n", token.getPartOfSpeech().getMood());
    System.out.printf("\tNumber: %s\n", token.getPartOfSpeech().getNumber());
    System.out.printf("\tPerson: %s\n", token.getPartOfSpeech().getPerson());
    System.out.printf("\tProper: %s\n", token.getPartOfSpeech().getProper());
    System.out.printf("\tReciprocity: %s\n", token.getPartOfSpeech().getReciprocity());
    System.out.printf("\tTense: %s\n", token.getPartOfSpeech().getTense());
    System.out.printf("\tVoice: %s\n", token.getPartOfSpeech().getVoice());
    System.out.println("DependencyEdge");
    System.out.printf("\tHeadTokenIndex: %d\n", token.getDependencyEdge().getHeadTokenIndex());
    System.out.printf("\tLabel: %s\n\n", token.getDependencyEdge().getLabel());
  }

  return response.getTokensList();
}

Node.js

// Imports the Google Cloud client library
const language = require('@google-cloud/language');

// Creates a client
const client = new language.LanguageServiceClient();

/**
 * TODO(developer): Uncomment the following lines to run this code
 */
// const bucketName = 'Your bucket name, e.g. my-bucket';
// const fileName = 'Your file name, e.g. my-file.txt';

// Prepares a document, representing a text file in Cloud Storage
const document = {
  gcsContentUri: `gs://${bucketName}/${fileName}`,
  type: 'PLAIN_TEXT',
};

// Need to specify an encodingType to receive word offsets
const encodingType = 'UTF8';

// Detects the sentiment of the document
const [syntax] = await client.analyzeSyntax({document, encodingType});

console.log('Parts of speech:');
syntax.tokens.forEach(part => {
  console.log(`${part.partOfSpeech.tag}: ${part.text.content}`);
  console.log('Morphology:', part.partOfSpeech);
});

Python

from google.cloud import language_v1

def sample_analyze_syntax(gcs_content_uri):
    """
    Analyzing Syntax in text file stored in Cloud Storage

    Args:
      gcs_content_uri Google Cloud Storage URI where the file content is located.
      e.g. gs://[Your Bucket]/[Path to File]
    """

    client = language_v1.LanguageServiceClient()

    # gcs_content_uri = 'gs://cloud-samples-data/language/syntax-sentence.txt'

    # Available types: PLAIN_TEXT, HTML
    type_ = language_v1.Document.Type.PLAIN_TEXT

    # Optional. If not specified, the language is automatically detected.
    # For list of supported languages:
    # https://cloud.google.com/natural-language/docs/languages
    language = "en"
    document = {"gcs_content_uri": gcs_content_uri, "type_": type_, "language": language}

    # Available values: NONE, UTF8, UTF16, UTF32
    encoding_type = language_v1.EncodingType.UTF8

    response = client.analyze_syntax(request = {'document': document, 'encoding_type': encoding_type})
    # Loop through tokens returned from the API
    for token in response.tokens:
        # Get the text content of this token. Usually a word or punctuation.
        text = token.text
        print(u"Token text: {}".format(text.content))
        print(
            u"Location of this token in overall document: {}".format(text.begin_offset)
        )
        # Get the part of speech information for this token.
        # Parts of spech are as defined in:
        # http://www.lrec-conf.org/proceedings/lrec2012/pdf/274_Paper.pdf
        part_of_speech = token.part_of_speech
        # Get the tag, e.g. NOUN, ADJ for Adjective, et al.
        print(
            u"Part of Speech tag: {}".format(
                language_v1.PartOfSpeech.Tag(part_of_speech.tag).name
            )
        )
        # Get the voice, e.g. ACTIVE or PASSIVE
        print(u"Voice: {}".format(language_v1.PartOfSpeech.Voice(part_of_speech.voice).name))
        # Get the tense, e.g. PAST, FUTURE, PRESENT, et al.
        print(u"Tense: {}".format(language_v1.PartOfSpeech.Tense(part_of_speech.tense).name))
        # See API reference for additional Part of Speech information available
        # Get the lemma of the token. Wikipedia lemma description
        # https://en.wikipedia.org/wiki/Lemma_(morphology)
        print(u"Lemma: {}".format(token.lemma))
        # Get the dependency tree parse information for this token.
        # For more information on dependency labels:
        # http://www.aclweb.org/anthology/P13-2017
        dependency_edge = token.dependency_edge
        print(u"Head token index: {}".format(dependency_edge.head_token_index))
        print(
            u"Label: {}".format(language_v1.DependencyEdge.Label(dependency_edge.label).name)
        )

    # Get the language of the text, which will be the same as
    # the language specified in the request or, if not specified,
    # the automatically-detected language.
    print(u"Language of the text: {}".format(response.language))

What's next

To search and filter code samples for other Google Cloud products, see the Google Cloud sample browser