Analyze syntax of a Cloud Storage file

Break up the text from a file stored in Cloud Storage into a series of sentences and tokens (generally, words) and provide linguistic information about those tokens.

Documentation pages that include this code sample

To view the code sample used in context, see the following documentation:

Code sample


func analyzeSyntaxFromGCS(ctx context.Context, gcsURI string) (*languagepb.AnnotateTextResponse, error) {
	return client.AnnotateText(ctx, &languagepb.AnnotateTextRequest{
		Document: &languagepb.Document{
			Source: &languagepb.Document_GcsContentUri{
				GcsContentUri: gcsURI,
			Type: languagepb.Document_PLAIN_TEXT,
		Features: &languagepb.AnnotateTextRequest_Features{
			ExtractSyntax: true,
		EncodingType: languagepb.EncodingType_UTF8,


// Instantiate the Language client
try (LanguageServiceClient language = LanguageServiceClient.create()) {
  Document doc =
  AnalyzeSyntaxRequest request =
  // analyze the syntax in the given text
  AnalyzeSyntaxResponse response = language.analyzeSyntax(request);
  // print the response
  for (Token token : response.getTokensList()) {
    System.out.printf("\tText: %s\n", token.getText().getContent());
    System.out.printf("\tBeginOffset: %d\n", token.getText().getBeginOffset());
    System.out.printf("Lemma: %s\n", token.getLemma());
    System.out.printf("PartOfSpeechTag: %s\n", token.getPartOfSpeech().getTag());
    System.out.printf("\tAspect: %s\n", token.getPartOfSpeech().getAspect());
    System.out.printf("\tCase: %s\n", token.getPartOfSpeech().getCase());
    System.out.printf("\tForm: %s\n", token.getPartOfSpeech().getForm());
    System.out.printf("\tGender: %s\n", token.getPartOfSpeech().getGender());
    System.out.printf("\tMood: %s\n", token.getPartOfSpeech().getMood());
    System.out.printf("\tNumber: %s\n", token.getPartOfSpeech().getNumber());
    System.out.printf("\tPerson: %s\n", token.getPartOfSpeech().getPerson());
    System.out.printf("\tProper: %s\n", token.getPartOfSpeech().getProper());
    System.out.printf("\tReciprocity: %s\n", token.getPartOfSpeech().getReciprocity());
    System.out.printf("\tTense: %s\n", token.getPartOfSpeech().getTense());
    System.out.printf("\tVoice: %s\n", token.getPartOfSpeech().getVoice());
    System.out.printf("\tHeadTokenIndex: %d\n", token.getDependencyEdge().getHeadTokenIndex());
    System.out.printf("\tLabel: %s\n\n", token.getDependencyEdge().getLabel());

  return response.getTokensList();


// Imports the Google Cloud client library
const language = require('@google-cloud/language');

// Creates a client
const client = new language.LanguageServiceClient();

 * TODO(developer): Uncomment the following lines to run this code
// const bucketName = 'Your bucket name, e.g. my-bucket';
// const fileName = 'Your file name, e.g. my-file.txt';

// Prepares a document, representing a text file in Cloud Storage
const document = {
  gcsContentUri: `gs://${bucketName}/${fileName}`,
  type: 'PLAIN_TEXT',

// Need to specify an encodingType to receive word offsets
const encodingType = 'UTF8';

// Detects the sentiment of the document
const [syntax] = await client.analyzeSyntax({document, encodingType});

console.log('Parts of speech:');
syntax.tokens.forEach(part => {
  console.log(`${part.partOfSpeech.tag}: ${part.text.content}`);
  console.log('Morphology:', part.partOfSpeech);


use Google\Cloud\Language\V1\Document;
use Google\Cloud\Language\V1\Document\Type;
use Google\Cloud\Language\V1\LanguageServiceClient;
use Google\Cloud\Language\V1\PartOfSpeech\Tag;

/** Uncomment and populate these variables in your code */
// $uri = 'The cloud storage object to analyze (gs://your-bucket-name/your-object-name)';

// Create the Natural Language client
$languageServiceClient = new LanguageServiceClient();

try {
    // Create a new Document, pass GCS URI and set type to PLAIN_TEXT
    $document = (new Document())

    // Call the analyzeEntities function
    $response = $languageServiceClient->analyzeSyntax($document, []);
    $tokens = $response->getTokens();
    // Print out information about each entity
    foreach ($tokens as $token) {
        printf('Token text: %s' . PHP_EOL, $token->getText()->getContent());
        printf('Token part of speech: %s' . PHP_EOL, Tag::name($token->getPartOfSpeech()->getTag()));
} finally {


from import language_v1

def sample_analyze_syntax(gcs_content_uri):
    Analyzing Syntax in text file stored in Cloud Storage

      gcs_content_uri Google Cloud Storage URI where the file content is located.
      e.g. gs://[Your Bucket]/[Path to File]

    client = language_v1.LanguageServiceClient()

    # gcs_content_uri = 'gs://cloud-samples-data/language/syntax-sentence.txt'

    # Available types: PLAIN_TEXT, HTML
    type_ = language_v1.Document.Type.PLAIN_TEXT

    # Optional. If not specified, the language is automatically detected.
    # For list of supported languages:
    language = "en"
    document = {"gcs_content_uri": gcs_content_uri, "type_": type_, "language": language}

    # Available values: NONE, UTF8, UTF16, UTF32
    encoding_type = language_v1.EncodingType.UTF8

    response = client.analyze_syntax(request = {'document': document, 'encoding_type': encoding_type})
    # Loop through tokens returned from the API
    for token in response.tokens:
        # Get the text content of this token. Usually a word or punctuation.
        text = token.text
        print(u"Token text: {}".format(text.content))
            u"Location of this token in overall document: {}".format(text.begin_offset)
        # Get the part of speech information for this token.
        # Part of speech is defined in:
        part_of_speech = token.part_of_speech
        # Get the tag, e.g. NOUN, ADJ for Adjective, et al.
            u"Part of Speech tag: {}".format(
        # Get the voice, e.g. ACTIVE or PASSIVE
        print(u"Voice: {}".format(language_v1.PartOfSpeech.Voice(part_of_speech.voice).name))
        # Get the tense, e.g. PAST, FUTURE, PRESENT, et al.
        print(u"Tense: {}".format(language_v1.PartOfSpeech.Tense(part_of_speech.tense).name))
        # See API reference for additional Part of Speech information available
        # Get the lemma of the token. Wikipedia lemma description
        print(u"Lemma: {}".format(token.lemma))
        # Get the dependency tree parse information for this token.
        # For more information on dependency labels:
        dependency_edge = token.dependency_edge
        print(u"Head token index: {}".format(dependency_edge.head_token_index))
            u"Label: {}".format(language_v1.DependencyEdge.Label(dependency_edge.label).name)

    # Get the language of the text, which will be the same as
    # the language specified in the request or, if not specified,
    # the automatically-detected language.
    print(u"Language of the text: {}".format(response.language))

What's next

To search and filter code samples for other Google Cloud products, see the Google Cloud sample browser