Décomposez le texte d'un fichier stocké dans Cloud Storage en une série de phrases et de jetons (généralement des mots) et fournissez des informations linguistiques sur ces jetons.
En savoir plus
Pour obtenir une documentation détaillée incluant cet exemple de code, consultez les articles suivants :
Exemple de code
Go
func analyzeSyntaxFromGCS(ctx context.Context, gcsURI string) (*languagepb.AnnotateTextResponse, error) {
return client.AnnotateText(ctx, &languagepb.AnnotateTextRequest{
Document: &languagepb.Document{
Source: &languagepb.Document_GcsContentUri{
GcsContentUri: gcsURI,
},
Type: languagepb.Document_PLAIN_TEXT,
},
Features: &languagepb.AnnotateTextRequest_Features{
ExtractSyntax: true,
},
EncodingType: languagepb.EncodingType_UTF8,
})
}
Java
// Instantiate the Language client com.google.cloud.language.v1.LanguageServiceClient
try (LanguageServiceClient language = LanguageServiceClient.create()) {
Document doc =
Document.newBuilder().setGcsContentUri(gcsUri).setType(Type.PLAIN_TEXT).build();
AnalyzeSyntaxRequest request =
AnalyzeSyntaxRequest.newBuilder()
.setDocument(doc)
.setEncodingType(EncodingType.UTF16)
.build();
// Analyze the syntax in the given text
AnalyzeSyntaxResponse response = language.analyzeSyntax(request);
// Print the response
for (Token token : response.getTokensList()) {
System.out.printf("\tText: %s\n", token.getText().getContent());
System.out.printf("\tBeginOffset: %d\n", token.getText().getBeginOffset());
System.out.printf("Lemma: %s\n", token.getLemma());
System.out.printf("PartOfSpeechTag: %s\n", token.getPartOfSpeech().getTag());
System.out.printf("\tAspect: %s\n", token.getPartOfSpeech().getAspect());
System.out.printf("\tCase: %s\n", token.getPartOfSpeech().getCase());
System.out.printf("\tForm: %s\n", token.getPartOfSpeech().getForm());
System.out.printf("\tGender: %s\n", token.getPartOfSpeech().getGender());
System.out.printf("\tMood: %s\n", token.getPartOfSpeech().getMood());
System.out.printf("\tNumber: %s\n", token.getPartOfSpeech().getNumber());
System.out.printf("\tPerson: %s\n", token.getPartOfSpeech().getPerson());
System.out.printf("\tProper: %s\n", token.getPartOfSpeech().getProper());
System.out.printf("\tReciprocity: %s\n", token.getPartOfSpeech().getReciprocity());
System.out.printf("\tTense: %s\n", token.getPartOfSpeech().getTense());
System.out.printf("\tVoice: %s\n", token.getPartOfSpeech().getVoice());
System.out.println("DependencyEdge");
System.out.printf("\tHeadTokenIndex: %d\n", token.getDependencyEdge().getHeadTokenIndex());
System.out.printf("\tLabel: %s\n\n", token.getDependencyEdge().getLabel());
}
return response.getTokensList();
}
Node.js
// Imports the Google Cloud client library
const language = require('@google-cloud/language');
// Creates a client
const client = new language.LanguageServiceClient();
/**
* TODO(developer): Uncomment the following lines to run this code
*/
// const bucketName = 'Your bucket name, e.g. my-bucket';
// const fileName = 'Your file name, e.g. my-file.txt';
// Prepares a document, representing a text file in Cloud Storage
const document = {
gcsContentUri: `gs://${bucketName}/${fileName}`,
type: 'PLAIN_TEXT',
};
// Need to specify an encodingType to receive word offsets
const encodingType = 'UTF8';
// Detects the sentiment of the document
const [syntax] = await client.analyzeSyntax({document, encodingType});
console.log('Parts of speech:');
syntax.tokens.forEach(part => {
console.log(`${part.partOfSpeech.tag}: ${part.text.content}`);
console.log('Morphology:', part.partOfSpeech);
});
PHP
use Google\Cloud\Language\V1\Document;
use Google\Cloud\Language\V1\Document\Type;
use Google\Cloud\Language\V1\LanguageServiceClient;
use Google\Cloud\Language\V1\PartOfSpeech\Tag;
/**
* @param string $uri The cloud storage object to analyze (gs://your-bucket-name/your-object-name)
*/
function analyze_syntax_from_file(string $uri): void
{
// Create the Natural Language client
$languageServiceClient = new LanguageServiceClient();
// Create a new Document, pass GCS URI and set type to PLAIN_TEXT
$document = (new Document())
->setGcsContentUri($uri)
->setType(Type::PLAIN_TEXT);
// Call the analyzeEntities function
$response = $languageServiceClient->analyzeSyntax($document, []);
$tokens = $response->getTokens();
// Print out information about each entity
foreach ($tokens as $token) {
printf('Token text: %s' . PHP_EOL, $token->getText()->getContent());
printf('Token part of speech: %s' . PHP_EOL, Tag::name($token->getPartOfSpeech()->getTag()));
print(PHP_EOL);
}
}
Python
from google.cloud import language_v1
def sample_analyze_syntax(gcs_content_uri):
"""
Analyzing Syntax in text file stored in Cloud Storage
Args:
gcs_content_uri Google Cloud Storage URI where the file content is located.
e.g. gs://[Your Bucket]/[Path to File]
"""
client = language_v1.LanguageServiceClient()
# gcs_content_uri = 'gs://cloud-samples-data/language/syntax-sentence.txt'
# Available types: PLAIN_TEXT, HTML
type_ = language_v1.Document.Type.PLAIN_TEXT
# Optional. If not specified, the language is automatically detected.
# For list of supported languages:
# https://cloud.google.com/natural-language/docs/languages
language = "en"
document = {
"gcs_content_uri": gcs_content_uri,
"type_": type_,
"language": language,
}
# Available values: NONE, UTF8, UTF16, UTF32
encoding_type = language_v1.EncodingType.UTF8
response = client.analyze_syntax(
request={"document": document, "encoding_type": encoding_type}
)
# Loop through tokens returned from the API
for token in response.tokens:
# Get the text content of this token. Usually a word or punctuation.
text = token.text
print("Token text: {}".format(text.content))
print(
"Location of this token in overall document: {}".format(text.begin_offset)
)
# Get the part of speech information for this token.
# Part of speech is defined in:
# http://www.lrec-conf.org/proceedings/lrec2012/pdf/274_Paper.pdf
part_of_speech = token.part_of_speech
# Get the tag, e.g. NOUN, ADJ for Adjective, et al.
print(
"Part of Speech tag: {}".format(
language_v1.PartOfSpeech.Tag(part_of_speech.tag).name
)
)
# Get the voice, e.g. ACTIVE or PASSIVE
print(
"Voice: {}".format(
language_v1.PartOfSpeech.Voice(part_of_speech.voice).name
)
)
# Get the tense, e.g. PAST, FUTURE, PRESENT, et al.
print(
"Tense: {}".format(
language_v1.PartOfSpeech.Tense(part_of_speech.tense).name
)
)
# See API reference for additional Part of Speech information available
# Get the lemma of the token. Wikipedia lemma description
# https://en.wikipedia.org/wiki/Lemma_(morphology)
print("Lemma: {}".format(token.lemma))
# Get the dependency tree parse information for this token.
# For more information on dependency labels:
# http://www.aclweb.org/anthology/P13-2017
dependency_edge = token.dependency_edge
print("Head token index: {}".format(dependency_edge.head_token_index))
print(
"Label: {}".format(
language_v1.DependencyEdge.Label(dependency_edge.label).name
)
)
# Get the language of the text, which will be the same as
# the language specified in the request or, if not specified,
# the automatically-detected language.
print("Language of the text: {}".format(response.language))
Étapes suivantes
Pour rechercher et filtrer des exemples de code pour d'autres produits Google Cloud, consultez l'exemple de navigateur Google Cloud.