将 Cloud Storage 存储的文件中的文本分解为一系列句子和词法单元(通常是单词),并提供这些词法单元的相关语言信息。
深入探索
如需查看包含此代码示例的详细文档,请参阅以下内容:
代码示例
Go
如需向 Natural Language 进行身份验证,请设置应用默认凭据。 如需了解详情,请参阅为本地开发环境设置身份验证。
func analyzeSyntaxFromGCS(ctx context.Context, gcsURI string) (*languagepb.AnnotateTextResponse, error) {
return client.AnnotateText(ctx, &languagepb.AnnotateTextRequest{
Document: &languagepb.Document{
Source: &languagepb.Document_GcsContentUri{
GcsContentUri: gcsURI,
},
Type: languagepb.Document_PLAIN_TEXT,
},
Features: &languagepb.AnnotateTextRequest_Features{
ExtractSyntax: true,
},
EncodingType: languagepb.EncodingType_UTF8,
})
}
Java
如需向 Natural Language 进行身份验证,请设置应用默认凭据。 如需了解详情,请参阅为本地开发环境设置身份验证。
// Instantiate the Language client com.google.cloud.language.v1.LanguageServiceClient
try (LanguageServiceClient language = LanguageServiceClient.create()) {
Document doc =
Document.newBuilder().setGcsContentUri(gcsUri).setType(Type.PLAIN_TEXT).build();
AnalyzeSyntaxRequest request =
AnalyzeSyntaxRequest.newBuilder()
.setDocument(doc)
.setEncodingType(EncodingType.UTF16)
.build();
// Analyze the syntax in the given text
AnalyzeSyntaxResponse response = language.analyzeSyntax(request);
// Print the response
for (Token token : response.getTokensList()) {
System.out.printf("\tText: %s\n", token.getText().getContent());
System.out.printf("\tBeginOffset: %d\n", token.getText().getBeginOffset());
System.out.printf("Lemma: %s\n", token.getLemma());
System.out.printf("PartOfSpeechTag: %s\n", token.getPartOfSpeech().getTag());
System.out.printf("\tAspect: %s\n", token.getPartOfSpeech().getAspect());
System.out.printf("\tCase: %s\n", token.getPartOfSpeech().getCase());
System.out.printf("\tForm: %s\n", token.getPartOfSpeech().getForm());
System.out.printf("\tGender: %s\n", token.getPartOfSpeech().getGender());
System.out.printf("\tMood: %s\n", token.getPartOfSpeech().getMood());
System.out.printf("\tNumber: %s\n", token.getPartOfSpeech().getNumber());
System.out.printf("\tPerson: %s\n", token.getPartOfSpeech().getPerson());
System.out.printf("\tProper: %s\n", token.getPartOfSpeech().getProper());
System.out.printf("\tReciprocity: %s\n", token.getPartOfSpeech().getReciprocity());
System.out.printf("\tTense: %s\n", token.getPartOfSpeech().getTense());
System.out.printf("\tVoice: %s\n", token.getPartOfSpeech().getVoice());
System.out.println("DependencyEdge");
System.out.printf("\tHeadTokenIndex: %d\n", token.getDependencyEdge().getHeadTokenIndex());
System.out.printf("\tLabel: %s\n\n", token.getDependencyEdge().getLabel());
}
return response.getTokensList();
}
Node.js
如需向 Natural Language 进行身份验证,请设置应用默认凭据。 如需了解详情,请参阅为本地开发环境设置身份验证。
// Imports the Google Cloud client library
const language = require('@google-cloud/language');
// Creates a client
const client = new language.LanguageServiceClient();
/**
* TODO(developer): Uncomment the following lines to run this code
*/
// const bucketName = 'Your bucket name, e.g. my-bucket';
// const fileName = 'Your file name, e.g. my-file.txt';
// Prepares a document, representing a text file in Cloud Storage
const document = {
gcsContentUri: `gs://${bucketName}/${fileName}`,
type: 'PLAIN_TEXT',
};
// Need to specify an encodingType to receive word offsets
const encodingType = 'UTF8';
// Detects the sentiment of the document
const [syntax] = await client.analyzeSyntax({document, encodingType});
console.log('Parts of speech:');
syntax.tokens.forEach(part => {
console.log(`${part.partOfSpeech.tag}: ${part.text.content}`);
console.log('Morphology:', part.partOfSpeech);
});
PHP
如需向 Natural Language 进行身份验证,请设置应用默认凭据。 如需了解详情,请参阅为本地开发环境设置身份验证。
use Google\Cloud\Language\V1\Document;
use Google\Cloud\Language\V1\Document\Type;
use Google\Cloud\Language\V1\LanguageServiceClient;
use Google\Cloud\Language\V1\PartOfSpeech\Tag;
/**
* @param string $uri The cloud storage object to analyze (gs://your-bucket-name/your-object-name)
*/
function analyze_syntax_from_file(string $uri): void
{
// Create the Natural Language client
$languageServiceClient = new LanguageServiceClient();
// Create a new Document, pass GCS URI and set type to PLAIN_TEXT
$document = (new Document())
->setGcsContentUri($uri)
->setType(Type::PLAIN_TEXT);
// Call the analyzeEntities function
$response = $languageServiceClient->analyzeSyntax($document, []);
$tokens = $response->getTokens();
// Print out information about each entity
foreach ($tokens as $token) {
printf('Token text: %s' . PHP_EOL, $token->getText()->getContent());
printf('Token part of speech: %s' . PHP_EOL, Tag::name($token->getPartOfSpeech()->getTag()));
print(PHP_EOL);
}
}
Python
如需向 Natural Language 进行身份验证,请设置应用默认凭据。 如需了解详情,请参阅为本地开发环境设置身份验证。
from google.cloud import language_v1
def sample_analyze_syntax(gcs_content_uri):
"""
Analyzing Syntax in text file stored in Cloud Storage
Args:
gcs_content_uri Google Cloud Storage URI where the file content is located.
e.g. gs://[Your Bucket]/[Path to File]
"""
client = language_v1.LanguageServiceClient()
# gcs_content_uri = 'gs://cloud-samples-data/language/syntax-sentence.txt'
# Available types: PLAIN_TEXT, HTML
type_ = language_v1.Document.Type.PLAIN_TEXT
# Optional. If not specified, the language is automatically detected.
# For list of supported languages:
# https://cloud.google.com/natural-language/docs/languages
language = "en"
document = {
"gcs_content_uri": gcs_content_uri,
"type_": type_,
"language": language,
}
# Available values: NONE, UTF8, UTF16, UTF32
encoding_type = language_v1.EncodingType.UTF8
response = client.analyze_syntax(
request={"document": document, "encoding_type": encoding_type}
)
# Loop through tokens returned from the API
for token in response.tokens:
# Get the text content of this token. Usually a word or punctuation.
text = token.text
print(f"Token text: {text.content}")
print(
f"Location of this token in overall document: {text.begin_offset}"
)
# Get the part of speech information for this token.
# Part of speech is defined in:
# http://www.lrec-conf.org/proceedings/lrec2012/pdf/274_Paper.pdf
part_of_speech = token.part_of_speech
# Get the tag, e.g. NOUN, ADJ for Adjective, et al.
print(
"Part of Speech tag: {}".format(
language_v1.PartOfSpeech.Tag(part_of_speech.tag).name
)
)
# Get the voice, e.g. ACTIVE or PASSIVE
print(
"Voice: {}".format(
language_v1.PartOfSpeech.Voice(part_of_speech.voice).name
)
)
# Get the tense, e.g. PAST, FUTURE, PRESENT, et al.
print(
"Tense: {}".format(
language_v1.PartOfSpeech.Tense(part_of_speech.tense).name
)
)
# See API reference for additional Part of Speech information available
# Get the lemma of the token. Wikipedia lemma description
# https://en.wikipedia.org/wiki/Lemma_(morphology)
print(f"Lemma: {token.lemma}")
# Get the dependency tree parse information for this token.
# For more information on dependency labels:
# http://www.aclweb.org/anthology/P13-2017
dependency_edge = token.dependency_edge
print(f"Head token index: {dependency_edge.head_token_index}")
print(
"Label: {}".format(
language_v1.DependencyEdge.Label(dependency_edge.label).name
)
)
# Get the language of the text, which will be the same as
# the language specified in the request or, if not specified,
# the automatically-detected language.
print(f"Language of the text: {response.language}")
后续步骤
如需搜索和过滤其他 Google Cloud 产品的代码示例,请参阅 Google Cloud 示例浏览器。