The following examples demonstrate using the Cloud DLP API to scan a 90% subset of a Cloud Storage bucket for person names. The scan starts from a random location in the dataset and only includes text files under 200 bytes.
Explore further
For detailed documentation that includes this code sample, see the following:
Code sample
C#
To learn how to install and use the client library for Sensitive Data Protection, see Sensitive Data Protection client libraries.
To authenticate to Sensitive Data Protection, set up Application Default Credentials. For more information, see Set up authentication for a local development environment.
using Google.Api.Gax.ResourceNames;
using Google.Cloud.Dlp.V2;
using Google.Cloud.PubSub.V1;
using System.Collections.Generic;
using System.Threading;
using System.Threading.Tasks;
public class InspectStorageWithSampling
{
public static async Task<DlpJob> InspectAsync(
string projectId,
string gcsUri,
string topicId,
string subId,
Likelihood minLikelihood = Likelihood.Possible,
IEnumerable<InfoType> infoTypes = null)
{
// Instantiate the dlp client.
var dlp = DlpServiceClient.Create();
// Construct Storage config by specifying the GCS file to be inspected
// and sample method.
var storageConfig = new StorageConfig
{
CloudStorageOptions = new CloudStorageOptions
{
FileSet = new CloudStorageOptions.Types.FileSet
{
Url = gcsUri
},
BytesLimitPerFile = 200,
FileTypes = { new FileType[] { FileType.Csv } },
FilesLimitPercent = 90,
SampleMethod = CloudStorageOptions.Types.SampleMethod.RandomStart
}
};
// Construct the Inspect Config and specify the type of info the inspection
// will look for.
var inspectConfig = new InspectConfig
{
InfoTypes =
{
infoTypes ?? new InfoType[] { new InfoType { Name = "PERSON_NAME" } }
},
IncludeQuote = true,
MinLikelihood = minLikelihood
};
// Construct the pubsub action.
var actions = new Action[]
{
new Action
{
PubSub = new Action.Types.PublishToPubSub
{
Topic = $"projects/{projectId}/topics/{topicId}"
}
}
};
// Construct the inspect job config using above created objects.
var inspectJob = new InspectJobConfig
{
StorageConfig = storageConfig,
InspectConfig = inspectConfig,
Actions = { actions }
};
// Issue Create Dlp Job Request
var request = new CreateDlpJobRequest
{
InspectJob = inspectJob,
ParentAsLocationName = new LocationName(projectId, "global"),
};
// We keep the name of the job that we just created.
var dlpJob = dlp.CreateDlpJob(request);
var jobName = dlpJob.Name;
// Listen to pub/sub for the job
var subscriptionName = new SubscriptionName(projectId, subId);
var subscriber = await SubscriberClient.CreateAsync(
subscriptionName);
await subscriber.StartAsync((PubsubMessage message, CancellationToken cancel) =>
{
if (message.Attributes["DlpJobName"] == jobName)
{
subscriber.StopAsync(cancel);
return Task.FromResult(SubscriberClient.Reply.Ack);
}
else
{
return Task.FromResult(SubscriberClient.Reply.Nack);
}
});
// Get the latest state of the job from the service
var resultJob = dlp.GetDlpJob(new GetDlpJobRequest
{
DlpJobName = DlpJobName.Parse(jobName)
});
// Parse the response and process results.
System.Console.WriteLine($"Job status: {resultJob.State}");
System.Console.WriteLine($"Job Name: {resultJob.Name}");
var result = resultJob.InspectDetails.Result;
foreach (var infoType in result.InfoTypeStats)
{
System.Console.WriteLine($"Info Type: {infoType.InfoType.Name}");
System.Console.WriteLine($"Count: {infoType.Count}");
}
return resultJob;
}
}
Go
To learn how to install and use the client library for Sensitive Data Protection, see Sensitive Data Protection client libraries.
To authenticate to Sensitive Data Protection, set up Application Default Credentials. For more information, see Set up authentication for a local development environment.
import (
"context"
"fmt"
"io"
"time"
dlp "cloud.google.com/go/dlp/apiv2"
"cloud.google.com/go/dlp/apiv2/dlppb"
"cloud.google.com/go/pubsub"
)
// inspectGcsFileWithSampling inspects a storage with sampling
func inspectGcsFileWithSampling(w io.Writer, projectID, gcsUri, topicID, subscriptionId string) error {
// projectId := "your-project-id"
// gcsUri := "gs://" + "your-bucket-name" + "/path/to/your/file.txt"
// topicID := "your-pubsub-topic-id"
// subscriptionId := "your-pubsub-subscription-id"
ctx := context.Background()
// Initialize a client once and reuse it to send multiple requests. Clients
// are safe to use across goroutines. When the client is no longer needed,
// call the Close method to cleanup its resources.
client, err := dlp.NewClient(ctx)
if err != nil {
return err
}
// Closing the client safely cleans up background resources.
defer client.Close()
// Specify the GCS file to be inspected and sampling configuration
var cloudStorageOptions = &dlppb.CloudStorageOptions{
FileSet: &dlppb.CloudStorageOptions_FileSet{
Url: gcsUri,
},
BytesLimitPerFile: int64(200),
FileTypes: []dlppb.FileType{
dlppb.FileType_TEXT_FILE,
},
FilesLimitPercent: int32(90),
SampleMethod: dlppb.CloudStorageOptions_RANDOM_START,
}
var storageConfig = &dlppb.StorageConfig{
Type: &dlppb.StorageConfig_CloudStorageOptions{
CloudStorageOptions: cloudStorageOptions,
},
}
// Specify the type of info the inspection will look for.
// See https://cloud.google.com/dlp/docs/infotypes-reference for complete list of info types
// Specify how the content should be inspected.
var inspectConfig = &dlppb.InspectConfig{
InfoTypes: []*dlppb.InfoType{
{Name: "PERSON_NAME"},
},
ExcludeInfoTypes: true,
IncludeQuote: true,
MinLikelihood: dlppb.Likelihood_POSSIBLE,
}
// Create a PubSub Client used to listen for when the inspect job finishes.
pubsubClient, err := pubsub.NewClient(ctx, projectID)
if err != nil {
return err
}
defer pubsubClient.Close()
// Create a PubSub subscription we can use to listen for messages.
// Create the Topic if it doesn't exist.
t := pubsubClient.Topic(topicID)
if exists, err := t.Exists(ctx); err != nil {
return err
} else if !exists {
if t, err = pubsubClient.CreateTopic(ctx, topicID); err != nil {
return err
}
}
// Create the Subscription if it doesn't exist.
s := pubsubClient.Subscription(subscriptionId)
if exists, err := s.Exists(ctx); err != nil {
return err
} else if !exists {
if s, err = pubsubClient.CreateSubscription(ctx, subscriptionId, pubsub.SubscriptionConfig{Topic: t}); err != nil {
return err
}
}
// topic is the PubSub topic string where messages should be sent.
topic := "projects/" + projectID + "/topics/" + topicID
var action = &dlppb.Action{
Action: &dlppb.Action_PubSub{
PubSub: &dlppb.Action_PublishToPubSub{
Topic: topic,
},
},
}
// Configure the long running job we want the service to perform.
var inspectJobConfig = &dlppb.InspectJobConfig{
StorageConfig: storageConfig,
InspectConfig: inspectConfig,
Actions: []*dlppb.Action{
action,
},
}
// Create the request for the job configured above.
req := &dlppb.CreateDlpJobRequest{
Parent: fmt.Sprintf("projects/%s/locations/global", projectID),
Job: &dlppb.CreateDlpJobRequest_InspectJob{
InspectJob: inspectJobConfig,
},
}
// Use the client to send the request.
j, err := client.CreateDlpJob(ctx, req)
if err != nil {
return err
}
fmt.Fprintf(w, "Job Created: %v", j.GetName())
// Wait for the inspect job to finish by waiting for a PubSub message.
// This only waits for 10 minutes. For long jobs, consider using a truly
// asynchronous execution model such as Cloud Functions.
ctx, cancel := context.WithTimeout(ctx, 10*time.Minute)
defer cancel()
err = s.Receive(ctx, func(ctx context.Context, msg *pubsub.Message) {
// If this is the wrong job, do not process the result.
if msg.Attributes["DlpJobName"] != j.GetName() {
msg.Nack()
return
}
msg.Ack()
// Stop listening for more messages.
defer cancel()
resp, err := client.GetDlpJob(ctx, &dlppb.GetDlpJobRequest{
Name: j.GetName(),
})
if err != nil {
fmt.Fprintf(w, "Error getting completed job: %v\n", err)
return
}
r := resp.GetInspectDetails().GetResult().GetInfoTypeStats()
if len(r) == 0 {
fmt.Fprintf(w, "No results")
return
}
for _, s := range r {
fmt.Fprintf(w, "\nFound %v instances of infoType %v\n", s.GetCount(), s.GetInfoType().GetName())
}
})
if err != nil {
return err
}
return nil
}
Java
To learn how to install and use the client library for Sensitive Data Protection, see Sensitive Data Protection client libraries.
To authenticate to Sensitive Data Protection, set up Application Default Credentials. For more information, see Set up authentication for a local development environment.
import com.google.api.core.SettableApiFuture;
import com.google.cloud.dlp.v2.DlpServiceClient;
import com.google.cloud.pubsub.v1.AckReplyConsumer;
import com.google.cloud.pubsub.v1.MessageReceiver;
import com.google.cloud.pubsub.v1.Subscriber;
import com.google.privacy.dlp.v2.Action;
import com.google.privacy.dlp.v2.CloudStorageOptions;
import com.google.privacy.dlp.v2.CloudStorageOptions.FileSet;
import com.google.privacy.dlp.v2.CloudStorageOptions.SampleMethod;
import com.google.privacy.dlp.v2.CreateDlpJobRequest;
import com.google.privacy.dlp.v2.DlpJob;
import com.google.privacy.dlp.v2.FileType;
import com.google.privacy.dlp.v2.GetDlpJobRequest;
import com.google.privacy.dlp.v2.InfoType;
import com.google.privacy.dlp.v2.InfoTypeStats;
import com.google.privacy.dlp.v2.InspectConfig;
import com.google.privacy.dlp.v2.InspectDataSourceDetails;
import com.google.privacy.dlp.v2.InspectJobConfig;
import com.google.privacy.dlp.v2.Likelihood;
import com.google.privacy.dlp.v2.LocationName;
import com.google.privacy.dlp.v2.StorageConfig;
import com.google.pubsub.v1.ProjectSubscriptionName;
import com.google.pubsub.v1.PubsubMessage;
import java.io.IOException;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
public class InspectGcsFileWithSampling {
public static void main(String[] args) throws Exception {
// TODO(developer): Replace these variables before running the sample.
String projectId = "your-project-id";
String gcsUri = "gs://" + "your-bucket-name" + "/path/to/your/file.txt";
String topicId = "your-pubsub-topic-id";
String subscriptionId = "your-pubsub-subscription-id";
inspectGcsFileWithSampling(projectId, gcsUri, topicId, subscriptionId);
}
// Inspects a file in a Google Cloud Storage Bucket.
public static void inspectGcsFileWithSampling(
String projectId, String gcsUri, String topicId, String subscriptionId)
throws ExecutionException, InterruptedException, IOException {
// Initialize client that will be used to send requests. This client only needs to be created
// once, and can be reused for multiple requests. After completing all of your requests, call
// the "close" method on the client to safely clean up any remaining background resources.
try (DlpServiceClient dlp = DlpServiceClient.create()) {
// Specify the GCS file to be inspected and sampling configuration
CloudStorageOptions cloudStorageOptions =
CloudStorageOptions.newBuilder()
.setFileSet(FileSet.newBuilder().setUrl(gcsUri))
.setBytesLimitPerFile(200)
.addFileTypes(FileType.TEXT_FILE)
.setFilesLimitPercent(90)
.setSampleMethod(SampleMethod.RANDOM_START)
.build();
StorageConfig storageConfig =
StorageConfig.newBuilder().setCloudStorageOptions(cloudStorageOptions).build();
// Specify the type of info the inspection will look for.
// See https://cloud.google.com/dlp/docs/infotypes-reference for complete list of info types
InfoType infoType = InfoType.newBuilder().setName("PERSON_NAME").build();
// Specify how the content should be inspected.
InspectConfig inspectConfig =
InspectConfig.newBuilder()
.addInfoTypes(infoType)
.setExcludeInfoTypes(true)
.setIncludeQuote(true)
.setMinLikelihood(Likelihood.POSSIBLE)
.build();
// Specify the action that is triggered when the job completes.
String pubSubTopic = String.format("projects/%s/topics/%s", projectId, topicId);
Action.PublishToPubSub publishToPubSub =
Action.PublishToPubSub.newBuilder().setTopic(pubSubTopic).build();
Action action = Action.newBuilder().setPubSub(publishToPubSub).build();
// Configure the long running job we want the service to perform.
InspectJobConfig inspectJobConfig =
InspectJobConfig.newBuilder()
.setStorageConfig(storageConfig)
.setInspectConfig(inspectConfig)
.addActions(action)
.build();
// Create the request for the job configured above.
CreateDlpJobRequest createDlpJobRequest =
CreateDlpJobRequest.newBuilder()
.setParent(LocationName.of(projectId, "global").toString())
.setInspectJob(inspectJobConfig)
.build();
// Use the client to send the request.
final DlpJob dlpJob = dlp.createDlpJob(createDlpJobRequest);
System.out.println("Job created: " + dlpJob.getName());
// Set up a Pub/Sub subscriber to listen on the job completion status
final SettableApiFuture<Boolean> done = SettableApiFuture.create();
ProjectSubscriptionName subscriptionName =
ProjectSubscriptionName.of(projectId, subscriptionId);
MessageReceiver messageHandler =
(PubsubMessage pubsubMessage, AckReplyConsumer ackReplyConsumer) -> {
handleMessage(dlpJob, done, pubsubMessage, ackReplyConsumer);
};
Subscriber subscriber = Subscriber.newBuilder(subscriptionName, messageHandler).build();
subscriber.startAsync();
// Wait for job completion semi-synchronously
// For long jobs, consider using a truly asynchronous execution model such as Cloud Functions
try {
done.get(15, TimeUnit.MINUTES);
} catch (TimeoutException e) {
System.out.println("Job was not completed after 15 minutes.");
return;
} finally {
subscriber.stopAsync();
subscriber.awaitTerminated();
}
// Get the latest state of the job from the service
GetDlpJobRequest request = GetDlpJobRequest.newBuilder().setName(dlpJob.getName()).build();
DlpJob completedJob = dlp.getDlpJob(request);
// Parse the response and process results.
System.out.println("Job status: " + completedJob.getState());
System.out.println("Job name: " + dlpJob.getName());
InspectDataSourceDetails.Result result = completedJob.getInspectDetails().getResult();
System.out.println("Findings: ");
for (InfoTypeStats infoTypeStat : result.getInfoTypeStatsList()) {
System.out.print("\tInfo type: " + infoTypeStat.getInfoType().getName());
System.out.println("\tCount: " + infoTypeStat.getCount());
}
}
}
// handleMessage injects the job and settableFuture into the message reciever interface
private static void handleMessage(
DlpJob job,
SettableApiFuture<Boolean> done,
PubsubMessage pubsubMessage,
AckReplyConsumer ackReplyConsumer) {
String messageAttribute = pubsubMessage.getAttributesMap().get("DlpJobName");
if (job.getName().equals(messageAttribute)) {
done.set(true);
ackReplyConsumer.ack();
} else {
ackReplyConsumer.nack();
}
}
}
Node.js
To learn how to install and use the client library for Sensitive Data Protection, see Sensitive Data Protection client libraries.
To authenticate to Sensitive Data Protection, set up Application Default Credentials. For more information, see Set up authentication for a local development environment.
// Import the Google Cloud client libraries
const DLP = require('@google-cloud/dlp');
const {PubSub} = require('@google-cloud/pubsub');
// Instantiates clients
const dlp = new DLP.DlpServiceClient();
const pubsub = new PubSub();
// The project ID to run the API call under
// const projectId = 'my-project';
// The gcs file path
// const gcsUri = 'gs://" + "your-bucket-name" + "/path/to/your/file.txt';
// Specify the type of info the inspection will look for.
// See https://cloud.google.com/dlp/docs/infotypes-reference for complete list of info types
// const infoTypes = [{ name: 'PERSON_NAME' }];
// The name of the Pub/Sub topic to notify once the job completes
// TODO(developer): create a Pub/Sub topic to use for this
// const topicId = 'MY-PUBSUB-TOPIC'
// The name of the Pub/Sub subscription to use when listening for job
// completion notifications
// TODO(developer): create a Pub/Sub subscription to use for this
// const subscriptionId = 'MY-PUBSUB-SUBSCRIPTION'
// DLP Job max time (in milliseconds)
const DLP_JOB_WAIT_TIME = 15 * 1000 * 60;
async function inspectGcsFileSampling() {
// Specify the GCS file to be inspected and sampling configuration
const storageItemConfig = {
cloudStorageOptions: {
fileSet: {url: gcsUri},
bytesLimitPerFile: 200,
filesLimitPercent: 90,
fileTypes: [DLP.protos.google.privacy.dlp.v2.FileType.TEXT_FILE],
sampleMethod:
DLP.protos.google.privacy.dlp.v2.CloudStorageOptions.SampleMethod
.RANDOM_START,
},
};
// Specify how the content should be inspected.
const inspectConfig = {
infoTypes: infoTypes,
minLikelihood: DLP.protos.google.privacy.dlp.v2.Likelihood.POSSIBLE,
includeQuote: true,
excludeInfoTypes: true,
};
// Specify the action that is triggered when the job completes.
const actions = [
{
pubSub: {
topic: `projects/${projectId}/topics/${topicId}`,
},
},
];
// Create the request for the job configured above.
const request = {
parent: `projects/${projectId}/locations/global`,
inspectJob: {
inspectConfig: inspectConfig,
storageConfig: storageItemConfig,
actions: actions,
},
};
// Use the client to send the request.
const [topicResponse] = await pubsub.topic(topicId).get();
// Verify the Pub/Sub topic and listen for job notifications via an
// existing subscription.
const subscription = await topicResponse.subscription(subscriptionId);
const [jobsResponse] = await dlp.createDlpJob(request);
const jobName = jobsResponse.name;
// Watch the Pub/Sub topic until the DLP job finishes
await new Promise((resolve, reject) => {
// Set up the timeout
const timer = setTimeout(() => {
reject(new Error('Timeout'));
}, DLP_JOB_WAIT_TIME);
const messageHandler = message => {
if (message.attributes && message.attributes.DlpJobName === jobName) {
message.ack();
subscription.removeListener('message', messageHandler);
subscription.removeListener('error', errorHandler);
clearTimeout(timer);
resolve(jobName);
} else {
message.nack();
}
};
const errorHandler = err => {
subscription.removeListener('message', messageHandler);
subscription.removeListener('error', errorHandler);
clearTimeout(timer);
reject(err);
};
subscription.on('message', messageHandler);
subscription.on('error', errorHandler);
});
const [job] = await dlp.getDlpJob({name: jobName});
console.log(`Job ${job.name} status: ${job.state}`);
const infoTypeStats = job.inspectDetails.result.infoTypeStats;
if (infoTypeStats.length > 0) {
infoTypeStats.forEach(infoTypeStat => {
console.log(
` Found ${infoTypeStat.count} instance(s) of infoType ${infoTypeStat.infoType.name}.`
);
});
} else {
console.log('No findings.');
}
}
await inspectGcsFileSampling();
PHP
To learn how to install and use the client library for Sensitive Data Protection, see Sensitive Data Protection client libraries.
To authenticate to Sensitive Data Protection, set up Application Default Credentials. For more information, see Set up authentication for a local development environment.
use Google\Cloud\Dlp\V2\DlpServiceClient;
use Google\Cloud\Dlp\V2\InfoType;
use Google\Cloud\Dlp\V2\InspectConfig;
use Google\Cloud\Dlp\V2\StorageConfig;
use Google\Cloud\Dlp\V2\DlpJob\JobState;
use Google\Cloud\Dlp\V2\Action;
use Google\Cloud\Dlp\V2\Action\PublishToPubSub;
use Google\Cloud\Dlp\V2\BigQueryOptions\SampleMethod;
use Google\Cloud\Dlp\V2\CloudStorageOptions;
use Google\Cloud\Dlp\V2\CloudStorageOptions\FileSet;
use Google\Cloud\Dlp\V2\InspectJobConfig;
use Google\Cloud\PubSub\PubSubClient;
/**
* Inspect storage with sampling.
* The following examples demonstrate using the Cloud DLP API to scan a 90% subset of a
* Cloud Storage bucket for person names. The scan starts from a random location in the dataset
* and only includes text files under 200 bytes.
*
* @param string $callingProjectId The project ID to run the API call under.
* @param string $gcsUri Google Cloud Storage file url.
* @param string $topicId The ID of the Pub/Sub topic to notify once the job completes.
* @param string $subscriptionId The ID of the Pub/Sub subscription to use when listening for job.
*/
function inspect_gcs_with_sampling(
// TODO(developer): Replace sample parameters before running the code.
string $callingProjectId,
string $gcsUri = 'gs://GOOGLE_STORAGE_BUCKET_NAME/dlp_sample.csv',
string $topicId = 'dlp-pubsub-topic',
string $subscriptionId = 'dlp_subcription'
): void {
// Instantiate a client.
$dlp = new DlpServiceClient();
$pubsub = new PubSubClient();
$topic = $pubsub->topic($topicId);
// Construct the items to be inspected.
$cloudStorageOptions = (new CloudStorageOptions())
->setFileSet((new FileSet())
->setUrl($gcsUri))
->setBytesLimitPerFile(200)
->setFilesLimitPercent(90)
->setSampleMethod(SampleMethod::RANDOM_START);
$storageConfig = (new StorageConfig())
->setCloudStorageOptions($cloudStorageOptions);
// Specify the type of info the inspection will look for.
$phoneNumberInfoType = (new InfoType())
->setName('PHONE_NUMBER');
$emailAddressInfoType = (new InfoType())
->setName('EMAIL_ADDRESS');
$cardNumberInfoType = (new InfoType())
->setName('CREDIT_CARD_NUMBER');
$infoTypes = [$phoneNumberInfoType, $emailAddressInfoType, $cardNumberInfoType];
// Specify how the content should be inspected.
$inspectConfig = (new InspectConfig())
->setInfoTypes($infoTypes)
->setIncludeQuote(true);
// Construct the action to run when job completes.
$action = (new Action())
->setPubSub((new PublishToPubSub())
->setTopic($topic->name()));
// Construct inspect job config to run.
$inspectJob = (new InspectJobConfig())
->setInspectConfig($inspectConfig)
->setStorageConfig($storageConfig)
->setActions([$action]);
// Listen for job notifications via an existing topic/subscription.
$subscription = $topic->subscription($subscriptionId);
// Submit request.
$parent = "projects/$callingProjectId/locations/global";
$job = $dlp->createDlpJob($parent, [
'inspectJob' => $inspectJob
]);
// Poll Pub/Sub using exponential backoff until job finishes.
// Consider using an asynchronous execution model such as Cloud Functions.
$attempt = 1;
$startTime = time();
do {
foreach ($subscription->pull() as $message) {
if (
isset($message->attributes()['DlpJobName']) &&
$message->attributes()['DlpJobName'] === $job->getName()
) {
$subscription->acknowledge($message);
// Get the updated job. Loop to avoid race condition with DLP API.
do {
$job = $dlp->getDlpJob($job->getName());
} while ($job->getState() == JobState::RUNNING);
break 2; // break from parent do while.
}
}
printf('Waiting for job to complete' . PHP_EOL);
// Exponential backoff with max delay of 60 seconds.
sleep(min(60, pow(2, ++$attempt)));
} while (time() - $startTime < 600); // 10 minute timeout.
// Print finding counts.
printf('Job %s status: %s' . PHP_EOL, $job->getName(), JobState::name($job->getState()));
switch ($job->getState()) {
case JobState::DONE:
$infoTypeStats = $job->getInspectDetails()->getResult()->getInfoTypeStats();
if (count($infoTypeStats) === 0) {
printf('No findings.' . PHP_EOL);
} else {
foreach ($infoTypeStats as $infoTypeStat) {
printf(
' Found %s instance(s) of infoType %s' . PHP_EOL,
$infoTypeStat->getCount(),
$infoTypeStat->getInfoType()->getName()
);
}
}
break;
case JobState::FAILED:
printf('Job %s had errors:' . PHP_EOL, $job->getName());
$errors = $job->getErrors();
foreach ($errors as $error) {
var_dump($error->getDetails());
}
break;
case JobState::PENDING:
printf('Job has not completed. Consider a longer timeout or an asynchronous execution model' . PHP_EOL);
break;
default:
printf('Unexpected job state. Most likely, the job is either running or has not yet started.');
}
}
Python
To learn how to install and use the client library for Sensitive Data Protection, see Sensitive Data Protection client libraries.
To authenticate to Sensitive Data Protection, set up Application Default Credentials. For more information, see Set up authentication for a local development environment.
import threading
from typing import List
import google.cloud.dlp
import google.cloud.pubsub
def inspect_gcs_with_sampling(
project: str,
bucket: str,
topic_id: str,
subscription_id: str,
info_types: List[str] = None,
file_types: List[str] = None,
min_likelihood: str = None,
max_findings: int = None,
timeout: int = 300,
) -> None:
"""Uses the Data Loss Prevention API to analyze files in GCS by
limiting the amount of data to be scanned.
Args:
project: The Google Cloud project id to use as a parent resource.
bucket: The name of the GCS bucket containing the file, as a string.
topic_id: The id of the Cloud Pub/Sub topic to which the API will
broadcast job completion. The topic must already exist.
subscription_id: The id of the Cloud Pub/Sub subscription to listen on
while waiting for job completion. The subscription must already
exist and be subscribed to the topic.
info_types: A list of strings representing infoTypes to look for.
A full list of info type categories can be fetched from the API.
file_types: Type of files in gcs bucket where the inspection would happen.
min_likelihood: A string representing the minimum likelihood threshold
that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED',
'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'.
max_findings: The maximum number of findings to report; 0 = no maximum.
timeout: The number of seconds to wait for a response from the API.
"""
# Instantiate a client.
dlp = google.cloud.dlp_v2.DlpServiceClient()
# Prepare info_types by converting the list of strings into a list of
# dictionaries.
if not info_types:
info_types = ["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"]
info_types = [{"name": info_type} for info_type in info_types]
# Specify how the content should be inspected. Keys which are None may
# optionally be omitted entirely.
inspect_config = {
"info_types": info_types,
"exclude_info_types": True,
"include_quote": True,
"min_likelihood": min_likelihood,
"limits": {"max_findings_per_request": max_findings},
}
# Setting default file types as CSV files
if not file_types:
file_types = ["CSV"]
# Construct a cloud_storage_options dictionary with the bucket's URL.
url = f"gs://{bucket}/*"
storage_config = {
"cloud_storage_options": {
"file_set": {"url": url},
"bytes_limit_per_file": 200,
"file_types": file_types,
"files_limit_percent": 90,
"sample_method": "RANDOM_START",
}
}
# Tell the API where to send a notification when the job is complete.
topic = google.cloud.pubsub.PublisherClient.topic_path(project, topic_id)
actions = [{"pub_sub": {"topic": topic}}]
# Construct the inspect_job, which defines the entire inspect content task.
inspect_job = {
"inspect_config": inspect_config,
"storage_config": storage_config,
"actions": actions,
}
# Convert the project id into full resource ids.
parent = f"projects/{project}/locations/global"
# Call the API
operation = dlp.create_dlp_job(
request={"parent": parent, "inspect_job": inspect_job}
)
print(f"Inspection operation started: {operation.name}")
# Create a Pub/Sub client and find the subscription. The subscription is
# expected to already be listening to the topic.
subscriber = google.cloud.pubsub.SubscriberClient()
subscription_path = subscriber.subscription_path(project, subscription_id)
# Set up a callback to acknowledge a message. This closes around an event
# so that it can signal that it is done and the main thread can continue.
job_done = threading.Event()
def callback(message):
try:
if message.attributes["DlpJobName"] == operation.name:
# This is the message we're looking for, so acknowledge it.
message.ack()
# Now that the job is done, fetch the results and print them.
job = dlp.get_dlp_job(request={"name": operation.name})
print(f"Job name: {job.name}")
if job.inspect_details.result.info_type_stats:
print("Findings:")
for finding in job.inspect_details.result.info_type_stats:
print(
f"Info type: {finding.info_type.name}; Count: {finding.count}"
)
else:
print("No findings.")
# Signal to the main thread that we can exit.
job_done.set()
else:
# This is not the message we're looking for.
message.drop()
except Exception as e:
# Because this is executing in a thread, an exception won't be
# noted unless we print it manually.
print(e)
raise
# Register the callback and wait on the event.
subscriber.subscribe(subscription_path, callback=callback)
finished = job_done.wait(timeout=timeout)
if not finished:
print(
"No event received before the timeout. Please verify that the "
"subscription provided is subscribed to the topic provided."
)
What's next
To search and filter code samples for other Google Cloud products, see the Google Cloud sample browser.