Inspect BigQuery for sensitive data with sampling

The following examples demonstrate using the Cloud Data Loss Prevention API to scan a 1000-row subset of a BigQuery table. The scan starts from a random row.

Explore further

For detailed documentation that includes this code sample, see the following:

Code sample

C#

To learn how to install and use the client library for Sensitive Data Protection, see Sensitive Data Protection client libraries.

To authenticate to Sensitive Data Protection, set up Application Default Credentials. For more information, see Set up authentication for a local development environment.


using Google.Api.Gax.ResourceNames;
using Google.Cloud.Dlp.V2;
using Google.Cloud.PubSub.V1;
using System.Collections.Generic;
using System.Threading;
using System.Threading.Tasks;
using static Google.Cloud.Dlp.V2.InspectConfig.Types;

public class InspectBigQueryWithSampling
{
    public static async Task<DlpJob> InspectAsync(
        string projectId,
        int maxFindings,
        bool includeQuote,
        string topicId,
        string subId,
        Likelihood minLikelihood = Likelihood.Possible,
        IEnumerable<FieldId> identifyingFields = null,
        IEnumerable<InfoType> infoTypes = null)
    {

        // Instantiate the dlp client.
        var dlp = DlpServiceClient.Create();

        // Construct Storage config.
        var storageConfig = new StorageConfig
        {
            BigQueryOptions = new BigQueryOptions
            {
                TableReference = new BigQueryTable
                {
                    ProjectId = "bigquery-public-data",
                    DatasetId = "usa_names",
                    TableId = "usa_1910_current",
                },
                IdentifyingFields =
                {
                    identifyingFields ?? new FieldId[] { new FieldId { Name = "name" } }
                },
                RowsLimit = 100,
                SampleMethod = BigQueryOptions.Types.SampleMethod.RandomStart
            }
        };

        // Construct the inspect config.
        var inspectConfig = new InspectConfig
        {
            InfoTypes = { infoTypes ?? new InfoType[] { new InfoType { Name = "PERSON_NAME" } } },
            Limits = new FindingLimits
            {
                MaxFindingsPerRequest = maxFindings,
            },
            IncludeQuote = includeQuote,
            MinLikelihood = minLikelihood
        };

        // Construct the pubsub action.
        var actions = new Action[]
        {
            new Action
            {
                PubSub = new Action.Types.PublishToPubSub
                {
                    Topic = $"projects/{projectId}/topics/{topicId}"
                }
            }
        };

        // Construct the inspect job config using the actions.
        var inspectJob = new InspectJobConfig
        {
            StorageConfig = storageConfig,
            InspectConfig = inspectConfig,
            Actions = { actions }
        };

        // Issue Create Dlp Job Request.
        var request = new CreateDlpJobRequest
        {
            InspectJob = inspectJob,
            ParentAsLocationName = new LocationName(projectId, "global"),
        };

        // We keep the name of the job that we just created.
        var dlpJob = dlp.CreateDlpJob(request);
        var jobName = dlpJob.Name;

        // Listen to pub/sub for the job.
        var subscriptionName = new SubscriptionName(projectId, subId);
        var subscriber = await SubscriberClient.CreateAsync(
            subscriptionName);

        // SimpleSubscriber runs your message handle function on multiple threads to maximize throughput.
        await subscriber.StartAsync((PubsubMessage message, CancellationToken cancel) =>
        {
            if (message.Attributes["DlpJobName"] == jobName)
            {
                subscriber.StopAsync(cancel);
                return Task.FromResult(SubscriberClient.Reply.Ack);
            }
            else
            {
                return Task.FromResult(SubscriberClient.Reply.Nack);
            }
        });

        // Get the latest state of the job from the service.
        var resultJob = dlp.GetDlpJob(new GetDlpJobRequest
        {
            DlpJobName = DlpJobName.Parse(jobName)
        });

        // Parse the response and process results.
        System.Console.WriteLine($"Job status: {resultJob.State}");
        System.Console.WriteLine($"Job Name: {resultJob.Name}");
        var result = resultJob.InspectDetails.Result;
        foreach (var infoType in result.InfoTypeStats)
        {
            System.Console.WriteLine($"Info Type: {infoType.InfoType.Name}");
            System.Console.WriteLine($"Count: {infoType.Count}");
        }
        return resultJob;
    }
}

Go

To learn how to install and use the client library for Sensitive Data Protection, see Sensitive Data Protection client libraries.

To authenticate to Sensitive Data Protection, set up Application Default Credentials. For more information, see Set up authentication for a local development environment.

import (
	"context"
	"fmt"
	"io"
	"time"

	dlp "cloud.google.com/go/dlp/apiv2"
	"cloud.google.com/go/dlp/apiv2/dlppb"
	"cloud.google.com/go/pubsub"
)

// inspectBigQueryTableWithSampling inspect bigQueries for sensitive data with sampling
func inspectBigQueryTableWithSampling(w io.Writer, projectID, topicID, subscriptionID string) error {
	// projectId := "your-project-id"
	// topicID := "your-pubsub-topic-id"
	// or provide a topicID name to create one
	// subscriptionID := "your-pubsub-subscription-id"
	// or provide a subscription name to create one

	ctx := context.Background()

	// Initialize a client once and reuse it to send multiple requests. Clients
	// are safe to use across goroutines. When the client is no longer needed,
	// call the Close method to cleanup its resources.
	client, err := dlp.NewClient(ctx)
	if err != nil {
		return err
	}

	// Closing the client safely cleans up background resources.
	defer client.Close()

	// Specify the BigQuery table to be inspected.
	tableReference := &dlppb.BigQueryTable{
		ProjectId: "bigquery-public-data",
		DatasetId: "usa_names",
		TableId:   "usa_1910_current",
	}

	bigQueryOptions := &dlppb.BigQueryOptions{
		TableReference: tableReference,
		RowsLimit:      int64(10000),
		SampleMethod:   dlppb.BigQueryOptions_RANDOM_START,
		IdentifyingFields: []*dlppb.FieldId{
			{Name: "name"},
		},
	}

	// Provide storage config with BigqueryOptions
	storageConfig := &dlppb.StorageConfig{
		Type: &dlppb.StorageConfig_BigQueryOptions{
			BigQueryOptions: bigQueryOptions,
		},
	}

	// Specify the type of info the inspection will look for.
	// See https://cloud.google.com/dlp/docs/infotypes-reference for complete list of info types
	infoTypes := []*dlppb.InfoType{
		{Name: "PERSON_NAME"},
	}

	// Specify how the content should be inspected.
	inspectConfig := &dlppb.InspectConfig{
		InfoTypes:    infoTypes,
		IncludeQuote: true,
	}

	// Create a PubSub Client used to listen for when the inspect job finishes.
	pubsubClient, err := pubsub.NewClient(ctx, projectID)
	if err != nil {
		return err
	}
	defer pubsubClient.Close()

	// Create a PubSub subscription we can use to listen for messages.
	// Create the Topic if it doesn't exist.
	t := pubsubClient.Topic(topicID)
	if exists, err := t.Exists(ctx); err != nil {
		return err
	} else if !exists {
		if t, err = pubsubClient.CreateTopic(ctx, topicID); err != nil {
			return err
		}
	}

	// Create the Subscription if it doesn't exist.
	s := pubsubClient.Subscription(subscriptionID)
	if exists, err := s.Exists(ctx); err != nil {
		return err
	} else if !exists {
		if s, err = pubsubClient.CreateSubscription(ctx, subscriptionID, pubsub.SubscriptionConfig{Topic: t}); err != nil {
			return err
		}
	}

	// topic is the PubSub topic string where messages should be sent.
	topic := fmt.Sprintf("projects/%s/topics/%s", projectID, topicID)

	action := &dlppb.Action{
		Action: &dlppb.Action_PubSub{
			PubSub: &dlppb.Action_PublishToPubSub{
				Topic: topic,
			},
		},
	}

	// Configure the long running job we want the service to perform.
	inspectJobConfig := &dlppb.InspectJobConfig{
		StorageConfig: storageConfig,
		InspectConfig: inspectConfig,
		Actions: []*dlppb.Action{
			action,
		},
	}

	// Create the request for the job configured above.
	req := &dlppb.CreateDlpJobRequest{
		Parent: fmt.Sprintf("projects/%s/locations/global", projectID),
		Job: &dlppb.CreateDlpJobRequest_InspectJob{
			InspectJob: inspectJobConfig,
		},
	}

	// Use the client to send the request.
	j, err := client.CreateDlpJob(ctx, req)
	if err != nil {
		return err
	}
	fmt.Fprintf(w, "Job Created: %v", j.GetName())

	// Wait for the inspect job to finish by waiting for a PubSub message.
	// This only waits for 10 minutes. For long jobs, consider using a truly
	// asynchronous execution model such as Cloud Functions.
	c, cancel := context.WithTimeout(ctx, 10*time.Minute)
	defer cancel()
	err = s.Receive(c, func(ctx context.Context, msg *pubsub.Message) {
		// If this is the wrong job, do not process the result.
		if msg.Attributes["DlpJobName"] != j.GetName() {
			msg.Nack()
			return
		}
		msg.Ack()

		// Stop listening for more messages.
		defer cancel()
	})
	if err != nil {
		return err
	}

	resp, err := client.GetDlpJob(ctx, &dlppb.GetDlpJobRequest{
		Name: j.GetName(),
	})
	if err != nil {
		return err
	}
	r := resp.GetInspectDetails().GetResult().GetInfoTypeStats()
	if len(r) == 0 {
		fmt.Fprintf(w, "No results")
		return err
	}
	for _, s := range r {
		fmt.Fprintf(w, "\nFound %v instances of infoType %v\n", s.GetCount(), s.GetInfoType().GetName())
	}
	return nil

}

Java

To learn how to install and use the client library for Sensitive Data Protection, see Sensitive Data Protection client libraries.

To authenticate to Sensitive Data Protection, set up Application Default Credentials. For more information, see Set up authentication for a local development environment.


import com.google.api.core.SettableApiFuture;
import com.google.cloud.dlp.v2.DlpServiceClient;
import com.google.cloud.pubsub.v1.AckReplyConsumer;
import com.google.cloud.pubsub.v1.MessageReceiver;
import com.google.cloud.pubsub.v1.Subscriber;
import com.google.privacy.dlp.v2.Action;
import com.google.privacy.dlp.v2.BigQueryOptions;
import com.google.privacy.dlp.v2.BigQueryOptions.SampleMethod;
import com.google.privacy.dlp.v2.BigQueryTable;
import com.google.privacy.dlp.v2.CreateDlpJobRequest;
import com.google.privacy.dlp.v2.DlpJob;
import com.google.privacy.dlp.v2.FieldId;
import com.google.privacy.dlp.v2.GetDlpJobRequest;
import com.google.privacy.dlp.v2.InfoType;
import com.google.privacy.dlp.v2.InfoTypeStats;
import com.google.privacy.dlp.v2.InspectConfig;
import com.google.privacy.dlp.v2.InspectDataSourceDetails;
import com.google.privacy.dlp.v2.InspectJobConfig;
import com.google.privacy.dlp.v2.LocationName;
import com.google.privacy.dlp.v2.StorageConfig;
import com.google.pubsub.v1.ProjectSubscriptionName;
import com.google.pubsub.v1.PubsubMessage;
import java.io.IOException;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;

public class InspectBigQueryTableWithSampling {

  public static void main(String[] args) throws Exception {
    // TODO(developer): Replace these variables before running the sample.
    String projectId = "your-project-id";
    String topicId = "your-pubsub-topic-id";
    String subscriptionId = "your-pubsub-subscription-id";
    inspectBigQueryTableWithSampling(projectId, topicId, subscriptionId);
  }

  // Inspects a BigQuery Table
  public static void inspectBigQueryTableWithSampling(
      String projectId, String topicId, String subscriptionId)
      throws ExecutionException, InterruptedException, IOException {
    // Initialize client that will be used to send requests. This client only needs to be created
    // once, and can be reused for multiple requests. After completing all of your requests, call
    // the "close" method on the client to safely clean up any remaining background resources.
    try (DlpServiceClient dlp = DlpServiceClient.create()) {
      // Specify the BigQuery table to be inspected.
      BigQueryTable tableReference =
          BigQueryTable.newBuilder()
              .setProjectId("bigquery-public-data")
              .setDatasetId("usa_names")
              .setTableId("usa_1910_current")
              .build();

      BigQueryOptions bigQueryOptions =
          BigQueryOptions.newBuilder()
              .setTableReference(tableReference)
              .setRowsLimit(1000)
              .setSampleMethod(SampleMethod.RANDOM_START)
              .addIdentifyingFields(FieldId.newBuilder().setName("name"))
              .build();

      StorageConfig storageConfig =
          StorageConfig.newBuilder().setBigQueryOptions(bigQueryOptions).build();

      // Specify the type of info the inspection will look for.
      // See https://cloud.google.com/dlp/docs/infotypes-reference for complete list of info types
      InfoType infoType = InfoType.newBuilder().setName("PERSON_NAME").build();

      // Specify how the content should be inspected.
      InspectConfig inspectConfig =
          InspectConfig.newBuilder().addInfoTypes(infoType).setIncludeQuote(true).build();

      // Specify the action that is triggered when the job completes.
      String pubSubTopic = String.format("projects/%s/topics/%s", projectId, topicId);
      Action.PublishToPubSub publishToPubSub =
          Action.PublishToPubSub.newBuilder().setTopic(pubSubTopic).build();
      Action action = Action.newBuilder().setPubSub(publishToPubSub).build();

      // Configure the long running job we want the service to perform.
      InspectJobConfig inspectJobConfig =
          InspectJobConfig.newBuilder()
              .setStorageConfig(storageConfig)
              .setInspectConfig(inspectConfig)
              .addActions(action)
              .build();

      // Create the request for the job configured above.
      CreateDlpJobRequest createDlpJobRequest =
          CreateDlpJobRequest.newBuilder()
              .setParent(LocationName.of(projectId, "global").toString())
              .setInspectJob(inspectJobConfig)
              .build();

      // Use the client to send the request.
      final DlpJob dlpJob = dlp.createDlpJob(createDlpJobRequest);
      System.out.println("Job created: " + dlpJob.getName());

      // Set up a Pub/Sub subscriber to listen on the job completion status
      final SettableApiFuture<Boolean> done = SettableApiFuture.create();

      ProjectSubscriptionName subscriptionName =
          ProjectSubscriptionName.of(projectId, subscriptionId);

      MessageReceiver messageHandler =
          (PubsubMessage pubsubMessage, AckReplyConsumer ackReplyConsumer) -> {
            handleMessage(dlpJob, done, pubsubMessage, ackReplyConsumer);
          };
      Subscriber subscriber = Subscriber.newBuilder(subscriptionName, messageHandler).build();
      subscriber.startAsync();

      // Wait for job completion semi-synchronously
      // For long jobs, consider using a truly asynchronous execution model such as Cloud Functions
      try {
        done.get(15, TimeUnit.MINUTES);
      } catch (TimeoutException e) {
        System.out.println("Job was not completed after 15 minutes.");
        return;
      } finally {
        subscriber.stopAsync();
        subscriber.awaitTerminated();
      }

      // Get the latest state of the job from the service
      GetDlpJobRequest request = GetDlpJobRequest.newBuilder().setName(dlpJob.getName()).build();
      DlpJob completedJob = dlp.getDlpJob(request);

      // Parse the response and process results.
      System.out.println("Job status: " + completedJob.getState());
      System.out.println("Job name: " + dlpJob.getName());
      InspectDataSourceDetails.Result result = completedJob.getInspectDetails().getResult();
      System.out.println("Findings: ");
      for (InfoTypeStats infoTypeStat : result.getInfoTypeStatsList()) {
        System.out.print("\tInfo type: " + infoTypeStat.getInfoType().getName());
        System.out.println("\tCount: " + infoTypeStat.getCount());
      }
    }
  }

  // handleMessage injects the job and settableFuture into the message reciever interface
  private static void handleMessage(
      DlpJob job,
      SettableApiFuture<Boolean> done,
      PubsubMessage pubsubMessage,
      AckReplyConsumer ackReplyConsumer) {
    String messageAttribute = pubsubMessage.getAttributesMap().get("DlpJobName");
    if (job.getName().equals(messageAttribute)) {
      done.set(true);
      ackReplyConsumer.ack();
    } else {
      ackReplyConsumer.nack();
    }
  }
}

Node.js

To learn how to install and use the client library for Sensitive Data Protection, see Sensitive Data Protection client libraries.

To authenticate to Sensitive Data Protection, set up Application Default Credentials. For more information, see Set up authentication for a local development environment.

// Import the Google Cloud client libraries
const DLP = require('@google-cloud/dlp');
const {PubSub} = require('@google-cloud/pubsub');

// Instantiates clients
const dlp = new DLP.DlpServiceClient();
const pubsub = new PubSub();

// The project ID to run the API call under
// const projectId = 'my-project';

// The project ID the table is stored under
// This may or (for public datasets) may not equal the calling project ID
// const dataProjectId = 'my-project';

// The ID of the dataset to inspect, e.g. 'my_dataset'
// const datasetId = 'my_dataset';

// The ID of the table to inspect, e.g. 'my_table'
// const tableId = 'my_table';

// The name of the Pub/Sub topic to notify once the job completes
// TODO(developer): create a Pub/Sub topic to use for this
// const topicId = 'MY-PUBSUB-TOPIC'

// The name of the Pub/Sub subscription to use when listening for job
// completion notifications
// TODO(developer): create a Pub/Sub subscription to use for this
// const subscriptionId = 'MY-PUBSUB-SUBSCRIPTION'

// DLP Job max time (in milliseconds)
const DLP_JOB_WAIT_TIME = 15 * 1000 * 60;

async function inspectBigqueryWithSampling() {
  // Specify the type of info the inspection will look for.
  // See https://cloud.google.com/dlp/docs/infotypes-reference for complete list of info types
  const infoTypes = [{name: 'PERSON_NAME'}];

  // Specify the BigQuery options required for inspection.
  const storageItem = {
    bigQueryOptions: {
      tableReference: {
        projectId: dataProjectId,
        datasetId: datasetId,
        tableId: tableId,
      },
      rowsLimit: 1000,
      sampleMethod:
        DLP.protos.google.privacy.dlp.v2.BigQueryOptions.SampleMethod
          .RANDOM_START,
      includedFields: [{name: 'name'}],
    },
  };

  // Specify the action that is triggered when the job completes.
  const actions = [
    {
      pubSub: {
        topic: `projects/${projectId}/topics/${topicId}`,
      },
    },
  ];

  // Construct request for creating an inspect job
  const request = {
    parent: `projects/${projectId}/locations/global`,
    inspectJob: {
      inspectConfig: {
        infoTypes: infoTypes,
        includeQuote: true,
      },
      storageConfig: storageItem,
      actions: actions,
    },
  };
  // Use the client to send the request.
  const [topicResponse] = await pubsub.topic(topicId).get();

  // Verify the Pub/Sub topic and listen for job notifications via an
  // existing subscription.
  const subscription = await topicResponse.subscription(subscriptionId);

  const [jobsResponse] = await dlp.createDlpJob(request);
  const jobName = jobsResponse.name;

  // Watch the Pub/Sub topic until the DLP job finishes
  await new Promise((resolve, reject) => {
    // Set up the timeout
    const timer = setTimeout(() => {
      reject(new Error('Timeout'));
    }, DLP_JOB_WAIT_TIME);

    const messageHandler = message => {
      if (message.attributes && message.attributes.DlpJobName === jobName) {
        message.ack();
        subscription.removeListener('message', messageHandler);
        subscription.removeListener('error', errorHandler);
        clearTimeout(timer);
        resolve(jobName);
      } else {
        message.nack();
      }
    };

    const errorHandler = err => {
      subscription.removeListener('message', messageHandler);
      subscription.removeListener('error', errorHandler);
      clearTimeout(timer);
      reject(err);
    };

    subscription.on('message', messageHandler);
    subscription.on('error', errorHandler);
  });
  const [job] = await dlp.getDlpJob({name: jobName});
  console.log(`Job ${job.name} status: ${job.state}`);

  const infoTypeStats = job.inspectDetails.result.infoTypeStats;
  if (infoTypeStats.length > 0) {
    infoTypeStats.forEach(infoTypeStat => {
      console.log(
        `  Found ${infoTypeStat.count} instance(s) of infoType ${infoTypeStat.infoType.name}.`
      );
    });
  } else {
    console.log('No findings.');
  }
}

await inspectBigqueryWithSampling();

PHP

To learn how to install and use the client library for Sensitive Data Protection, see Sensitive Data Protection client libraries.

To authenticate to Sensitive Data Protection, set up Application Default Credentials. For more information, see Set up authentication for a local development environment.


use Google\Cloud\Dlp\V2\DlpServiceClient;
use Google\Cloud\Dlp\V2\BigQueryOptions;
use Google\Cloud\Dlp\V2\InfoType;
use Google\Cloud\Dlp\V2\InspectConfig;
use Google\Cloud\Dlp\V2\StorageConfig;
use Google\Cloud\Dlp\V2\BigQueryTable;
use Google\Cloud\Dlp\V2\DlpJob\JobState;
use Google\Cloud\Dlp\V2\Action;
use Google\Cloud\Dlp\V2\Action\PublishToPubSub;
use Google\Cloud\Dlp\V2\BigQueryOptions\SampleMethod;
use Google\Cloud\Dlp\V2\FieldId;
use Google\Cloud\Dlp\V2\InspectJobConfig;
use Google\Cloud\PubSub\PubSubClient;

/**
 * Inspect BigQuery for sensitive data with sampling.
 * The following examples demonstrate using the Cloud Data Loss Prevention
 * API to scan a 1000-row subset of a BigQuery table. The scan starts from
 * a random row.
 *
 * @param string $callingProjectId  The project ID to run the API call under.
 * @param string $topicId           The Pub/Sub topic ID to notify once the job is completed.
 * @param string $subscriptionId    The Pub/Sub subscription ID to use when listening for job.
 * @param string $projectId         The Google Cloud Project ID.
 * @param string $datasetId         The BigQuery Dataset ID.
 * @param string $tableId           The BigQuery Table ID to be inspected.
 */
function inspect_bigquery_with_sampling(
    string $callingProjectId,
    string $topicId,
    string $subscriptionId,
    string $projectId,
    string $datasetId,
    string $tableId
): void {
    // Instantiate a client.
    $dlp = new DlpServiceClient();
    $pubsub = new PubSubClient();
    $topic = $pubsub->topic($topicId);

    // Specify the BigQuery table to be inspected.
    $bigqueryTable = (new BigQueryTable())
        ->setProjectId($projectId)
        ->setDatasetId($datasetId)
        ->setTableId($tableId);

    $bigQueryOptions = (new BigQueryOptions())
        ->setTableReference($bigqueryTable)
        ->setRowsLimit(1000)
        ->setSampleMethod(SampleMethod::RANDOM_START)
        ->setIdentifyingFields([
            (new FieldId())
                ->setName('name')
        ]);

    $storageConfig = (new StorageConfig())
        ->setBigQueryOptions($bigQueryOptions);

    // Specify the type of info the inspection will look for.
    // See https://cloud.google.com/dlp/docs/infotypes-reference for complete list of info types
    $personNameInfoType = (new InfoType())
        ->setName('PERSON_NAME');
    $infoTypes = [$personNameInfoType];

    // Specify how the content should be inspected.
    $inspectConfig = (new InspectConfig())
        ->setInfoTypes($infoTypes)
        ->setIncludeQuote(true);

    // Specify the action that is triggered when the job completes.
    $pubSubAction = (new PublishToPubSub())
        ->setTopic($topic->name());

    $action = (new Action())
        ->setPubSub($pubSubAction);

    // Configure the long running job we want the service to perform.
    $inspectJob = (new InspectJobConfig())
        ->setInspectConfig($inspectConfig)
        ->setStorageConfig($storageConfig)
        ->setActions([$action]);

    // Listen for job notifications via an existing topic/subscription.
    $subscription = $topic->subscription($subscriptionId);

    // Submit request
    $parent = "projects/$callingProjectId/locations/global";
    $job = $dlp->createDlpJob($parent, [
        'inspectJob' => $inspectJob
    ]);

    // Poll Pub/Sub using exponential backoff until job finishes
    // Consider using an asynchronous execution model such as Cloud Functions
    $attempt = 1;
    $startTime = time();
    do {
        foreach ($subscription->pull() as $message) {
            if (
                isset($message->attributes()['DlpJobName']) &&
                $message->attributes()['DlpJobName'] === $job->getName()
            ) {
                $subscription->acknowledge($message);
                // Get the updated job. Loop to avoid race condition with DLP API.
                do {
                    $job = $dlp->getDlpJob($job->getName());
                } while ($job->getState() == JobState::RUNNING);
                break 2; // break from parent do while
            }
        }
        printf('Waiting for job to complete' . PHP_EOL);
        // Exponential backoff with max delay of 60 seconds
        sleep(min(60, pow(2, ++$attempt)));
    } while (time() - $startTime < 600); // 10 minute timeout

    // Print finding counts
    printf('Job %s status: %s' . PHP_EOL, $job->getName(), JobState::name($job->getState()));
    switch ($job->getState()) {
        case JobState::DONE:
            $infoTypeStats = $job->getInspectDetails()->getResult()->getInfoTypeStats();
            if (count($infoTypeStats) === 0) {
                printf('No findings.' . PHP_EOL);
            } else {
                foreach ($infoTypeStats as $infoTypeStat) {
                    printf(
                        '  Found %s instance(s) of infoType %s' . PHP_EOL,
                        $infoTypeStat->getCount(),
                        $infoTypeStat->getInfoType()->getName()
                    );
                }
            }
            break;
        case JobState::FAILED:
            printf('Job %s had errors:' . PHP_EOL, $job->getName());
            $errors = $job->getErrors();
            foreach ($errors as $error) {
                var_dump($error->getDetails());
            }
            break;
        case JobState::PENDING:
            printf('Job has not completed. Consider a longer timeout or an asynchronous execution model' . PHP_EOL);
            break;
        default:
            printf('Unexpected job state. Most likely, the job is either running or has not yet started.');
    }
}

Python

To learn how to install and use the client library for Sensitive Data Protection, see Sensitive Data Protection client libraries.

To authenticate to Sensitive Data Protection, set up Application Default Credentials. For more information, see Set up authentication for a local development environment.

import threading

import google.cloud.dlp
import google.cloud.pubsub


def inspect_bigquery_table_with_sampling(
    project: str,
    topic_id: str,
    subscription_id: str,
    min_likelihood: str = None,
    max_findings: str = None,
    timeout: int = 300,
) -> None:
    """Uses the Data Loss Prevention API to analyze BigQuery data by limiting
    the amount of data to be scanned.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        topic_id: The id of the Cloud Pub/Sub topic to which the API will
            broadcast job completion. The topic must already exist.
        subscription_id: The id of the Cloud Pub/Sub subscription to listen on
            while waiting for job completion. The subscription must already
            exist and be subscribed to the topic.
        min_likelihood: A string representing the minimum likelihood threshold
            that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED',
            'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'.
        max_findings: The maximum number of findings to report; 0 = no maximum.
        timeout: The number of seconds to wait for a response from the API.
    """

    # Instantiate a client.
    dlp = google.cloud.dlp_v2.DlpServiceClient()

    # Specify how the content should be inspected. Keys which are None may
    # optionally be omitted entirely.
    inspect_config = {
        "info_types": [{"name": "PERSON_NAME"}],
        "min_likelihood": min_likelihood,
        "limits": {"max_findings_per_request": max_findings},
        "include_quote": True,
    }

    # Specify the BigQuery table to be inspected.
    # Here we are using public bigquery table.
    table_reference = {
        "project_id": "bigquery-public-data",
        "dataset_id": "usa_names",
        "table_id": "usa_1910_current",
    }

    # Construct a storage_config containing the target BigQuery info.
    storage_config = {
        "big_query_options": {
            "table_reference": table_reference,
            "rows_limit": 1000,
            "sample_method": "RANDOM_START",
            "identifying_fields": [{"name": "name"}],
        }
    }

    # Tell the API where to send a notification when the job is complete.
    topic = google.cloud.pubsub.PublisherClient.topic_path(project, topic_id)
    actions = [{"pub_sub": {"topic": topic}}]

    # Construct the inspect_job, which defines the entire inspect content task.
    inspect_job = {
        "inspect_config": inspect_config,
        "storage_config": storage_config,
        "actions": actions,
    }

    # Convert the project id into full resource ids.
    parent = f"projects/{project}/locations/global"

    # Call the API
    operation = dlp.create_dlp_job(
        request={"parent": parent, "inspect_job": inspect_job}
    )
    print(f"Inspection operation started: {operation.name}")

    # Create a Pub/Sub client and find the subscription. The subscription is
    # expected to already be listening to the topic.
    subscriber = google.cloud.pubsub.SubscriberClient()
    subscription_path = subscriber.subscription_path(project, subscription_id)

    # Set up a callback to acknowledge a message. This closes around an event
    # so that it can signal that it is done and the main thread can continue.
    job_done = threading.Event()

    def callback(message: google.cloud.pubsub_v1.subscriber.message.Message) -> None:
        try:
            if message.attributes["DlpJobName"] == operation.name:
                # This is the message we're looking for, so acknowledge it.
                message.ack()

                # Now that the job is done, fetch the results and print them.
                job = dlp.get_dlp_job(request={"name": operation.name})
                print(f"Job name: {job.name}")

                if job.inspect_details.result.info_type_stats:
                    for finding in job.inspect_details.result.info_type_stats:
                        print(
                            f"Info type: {finding.info_type.name}; Count: {finding.count}"
                        )
                else:
                    print("No findings.")

                # Signal to the main thread that we can exit.
                job_done.set()
            else:
                # This is not the message we're looking for.
                message.drop()

        except Exception as e:
            # Because this is executing in a thread, an exception won't be
            # noted unless we print it manually.
            print(e)
            raise

    # Register the callback and wait on the event.
    subscriber.subscribe(subscription_path, callback=callback)
    finished = job_done.wait(timeout=timeout)
    if not finished:
        print(
            "No event received before the timeout. Please verify that the "
            "subscription provided is subscribed to the topic provided."
        )

What's next

To search and filter code samples for other Google Cloud products, see the Google Cloud sample browser.