De-identify DICOM data using keeplist

Remove PII from DICOM data by using a keeplist.

Documentation pages that include this code sample

To view the code sample used in context, see the following documentation:

Code sample

Go

import (
	"context"
	"fmt"
	"io"
	"time"

	healthcare "google.golang.org/api/healthcare/v1"
)

// deidentifyDataset creates a new dataset containing de-identified data from the source dataset.
func deidentifyDataset(w io.Writer, projectID, location, sourceDatasetID, destinationDatasetID string) error {
	ctx := context.Background()

	healthcareService, err := healthcare.NewService(ctx)
	if err != nil {
		return fmt.Errorf("healthcare.NewService: %v", err)
	}

	datasetsService := healthcareService.Projects.Locations.Datasets

	parent := fmt.Sprintf("projects/%s/locations/%s", projectID, location)

	req := &healthcare.DeidentifyDatasetRequest{
		DestinationDataset: fmt.Sprintf("%s/datasets/%s", parent, destinationDatasetID),
		Config: &healthcare.DeidentifyConfig{
			Dicom: &healthcare.DicomConfig{
				KeepList: &healthcare.TagFilterList{
					Tags: []string{
						"PatientID",
					},
				},
			},
		},
	}

	sourceName := fmt.Sprintf("%s/datasets/%s", parent, sourceDatasetID)
	resp, err := datasetsService.Deidentify(sourceName, req).Do()
	if err != nil {
		return fmt.Errorf("Deidentify: %v", err)
	}

	// Wait for the deidentification operation to finish.
	operationService := healthcareService.Projects.Locations.Datasets.Operations
	for {
		op, err := operationService.Get(resp.Name).Do()
		if err != nil {
			return fmt.Errorf("operationService.Get: %v", err)
		}
		if !op.Done {
			time.Sleep(1 * time.Second)
			continue
		}
		if op.Error != nil {
			return fmt.Errorf("deidentify operation error: %v", *op.Error)
		}
		fmt.Fprintf(w, "Created de-identified dataset %s from %s\n", resp.Name, sourceName)
		return nil
	}
}

Java

import com.google.api.client.http.HttpRequestInitializer;
import com.google.api.client.http.javanet.NetHttpTransport;
import com.google.api.client.json.JsonFactory;
import com.google.api.client.json.jackson2.JacksonFactory;
import com.google.api.services.healthcare.v1.CloudHealthcare;
import com.google.api.services.healthcare.v1.CloudHealthcare.Projects.Locations.Datasets;
import com.google.api.services.healthcare.v1.CloudHealthcareScopes;
import com.google.api.services.healthcare.v1.model.DeidentifyConfig;
import com.google.api.services.healthcare.v1.model.DeidentifyDatasetRequest;
import com.google.api.services.healthcare.v1.model.DicomConfig;
import com.google.api.services.healthcare.v1.model.Operation;
import com.google.api.services.healthcare.v1.model.TagFilterList;
import com.google.auth.http.HttpCredentialsAdapter;
import com.google.auth.oauth2.GoogleCredentials;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;

public class DatasetDeIdentify {
  private static final String DATASET_NAME = "projects/%s/locations/%s/datasets/%s";
  private static final JsonFactory JSON_FACTORY = new JacksonFactory();
  private static final NetHttpTransport HTTP_TRANSPORT = new NetHttpTransport();

  public static void datasetDeIdentify(String srcDatasetName, String destDatasetName)
      throws IOException {
    // String srcDatasetName =
    //     String.format(DATASET_NAME, "your-project-id", "your-region-id", "your-src-dataset-id");
    // String destDatasetName =
    //    String.format(DATASET_NAME, "your-project-id", "your-region-id", "your-dest-dataset-id");

    // Initialize the client, which will be used to interact with the service.
    CloudHealthcare client = createClient();

    // Configure what information needs to be De-Identified.
    // For more information on de-identifying using tags, please see the following:
    // https://cloud.google.com/healthcare/docs/how-tos/dicom-deidentify#de-identification_using_tags
    TagFilterList tags = new TagFilterList().setTags(Arrays.asList("PatientID"));
    DicomConfig dicomConfig = new DicomConfig().setKeepList(tags);
    DeidentifyConfig config = new DeidentifyConfig().setDicom(dicomConfig);

    // Create the de-identify request and configure any parameters.
    DeidentifyDatasetRequest deidentifyRequest =
        new DeidentifyDatasetRequest().setDestinationDataset(destDatasetName).setConfig(config);
    Datasets.Deidentify request =
        client.projects().locations().datasets().deidentify(srcDatasetName, deidentifyRequest);

    // Execute the request, wait for the operation to complete, and process the results.
    try {
      Operation operation = request.execute();
      while (operation.getDone() == null || !operation.getDone()) {
        // Update the status of the operation with another request.
        Thread.sleep(500); // Pause for 500ms between requests.
        operation =
            client
                .projects()
                .locations()
                .datasets()
                .operations()
                .get(operation.getName())
                .execute();
      }
      System.out.println(
          "De-identified Dataset created. Response content: " + operation.getResponse());
    } catch (Exception ex) {
      System.out.printf("Error during request execution: %s", ex.toString());
      ex.printStackTrace(System.out);
    }
  }

  private static CloudHealthcare createClient() throws IOException {
    // Use Application Default Credentials (ADC) to authenticate the requests
    // For more information see https://cloud.google.com/docs/authentication/production
    GoogleCredentials credential =
        GoogleCredentials.getApplicationDefault()
            .createScoped(Collections.singleton(CloudHealthcareScopes.CLOUD_PLATFORM));

    // Create a HttpRequestInitializer, which will provide a baseline configuration to all requests.
    HttpRequestInitializer requestInitializer =
        request -> {
          new HttpCredentialsAdapter(credential).initialize(request);
          request.setConnectTimeout(60000); // 1 minute connect timeout
          request.setReadTimeout(60000); // 1 minute read timeout
        };

    // Build the client for interacting with the service.
    return new CloudHealthcare.Builder(HTTP_TRANSPORT, JSON_FACTORY, requestInitializer)
        .setApplicationName("your-application-name")
        .build();
  }
}

Node.js

const {google} = require('googleapis');
const healthcare = google.healthcare('v1');

const deidentifyDataset = async () => {
  const auth = await google.auth.getClient({
    scopes: ['https://www.googleapis.com/auth/cloud-platform'],
  });
  google.options({auth});

  // TODO(developer): uncomment these lines before running the sample
  // const cloudRegion = 'us-central1';
  // const projectId = 'adjective-noun-123';
  // const sourceDatasetId = 'my-source-dataset';
  // const destinationDatasetId = 'my-destination-dataset';
  // const keeplistTags = 'PatientID'
  const sourceDataset = `projects/${projectId}/locations/${cloudRegion}/datasets/${sourceDatasetId}`;
  const destinationDataset = `projects/${projectId}/locations/${cloudRegion}/datasets/${destinationDatasetId}`;
  const request = {
    sourceDataset: sourceDataset,
    destinationDataset: destinationDataset,
    resource: {
      config: {
        dicom: {
          keepList: {
            tags: [keeplistTags],
          },
        },
      },
    },
  };

  await healthcare.projects.locations.datasets.deidentify(request);
  console.log(
    `De-identified data written from dataset ${sourceDatasetId} to dataset ${destinationDatasetId}`
  );
};

deidentifyDataset();

Python

def deidentify_dataset(
        project_id,
        cloud_region,
        dataset_id,
        destination_dataset_id,
        keeplist_tags):
    """Creates a new dataset containing de-identified data
    from the source dataset.
    """
    client = get_client()
    source_dataset = 'projects/{}/locations/{}/datasets/{}'.format(
        project_id, cloud_region, dataset_id)
    destination_dataset = 'projects/{}/locations/{}/datasets/{}'.format(
        project_id, cloud_region, destination_dataset_id)

    body = {
        'destinationDataset': destination_dataset,
        'config': {
            'dicom': {
                'keepList': {
                    'tags': [
                        'Columns',
                        'NumberOfFrames',
                        'PixelRepresentation',
                        'MediaStorageSOPClassUID',
                        'MediaStorageSOPInstanceUID',
                        'Rows',
                        'SamplesPerPixel',
                        'BitsAllocated',
                        'HighBit',
                        'PhotometricInterpretation',
                        'BitsStored',
                        'PatientID',
                        'TransferSyntaxUID',
                        'SOPInstanceUID',
                        'StudyInstanceUID',
                        'SeriesInstanceUID',
                        'PixelData'
                    ]
                }
            }
        }
    }

    request = client.projects().locations().datasets().deidentify(
        sourceDataset=source_dataset, body=body)

    response = request.execute()
    print(
        'Data in dataset {} de-identified.'
        'De-identified data written to {}'.format(
            dataset_id,
            destination_dataset_id))
    return response

What's next

To search and filter code samples for other Google Cloud products, see the Google Cloud sample browser