구조화된 텍스트에서 민감한 정보 검사

Cloud Data Loss Prevention(DLP)는 CSV 또는 JSON과 같은 구조화된 콘텐츠에서 민감한 정보를 감지하고 분류할 수 있습니다. 테이블을 검사하거나 테이블로 익명화하여 구조 및 열은 Cloud DLP에 일부 사용 사례에 더 나은 결과를 제공하는 데 사용할 수 있는 추가 정보를 제공합니다.

테이블 검사

아래 코드 샘플은 데이터 테이블에서 민감한 콘텐츠를 검사하는 방법을 보여줍니다. 테이블은 다양한 유형을 지원합니다.

프로토콜

JSON과 Cloud DLP API 사용법에 대한 자세한 내용은 JSON 빠른 시작을 참조하세요.

JSON 입력:

POST https://dlp.googleapis.com/v2/projects/[PROJECT_ID]/content:inspect?key={YOUR_API_KEY}

{
  "item":{
    "table":{
      "headers": [{"name":"name"}, {"name":"phone"}],
      "rows": [{
        "values":[
          {"string_value": "John Doe"},
          {"string_value": "(206) 555-0123"}
        ]}
      ],
    }
  },
  "inspectConfig":{
    "infoTypes":[
      {
        "name":"PHONE_NUMBER"
      }
    ],
    "includeQuote":true
  }
}

JSON 출력:

{
  "result": {
    "findings": [
     {
      "quote": "(206) 555-0123",
      "infoType": {
       "name": "PHONE_NUMBER"
      },
      "likelihood": "VERY_LIKELY",
      "location": {
         "byteRange": {
          "end": "14"
         },
         "codepointRange": {
          "end": "14"
         },
         "contentLocations": [
          {
           "recordLocation": {
              "fieldId": {
               "name": "phone"
              },
              "tableLocation": {
              }
           }
          }
         ]
      },
      "createTime": "2019-03-08T23:55:10.980Z"
     }
    ]
  }
}

자바


import com.google.cloud.dlp.v2.DlpServiceClient;
import com.google.privacy.dlp.v2.ByteContentItem;
import com.google.privacy.dlp.v2.ByteContentItem.BytesType;
import com.google.privacy.dlp.v2.ContentItem;
import com.google.privacy.dlp.v2.FieldId;
import com.google.privacy.dlp.v2.Finding;
import com.google.privacy.dlp.v2.InfoType;
import com.google.privacy.dlp.v2.InspectConfig;
import com.google.privacy.dlp.v2.InspectContentRequest;
import com.google.privacy.dlp.v2.InspectContentResponse;
import com.google.privacy.dlp.v2.Likelihood;
import com.google.privacy.dlp.v2.LocationName;
import com.google.privacy.dlp.v2.Table;
import com.google.privacy.dlp.v2.Table.Row;
import com.google.privacy.dlp.v2.Value;
import com.google.protobuf.ByteString;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

public class InspectTable {

  public static void main(String[] args) throws Exception {
    // TODO(developer): Replace these variables before running the sample.
    String projectId = "your-project-id";
    Table tableToInspect = Table.newBuilder()
        .addHeaders(FieldId.newBuilder().setName("name").build())
        .addHeaders(FieldId.newBuilder().setName("phone").build())
        .addRows(Row.newBuilder()
            .addValues(Value.newBuilder().setStringValue("John Doe").build())
            .addValues(Value.newBuilder().setStringValue("(206) 555-0123").build()))
        .build();

    inspectTable(projectId, tableToInspect);
  }

  // Inspects the provided text.
  public static void inspectTable(String projectId, Table tableToInspect) {
    // Initialize client that will be used to send requests. This client only needs to be created
    // once, and can be reused for multiple requests. After completing all of your requests, call
    // the "close" method on the client to safely clean up any remaining background resources.
    try (DlpServiceClient dlp = DlpServiceClient.create()) {
      // Specify the table to be inspected.
      ContentItem item = ContentItem.newBuilder().setTable(tableToInspect).build();

      // Specify the type of info the inspection will look for.
      // See https://cloud.google.com/dlp/docs/infotypes-reference for complete list of info types
      InfoType infoType = InfoType.newBuilder().setName("PHONE_NUMBER").build();

      // Construct the configuration for the Inspect request.
      InspectConfig config =
          InspectConfig.newBuilder()
              .addInfoTypes(infoType)
              .setIncludeQuote(true)
              .build();

      // Construct the Inspect request to be sent by the client.
      InspectContentRequest request =
          InspectContentRequest.newBuilder()
              .setParent(LocationName.of(projectId, "global").toString())
              .setItem(item)
              .setInspectConfig(config)
              .build();

      // Use the client to send the API request.
      InspectContentResponse response = dlp.inspectContent(request);

      // Parse the response and process results
      System.out.println("Findings: " + response.getResult().getFindingsCount());
      for (Finding f : response.getResult().getFindingsList()) {
        System.out.println("\tQuote: " + f.getQuote());
        System.out.println("\tInfo type: " + f.getInfoType().getName());
        System.out.println("\tLikelihood: " + f.getLikelihood());
      }
    } catch (Exception e) {
      System.out.println("Error during inspectString: \n" + e.toString());
    }
  }
}

Python



def inspect_table(
    project,
    data,
    info_types,
    custom_dictionaries=None,
    custom_regexes=None,
    min_likelihood=None,
    max_findings=None,
    include_quote=True,
):
    """Uses the Data Loss Prevention API to analyze strings for protected data.
    Args:
        project: The Google Cloud project id to use as a parent resource.
        data: Json string representing table data.
        info_types: A list of strings representing info types to look for.
            A full list of info type categories can be fetched from the API.
        min_likelihood: A string representing the minimum likelihood threshold
            that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED',
            'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'.
        max_findings: The maximum number of findings to report; 0 = no maximum.
        include_quote: Boolean for whether to display a quote of the detected
            information in the results.
    Returns:
        None; the response from the API is printed to the terminal.
    Example:
        data = {
            "header":[
                "email",
                "phone number"
            ],
            "rows":[
                [
                    "robertfrost@xyz.com",
                    "4232342345"
                ],
                [
                    "johndoe@pqr.com",
                    "4253458383"
                ]
            ]
        }

        >> $ python inspect_content.py table \
        '{"header": ["email", "phone number"],
        "rows": [["robertfrost@xyz.com", "4232342345"],
        ["johndoe@pqr.com", "4253458383"]]}'
        >>  Quote: robertfrost@xyz.com
            Info type: EMAIL_ADDRESS
            Likelihood: 4
            Quote: johndoe@pqr.com
            Info type: EMAIL_ADDRESS
            Likelihood: 4
    """

    # Import the client library.
    import google.cloud.dlp

    # Instantiate a client.
    dlp = google.cloud.dlp_v2.DlpServiceClient()

    # Prepare info_types by converting the list of strings into a list of
    # dictionaries (protos are also accepted).
    info_types = [{"name": info_type} for info_type in info_types]

    # Prepare custom_info_types by parsing the dictionary word lists and
    # regex patterns.
    if custom_dictionaries is None:
        custom_dictionaries = []
    dictionaries = [
        {
            "info_type": {"name": "CUSTOM_DICTIONARY_{}".format(i)},
            "dictionary": {"word_list": {"words": custom_dict.split(",")}},
        }
        for i, custom_dict in enumerate(custom_dictionaries)
    ]
    if custom_regexes is None:
        custom_regexes = []
    regexes = [
        {
            "info_type": {"name": "CUSTOM_REGEX_{}".format(i)},
            "regex": {"pattern": custom_regex},
        }
        for i, custom_regex in enumerate(custom_regexes)
    ]
    custom_info_types = dictionaries + regexes

    # Construct the configuration dictionary. Keys which are None may
    # optionally be omitted entirely.
    inspect_config = {
        "info_types": info_types,
        "custom_info_types": custom_info_types,
        "min_likelihood": min_likelihood,
        "include_quote": include_quote,
        "limits": {"max_findings_per_request": max_findings},
    }

    # Construct the `table`. For more details on the table schema, please see
    # https://cloud.google.com/dlp/docs/reference/rest/v2/ContentItem#Table
    headers = [{"name": val} for val in data["header"]]
    rows = []
    for row in data["rows"]:
        rows.append({"values": [{"string_value": cell_val} for cell_val in row]})

    table = {}
    table["headers"] = headers
    table["rows"] = rows
    item = {"table": table}
    # Convert the project id into a full resource id.
    parent = f"projects/{project}"

    # Call the API.
    response = dlp.inspect_content(
        request={"parent": parent, "inspect_config": inspect_config, "item": item}
    )

    # Print out the results.
    if response.result.findings:
        for finding in response.result.findings:
            try:
                if finding.quote:
                    print("Quote: {}".format(finding.quote))
            except AttributeError:
                pass
            print("Info type: {}".format(finding.info_type.name))
            print("Likelihood: {}".format(finding.likelihood))
    else:
        print("No findings.")

텍스트와 구조화된 텍스트 비교

텍스트를 구조화하면 컨텍스트를 제공할 수 있습니다. 'John Doe, (206) 555-0123'과 같은 문자열로만 검사하는 앞의 예시와 같은 요청은 결과의 정확성이 떨어질 것입니다. Cloud DLP에 숫자의 용도에 대한 컨텍스트 정보가 적기 때문입니다. 가장 정확한 스캔 결과를 얻으려면 가능한 경우 문자열을 테이블 객체로 파싱하는 것이 좋습니다.