Managing datasets

A dataset contains representative samples of the type of entities you want to identify and annotate, labeled with the labels you want your custom model to use. The dataset serves as the input for training a model.

The main steps for building a dataset are:

  1. Create a dataset.
  2. Import data items into the dataset.
  3. Identify the entities.

In many cases, steps 2 and 3 are combined: you import data items with their entities already identified with annotations.

A project can have multiple datasets, each used to train a separate model. You can get a list of the available datasets and can delete datasets you no longer need.

Creating a dataset

The first step in creating a custom model is to create an empty dataset that will eventually hold the training data for the model. The newly created dataset doesn't contain any data until you import items into it.

Web UI

  1. Open the AutoML Natural Language Entity Extraction UI and select the Get started link in the AutoML Entity Extraction box.

    The Datasets page shows the status of previously created datasets for the current project.

    Dataset list page

    To add a dataset for a different project, select the project from the drop-down list in the upper right of the title bar.

  2. Click the New Dataset button in the title bar.

  3. On the Create dataset page, enter a name for the dataset then click Create dataset.

Command-line

In the command below, replace project-id with the ID for your project.

curl \
  -X POST \
  -H "Authorization: Bearer $(gcloud auth application-default print-access-token)" \
  -H "Content-Type: application/json" \
  https://automl.googleapis.com/v1beta1/projects/project-id/locations/us-central1/datasets \
  -d '{
    "displayName": "test_dataset",
    "textExtractionDatasetMetadata": {
    }
  }'

You should see output similar to the following:

{
  name: "projects/000000000000/locations/us-central1/datasets/TEN5582774688079151104"
  display_name: "test_dataset"
  create_time {
     seconds: 1539886451
     nanos: 757650000
   }
   text_extraction_dataset_metadata {
   }
}

Note the name and ID if the new dataset, which you will need for other operations such as importing items into your dataset and training a model. The dataset name has the format projects/{project-id}/locations/us-central1/datasets/{dataset-id}; the dataset ID is the element that appears after datasets/ in the "name" value of the response.

Java

import com.google.cloud.automl.v1beta1.AutoMlClient;
import com.google.cloud.automl.v1beta1.Dataset;
import com.google.cloud.automl.v1beta1.LocationName;
import com.google.cloud.automl.v1beta1.TextExtractionDatasetMetadata;
import java.io.IOException;
import java.text.DateFormat;
import java.text.SimpleDateFormat;

class CreateDataset {

  // Create an empty dataset
  static void createDataset(String projectId, String computeRegion, String datasetName)
      throws IOException {
    // String projectId = "YOUR_PROJECT_ID";
    // String computeRegion = "us-central1";
    // String datasetName = "YOUR_DATASET_DISPLAY_NAME";

    // Instantiates a client
    try (AutoMlClient client = AutoMlClient.create()) {

      // A resource that represents Google Cloud Platform location.
      LocationName projectLocation = LocationName.of(projectId, computeRegion);

      // Specify the text extraction dataset metadata for the dataset.
      TextExtractionDatasetMetadata textExtractionDatasetMetadata =
          TextExtractionDatasetMetadata.newBuilder().build();

      // Set dataset name and dataset metadata.
      Dataset myDataset =
          Dataset.newBuilder()
              .setDisplayName(datasetName)
              .setTextExtractionDatasetMetadata(textExtractionDatasetMetadata)
              .build();

      // Create a dataset with the dataset metadata in the region.
      Dataset dataset = client.createDataset(projectLocation, myDataset);

      // Display the dataset information.
      System.out.println(String.format("Dataset name: %s", dataset.getName()));
      System.out.println(
          String.format(
              "Dataset Id: %s",
              dataset.getName().split("/")[dataset.getName().split("/").length - 1]));
      System.out.println(String.format("Dataset display name: %s", dataset.getDisplayName()));
      System.out.println("Text extraction dataset metadata:");
      System.out.print(String.format("\t%s", dataset.getTextExtractionDatasetMetadata()));
      System.out.println(String.format("Dataset example count: %d", dataset.getExampleCount()));
      DateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSZ");
      String createTime =
          dateFormat.format(new java.util.Date(dataset.getCreateTime().getSeconds() * 1000));
      System.out.println(String.format("Dataset create time: %s", createTime));
    }
  }
}

Node.js

const automl = require(`@google-cloud/automl`);
const util = require(`util`);
const client = new automl.v1beta1.AutoMlClient();

/**
 * Demonstrates using the AutoML client to create a dataset
 * TODO(developer): Uncomment the following lines before running the sample.
 */
// const projectId = '[PROJECT_ID]' e.g., "my-gcloud-project";
// const computeRegion = '[REGION_NAME]' e.g., "us-central1";
// const datasetName = '[DATASET_NAME]' e.g., "myDataset";

// A resource that represents Google Cloud Platform location.
const projectLocation = client.locationPath(projectId, computeRegion);

// Set dataset name and metadata.
const myDataset = {
  displayName: datasetName,
  textExtractionDatasetMetadata: {},
};

// Create a dataset with the dataset metadata in the region.
client
  .createDataset({parent: projectLocation, dataset: myDataset})
  .then(responses => {
    const dataset = responses[0];

    // Display the dataset information.
    console.log(`Dataset name: ${dataset.name}`);
    console.log(`Dataset Id: ${dataset.name.split(`/`).pop(-1)}`);
    console.log(`Dataset display name: ${dataset.displayName}`);
    console.log(`Dataset example count: ${dataset.exampleCount}`);
    console.log(
      `Text extraction dataset metadata: ${util.inspect(
        dataset.textExtractionDatasetMetadata,
        false,
        null
      )}`
    );
  })
  .catch(err => {
    console.error(err);
  });

Python

    # TODO(developer): Uncomment and set the following variables
    # project_id = '[PROJECT_ID]'
    # compute_region = '[COMPUTE_REGION]'
    # dataset_name = '[DATASET_NAME]'

    from datetime import datetime
    from google.cloud import automl_v1beta1 as automl

    client = automl.AutoMlClient()

    # A resource that represents Google Cloud Platform location.
    project_location = client.location_path(project_id, compute_region)

    # Set dataset name and metadata.
    my_dataset = {
        "display_name": dataset_name,
        "text_extraction_dataset_metadata": {}
    }

    # Create a dataset with the dataset metadata in the region.
    dataset = client.create_dataset(project_location, my_dataset)

    # Display the dataset information.
    print("Dataset name: {}".format(dataset.name))
    print("Dataset id: {}".format(dataset.name.split("/")[-1]))
    print("Dataset display name: {}".format(dataset.display_name))
    print("Dataset example count: {}".format(dataset.example_count))
    print("Dataset create time: {}".format(datetime.fromtimestamp(dataset.create_time.seconds).strftime("%Y-%m-%dT%H:%M:%SZ")))

Importing items into a dataset

After you have created a dataset, you can import training items from a CSV file stored in a Google Cloud Storage bucket. For details on preparing your data and creating a CSV file for import, see Preparing your training data.

Web UI

You have to create a dataset before you can import items into it; see Creating a dataset. The steps below import items into an existing dataset. When you import additional items, you need to train a new model based on the expanded dataset.

  1. Open the AutoML Natural Language Entity Extraction UI, select the Get started link in the AutoML Entity Extraction box, and select the dataset from the Datasets page.

  2. On the Import page, enter the location of the training data to import.

    In the Select a CSV file on Cloud Storage text box, enter the path for the CSV file containing your training data. (The gs:// prefix is added automatically.) Alternatively, you can click Browse and navigate to the CSV file.

    The training data is JSONL files that contain sample text items annotated to identify entities you want the model to learn to extract. To import the training data into the dataset, you use a CSV file that points to the JSONL files; see Preparing your training data for information about the format.

  3. Click Import.

    You're returned to the Datasets page; your dataset will show an in progress animation while your documents are being imported. This process should take approximately 10 to 20 minutes per 1000 documents, but may take more or less time.

    If the service returns a 405 error, reduce the number of documents you're uploading at once. You'll need to refresh the page before trying again.

Command-line

In the command below, replace project-id and dataset-id with your IDs, and replace csv-file-URI with the path to the .csv file in your Google Cloud Storage bucket.

curl \
  -X POST \
  -H "Authorization: Bearer $(gcloud auth application-default print-access-token)" \
  -H "Content-Type: application/json" \
  https://automl.googleapis.com/v1beta1/projects/project-id/locations/us-central1/datasets/dataset-id:importData \
  -d '{
  "inputConfig": {
    "gcsSource": {
      "inputUris": ["csv-file-URI"]
    }
  }
}'

You should see output similar to the following. You can use the operation ID to get the status of the task. For an example, see Getting the status of an operation.

{
  "name": "projects/000000000000/locations/us-central1/operations/1979469554520650937",
  "metadata": {
    "@type": "type.googleapis.com/google.cloud.automl.v1beta1.OperationMetadata",
    "createTime": "2018-04-27T01:28:36.128120Z",
    "updateTime": "2018-04-27T01:28:36.128150Z",
    "cancellable": true
  }
}

Java

import com.google.cloud.automl.v1beta1.AutoMlClient;
import com.google.cloud.automl.v1beta1.DatasetName;
import com.google.cloud.automl.v1beta1.GcsSource;
import com.google.cloud.automl.v1beta1.InputConfig;
import com.google.protobuf.Empty;
import java.io.IOException;
import java.util.concurrent.ExecutionException;

class ImportData {

  // Import data from Google Cloud Storage into a dataset
  static void importData(String projectId, String computeRegion, String datasetId, String[] gcsUris)
      throws InterruptedException, ExecutionException, IOException {
    // String projectId = "YOUR_PROJECT_ID";
    // String computeRegion = "us-central1";
    // String datasetId = "YOUR_DATASET_ID";
    // String[] gcsUris = {"PATH_TO_YOUR_DATA_FILE1", "PATH_TO_YOUR_DATA_FILE2"};

    // Instantiates a client
    try (AutoMlClient client = AutoMlClient.create()) {

      // Get the complete path of the dataset.
      DatasetName datasetFullId = DatasetName.of(projectId, computeRegion, datasetId);

      GcsSource.Builder gcsSource = GcsSource.newBuilder();

      // Get multiple training data files to be imported from gcsSource.
      for (String inputUri : gcsUris) {
        gcsSource.addInputUris(inputUri);
      }

      // Import data from the input URI
      InputConfig inputConfig = InputConfig.newBuilder().setGcsSource(gcsSource).build();
      System.out.println("Processing import...");

      Empty response = client.importDataAsync(datasetFullId, inputConfig).get();
      System.out.println(String.format("Dataset imported. %s", response));
    }
  }
}

Node.js

const automl = require(`@google-cloud/automl`);
const client = new automl.v1beta1.AutoMlClient();

/**
 * Demonstrates using the AutoML client to import labeled items.
 * TODO(developer): Uncomment the following lines before running the sample.
 */
// const projectId = '[PROJECT_ID]' e.g., "my-gcloud-project";
// const computeRegion = '[REGION_NAME]' e.g., "us-central1";
// const datasetId = '[DATASET_ID]' e.g., "TEN8051890775971069952";
// const gcsPath = '[GCS_PATH]' e.g., "gs://<bucket-name>/<csv file>",
// `.csv paths in AutoML Natural Language Entity CSV format`;

// Get the full path of the dataset.
const datasetFullId = client.datasetPath(projectId, computeRegion, datasetId);

// Get the multiple Google Cloud Storage URIs.
const inputUris = gcsPath.split(`,`);
const inputConfig = {
  gcsSource: {
    inputUris: inputUris,
  },
};

// Import the data from the input URI.
client
  .importData({name: datasetFullId, inputConfig: inputConfig})
  .then(responses => {
    const operation = responses[0];
    console.log(`Processing import...`);
    return operation.promise();
  })
  .then(responses => {
    // The final result of the operation.
    const operationDetails = responses[2];

    // Get the data import details.
    console.log('Data import details:');
    console.log(`\tOperation details:`);
    console.log(`\t\tName: ${operationDetails.name}`);
    console.log(`\t\tDone: ${operationDetails.done}`);
  })
  .catch(err => {
    console.error(err);
  });

Python

    # TODO(developer): Uncomment and set the following variables
    # project_id = '[PROJECT_ID]'
    # compute_region = '[COMPUTE_REGION]'
    # dataset_id = '[DATASET_ID]'
    # path = 'gs://path/to/file.csv'

    from google.cloud import automl_v1beta1 as automl

    client = automl.AutoMlClient()

    # Get the full path of the dataset.
    dataset_full_id = client.dataset_path(
        project_id, compute_region, dataset_id
    )

    # Get the multiple Google Cloud Storage URIs.
    input_uris = path.split(",")
    input_config = {"gcs_source": {"input_uris": input_uris}}

    # Import the dataset from the input URI.
    response = client.import_data(dataset_full_id, input_config)

    print("Processing import...")
    # synchronous check of operation status.
    print("Data imported. {}".format(response.result()))

Identify the entities

To train your custom model, you must provide representative samples of the type of entities you want AutoML Natural Language Entity Extraction to identify in text, annotated with the labels you want the model to use. Each annotation includes a label and a span of text. You can identify entities in your training data in three ways:

  • Annotate the JSONL files
  • Add annotations in the AutoML Natural Language Entity Extraction UI
  • Request labeling from human labelers using the AI Platform Data Labeling Service

You can also combine the first two options, by uploading labeled JSONL files and modifying them in the UI.

The AutoML API does not include methods for annotating training data.

For details about annotating your JSON files, see Preparing your training data.

To annotate in the AutoML Natural Language Entity Extraction UI, select the dataset from the dataset listing page to see its details. The display name of the selected dataset appears in the title bar, and the page lists the individual items in the dataset along with the annotations in them. The navigation bar along the left summarizes the labels and the number of times each label appears. You can also filter the item list by label.

Annotation list

To add or delete annotations within a text item, double-click the item you want to update. The Edit page shows the complete text of the selected item, with all previous annotations highlighted.

Entity editor

To add a new annotation, highlight the text that represents the entity, select the label from the Add entity dialog box, and click Save.

Add annotation

To remove an annotation, locate the text in the list of labels on the right and click the garbage can icon next to it.

Listing datasets

A project can include numerous datasets. This section describes how to retrieve a list of the available datasets for a project.

Web UI

To see a list of the available datasets using the AutoML Natural Language Entity Extraction UI, click the Datasets link at the top of the left navigation menu.

Dataset list page

To see the datasets for a different project, select the project from the drop-down list in the upper right of the title bar.

Command-line

In the command below, replace project-id with the ID for your project.

curl \
  -H "Authorization: Bearer $(gcloud auth application-default print-access-token)" \
  -H "Content-Type: application/json" \
  https://automl.googleapis.com/v1beta1/projects/project-id/locations/us-central1/datasets

You should see output similar to the following:

{
  "datasets": [
    {
      "name": "projects/000000000000/locations/us-central1/datasets/356587829854924648",
      "displayName": "test_dataset",
      "createTime": "2018-04-26T18:02:59.825060Z",
      "textExtractionDatasetMetadata": {
      }
    },
    {
      "name": "projects/000000000000/locations/us-central1/datasets/3104518874390609379",
      "displayName": "test",
      "createTime": "2017-12-16T01:10:38.328280Z",
      "textExtractionDatasetMetadata": {
      }
      }
    }
  ]
}

Java

import com.google.cloud.automl.v1beta1.AutoMlClient;
import com.google.cloud.automl.v1beta1.Dataset;
import com.google.cloud.automl.v1beta1.ListDatasetsRequest;
import com.google.cloud.automl.v1beta1.LocationName;
import java.io.IOException;
import java.text.DateFormat;
import java.text.SimpleDateFormat;

class ListDatasets {

  // List all datasets for a given project based on the filter expression
  static void listDatasets(String projectId, String computeRegion, String filter)
      throws IOException {
    // String projectId = "YOUR_PROJECT_ID";
    // String computeRegion = "us-central1";
    // String filter = "YOUR_FILTER_EXPRESSION";

    // Instantiates a client
    try (AutoMlClient client = AutoMlClient.create()) {

      // A resource that represents Google Cloud Platform location.
      LocationName projectLocation = LocationName.of(projectId, computeRegion);

      // Build the List datasets request
      ListDatasetsRequest request =
          ListDatasetsRequest.newBuilder()
              .setParent(projectLocation.toString())
              .setFilter(filter)
              .build();

      // List all the datasets available in the region by applying filter.
      System.out.println("List of datasets:");
      for (Dataset dataset : client.listDatasets(request).iterateAll()) {
        // Display the dataset information.
        System.out.println(String.format("\nDataset name: %s", dataset.getName()));
        System.out.println(
            String.format(
                "Dataset Id: %s",
                dataset.getName().split("/")[dataset.getName().split("/").length - 1]));
        System.out.println(String.format("Dataset display name: %s", dataset.getDisplayName()));
        System.out.println("Text extraction dataset metadata:");
        System.out.print(String.format("\t%s", dataset.getTextExtractionDatasetMetadata()));
        System.out.println(String.format("Dataset example count: %d", dataset.getExampleCount()));
        DateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSZ");
        String createTime =
            dateFormat.format(new java.util.Date(dataset.getCreateTime().getSeconds() * 1000));
        System.out.println(String.format("Dataset create time: %s", createTime));
      }
    }
  }
}

Node.js

const automl = require(`@google-cloud/automl`);
const util = require(`util`);
const client = new automl.v1beta1.AutoMlClient();

/**
 * Demonstrates using the AutoML client to list all datasets.
 * TODO(developer): Uncomment the following lines before running the sample.
 */
// const projectId = '[PROJECT_ID]' e.g., "my-gcloud-project";
// const computeRegion = '[REGION_NAME]' e.g., "us-central1";
// const filter_ = '[FILTER_EXPRESSIONS]'
// e.g., "textExtractionDatasetMetadata:*";

// A resource that represents Google Cloud Platform location.
const projectLocation = client.locationPath(projectId, computeRegion);

// List all the datasets available in the region by applying filter.
client
  .listDatasets({parent: projectLocation, filter: filter})
  .then(responses => {
    const dataset = responses[0];

    // Display the dataset information.
    console.log(`List of datasets:`);
    for (let i = 0; i < dataset.length; i++) {
      console.log(`\nDataset name: ${dataset[i].name}`);
      console.log(`Dataset Id: ${dataset[i].name.split(`/`).pop(-1)}`);
      console.log(`Dataset display name: ${dataset[i].displayName}`);
      console.log(`Dataset example count: ${dataset[i].exampleCount}`);
      console.log(
        `Text extraction dataset metadata: ${util.inspect(
          dataset[i].textExtractionDatasetMetadata,
          false,
          null
        )}`
      );
    }
  })
  .catch(err => {
    console.error(err);
  });

Python

    # TODO(developer): Uncomment and set the following variables
    # project_id = '[PROJECT_ID]'
    # compute_region = '[COMPUTE_REGION]'
    # filter_ = 'filter expression here'

    from datetime import datetime
    from google.cloud import automl_v1beta1 as automl

    client = automl.AutoMlClient()

    # A resource that represents Google Cloud Platform location.
    project_location = client.location_path(project_id, compute_region)

    # List all the datasets available in the region by applying filter.
    response = client.list_datasets(project_location, filter_)

    print("List of datasets:")
    for dataset in response:
        # Display the dataset information.
        print("Dataset name: {}".format(dataset.name))
        print("Dataset id: {}".format(dataset.name.split("/")[-1]))
        print("Dataset display name: {}".format(dataset.display_name))
        print("Dataset example count: {}".format(dataset.example_count))
        print("Dataset create time: {}".format(datetime.fromtimestamp(dataset.create_time.seconds).strftime("%Y-%m-%dT%H:%M:%SZ")))

Deleting a dataset

Web UI

  1. In the AutoML Natural Language Entity Extraction UI, click the Datasets link at the top of the left navigation menu to display the list of available datasets.

    Dataset list page

  2. Click the three-dot menu at the far right of the row you want to delete and select Delete dataset.

  3. Click Delete in the confirmation dialog box.

Command-line

Replace dataset-name with the full name of your dataset, from the response when you created the dataset. The full name has the format: projects/{project-id}/locations/us-central1/datasets/{dataset-id}

curl -X DELETE \
  -H "Authorization: Bearer $(gcloud auth application-default print-access-token)" \
  -H "Content-Type: application/json" https://automl.googleapis.com/v1beta1/dataset-name

You should see output similar to the following:

{
  "name": "projects/000000000000/locations/us-central1/operations/3512013641657611176",
  "metadata": {
    "@type": "type.googleapis.com/google.cloud.automl.v1beta1.OperationMetadata",
    "createTime": "2018-05-04T01:45:16.735340Z",
    "updateTime": "2018-05-04T01:45:16.735360Z",
    "cancellable": true
  }
}

Java

import com.google.cloud.automl.v1beta1.AutoMlClient;
import com.google.cloud.automl.v1beta1.DatasetName;
import com.google.protobuf.Empty;
import java.io.IOException;
import java.util.concurrent.ExecutionException;

class DeleteDataset {

  // Delete a dataset
  static void deleteDataset(String projectId, String computeRegion, String datasetId)
      throws InterruptedException, ExecutionException, IOException {
    // String projectId = "YOUR_PROJECT_ID";
    // String computeRegion = "us-central1";
    // String datasetId = "YOUR_DATASET_ID";

    // Instantiates a client
    try (AutoMlClient client = AutoMlClient.create()) {

      // Get the complete path of the dataset.
      DatasetName datasetFullId = DatasetName.of(projectId, computeRegion, datasetId);

      // Delete a dataset.
      Empty response = client.deleteDatasetAsync(datasetFullId).get();
      System.out.println(String.format("Dataset deleted. %s", response));
    }
  }
}

Node.js

const automl = require(`@google-cloud/automl`);
const client = new automl.v1beta1.AutoMlClient();

/**
 * Demonstrates using the AutoML client to delete a dataset.
 * TODO(developer): Uncomment the following lines before running the sample.
 */
// const projectId = '[PROJECT_ID]' e.g., "my-gcloud-project";
// const computeRegion = '[REGION_NAME]' e.g., "us-central1";
// const datasetId = '[DATASET_ID]' e.g., "TEN8051890775971069952";

// Get the full path of the dataset.
const datasetFullId = client.datasetPath(projectId, computeRegion, datasetId);

// Delete a dataset.
client
  .deleteDataset({name: datasetFullId})
  .then(responses => {
    const operation = responses[0];
    return operation.promise();
  })
  .then(responses => {
    // The final result of the operation.
    const operationDetails = responses[2];

    // Get the Dataset delete details.
    console.log('Dataset delete details:');
    console.log(`\tOperation details:`);
    console.log(`\t\tName: ${operationDetails.name}`);
    console.log(`\t\tDone: ${operationDetails.done}`);
  })
  .catch(err => {
    console.error(err);
  });

Python

    # TODO(developer): Uncomment and set the following variables
    # project_id = '[PROJECT_ID]'
    # compute_region = '[COMPUTE_REGION]'
    # dataset_id = '[DATASET_ID]'

    from google.cloud import automl_v1beta1 as automl

    client = automl.AutoMlClient()

    # Get the full path of the dataset.
    dataset_full_id = client.dataset_path(
        project_id, compute_region, dataset_id
    )

    # Delete a dataset.
    response = client.delete_dataset(dataset_full_id)

    # synchronous check of operation status.
    print("Dataset deleted. {}".format(response.result()))

Was this page helpful? Let us know how we did:

Send feedback about...

AutoML Natural Language Entity Extraction