Exporting labeled data

When the labeling operation is complete, you can export the annotated dataset to your Google Cloud Storage bucket by calling ExportData.

ExportData supports returning a .csv file containing one row for each annotation or data item. The first field indicates the ml usage category of this line, which defaults to UNASSIGNED. ExportData also supports a jsonl file where each line represents an example which includes a data item and all the annotations. Below are examples for each type.

  • Image classification

    • csv line:

      UNASSIGNED,image_url,label_1,label_2,...

    • json line:

      {
      "name":"projects/project_id/datasets/dataset_id/annotatedDatasets/annotated_dataset_id/examples/example_id",
      "imagePayload":{
      "mimeType":"IMAGE_PNG",
      "imageUri":"gs://sample_bucket/image.png"
      },
      "annotations":[
      {
       "name":"projects/project_id/datasets/dataset_id/annotatedDatasets/annotated_dataset_id/examples/example_id/annotations/annotation_id",
       "annotationValue":{
          "imageClassificationAnnotation":{
             "annotationSpec":{
                "displayName":"tulip",
             }
          }
       }
      }
      ]
      }

  • Image bounding box

    • csv line: The four points are top-left, top-right, bottom-right, bottom-left. The second and fourth points are optional. Each point is represented by x,y. Each line will contain one bounding box. Multiple boxes of an image will be in multiple lines.

      UNASSIGNED,image_url,label,0.1,0.1,,,0.3,0.3,,

    • json line: if a coordinate in normalizedVertices is not set, that field is 0 by default. This also applies to any coordinate based annotations.

    {
    "name":"projects/project_id/datasets/dataset_id/annotatedDatasets/annotated_dataset_id/examples/example_id",
    "imagePayload":{
      "mimeType":"IMAGE_PNG",
      "imageUri":"gs://sample_bucket/image.png"
    },
    "annotations":[
      {
         "name":"projects/project_id/datasets/dataset_id/annotatedDatasets/annotated_dataset_id/examples/example_id/annotations/annotation_id",
         "annotationValue":{
           "image_bounding_poly_annotation": {
            "annotationSpec": {
              "displayName": "tulip"
            },
            "normalizedBoundingPoly": {
              "normalizedVertices": [ {
                "x": 0.1,
                "y": 0.2
              }, {
                "x": 0.9,
                "y": 0.9
              } ]
            }
         }
      }
    }
    ]
    }
  • Image bounding polygon, oriented bounding box and polyline

    • csv line: Each point in the closed polygon/polyline is represented by the x,y point, separated by two empty csv columns. The last pair connects back to the first pair for polygon while there is no closed cycle for polyline. Each line represents one polygon/polyline.

      UNASSIGNED,image_url,label,0.1,0.1,,,0.3,0.3,,,0.6,0.6,,...

    • json line:

      {
      "name":"projects/project_id/datasets/dataset_id/annotatedDatasets/annotated_dataset_id/examples/example_id",
      "imagePayload":{
      "mimeType":"IMAGE_PNG",
      "imageUri":"gs://sample_bucket/image.png"
      },
      "annotations":[
      {
       "name":"projects/project_id/datasets/dataset_id/annotatedDatasets/annotated_dataset_id/examples/example_id/annotations/annotation_id",
       "annotationValue":{
         "image_bounding_poly_annotation": {
          "annotationSpec": {
            "displayName": "tulip"
          },
          "normalizedBoundingPoly": {
            "normalizedVertices": [ {
              "x": 0.1,
              "y": 0.1
            }, {
              "x": 0.1,
              "y": 0.2
            }, {
              "x": 0.2,
              "y": 0.3
            }  ]
          }
       }
      }
      }
      ]
      }

  • Image segmentation

    For image segmentation, only jsonl output is provided.

    • json line: The imageBytes field in imageSegmentationAnnotation represents the segmentation mask for that image. The color for each label (that is, each dog and cat) is shown in the annotationColors field.
      {
      "name":"projects/project_id/datasets/dataset_id/annotatedDatasets/annotated_dataset_id/examples/example_id",
      "imagePayload":{
      "mimeType":"IMAGE_PNG",
      "imageUri":"gs://sample_bucket/image.png"
      },
      "annotations":[
      {
       "name":"projects/project_id/datasets/dataset_id/annotatedDatasets/annotated_dataset_id/examples/example_id/annotations/annotation_id",
       "annotationValue":{
         "imageSegmentationAnnotation": {
            "annotationColors": [ {
              "key": "rgb(0,0,255)",
              "value": {
                "display_name": "dog"
              }
            }, {
              "key": "rgb(0,255,0)",
              "value": {
                "display_name": "cat"
              }
            } ],
            "mimeType": "IMAGE_JPEG",
            "imageBytes": "/9j/4AAQSkZJRgABAQAAAQABAAD/2"
       }
      }
      }
      ]
      }
  • Video classification

    • csv line:

      UNASSIGNED,video_url,label,segment_start_time,segment_end_time

    • json line:

      {
      "name": "projects/project_id/datasets/dataset_id/annotatedDatasets/annotated_dataset_id/examples/example_id",
      "videoPayload": {
      "mimeType": "VIDEO_MP4",
      "resolution": {
        width: 720,
        height: 360
      }
      "frameRate": 24
      },
      "annotations": [ {
      "name": "projects/project_id/datasets/dataset_id/annotatedDatasets/annotated_dataset_id/examples/example_id/annotations/annotation_id",
      "annotationSource": 3,
      "annotationValue": {
        "videoClassificationAnnotation": {
          "timeSegment": {
            "startTimeOffset": {
              "seconds": 10
            },
            "endTimeOffset": {
              "seconds": 20
            }
          },
          "annotationSpec": {
            "displayName": "dog"
          }
        }
      }
      } ]
      }

  • Video object detection

    • csv line:The four points are top-left, top-right, bottom-right, bottom-left. The second and fourth points are optional. Each point is represented by x,y. Each line will contain one bounding box.

      UNASSIGNED,video_url,label,timestamp,0.1,0.1,,,0.3,0.3,,

    • json line:

      {
      "name": "projects/project_id/datasets/dataset_id/annotatedDatasets/annotated_dataset_id/examples/example_id",
      "videoPayload": {
      "mimeType": "VIDEO_MP4",
      "resolution": {
        width: 720,
        height: 360
      }
      "frameRate": 24
      },
      "annotations": [ {
      "name": "projects/project_id/datasets/dataset_id/annotatedDatasets/annotated_dataset_id/examples/example_id/annotations/annotation_id",
      "annotationSource": 3,
      "annotationValue": {
        "videoObjectTrackingAnnotation": {
      "annotationSpec": {
        "displayName": "tulip"
      },
      "timeSegment": {
        "startTimeOffset": {
          "seconds": 10
        },
        "endTimeOffset": {
          "seconds": 10
        }
      },
      "objectTrackingFrames": [ {
        "normalizedBoundingPoly": {
          "normalizedVertices": [ {
            "x": 0.2,
            "y": 0.3
          }, {
            "x": 0.9,
            "y": 0.5
          } ]
        },
      }, {
        "normalizedBoundingPoly": {
          "normalizedVertices": [ {
            "x": 0.3,
            "y": 0.3
          }, {
            "x": 0.5,
            "y": 0.7
          } ]
        },
      } ]
      }
      }
      }]}

  • Video object tracking

    • csv line:The four points are top-left, top-right, bottom-right, bottom-left. The second and fourth points are optional. Each point is represented by x,y. Each line will contain one bounding box. Each object in the video is represented by a unique instance_id.

      UNASSIGNED,video_url,label,instance_id,timestamp,0.1,0.1,,,0.3,0.3,,

    • json line:

      {
      "name": "projects/project_id/datasets/dataset_id/annotatedDatasets/annotated_dataset_id/examples/example_id",
      "videoPayload": {
      "mimeType": "VIDEO_MP4",
      "resolution": {
        width: 720,
        height: 360
      }
      "frameRate": 24
      },
      "annotations": [ {
      "name": "projects/project_id/datasets/dataset_id/annotatedDatasets/annotated_dataset_id/examples/example_id/annotations/annotation_id",
      "annotationSource": 3,
      "annotationValue": {
        "videoObjectTrackingAnnotation": {
      "annotationSpec": {
        "displayName": "tulip"
      },
      "timeSegment": {
        "startTimeOffset": {
          "seconds": 10
        },
        "endTimeOffset": {
          "seconds": 20
        }
      },
      "objectTrackingFrames": [ {
        "normalizedBoundingPoly": {
          "normalizedVertices": [ {
            "x": 0.2,
            "y": 0.3
          }, {
            "x": 0.9,
            "y": 0.5
          } ]
        },
        "timeOffset": {
          "nanos": 1000000
        }
      }, {
        "normalizedBoundingPoly": {
          "normalizedVertices": [ {
            "x": 0.3,
            "y": 0.3
          }, {
            "x": 0.5,
            "y": 0.7
          } ]
        },
        "timeOffset": {
          "nanos": 84000000
        }
      } ]
      }
      }
      }]}

  • Video event

    • csv line:The four points are top-left, top-right, bottom-right, bottom-left. The second and fourth points are optional. Each point is represented by x,y. Each line will contain one bounding box. Each object in the video is represented by a unique instance_id.

      UNASSIGNED,video_url,label,segment_start_time,segment_end_time

    • json line:

      {
      "name": "projects/project_id/datasets/dataset_id/annotatedDatasets/annotated_dataset_id/examples/example_id",
      "videoPayload": {
      "mimeType": "VIDEO_MP4",
      "resolution": {
        width: 720,
        height: 360
      }
      "frameRate": 24
      },
      "annotations": [ {
      "name": "projects/project_id/datasets/dataset_id/annotatedDatasets/annotated_dataset_id/examples/example_id/annotations/annotation_id",
      "annotationValue": {
        "videoEventAnnotation": {
          "annotationSpec": {
            "displayName": "Callie"
          },
          "timeSegment": {
            "startTimeOffset": {
              "seconds": 123
            },
            "endTimeOffset": {
              "seconds": 150
            }
          }
        }
      }
      } ]
      }
      }
      }]}

  • Text classification

    • csv line:

      UNASSIGNED,text_url,label_l

    • json line:

      {
      "name": "projects/project_id/datasets/dataset_id/annotatedDatasets/annotated_dataset_id/examples/example_id",
      "textPayload": {
        "textContent": "dummy_text_content",
        "textUri": "gs://test_bucket/file.txt",
        "wordCount": 1
      }
      "annotations": [ {
        "name": "projects/project_id/datasets/dataset_id/annotatedDatasets/annotated_dataset_id/examples/example_id/annotations/fake_annotation_id",
        "annotationValue": {
          "textClassificationAnnotation": {
            "annotationSpec": {
              "displayName": "news"
            }
          }
        }
      } ],
      }

  • Text sentiment

    • csv line:

      UNASSIGNED,text_url,label_l,sentiment

    • json line:

      {
      "name": "projects/project_id/datasets/dataset_id/annotatedDatasets/annotated_dataset_id/examples/example_id",
      "textPayload": {
        "textContent": "dummy_text_content",
        "textUri": "gs://test_bucket/file.txt",
        "wordCount": 1
      }
      "annotations": [ {
        "name": "projects/project_id/datasets/dataset_id/annotatedDatasets/annotated_dataset_id/examples/example_id/annotations/fake_annotation_id",
        "annotationValue": {
          "textClassificationAnnotation": {
            "annotationSpec": {
              "displayName": "news"
            }
          }
        },
        "annotationSentiment": 1
      } ],
      }

  • Text entity extraction For text entity extraction, only jsonl output is provided.

    • json line:
      {
      "name": "projects/project_id/datasets/dataset_id/annotatedDatasets/annotated_dataset_id/examples/example_id",
      "textPayload": {
        "textContent": "dummy_text_content",
        "textUri": "gs://test_bucket/file.txt",
        "wordCount": 1
      }
      "annotations": [ {
        "name": "projects/project_id/datasets/dataset_id/annotatedDatasets/annotated_dataset_id/examples/example_id/annotations/fake_annotation_id",
        "annotationValue": {
          "textEntityExtractionAnnotation": {
            "annotationSpec": {
              "displayName": "equations"
            },
            "textSegment": {
              "startOffset": 10,
              "endOffset": 20
            }
          }
        }
      } ],
      }

ExportData is a long running operation. The API will return an operation id. You can use the operation id to call GetOperation to get the status for it later.

Web UI

Follow these steps to export the labeled data by using the Data Labeling Service UI.

  1. Open the Data Labeling Service UI in the Google Cloud Console.

    The Datasets page shows the status of previously created datasets for the current project.

  2. Click the dataset name of the dataset you want to export. This takes you to the Dataset detail page.

  3. In the Labeled datasets section, click EXPORT in the Export status column.

  4. In the Export labeled dataset dialog, enter the Cloud Storage path to use for the output file, and select the file format that you want.

  5. Click EXPORT.

    The Dataset detail page shows an in-progress status while your data is being exported. Once it is completed, you can find the export file at the Cloud Storage path that you specified.

Command-line

Set the following environment variables:

  1. PROJECT_ID variable to your Google Cloud project ID.
  2. DATASET_ID variable to the ID of your dataset, from the response when you created the dataset. The ID appears at the end of the full dataset name:

    projects/project-id/locations/us-central1/datasets/dataset-id
  3. ANNOTATED_DATASET_ID variable to the ID of your annotated dataset resource name. The resource name is in the following format:

    projects/project-id/locations/us-central1/datasets/dataset-id/annotatedDatasets/annotated-dataset-id
  4. STORAGE_URI variable to the URI of the Cloud Storage bucket where you want the results stored.

For all annotation requests except image segmentation, the curl request looks similar to the following:

curl -X POST \
   -H "Authorization: Bearer $(gcloud auth application-default print-access-token)" \
   -H "Content-Type: application/json" \
   https://datalabeling.googleapis.com/v1beta1/projects/${PROJECT_ID}/datasets/${DATASET_ID}:exportData \
   -d '{
     "annotatedDataset": "${ANNOTATED_DATASET_ID}",
     "outputConfig": {
       "gcsDestination": {
           "output_uri": "${STORAGE_URI}",
           "mimeType": "text/csv"
       }
     }
   }'

To export image segmentation data, the curl request looks like the following:

curl -X POST \
   -H "Authorization: Bearer $(gcloud auth application-default print-access-token)" \
   -H "Content-Type: application/json" \
   https://datalabeling.googleapis.com/v1beta1/projects/${PROJECT_ID}/datasets/${DATASET_ID}:exportData \
   -d '{
     "annotatedDataset": "${ANNOTATED_DATASET_ID}",
     "outputConfig": {
       "gcsFolderDestination": {
         "output_folder_uri": "${STORAGE_URI}"
       }
     }
   }'

You should see output similar to the following:

{
  "name": "projects/data-labeling-codelab/operations/5c73dd6b_0000_2b34_a920_883d24fa2064",
  "metadata": {
    "@type": "type.googleapis.com/google.cloud.data-labeling.v1beta1.ExportDataOperationResponse",
    "dataset": "projects/data-labeling-codelab/datasets/5c73db3d_0000_23e0_a25b_94eb2c119c4c"
  }
}

Python

Before you can run this code example, you must install the Python Client Libraries.

def export_data(dataset_resource_name, annotated_dataset_resource_name,
                export_gcs_uri):
    """Exports a dataset from the given Google Cloud project."""
    from google.cloud import datalabeling_v1beta1 as datalabeling
    client = datalabeling.DataLabelingServiceClient()

    gcs_destination = datalabeling.types.GcsDestination(
        output_uri=export_gcs_uri, mime_type='text/csv')

    output_config = datalabeling.types.OutputConfig(
        gcs_destination=gcs_destination)

    response = client.export_data(
        dataset_resource_name,
        annotated_dataset_resource_name,
        output_config
    )

    print('Dataset ID: {}\n'.format(response.result().dataset))
    print('Output config:')
    print('\tGcs destination:')
    print('\t\tOutput URI: {}\n'.format(
        response.result().output_config.gcs_destination.output_uri))

Java

Before you can run this code example, you must install the Java Client Libraries.
import com.google.api.gax.longrunning.OperationFuture;
import com.google.cloud.datalabeling.v1beta1.DataLabelingServiceClient;
import com.google.cloud.datalabeling.v1beta1.DataLabelingServiceSettings;
import com.google.cloud.datalabeling.v1beta1.ExportDataOperationMetadata;
import com.google.cloud.datalabeling.v1beta1.ExportDataOperationResponse;
import com.google.cloud.datalabeling.v1beta1.ExportDataRequest;
import com.google.cloud.datalabeling.v1beta1.GcsDestination;
import com.google.cloud.datalabeling.v1beta1.LabelStats;
import com.google.cloud.datalabeling.v1beta1.OutputConfig;
import java.io.IOException;
import java.util.Map.Entry;
import java.util.Set;
import java.util.concurrent.ExecutionException;

class ExportData {

  // Export data from an annotated dataset.
  static void exportData(String datasetName, String annotatedDatasetName, String gcsOutputUri)
      throws IOException {
    // String datasetName = DataLabelingServiceClient.formatDatasetName(
    //     "YOUR_PROJECT_ID", "YOUR_DATASETS_UUID");
    // String annotatedDatasetName = DataLabelingServiceClient.formatAnnotatedDatasetName(
    //     "YOUR_PROJECT_ID",
    //     "YOUR_DATASET_UUID",
    //     "YOUR_ANNOTATED_DATASET_UUID");
    // String gcsOutputUri = "gs://YOUR_BUCKET_ID/export_path";


    DataLabelingServiceSettings settings = DataLabelingServiceSettings
        .newBuilder()
        .build();
    try (DataLabelingServiceClient dataLabelingServiceClient =
             DataLabelingServiceClient.create(settings)) {
      GcsDestination gcsDestination = GcsDestination.newBuilder()
          .setOutputUri(gcsOutputUri)
          .setMimeType("text/csv")
          .build();

      OutputConfig outputConfig = OutputConfig.newBuilder()
          .setGcsDestination(gcsDestination)
          .build();

      ExportDataRequest exportDataRequest = ExportDataRequest.newBuilder()
          .setName(datasetName)
          .setOutputConfig(outputConfig)
          .setAnnotatedDataset(annotatedDatasetName)
          .build();

      OperationFuture<ExportDataOperationResponse, ExportDataOperationMetadata> operation =
          dataLabelingServiceClient.exportDataAsync(exportDataRequest);

      ExportDataOperationResponse response = operation.get();

      System.out.format("Exported item count: %d\n", response.getExportCount());
      LabelStats labelStats = response.getLabelStats();
      Set<Entry<String, Long>> entries = labelStats.getExampleCountMap().entrySet();
      for (Entry<String, Long> entry : entries) {
        System.out.format("\tLabel: %s\n", entry.getKey());
        System.out.format("\tCount: %d\n\n", entry.getValue());
      }
    } catch (IOException | InterruptedException | ExecutionException e) {
      e.printStackTrace();
    }
  }
}