Clustered table

Load data from a CSV file on Cloud Storage to a clustered table.

Documentation pages that include this code sample

To view the code sample used in context, see the following documentation:

Code Sample

Go

Before trying this sample, follow the Go setup instructions in the BigQuery Quickstart Using Client Libraries. For more information, see the BigQuery Go API reference documentation.

import (
	"context"
	"fmt"

	"cloud.google.com/go/bigquery"
)

// importClusteredTable demonstrates creating a table from a load job and defining partitioning and clustering
// properties.
func importClusteredTable(projectID, destDatasetID, destTableID string) error {
	// projectID := "my-project-id"
	// datasetID := "mydataset"
	// tableID := "mytable"
	ctx := context.Background()
	client, err := bigquery.NewClient(ctx, projectID)
	if err != nil {
		return fmt.Errorf("bigquery.NewClient: %v", err)
	}
	defer client.Close()

	gcsRef := bigquery.NewGCSReference("gs://cloud-samples-data/bigquery/sample-transactions/transactions.csv")
	gcsRef.SkipLeadingRows = 1
	gcsRef.Schema = bigquery.Schema{
		{Name: "timestamp", Type: bigquery.TimestampFieldType},
		{Name: "origin", Type: bigquery.StringFieldType},
		{Name: "destination", Type: bigquery.StringFieldType},
		{Name: "amount", Type: bigquery.NumericFieldType},
	}
	loader := client.Dataset(destDatasetID).Table(destTableID).LoaderFrom(gcsRef)
	loader.TimePartitioning = &bigquery.TimePartitioning{
		Field: "timestamp",
	}
	loader.Clustering = &bigquery.Clustering{
		Fields: []string{"origin", "destination"},
	}
	loader.WriteDisposition = bigquery.WriteEmpty

	job, err := loader.Run(ctx)
	if err != nil {
		return err
	}
	status, err := job.Wait(ctx)
	if err != nil {
		return err
	}

	if status.Err() != nil {
		return fmt.Errorf("job completed with error: %v", status.Err())
	}
	return nil
}

Java

Before trying this sample, follow the Java setup instructions in the BigQuery Quickstart Using Client Libraries. For more information, see the BigQuery Java API reference documentation.

import com.google.cloud.bigquery.BigQuery;
import com.google.cloud.bigquery.BigQueryException;
import com.google.cloud.bigquery.BigQueryOptions;
import com.google.cloud.bigquery.Clustering;
import com.google.cloud.bigquery.Field;
import com.google.cloud.bigquery.FormatOptions;
import com.google.cloud.bigquery.Job;
import com.google.cloud.bigquery.JobInfo;
import com.google.cloud.bigquery.LoadJobConfiguration;
import com.google.cloud.bigquery.Schema;
import com.google.cloud.bigquery.StandardSQLTypeName;
import com.google.cloud.bigquery.TableId;
import com.google.cloud.bigquery.TimePartitioning;
import com.google.common.collect.ImmutableList;
import java.util.List;

// Sample to load clustered table.
public class LoadTableClustered {

  public static void main(String[] args) {
    // TODO(developer): Replace these variables before running the sample.
    String datasetName = "MY_DATASET_NAME";
    String tableName = "MY_TABLE_NAME";
    String sourceUri = "/path/to/file.csv";
    Schema schema =
        Schema.of(
            Field.of("name", StandardSQLTypeName.STRING),
            Field.of("post_abbr", StandardSQLTypeName.STRING),
            Field.of("date", StandardSQLTypeName.DATE));
    loadTableClustered(
        datasetName, tableName, sourceUri, schema, ImmutableList.of("name", "post_abbr"));
  }

  public static void loadTableClustered(
      String datasetName,
      String tableName,
      String sourceUri,
      Schema schema,
      List<String> clusteringFields) {
    try {
      // Initialize client that will be used to send requests. This client only needs to be created
      // once, and can be reused for multiple requests.
      BigQuery bigquery = BigQueryOptions.getDefaultInstance().getService();

      TableId tableId = TableId.of(datasetName, tableName);

      TimePartitioning partitioning = TimePartitioning.of(TimePartitioning.Type.DAY);
      // Clustering fields will be consisted of fields mentioned in the schema.
      // BigQuery supports clustering for both partitioned and non-partitioned tables.
      Clustering clustering = Clustering.newBuilder().setFields(clusteringFields).build();

      LoadJobConfiguration loadJobConfig =
          LoadJobConfiguration.builder(tableId, sourceUri)
              .setFormatOptions(FormatOptions.csv())
              .setSchema(schema)
              .setTimePartitioning(partitioning)
              .setClustering(clustering)
              .build();

      Job loadJob = bigquery.create(JobInfo.newBuilder(loadJobConfig).build());

      // Load data from a GCS parquet file into the table
      // Blocks until this load table job completes its execution, either failing or succeeding.
      Job job = loadJob.waitFor();

      // Check for errors
      if (job.isDone() && job.getStatus().getError() == null) {
        System.out.println("Data successfully loaded into clustered table during load job");
      } else {
        System.out.println(
            "BigQuery was unable to load into the table due to an error:"
                + job.getStatus().getError());
      }
    } catch (BigQueryException | InterruptedException e) {
      System.out.println("Data not loaded into clustered table during load job \n" + e.toString());
    }
  }
}

Python

Before trying this sample, follow the Python setup instructions in the BigQuery Quickstart Using Client Libraries. For more information, see the BigQuery Python API reference documentation.

from google.cloud import bigquery

# Construct a BigQuery client object.
client = bigquery.Client()

# TODO(developer): Set table_id to the ID of the table to create.
# table_id = "your-project.your_dataset.your_table_name"

job_config = bigquery.LoadJobConfig(
    skip_leading_rows=1,
    source_format=bigquery.SourceFormat.CSV,
    schema=[
        bigquery.SchemaField("timestamp", bigquery.SqlTypeNames.TIMESTAMP),
        bigquery.SchemaField("origin", bigquery.SqlTypeNames.STRING),
        bigquery.SchemaField("destination", bigquery.SqlTypeNames.STRING),
        bigquery.SchemaField("amount", bigquery.SqlTypeNames.NUMERIC),
    ],
    time_partitioning=bigquery.TimePartitioning(field="timestamp"),
    clustering_fields=["origin", "destination"],
)

job = client.load_table_from_uri(
    ["gs://cloud-samples-data/bigquery/sample-transactions/transactions.csv"],
    table_id,
    job_config=job_config,
)

job.result()  # Waits for the job to complete.

table = client.get_table(table_id)  # Make an API request.
print(
    "Loaded {} rows and {} columns to {}".format(
        table.num_rows, len(table.schema), table_id
    )
)