将数据从 Cloud Storage 上的 CSV 文件加载到聚簇表中。
深入探索
如需查看包含此代码示例的详细文档,请参阅以下内容:
代码示例
Go
试用此示例之前,请按照 BigQuery 快速入门:使用客户端库中的 Go 设置说明进行操作。如需了解详情,请参阅 BigQuery Go API 参考文档。
如需向 BigQuery 进行身份验证,请设置应用默认凭据。 如需了解详情,请参阅为客户端库设置身份验证。
import (
"context"
"fmt"
"cloud.google.com/go/bigquery"
)
// importClusteredTable demonstrates creating a table from a load job and defining partitioning and clustering
// properties.
func importClusteredTable(projectID, destDatasetID, destTableID string) error {
// projectID := "my-project-id"
// datasetID := "mydataset"
// tableID := "mytable"
ctx := context.Background()
client, err := bigquery.NewClient(ctx, projectID)
if err != nil {
return fmt.Errorf("bigquery.NewClient: %w", err)
}
defer client.Close()
gcsRef := bigquery.NewGCSReference("gs://cloud-samples-data/bigquery/sample-transactions/transactions.csv")
gcsRef.SkipLeadingRows = 1
gcsRef.Schema = bigquery.Schema{
{Name: "timestamp", Type: bigquery.TimestampFieldType},
{Name: "origin", Type: bigquery.StringFieldType},
{Name: "destination", Type: bigquery.StringFieldType},
{Name: "amount", Type: bigquery.NumericFieldType},
}
loader := client.Dataset(destDatasetID).Table(destTableID).LoaderFrom(gcsRef)
loader.TimePartitioning = &bigquery.TimePartitioning{
Field: "timestamp",
}
loader.Clustering = &bigquery.Clustering{
Fields: []string{"origin", "destination"},
}
loader.WriteDisposition = bigquery.WriteEmpty
job, err := loader.Run(ctx)
if err != nil {
return err
}
status, err := job.Wait(ctx)
if err != nil {
return err
}
if status.Err() != nil {
return fmt.Errorf("job completed with error: %w", status.Err())
}
return nil
}
Java
试用此示例之前,请按照 BigQuery 快速入门:使用客户端库中的 Java 设置说明进行操作。如需了解详情,请参阅 BigQuery Java API 参考文档。
如需向 BigQuery 进行身份验证,请设置应用默认凭据。 如需了解详情,请参阅为客户端库设置身份验证。
import com.google.cloud.bigquery.BigQuery;
import com.google.cloud.bigquery.BigQueryException;
import com.google.cloud.bigquery.BigQueryOptions;
import com.google.cloud.bigquery.Clustering;
import com.google.cloud.bigquery.Field;
import com.google.cloud.bigquery.FormatOptions;
import com.google.cloud.bigquery.Job;
import com.google.cloud.bigquery.JobInfo;
import com.google.cloud.bigquery.LoadJobConfiguration;
import com.google.cloud.bigquery.Schema;
import com.google.cloud.bigquery.StandardSQLTypeName;
import com.google.cloud.bigquery.TableId;
import com.google.cloud.bigquery.TimePartitioning;
import com.google.common.collect.ImmutableList;
import java.util.List;
// Sample to load clustered table.
public class LoadTableClustered {
public static void main(String[] args) {
// TODO(developer): Replace these variables before running the sample.
String datasetName = "MY_DATASET_NAME";
String tableName = "MY_TABLE_NAME";
String sourceUri = "/path/to/file.csv";
Schema schema =
Schema.of(
Field.of("name", StandardSQLTypeName.STRING),
Field.of("post_abbr", StandardSQLTypeName.STRING),
Field.of("date", StandardSQLTypeName.DATE));
loadTableClustered(
datasetName, tableName, sourceUri, schema, ImmutableList.of("name", "post_abbr"));
}
public static void loadTableClustered(
String datasetName,
String tableName,
String sourceUri,
Schema schema,
List<String> clusteringFields) {
try {
// Initialize client that will be used to send requests. This client only needs to be created
// once, and can be reused for multiple requests.
BigQuery bigquery = BigQueryOptions.getDefaultInstance().getService();
TableId tableId = TableId.of(datasetName, tableName);
TimePartitioning partitioning = TimePartitioning.of(TimePartitioning.Type.DAY);
// Clustering fields will be consisted of fields mentioned in the schema.
// BigQuery supports clustering for both partitioned and non-partitioned tables.
Clustering clustering = Clustering.newBuilder().setFields(clusteringFields).build();
LoadJobConfiguration loadJobConfig =
LoadJobConfiguration.builder(tableId, sourceUri)
.setFormatOptions(FormatOptions.csv())
.setSchema(schema)
.setTimePartitioning(partitioning)
.setClustering(clustering)
.build();
Job loadJob = bigquery.create(JobInfo.newBuilder(loadJobConfig).build());
// Load data from a GCS parquet file into the table
// Blocks until this load table job completes its execution, either failing or succeeding.
Job job = loadJob.waitFor();
// Check for errors
if (job.isDone() && job.getStatus().getError() == null) {
System.out.println("Data successfully loaded into clustered table during load job");
} else {
System.out.println(
"BigQuery was unable to load into the table due to an error:"
+ job.getStatus().getError());
}
} catch (BigQueryException | InterruptedException e) {
System.out.println("Data not loaded into clustered table during load job \n" + e.toString());
}
}
}
Node.js
试用此示例之前,请按照 BigQuery 快速入门:使用客户端库中的 Node.js 设置说明进行操作。如需了解详情,请参阅 BigQuery Node.js API 参考文档。
如需向 BigQuery 进行身份验证,请设置应用默认凭据。 如需了解详情,请参阅为客户端库设置身份验证。
// Import the Google Cloud client library
const {BigQuery} = require('@google-cloud/bigquery');
const {Storage} = require('@google-cloud/storage');
// Instantiate clients
const bigquery = new BigQuery();
const storage = new Storage();
/**
* This sample loads the CSV file at
* https://storage.googleapis.com/cloud-samples-data/sample-transactions/transactions.csv
*
* TODO(developer): Replace the following lines with the path to your file.
*/
const bucketName = 'cloud-samples-data';
const filename = 'bigquery/sample-transactions/transactions.csv';
async function loadTableClustered() {
// Loads a new clustered table named "my_table" in "my_dataset".
/**
* TODO(developer): Uncomment the following lines before running the sample.
*/
// const datasetId = "my_dataset";
// const tableId = "my_table";
const metadata = {
sourceFormat: 'CSV',
skipLeadingRows: 1,
schema: {
fields: [
{name: 'timestamp', type: 'TIMESTAMP'},
{name: 'origin', type: 'STRING'},
{name: 'destination', type: 'STRING'},
{name: 'amount', type: 'NUMERIC'},
],
},
clustering: {
fields: ['origin', 'destination'],
},
};
// Load data from a Google Cloud Storage file into the table
const [job] = await bigquery
.dataset(datasetId)
.table(tableId)
.load(storage.bucket(bucketName).file(filename), metadata);
// load() waits for the job to finish
console.log(`Job ${job.id} completed.`);
}
Python
试用此示例之前,请按照 BigQuery 快速入门:使用客户端库中的 Python 设置说明进行操作。如需了解详情,请参阅 BigQuery Python API 参考文档。
如需向 BigQuery 进行身份验证,请设置应用默认凭据。 如需了解详情,请参阅为客户端库设置身份验证。
from google.cloud import bigquery
# Construct a BigQuery client object.
client = bigquery.Client()
# TODO(developer): Set table_id to the ID of the table to create.
# table_id = "your-project.your_dataset.your_table_name"
job_config = bigquery.LoadJobConfig(
skip_leading_rows=1,
source_format=bigquery.SourceFormat.CSV,
schema=[
bigquery.SchemaField("timestamp", bigquery.SqlTypeNames.TIMESTAMP),
bigquery.SchemaField("origin", bigquery.SqlTypeNames.STRING),
bigquery.SchemaField("destination", bigquery.SqlTypeNames.STRING),
bigquery.SchemaField("amount", bigquery.SqlTypeNames.NUMERIC),
],
time_partitioning=bigquery.TimePartitioning(field="timestamp"),
clustering_fields=["origin", "destination"],
)
job = client.load_table_from_uri(
["gs://cloud-samples-data/bigquery/sample-transactions/transactions.csv"],
table_id,
job_config=job_config,
)
job.result() # Waits for the job to complete.
table = client.get_table(table_id) # Make an API request.
print(
"Loaded {} rows and {} columns to {}".format(
table.num_rows, len(table.schema), table_id
)
)
后续步骤
如需搜索和过滤其他 Google Cloud 产品的代码示例,请参阅 Google Cloud 示例浏览器。