End-to-end workflow.

Explore further

For detailed documentation that includes this code sample, see the following:

Code sample


Before trying this sample, follow the Go setup instructions in the Dataproc quickstart using client libraries. For more information, see the Dataproc Go API reference documentation.

// This quickstart shows how you can use the Dataproc Client library to create a
// Dataproc cluster, submit a PySpark job to the cluster, wait for the job to finish
// and finally delete the cluster.
// Usage:
//     go build
//     ./quickstart --project_id <PROJECT_ID> --region <REGION> \
//         --cluster_name <CLUSTER_NAME> --job_file_path <GCS_JOB_FILE_PATH>
package main

import (

	dataproc "cloud.google.com/go/dataproc/apiv1"
	dataprocpb "google.golang.org/genproto/googleapis/cloud/dataproc/v1"

func main() {
	var projectID, clusterName, region, jobFilePath string
	flag.StringVar(&projectID, "project_id", "", "Cloud Project ID, used for creating resources.")
	flag.StringVar(&region, "region", "", "Region that resources should be created in.")
	flag.StringVar(&clusterName, "cluster_name", "", "Name of Cloud Dataproc cluster to create.")
	flag.StringVar(&jobFilePath, "job_file_path", "", "Path to job file in GCS.")

	ctx := context.Background()

	// Create the cluster client.
	endpoint := fmt.Sprintf("%s-dataproc.googleapis.com:443", region)
	clusterClient, err := dataproc.NewClusterControllerClient(ctx, option.WithEndpoint(endpoint))
	if err != nil {
		log.Fatalf("error creating the cluster client: %s\n", err)

	// Create the cluster config.
	createReq := &dataprocpb.CreateClusterRequest{
		ProjectId: projectID,
		Region:    region,
		Cluster: &dataprocpb.Cluster{
			ProjectId:   projectID,
			ClusterName: clusterName,
			Config: &dataprocpb.ClusterConfig{
				MasterConfig: &dataprocpb.InstanceGroupConfig{
					NumInstances:   1,
					MachineTypeUri: "n1-standard-2",
				WorkerConfig: &dataprocpb.InstanceGroupConfig{
					NumInstances:   2,
					MachineTypeUri: "n1-standard-2",

	// Create the cluster.
	createOp, err := clusterClient.CreateCluster(ctx, createReq)
	if err != nil {
		log.Fatalf("error submitting the cluster creation request: %v\n", err)

	createResp, err := createOp.Wait(ctx)
	if err != nil {
		log.Fatalf("error creating the cluster: %v\n", err)

	// Defer cluster deletion.
	defer func() {
		dReq := &dataprocpb.DeleteClusterRequest{
			ProjectId:   projectID,
			Region:      region,
			ClusterName: clusterName,
		deleteOp, err := clusterClient.DeleteCluster(ctx, dReq)
		if err != nil {
			fmt.Printf("error deleting cluster %q: %v\n", clusterName, err)
		fmt.Printf("Cluster %q successfully deleted\n", clusterName)

	// Output a success message.
	fmt.Printf("Cluster created successfully: %q\n", createResp.ClusterName)

	// Create the job client.
	jobClient, err := dataproc.NewJobControllerClient(ctx, option.WithEndpoint(endpoint))

	// Create the job config.
	submitJobReq := &dataprocpb.SubmitJobRequest{
		ProjectId: projectID,
		Region:    region,
		Job: &dataprocpb.Job{
			Placement: &dataprocpb.JobPlacement{
				ClusterName: clusterName,
			TypeJob: &dataprocpb.Job_PysparkJob{
				PysparkJob: &dataprocpb.PySparkJob{
					MainPythonFileUri: jobFilePath,

	submitJobOp, err := jobClient.SubmitJobAsOperation(ctx, submitJobReq)
	if err != nil {
		fmt.Printf("error with request to submitting job: %v\n", err)

	submitJobResp, err := submitJobOp.Wait(ctx)
	if err != nil {
		fmt.Printf("error submitting job: %v\n", err)

	re := regexp.MustCompile("gs://(.+?)/(.+)")
	matches := re.FindStringSubmatch(submitJobResp.DriverOutputResourceUri)

	if len(matches) < 3 {
		fmt.Printf("regex error: %s\n", submitJobResp.DriverOutputResourceUri)

	// Dataproc job outget gets saved to a GCS bucket allocated to it.
	storageClient, err := storage.NewClient(ctx)
	if err != nil {
		fmt.Printf("error creating storage client: %v\n", err)

	obj := fmt.Sprintf("%s.000000000", matches[2])
	reader, err := storageClient.Bucket(matches[1]).Object(obj).NewReader(ctx)
	if err != nil {
		fmt.Printf("error reading job output: %v\n", err)

	defer reader.Close()

	body, err := ioutil.ReadAll(reader)
	if err != nil {
		fmt.Printf("could not read output from Dataproc Job: %v\n", err)

	fmt.Printf("Job finished successfully: %s", body)


Before trying this sample, follow the Java setup instructions in the Dataproc quickstart using client libraries. For more information, see the Dataproc Java API reference documentation.

/* This quickstart sample walks a user through creating a Cloud Dataproc
 * cluster, submitting a PySpark job from Google Cloud Storage to the
 * cluster, reading the output of the job and deleting the cluster, all
 * using the Java client library.
 * Usage:
 *     mvn clean package -DskipTests
 *     mvn exec:java -Dexec.args="<PROJECT_ID> <REGION> <CLUSTER_NAME> <GCS_JOB_FILE_PATH>"
 *     You can also set these arguments in the main function instead of providing them via the CLI.

import com.google.api.gax.longrunning.OperationFuture;
import com.google.cloud.dataproc.v1.Cluster;
import com.google.cloud.dataproc.v1.ClusterConfig;
import com.google.cloud.dataproc.v1.ClusterControllerClient;
import com.google.cloud.dataproc.v1.ClusterControllerSettings;
import com.google.cloud.dataproc.v1.ClusterOperationMetadata;
import com.google.cloud.dataproc.v1.InstanceGroupConfig;
import com.google.cloud.dataproc.v1.Job;
import com.google.cloud.dataproc.v1.JobControllerClient;
import com.google.cloud.dataproc.v1.JobControllerSettings;
import com.google.cloud.dataproc.v1.JobMetadata;
import com.google.cloud.dataproc.v1.JobPlacement;
import com.google.cloud.dataproc.v1.PySparkJob;
import com.google.cloud.storage.Blob;
import com.google.cloud.storage.Storage;
import com.google.cloud.storage.StorageOptions;
import com.google.protobuf.Empty;
import java.io.IOException;
import java.util.concurrent.ExecutionException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class Quickstart {

  public static void quickstart(
      String projectId, String region, String clusterName, String jobFilePath)
      throws IOException, InterruptedException {
    String myEndpoint = String.format("%s-dataproc.googleapis.com:443", region);

    // Configure the settings for the cluster controller client.
    ClusterControllerSettings clusterControllerSettings =

    // Configure the settings for the job controller client.
    JobControllerSettings jobControllerSettings =

    // Create both a cluster controller client and job controller client with the
    // configured settings. The client only needs to be created once and can be reused for
    // multiple requests. Using a try-with-resources closes the client, but this can also be done
    // manually with the .close() method.
    try (ClusterControllerClient clusterControllerClient =
        JobControllerClient jobControllerClient =
            JobControllerClient.create(jobControllerSettings)) {
      // Configure the settings for our cluster.
      InstanceGroupConfig masterConfig =
      InstanceGroupConfig workerConfig =
      ClusterConfig clusterConfig =
      // Create the cluster object with the desired cluster config.
      Cluster cluster =

      // Create the Cloud Dataproc cluster.
      OperationFuture<Cluster, ClusterOperationMetadata> createClusterAsyncRequest =
          clusterControllerClient.createClusterAsync(projectId, region, cluster);
      Cluster clusterResponse = createClusterAsyncRequest.get();
          String.format("Cluster created successfully: %s", clusterResponse.getClusterName()));

      // Configure the settings for our job.
      JobPlacement jobPlacement = JobPlacement.newBuilder().setClusterName(clusterName).build();
      PySparkJob pySparkJob = PySparkJob.newBuilder().setMainPythonFileUri(jobFilePath).build();
      Job job = Job.newBuilder().setPlacement(jobPlacement).setPysparkJob(pySparkJob).build();

      // Submit an asynchronous request to execute the job.
      OperationFuture<Job, JobMetadata> submitJobAsOperationAsyncRequest =
          jobControllerClient.submitJobAsOperationAsync(projectId, region, job);
      Job jobResponse = submitJobAsOperationAsyncRequest.get();

      // Print output from Google Cloud Storage.
      Matcher matches =

      Storage storage = StorageOptions.getDefaultInstance().getService();
      Blob blob = storage.get(matches.group(1), String.format("%s.000000000", matches.group(2)));

          String.format("Job finished successfully: %s", new String(blob.getContent())));

      // Delete the cluster.
      OperationFuture<Empty, ClusterOperationMetadata> deleteClusterAsyncRequest =
          clusterControllerClient.deleteClusterAsync(projectId, region, clusterName);
      System.out.println(String.format("Cluster \"%s\" successfully deleted.", clusterName));

    } catch (ExecutionException e) {
      System.err.println(String.format("quickstart: %s ", e.getMessage()));

  public static void main(String... args) throws IOException, InterruptedException {
    if (args.length != 4) {
          "Insufficient number of parameters provided. Please make sure a "
              + "PROJECT_ID, REGION, CLUSTER_NAME and JOB_FILE_PATH are provided, in this order.");

    String projectId = args[0]; // project-id of project to create the cluster in
    String region = args[1]; // region to create the cluster
    String clusterName = args[2]; // name of the cluster
    String jobFilePath = args[3]; // location in GCS of the PySpark job

    quickstart(projectId, region, clusterName, jobFilePath);


Before trying this sample, follow the Node.js setup instructions in the Dataproc quickstart using client libraries. For more information, see the Dataproc Node.js API reference documentation.

// This quickstart sample walks a user through creating a Dataproc
// cluster, submitting a PySpark job from Google Cloud Storage to the
// cluster, reading the output of the job and deleting the cluster, all
// using the Node.js client library.

'use strict';

function main(projectId, region, clusterName, jobFilePath) {
  const dataproc = require('@google-cloud/dataproc');
  const {Storage} = require('@google-cloud/storage');

  // Create a cluster client with the endpoint set to the desired cluster region
  const clusterClient = new dataproc.v1.ClusterControllerClient({
    apiEndpoint: `${region}-dataproc.googleapis.com`,
    projectId: projectId,

  // Create a job client with the endpoint set to the desired cluster region
  const jobClient = new dataproc.v1.JobControllerClient({
    apiEndpoint: `${region}-dataproc.googleapis.com`,
    projectId: projectId,

  async function quickstart() {
    // Create the cluster config
    const cluster = {
      projectId: projectId,
      region: region,
      cluster: {
        clusterName: clusterName,
        config: {
          masterConfig: {
            numInstances: 1,
            machineTypeUri: 'n1-standard-2',
          workerConfig: {
            numInstances: 2,
            machineTypeUri: 'n1-standard-2',

    // Create the cluster
    const [operation] = await clusterClient.createCluster(cluster);
    const [response] = await operation.promise();

    // Output a success message
    console.log(`Cluster created successfully: ${response.clusterName}`);

    const job = {
      projectId: projectId,
      region: region,
      job: {
        placement: {
          clusterName: clusterName,
        pysparkJob: {
          mainPythonFileUri: jobFilePath,

    const [jobOperation] = await jobClient.submitJobAsOperation(job);
    const [jobResponse] = await jobOperation.promise();

    const matches =

    const storage = new Storage();

    const output = await storage

    // Output a success message.
    console.log(`Job finished successfully: ${output}`);

    // Delete the cluster once the job has terminated.
    const deleteClusterReq = {
      projectId: projectId,
      region: region,
      clusterName: clusterName,

    const [deleteOperation] = await clusterClient.deleteCluster(
    await deleteOperation.promise();

    // Output a success message
    console.log(`Cluster ${clusterName} successfully deleted.`);


const args = process.argv.slice(2);

if (args.length !== 4) {
    'Insufficient number of parameters provided. Please make sure a ' +
      'PROJECT_ID, REGION, CLUSTER_NAME and JOB_FILE_PATH are provided, in this order.'



Before trying this sample, follow the Python setup instructions in the Dataproc quickstart using client libraries. For more information, see the Dataproc Python API reference documentation.

Command-line program to create a Dataproc cluster,
run a PySpark job located in Cloud Storage on the cluster,
then delete the cluster after the job completes.

    python submit_job_to_cluster --project_id <PROJECT_ID> --region <REGION> \
        --cluster_name <CLUSTER_NAME> --job_file_path <GCS_JOB_FILE_PATH>

import argparse
import os
import re

from google.cloud import dataproc_v1
from google.cloud import storage

DEFAULT_FILENAME = "pyspark_sort.py"
waiting_callback = False

def get_pyspark_file(pyspark_file=None):
    if pyspark_file:
        f = open(pyspark_file, "rb")
        return f, os.path.basename(pyspark_file)
        """Gets the PySpark file from current directory."""
        current_dir = os.path.dirname(os.path.abspath(__file__))
        f = open(os.path.join(current_dir, DEFAULT_FILENAME), "rb")
        return f, DEFAULT_FILENAME

def get_region_from_zone(zone):
        region_as_list = zone.split("-")[:-1]
        return "-".join(region_as_list)
    except (AttributeError, IndexError, ValueError):
        raise ValueError("Invalid zone provided, please check your input.")

def upload_pyspark_file(project, bucket_name, filename, spark_file):
    """Uploads the PySpark file in this directory to the configured input
    print("Uploading pyspark file to Cloud Storage.")
    client = storage.Client(project=project)
    bucket = client.get_bucket(bucket_name)
    blob = bucket.blob(filename)

def download_output(project, cluster_id, output_bucket, job_id):
    """Downloads the output file from Cloud Storage and returns it as a
    print("Downloading output file.")
    client = storage.Client(project=project)
    bucket = client.get_bucket(output_bucket)
    output_blob = "google-cloud-dataproc-metainfo/{}/jobs/{}/driveroutput.000000000".format(
        cluster_id, job_id
    return bucket.blob(output_blob).download_as_string()

def quickstart(project_id, region, cluster_name, gcs_bucket, pyspark_file):
    # Create the cluster client.
    cluster_client = dataproc_v1.ClusterControllerClient(
        client_options={"api_endpoint": "{}-dataproc.googleapis.com:443".format(region)}

    # Create the cluster config.
    cluster = {
        "project_id": project_id,
        "cluster_name": cluster_name,
        "config": {
            "master_config": {"num_instances": 1, "machine_type_uri": "n1-standard-2"},
            "worker_config": {"num_instances": 2, "machine_type_uri": "n1-standard-2"},

    # Create the cluster.
    operation = cluster_client.create_cluster(
        request={"project_id": project_id, "region": region, "cluster": cluster}
    result = operation.result()

    print("Cluster created successfully: {}".format(result.cluster_name))

    spark_file, spark_filename = get_pyspark_file(pyspark_file)
    upload_pyspark_file(project_id, gcs_bucket, spark_filename, spark_file)

    # Create the job client.
    job_client = dataproc_v1.JobControllerClient(
        client_options={"api_endpoint": "{}-dataproc.googleapis.com:443".format(region)}

    # Create the job config.
    job = {
        "placement": {"cluster_name": cluster_name},
        "pyspark_job": {"main_python_file_uri": "gs://{}/{}".format(gcs_bucket, spark_filename)},

    operation = job_client.submit_job_as_operation(
        request={"project_id": project_id, "region": region, "job": job}
    response = operation.result()

    # Dataproc job output is saved to the Cloud Storage bucket
    # allocated to the job. Use regex to obtain the bucket and blob info.
    matches = re.match("gs://(.*?)/(.*)", response.driver_output_resource_uri)

    output = (

    print(f"Job finished successfully: {output}\r\n")

    # Delete the cluster once the job has terminated.
    operation = cluster_client.delete_cluster(
            "project_id": project_id,
            "region": region,
            "cluster_name": cluster_name,

    print("Cluster {} successfully deleted.".format(cluster_name))

if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        help="Project to use for creating resources.",
        help="Region where the resources should live.",
        help="Name to use for creating a cluster.",

        "--gcs_bucket", help="Bucket to upload Pyspark file to", required=True

        "--pyspark_file", help="Pyspark filename. Defaults to pyspark_sort.py"

    args = parser.parse_args()
    quickstart(args.project_id, args.region, args.cluster_name, args.gcs_bucket, args.pyspark_file)

What's next

To search and filter code samples for other Google Cloud products, see the Google Cloud sample browser.