Submit hadoop fs job
Stay organized with collections
Save and categorize content based on your preferences.
Submits a Hadoop FS job to a Dataproc cluster.
Code sample
Except as otherwise noted, the content of this page is licensed under the Creative Commons Attribution 4.0 License, and code samples are licensed under the Apache 2.0 License. For details, see the Google Developers Site Policies. Java is a registered trademark of Oracle and/or its affiliates.
[[["Easy to understand","easyToUnderstand","thumb-up"],["Solved my problem","solvedMyProblem","thumb-up"],["Other","otherUp","thumb-up"]],[["Hard to understand","hardToUnderstand","thumb-down"],["Incorrect information or sample code","incorrectInformationOrSampleCode","thumb-down"],["Missing the information/samples I need","missingTheInformationSamplesINeed","thumb-down"],["Other","otherDown","thumb-down"]],[],[[["\u003cp\u003eThis code sample demonstrates how to submit a Hadoop FS job to a Google Cloud Dataproc cluster using the Java client library.\u003c/p\u003e\n"],["\u003cp\u003eThe process involves configuring job settings, including cluster placement and the Hadoop FS query to be executed.\u003c/p\u003e\n"],["\u003cp\u003eThe code utilizes the \u003ccode\u003eJobControllerClient\u003c/code\u003e to submit the job asynchronously and retrieve its results, including output from Google Cloud Storage.\u003c/p\u003e\n"],["\u003cp\u003eThe example includes error handling to catch and report issues if the job fails to complete successfully, and provides links for further reference.\u003c/p\u003e\n"],["\u003cp\u003eThe code requires the user to replace example strings for project ID, region, cluster name, and hadoop FS query with the actual values before running.\u003c/p\u003e\n"]]],[],null,["Submits a Hadoop FS job to a Dataproc cluster.\n\nCode sample \n\nJava\n\n\nBefore trying this sample, follow the Java setup instructions in the\n[Dataproc quickstart using\nclient libraries](/dataproc/docs/quickstarts/quickstart-lib).\n\n\nFor more information, see the\n[Dataproc Java API\nreference documentation](/java/docs/reference/google-cloud-dataproc/latest/overview).\n\n\nTo authenticate to Dataproc, set up Application Default Credentials.\nFor more information, see\n\n[Set up authentication for a local development environment](/docs/authentication/set-up-adc-local-dev-environment).\n\n\n import com.google.api.gax.longrunning.https://cloud.google.com/java/docs/reference/gax/latest/com.google.api.gax.longrunning.OperationFuture.html;\n import com.google.cloud.dataproc.v1.https://cloud.google.com/java/docs/reference/google-cloud-dataproc/latest/com.google.cloud.dataproc.v1.HadoopJob.html;\n import com.google.cloud.dataproc.v1.https://cloud.google.com/java/docs/reference/google-cloud-dataproc/latest/com.google.cloud.dataproc.v1.Job.html;\n import com.google.cloud.dataproc.v1.https://cloud.google.com/java/docs/reference/google-cloud-dataproc/latest/com.google.cloud.dataproc.v1.JobControllerClient.html;\n import com.google.cloud.dataproc.v1.https://cloud.google.com/java/docs/reference/google-cloud-dataproc/latest/com.google.cloud.dataproc.v1.JobControllerSettings.html;\n import com.google.cloud.dataproc.v1.https://cloud.google.com/java/docs/reference/google-cloud-dataproc/latest/com.google.cloud.dataproc.v1.JobMetadata.html;\n import com.google.cloud.dataproc.v1.https://cloud.google.com/java/docs/reference/google-cloud-dataproc/latest/com.google.cloud.dataproc.v1.JobPlacement.html;\n import com.google.cloud.storage.https://cloud.google.com/java/docs/reference/google-cloud-storage/latest/com.google.cloud.storage.Blob.html;\n import com.google.cloud.storage.https://cloud.google.com/java/docs/reference/google-cloud-storage/latest/com.google.cloud.storage.Storage.html;\n import com.google.cloud.storage.https://cloud.google.com/java/docs/reference/google-cloud-storage/latest/com.google.cloud.storage.StorageOptions.html;\n import java.io.IOException;\n import java.util.ArrayList;\n import java.util.Arrays;\n import java.util.concurrent.ExecutionException;\n import java.util.regex.Matcher;\n import java.util.regex.Pattern;\n\n public class SubmitHadoopFsJob {\n\n public static ArrayList\u003cString\u003e stringToList(String s) {\n return new ArrayList\u003c\u003e(Arrays.asList(s.split(\" \")));\n }\n\n public static void submitHadoopFsJob() throws IOException, InterruptedException {\n // TODO(developer): Replace these variables before running the sample.\n String projectId = \"your-project-id\";\n String region = \"your-project-region\";\n String clusterName = \"your-cluster-name\";\n String hadoopFsQuery = \"your-hadoop-fs-query\";\n submitHadoopFsJob(projectId, region, clusterName, hadoopFsQuery);\n }\n\n public static void submitHadoopFsJob(\n String projectId, String region, String clusterName, String hadoopFsQuery)\n throws IOException, InterruptedException {\n String myEndpoint = String.format(\"%s-dataproc.googleapis.com:443\", region);\n\n // Configure the settings for the job controller client.\n https://cloud.google.com/java/docs/reference/google-cloud-dataproc/latest/com.google.cloud.dataproc.v1.JobControllerSettings.html jobControllerSettings =\n https://cloud.google.com/java/docs/reference/google-cloud-dataproc/latest/com.google.cloud.dataproc.v1.JobControllerSettings.html.newBuilder().setEndpoint(myEndpoint).build();\n\n // Create a job controller client with the configured settings. Using a try-with-resources\n // closes the client,\n // but this can also be done manually with the .close() method.\n try (https://cloud.google.com/java/docs/reference/google-cloud-dataproc/latest/com.google.cloud.dataproc.v1.JobControllerClient.html jobControllerClient =\n https://cloud.google.com/java/docs/reference/google-cloud-dataproc/latest/com.google.cloud.dataproc.v1.JobControllerClient.html.create(jobControllerSettings)) {\n\n // Configure cluster placement for the job.\n https://cloud.google.com/java/docs/reference/google-cloud-dataproc/latest/com.google.cloud.dataproc.v1.JobPlacement.html jobPlacement = https://cloud.google.com/java/docs/reference/google-cloud-dataproc/latest/com.google.cloud.dataproc.v1.JobPlacement.html.newBuilder().setClusterName(clusterName).build();\n\n // Configure Hadoop job settings. The HadoopFS query is set here.\n https://cloud.google.com/java/docs/reference/google-cloud-dataproc/latest/com.google.cloud.dataproc.v1.HadoopJob.html hadoopJob =\n https://cloud.google.com/java/docs/reference/google-cloud-dataproc/latest/com.google.cloud.dataproc.v1.HadoopJob.html.newBuilder()\n .setMainClass(\"org.apache.hadoop.fs.FsShell\")\n .addAllArgs(stringToList(hadoopFsQuery))\n .build();\n\n https://cloud.google.com/java/docs/reference/google-cloud-dataproc/latest/com.google.cloud.dataproc.v1.Job.html job = https://cloud.google.com/java/docs/reference/google-cloud-dataproc/latest/com.google.cloud.dataproc.v1.Job.html.newBuilder().setPlacement(jobPlacement).setHadoopJob(hadoopJob).build();\n\n // Submit an asynchronous request to execute the job.\n OperationFuture\u003cJob, JobMetadata\u003e submitJobAsOperationAsyncRequest =\n jobControllerClient.https://cloud.google.com/java/docs/reference/google-cloud-dataproc/latest/com.google.cloud.dataproc.v1.JobControllerClient.html#com_google_cloud_dataproc_v1_JobControllerClient_submitJobAsOperationAsync_com_google_cloud_dataproc_v1_SubmitJobRequest_(projectId, region, job);\n\n https://cloud.google.com/java/docs/reference/google-cloud-dataproc/latest/com.google.cloud.dataproc.v1.Job.html response = submitJobAsOperationAsyncRequest.get();\n\n // Print output from Google Cloud Storage.\n Matcher matches =\n Pattern.compile(\"gs://(.*?)/(.*)\").matcher(response.https://cloud.google.com/java/docs/reference/google-cloud-dataproc/latest/com.google.cloud.dataproc.v1.Job.html#com_google_cloud_dataproc_v1_Job_getDriverOutputResourceUri__());\n matches.matches();\n\n https://cloud.google.com/java/docs/reference/google-cloud-storage/latest/com.google.cloud.storage.Storage.html storage = https://cloud.google.com/java/docs/reference/google-cloud-storage/latest/com.google.cloud.storage.StorageOptions.html.getDefaultInstance().https://cloud.google.com/java/docs/reference/google-cloud-storage/latest/com.google.cloud.storage.transfermanager.TransferManagerConfig.html#com_google_cloud_storage_transfermanager_TransferManagerConfig_getService__();\n https://cloud.google.com/java/docs/reference/google-cloud-storage/latest/com.google.cloud.storage.Blob.html blob = storage.https://cloud.google.com/java/docs/reference/google-cloud-storage/latest/com.google.cloud.storage.Storage.html#com_google_cloud_storage_Storage_get_com_google_cloud_storage_BlobId_(matches.group(1), String.format(\"%s.000000000\", matches.group(2)));\n\n System.out.println(\n String.format(\"Job finished successfully: %s\", new String(blob.https://cloud.google.com/java/docs/reference/google-cloud-storage/latest/com.google.cloud.storage.Blob.html#com_google_cloud_storage_Blob_getContent_com_google_cloud_storage_Blob_BlobSourceOption____())));\n\n } catch (ExecutionException e) {\n // If the job does not complete successfully, print the error message.\n System.err.println(String.format(\"submitHadoopFSJob: %s \", e.getMessage()));\n }\n }\n }\n\nWhat's next\n\n\nTo search and filter code samples for other Google Cloud products, see the\n[Google Cloud sample browser](/docs/samples?product=dataproc)."]]