DataprocWorkflowTemplate

Property Value
Google Cloud Service Name Dataproc
Google Cloud Service Documentation /dataproc/docs/
Google Cloud REST Resource Name v1.projects.locations.workflowTemplates
Google Cloud REST Resource Documentation /dataproc/docs/reference/rest/v1/projects.locations.workflowTemplates
Config Connector Resource Short Names gcpdataprocworkflowtemplate
gcpdataprocworkflowtemplates
dataprocworkflowtemplate
Config Connector Service Name dataproc.googleapis.com
Config Connector Resource Fully Qualified Name dataprocworkflowtemplates.dataproc.cnrm.cloud.google.com
Can Be Referenced by IAMPolicy/IAMPolicyMember No

Custom Resource Definition Properties

Spec

Schema

dagTimeout: string
jobs:
- hadoopJob:
    archiveUris:
    - string
    args:
    - string
    fileUris:
    - string
    jarFileUris:
    - string
    loggingConfig:
      driverLogLevels:
        string: string
    mainClass: string
    mainJarFileUri: string
    properties:
      string: string
  hiveJob:
    continueOnFailure: boolean
    jarFileUris:
    - string
    properties:
      string: string
    queryFileUri: string
    queryList:
      queries:
      - string
    scriptVariables:
      string: string
  labels:
    string: string
  pigJob:
    continueOnFailure: boolean
    jarFileUris:
    - string
    loggingConfig:
      driverLogLevels:
        string: string
    properties:
      string: string
    queryFileUri: string
    queryList:
      queries:
      - string
    scriptVariables:
      string: string
  prerequisiteStepIds:
  - string
  prestoJob:
    clientTags:
    - string
    continueOnFailure: boolean
    loggingConfig:
      driverLogLevels:
        string: string
    outputFormat: string
    properties:
      string: string
    queryFileUri: string
    queryList:
      queries:
      - string
  pysparkJob:
    archiveUris:
    - string
    args:
    - string
    fileUris:
    - string
    jarFileUris:
    - string
    loggingConfig:
      driverLogLevels:
        string: string
    mainPythonFileUri: string
    properties:
      string: string
    pythonFileUris:
    - string
  scheduling:
    maxFailuresPerHour: integer
    maxFailuresTotal: integer
  sparkJob:
    archiveUris:
    - string
    args:
    - string
    fileUris:
    - string
    jarFileUris:
    - string
    loggingConfig:
      driverLogLevels:
        string: string
    mainClass: string
    mainJarFileUri: string
    properties:
      string: string
  sparkRJob:
    archiveUris:
    - string
    args:
    - string
    fileUris:
    - string
    loggingConfig:
      driverLogLevels:
        string: string
    mainRFileUri: string
    properties:
      string: string
  sparkSqlJob:
    jarFileUris:
    - string
    loggingConfig:
      driverLogLevels:
        string: string
    properties:
      string: string
    queryFileUri: string
    queryList:
      queries:
      - string
    scriptVariables:
      string: string
  stepId: string
location: string
parameters:
- description: string
  fields:
  - string
  name: string
  validation:
    regex:
      regexes:
      - string
    values:
      values:
      - string
placement:
  clusterSelector:
    clusterLabels:
      string: string
    zone: string
  managedCluster:
    clusterName: string
    config:
      autoscalingConfig:
        policyRef:
          external: string
          name: string
          namespace: string
      encryptionConfig:
        gcePdKmsKeyRef:
          external: string
          name: string
          namespace: string
      endpointConfig:
        enableHttpPortAccess: boolean
      gceClusterConfig:
        internalIPOnly: boolean
        metadata:
          string: string
        networkRef:
          external: string
          name: string
          namespace: string
        nodeGroupAffinity:
          nodeGroupRef:
            external: string
            name: string
            namespace: string
        privateIPv6GoogleAccess: string
        reservationAffinity:
          consumeReservationType: string
          key: string
          values:
          - string
        serviceAccountRef:
          external: string
          name: string
          namespace: string
        serviceAccountScopes:
        - string
        subnetworkRef:
          external: string
          name: string
          namespace: string
        tags:
        - string
        zone: string
      initializationActions:
      - executableFile: string
        executionTimeout: string
      lifecycleConfig:
        autoDeleteTime: string
        autoDeleteTtl: string
        idleDeleteTtl: string
      masterConfig:
        accelerators:
        - acceleratorCount: integer
          acceleratorType: string
        diskConfig:
          bootDiskSizeGb: integer
          bootDiskType: string
          numLocalSsds: integer
        imageRef:
          external: string
          name: string
          namespace: string
        machineType: string
        minCpuPlatform: string
        numInstances: integer
        preemptibility: string
      secondaryWorkerConfig:
        accelerators:
        - acceleratorCount: integer
          acceleratorType: string
        diskConfig:
          bootDiskSizeGb: integer
          bootDiskType: string
          numLocalSsds: integer
        imageRef:
          external: string
          name: string
          namespace: string
        machineType: string
        minCpuPlatform: string
        numInstances: integer
        preemptibility: string
      securityConfig:
        kerberosConfig:
          crossRealmTrustAdminServer: string
          crossRealmTrustKdc: string
          crossRealmTrustRealm: string
          crossRealmTrustSharedPassword: string
          enableKerberos: boolean
          kdcDbKey: string
          keyPassword: string
          keystore: string
          keystorePassword: string
          kmsKeyRef:
            external: string
            name: string
            namespace: string
          realm: string
          rootPrincipalPassword: string
          tgtLifetimeHours: integer
          truststore: string
          truststorePassword: string
      softwareConfig:
        imageVersion: string
        optionalComponents:
        - string
        properties:
          string: string
      stagingBucketRef:
        external: string
        name: string
        namespace: string
      tempBucketRef:
        external: string
        name: string
        namespace: string
      workerConfig:
        accelerators:
        - acceleratorCount: integer
          acceleratorType: string
        diskConfig:
          bootDiskSizeGb: integer
          bootDiskType: string
          numLocalSsds: integer
        imageRef:
          external: string
          name: string
          namespace: string
        machineType: string
        minCpuPlatform: string
        numInstances: integer
        preemptibility: string
    labels:
      string: string
projectRef:
  external: string
  name: string
  namespace: string
resourceID: string
Fields

dagTimeout

Optional

string

Immutable. Optional. Timeout duration for the DAG of jobs, expressed in seconds (see [JSON representation of duration](https://developers.google.com/protocol-buffers/docs/proto3#json)). The timeout duration must be from 10 minutes ("600s") to 24 hours ("86400s"). The timer begins when the first job is submitted. If the workflow is running at the end of the timeout period, any remaining jobs are cancelled, the workflow is ended, and if the workflow was running on a [managed cluster](/dataproc/docs/concepts/workflows/using-workflows#configuring_or_selecting_a_cluster), the cluster is deleted.

jobs

Required

list (object)

Immutable. Required. The Directed Acyclic Graph of Jobs to submit.

jobs[]

Required

object

jobs[].hadoopJob

Optional

object

Immutable. Optional. Job is a Hadoop job.

jobs[].hadoopJob.archiveUris

Optional

list (string)

Immutable. Optional. HCFS URIs of archives to be extracted in the working directory of Hadoop drivers and tasks. Supported file types: .jar, .tar, .tar.gz, .tgz, or .zip.

jobs[].hadoopJob.archiveUris[]

Optional

string

jobs[].hadoopJob.args

Optional

list (string)

Immutable. Optional. The arguments to pass to the driver. Do not include arguments, such as `-libjars` or `-Dfoo=bar`, that can be set as job properties, since a collision may occur that causes an incorrect job submission.

jobs[].hadoopJob.args[]

Optional

string

jobs[].hadoopJob.fileUris

Optional

list (string)

Immutable. Optional. HCFS (Hadoop Compatible Filesystem) URIs of files to be copied to the working directory of Hadoop drivers and distributed tasks. Useful for naively parallel tasks.

jobs[].hadoopJob.fileUris[]

Optional

string

jobs[].hadoopJob.jarFileUris

Optional

list (string)

Immutable. Optional. Jar file URIs to add to the CLASSPATHs of the Hadoop driver and tasks.

jobs[].hadoopJob.jarFileUris[]

Optional

string

jobs[].hadoopJob.loggingConfig

Optional

object

Immutable. Optional. The runtime log config for job execution.

jobs[].hadoopJob.loggingConfig.driverLogLevels

Optional

map (key: string, value: string)

Immutable. The per-package log levels for the driver. This may include "root" package name to configure rootLogger. Examples: 'com.google = FATAL', 'root = INFO', 'org.apache = DEBUG'

jobs[].hadoopJob.mainClass

Optional

string

Immutable. The name of the driver's main class. The jar file containing the class must be in the default CLASSPATH or specified in `jar_file_uris`.

jobs[].hadoopJob.mainJarFileUri

Optional

string

Immutable. The HCFS URI of the jar file containing the main class. Examples: 'gs://foo-bucket/analytics-binaries/extract-useful-metrics-mr.jar' 'hdfs:/tmp/test-samples/custom-wordcount.jar' 'file:///home/usr/lib/hadoop-mapreduce/hadoop-mapreduce-examples.jar'

jobs[].hadoopJob.properties

Optional

map (key: string, value: string)

Immutable. Optional. A mapping of property names to values, used to configure Hadoop. Properties that conflict with values set by the Dataproc API may be overwritten. Can include properties set in /etc/hadoop/conf/*-site and classes in user code.

jobs[].hiveJob

Optional

object

Immutable. Optional. Job is a Hive job.

jobs[].hiveJob.continueOnFailure

Optional

boolean

Immutable. Optional. Whether to continue executing queries if a query fails. The default value is `false`. Setting to `true` can be useful when executing independent parallel queries.

jobs[].hiveJob.jarFileUris

Optional

list (string)

Immutable. Optional. HCFS URIs of jar files to add to the CLASSPATH of the Hive server and Hadoop MapReduce (MR) tasks. Can contain Hive SerDes and UDFs.

jobs[].hiveJob.jarFileUris[]

Optional

string

jobs[].hiveJob.properties

Optional

map (key: string, value: string)

Immutable. Optional. A mapping of property names and values, used to configure Hive. Properties that conflict with values set by the Dataproc API may be overwritten. Can include properties set in /etc/hadoop/conf/*-site.xml, /etc/hive/conf/hive-site.xml, and classes in user code.

jobs[].hiveJob.queryFileUri

Optional

string

Immutable. The HCFS URI of the script that contains Hive queries.

jobs[].hiveJob.queryList

Optional

object

Immutable. A list of queries.

jobs[].hiveJob.queryList.queries

Required*

list (string)

Immutable. Required. The queries to execute. You do not need to end a query expression with a semicolon. Multiple queries can be specified in one string by separating each with a semicolon. Here is an example of a Dataproc API snippet that uses a QueryList to specify a HiveJob: "hiveJob": { "queryList": { "queries": [ "query1", "query2", "query3;query4", ] } }

jobs[].hiveJob.queryList.queries[]

Required*

string

jobs[].hiveJob.scriptVariables

Optional

map (key: string, value: string)

Immutable. Optional. Mapping of query variable names to values (equivalent to the Hive command: `SET name="value";`).

jobs[].labels

Optional

map (key: string, value: string)

Immutable. Optional. The labels to associate with this job. Label keys must be between 1 and 63 characters long, and must conform to the following regular expression: p{Ll}p{Lo}{0,62} Label values must be between 1 and 63 characters long, and must conform to the following regular expression: [p{Ll}p{Lo}p{N}_-]{0,63} No more than 32 labels can be associated with a given job.

jobs[].pigJob

Optional

object

Immutable. Optional. Job is a Pig job.

jobs[].pigJob.continueOnFailure

Optional

boolean

Immutable. Optional. Whether to continue executing queries if a query fails. The default value is `false`. Setting to `true` can be useful when executing independent parallel queries.

jobs[].pigJob.jarFileUris

Optional

list (string)

Immutable. Optional. HCFS URIs of jar files to add to the CLASSPATH of the Pig Client and Hadoop MapReduce (MR) tasks. Can contain Pig UDFs.

jobs[].pigJob.jarFileUris[]

Optional

string

jobs[].pigJob.loggingConfig

Optional

object

Immutable. Optional. The runtime log config for job execution.

jobs[].pigJob.loggingConfig.driverLogLevels

Optional

map (key: string, value: string)

Immutable. The per-package log levels for the driver. This may include "root" package name to configure rootLogger. Examples: 'com.google = FATAL', 'root = INFO', 'org.apache = DEBUG'

jobs[].pigJob.properties

Optional

map (key: string, value: string)

Immutable. Optional. A mapping of property names to values, used to configure Pig. Properties that conflict with values set by the Dataproc API may be overwritten. Can include properties set in /etc/hadoop/conf/*-site.xml, /etc/pig/conf/pig.properties, and classes in user code.

jobs[].pigJob.queryFileUri

Optional

string

Immutable. The HCFS URI of the script that contains the Pig queries.

jobs[].pigJob.queryList

Optional

object

Immutable. A list of queries.

jobs[].pigJob.queryList.queries

Required*

list (string)

Immutable. Required. The queries to execute. You do not need to end a query expression with a semicolon. Multiple queries can be specified in one string by separating each with a semicolon. Here is an example of a Dataproc API snippet that uses a QueryList to specify a HiveJob: "hiveJob": { "queryList": { "queries": [ "query1", "query2", "query3;query4", ] } }

jobs[].pigJob.queryList.queries[]

Required*

string

jobs[].pigJob.scriptVariables

Optional

map (key: string, value: string)

Immutable. Optional. Mapping of query variable names to values (equivalent to the Pig command: `name=[value]`).

jobs[].prerequisiteStepIds

Optional

list (string)

Immutable. Optional. The optional list of prerequisite job step_ids. If not specified, the job will start at the beginning of workflow.

jobs[].prerequisiteStepIds[]

Optional

string

jobs[].prestoJob

Optional

object

Immutable. Optional. Job is a Presto job.

jobs[].prestoJob.clientTags

Optional

list (string)

Immutable. Optional. Presto client tags to attach to this query

jobs[].prestoJob.clientTags[]

Optional

string

jobs[].prestoJob.continueOnFailure

Optional

boolean

Immutable. Optional. Whether to continue executing queries if a query fails. The default value is `false`. Setting to `true` can be useful when executing independent parallel queries.

jobs[].prestoJob.loggingConfig

Optional

object

Immutable. Optional. The runtime log config for job execution.

jobs[].prestoJob.loggingConfig.driverLogLevels

Optional

map (key: string, value: string)

Immutable. The per-package log levels for the driver. This may include "root" package name to configure rootLogger. Examples: 'com.google = FATAL', 'root = INFO', 'org.apache = DEBUG'

jobs[].prestoJob.outputFormat

Optional

string

Immutable. Optional. The format in which query output will be displayed. See the Presto documentation for supported output formats

jobs[].prestoJob.properties

Optional

map (key: string, value: string)

Immutable. Optional. A mapping of property names to values. Used to set Presto [session properties](https://prestodb.io/docs/current/sql/set-session.html) Equivalent to using the --session flag in the Presto CLI

jobs[].prestoJob.queryFileUri

Optional

string

Immutable. The HCFS URI of the script that contains SQL queries.

jobs[].prestoJob.queryList

Optional

object

Immutable. A list of queries.

jobs[].prestoJob.queryList.queries

Required*

list (string)

Immutable. Required. The queries to execute. You do not need to end a query expression with a semicolon. Multiple queries can be specified in one string by separating each with a semicolon. Here is an example of a Dataproc API snippet that uses a QueryList to specify a HiveJob: "hiveJob": { "queryList": { "queries": [ "query1", "query2", "query3;query4", ] } }

jobs[].prestoJob.queryList.queries[]

Required*

string

jobs[].pysparkJob

Optional

object

Immutable. Optional. Job is a PySpark job.

jobs[].pysparkJob.archiveUris

Optional

list (string)

Immutable. Optional. HCFS URIs of archives to be extracted into the working directory of each executor. Supported file types: .jar, .tar, .tar.gz, .tgz, and .zip.

jobs[].pysparkJob.archiveUris[]

Optional

string

jobs[].pysparkJob.args

Optional

list (string)

Immutable. Optional. The arguments to pass to the driver. Do not include arguments, such as `--conf`, that can be set as job properties, since a collision may occur that causes an incorrect job submission.

jobs[].pysparkJob.args[]

Optional

string

jobs[].pysparkJob.fileUris

Optional

list (string)

Immutable. Optional. HCFS URIs of files to be placed in the working directory of each executor. Useful for naively parallel tasks.

jobs[].pysparkJob.fileUris[]

Optional

string

jobs[].pysparkJob.jarFileUris

Optional

list (string)

Immutable. Optional. HCFS URIs of jar files to add to the CLASSPATHs of the Python driver and tasks.

jobs[].pysparkJob.jarFileUris[]

Optional

string

jobs[].pysparkJob.loggingConfig

Optional

object

Immutable. Optional. The runtime log config for job execution.

jobs[].pysparkJob.loggingConfig.driverLogLevels

Optional

map (key: string, value: string)

Immutable. The per-package log levels for the driver. This may include "root" package name to configure rootLogger. Examples: 'com.google = FATAL', 'root = INFO', 'org.apache = DEBUG'

jobs[].pysparkJob.mainPythonFileUri

Required*

string

Immutable. Required. The HCFS URI of the main Python file to use as the driver. Must be a .py file.

jobs[].pysparkJob.properties

Optional

map (key: string, value: string)

Immutable. Optional. A mapping of property names to values, used to configure PySpark. Properties that conflict with values set by the Dataproc API may be overwritten. Can include properties set in /etc/spark/conf/spark-defaults.conf and classes in user code.

jobs[].pysparkJob.pythonFileUris

Optional

list (string)

Immutable. Optional. HCFS file URIs of Python files to pass to the PySpark framework. Supported file types: .py, .egg, and .zip.

jobs[].pysparkJob.pythonFileUris[]

Optional

string

jobs[].scheduling

Optional

object

Immutable. Optional. Job scheduling configuration.

jobs[].scheduling.maxFailuresPerHour

Optional

integer

Immutable. Optional. Maximum number of times per hour a driver may be restarted as a result of driver exiting with non-zero code before job is reported failed. A job may be reported as thrashing if driver exits with non-zero code 4 times within 10 minute window. Maximum value is 10.

jobs[].scheduling.maxFailuresTotal

Optional

integer

Immutable. Optional. Maximum number of times in total a driver may be restarted as a result of driver exiting with non-zero code before job is reported failed. Maximum value is 240.

jobs[].sparkJob

Optional

object

Immutable. Optional. Job is a Spark job.

jobs[].sparkJob.archiveUris

Optional

list (string)

Immutable. Optional. HCFS URIs of archives to be extracted into the working directory of each executor. Supported file types: .jar, .tar, .tar.gz, .tgz, and .zip.

jobs[].sparkJob.archiveUris[]

Optional

string

jobs[].sparkJob.args

Optional

list (string)

Immutable. Optional. The arguments to pass to the driver. Do not include arguments, such as `--conf`, that can be set as job properties, since a collision may occur that causes an incorrect job submission.

jobs[].sparkJob.args[]

Optional

string

jobs[].sparkJob.fileUris

Optional

list (string)

Immutable. Optional. HCFS URIs of files to be placed in the working directory of each executor. Useful for naively parallel tasks.

jobs[].sparkJob.fileUris[]

Optional

string

jobs[].sparkJob.jarFileUris

Optional

list (string)

Immutable. Optional. HCFS URIs of jar files to add to the CLASSPATHs of the Spark driver and tasks.

jobs[].sparkJob.jarFileUris[]

Optional

string

jobs[].sparkJob.loggingConfig

Optional

object

Immutable. Optional. The runtime log config for job execution.

jobs[].sparkJob.loggingConfig.driverLogLevels

Optional

map (key: string, value: string)

Immutable. The per-package log levels for the driver. This may include "root" package name to configure rootLogger. Examples: 'com.google = FATAL', 'root = INFO', 'org.apache = DEBUG'

jobs[].sparkJob.mainClass

Optional

string

Immutable. The name of the driver's main class. The jar file that contains the class must be in the default CLASSPATH or specified in `jar_file_uris`.

jobs[].sparkJob.mainJarFileUri

Optional

string

Immutable. The HCFS URI of the jar file that contains the main class.

jobs[].sparkJob.properties

Optional

map (key: string, value: string)

Immutable. Optional. A mapping of property names to values, used to configure Spark. Properties that conflict with values set by the Dataproc API may be overwritten. Can include properties set in /etc/spark/conf/spark-defaults.conf and classes in user code.

jobs[].sparkRJob

Optional

object

Immutable. Optional. Job is a SparkR job.

jobs[].sparkRJob.archiveUris

Optional

list (string)

Immutable. Optional. HCFS URIs of archives to be extracted into the working directory of each executor. Supported file types: .jar, .tar, .tar.gz, .tgz, and .zip.

jobs[].sparkRJob.archiveUris[]

Optional

string

jobs[].sparkRJob.args

Optional

list (string)

Immutable. Optional. The arguments to pass to the driver. Do not include arguments, such as `--conf`, that can be set as job properties, since a collision may occur that causes an incorrect job submission.

jobs[].sparkRJob.args[]

Optional

string

jobs[].sparkRJob.fileUris

Optional

list (string)

Immutable. Optional. HCFS URIs of files to be placed in the working directory of each executor. Useful for naively parallel tasks.

jobs[].sparkRJob.fileUris[]

Optional

string

jobs[].sparkRJob.loggingConfig

Optional

object

Immutable. Optional. The runtime log config for job execution.

jobs[].sparkRJob.loggingConfig.driverLogLevels

Optional

map (key: string, value: string)

Immutable. The per-package log levels for the driver. This may include "root" package name to configure rootLogger. Examples: 'com.google = FATAL', 'root = INFO', 'org.apache = DEBUG'

jobs[].sparkRJob.mainRFileUri

Required*

string

Immutable. Required. The HCFS URI of the main R file to use as the driver. Must be a .R file.

jobs[].sparkRJob.properties

Optional

map (key: string, value: string)

Immutable. Optional. A mapping of property names to values, used to configure SparkR. Properties that conflict with values set by the Dataproc API may be overwritten. Can include properties set in /etc/spark/conf/spark-defaults.conf and classes in user code.

jobs[].sparkSqlJob

Optional

object

Immutable. Optional. Job is a SparkSql job.

jobs[].sparkSqlJob.jarFileUris

Optional

list (string)

Immutable. Optional. HCFS URIs of jar files to be added to the Spark CLASSPATH.

jobs[].sparkSqlJob.jarFileUris[]

Optional

string

jobs[].sparkSqlJob.loggingConfig

Optional

object

Immutable. Optional. The runtime log config for job execution.

jobs[].sparkSqlJob.loggingConfig.driverLogLevels

Optional

map (key: string, value: string)

Immutable. The per-package log levels for the driver. This may include "root" package name to configure rootLogger. Examples: 'com.google = FATAL', 'root = INFO', 'org.apache = DEBUG'

jobs[].sparkSqlJob.properties

Optional

map (key: string, value: string)

Immutable. Optional. A mapping of property names to values, used to configure Spark SQL's SparkConf. Properties that conflict with values set by the Dataproc API may be overwritten.

jobs[].sparkSqlJob.queryFileUri

Optional

string

Immutable. The HCFS URI of the script that contains SQL queries.

jobs[].sparkSqlJob.queryList

Optional

object

Immutable. A list of queries.

jobs[].sparkSqlJob.queryList.queries

Required*

list (string)

Immutable. Required. The queries to execute. You do not need to end a query expression with a semicolon. Multiple queries can be specified in one string by separating each with a semicolon. Here is an example of a Dataproc API snippet that uses a QueryList to specify a HiveJob: "hiveJob": { "queryList": { "queries": [ "query1", "query2", "query3;query4", ] } }

jobs[].sparkSqlJob.queryList.queries[]

Required*

string

jobs[].sparkSqlJob.scriptVariables

Optional

map (key: string, value: string)

Immutable. Optional. Mapping of query variable names to values (equivalent to the Spark SQL command: SET `name="value";`).

jobs[].stepId

Required

string

Immutable. Required. The step id. The id must be unique among all jobs within the template. The step id is used as prefix for job id, as job `goog-dataproc-workflow-step-id` label, and in prerequisiteStepIds field from other steps. The id must contain only letters (a-z, A-Z), numbers (0-9), underscores (_), and hyphens (-). Cannot begin or end with underscore or hyphen. Must consist of between 3 and 50 characters.

location

Required

string

Immutable. The location for the resource

parameters

Optional

list (object)

Immutable. Optional. Template parameters whose values are substituted into the template. Values for parameters must be provided when the template is instantiated.

parameters[]

Optional

object

parameters[].description

Optional

string

Immutable. Optional. Brief description of the parameter. Must not exceed 1024 characters.

parameters[].fields

Required*

list (string)

Immutable. Required. Paths to all fields that the parameter replaces. A field is allowed to appear in at most one parameter's list of field paths. A field path is similar in syntax to a google.protobuf.FieldMask. For example, a field path that references the zone field of a workflow template's cluster selector would be specified as `placement.clusterSelector.zone`. Also, field paths can reference fields using the following syntax: * Values in maps can be referenced by key: * labels['key'] * placement.clusterSelector.clusterLabels['key'] * placement.managedCluster.labels['key'] * placement.clusterSelector.clusterLabels['key'] * jobs['step-id'].labels['key'] * Jobs in the jobs list can be referenced by step-id: * jobs['step-id'].hadoopJob.mainJarFileUri * jobs['step-id'].hiveJob.queryFileUri * jobs['step-id'].pySparkJob.mainPythonFileUri * jobs['step-id'].hadoopJob.jarFileUris[0] * jobs['step-id'].hadoopJob.archiveUris[0] * jobs['step-id'].hadoopJob.fileUris[0] * jobs['step-id'].pySparkJob.pythonFileUris[0] * Items in repeated fields can be referenced by a zero-based index: * jobs['step-id'].sparkJob.args[0] * Other examples: * jobs['step-id'].hadoopJob.properties['key'] * jobs['step-id'].hadoopJob.args[0] * jobs['step-id'].hiveJob.scriptVariables['key'] * jobs['step-id'].hadoopJob.mainJarFileUri * placement.clusterSelector.zone It may not be possible to parameterize maps and repeated fields in their entirety since only individual map values and individual items in repeated fields can be referenced. For example, the following field paths are invalid: - placement.clusterSelector.clusterLabels - jobs['step-id'].sparkJob.args

parameters[].fields[]

Required*

string

parameters[].name

Required*

string

Immutable. Required. Parameter name. The parameter name is used as the key, and paired with the parameter value, which are passed to the template when the template is instantiated. The name must contain only capital letters (A-Z), numbers (0-9), and underscores (_), and must not start with a number. The maximum length is 40 characters.

parameters[].validation

Optional

object

Immutable. Optional. Validation rules to be applied to this parameter's value.

parameters[].validation.regex

Optional

object

Immutable. Validation based on regular expressions.

parameters[].validation.regex.regexes

Required*

list (string)

Immutable. Required. RE2 regular expressions used to validate the parameter's value. The value must match the regex in its entirety (substring matches are not sufficient).

parameters[].validation.regex.regexes[]

Required*

string

parameters[].validation.values

Optional

object

Immutable. Validation based on a list of allowed values.

parameters[].validation.values.values

Required*

list (string)

Immutable. Required. List of allowed values for the parameter.

parameters[].validation.values.values[]

Required*

string

placement

Required

object

Immutable. Required. WorkflowTemplate scheduling information.

placement.clusterSelector

Optional

object

Immutable. Optional. A selector that chooses target cluster for jobs based on metadata. The selector is evaluated at the time each job is submitted.

placement.clusterSelector.clusterLabels

Required*

map (key: string, value: string)

Immutable. Required. The cluster labels. Cluster must have all labels to match.

placement.clusterSelector.zone

Optional

string

Immutable. Optional. The zone where workflow process executes. This parameter does not affect the selection of the cluster. If unspecified, the zone of the first cluster matching the selector is used.

placement.managedCluster

Optional

object

Immutable. A cluster that is managed by the workflow.

placement.managedCluster.clusterName

Required*

string

Immutable. Required. The cluster name prefix. A unique cluster name will be formed by appending a random suffix. The name must contain only lower-case letters (a-z), numbers (0-9), and hyphens (-). Must begin with a letter. Cannot begin or end with hyphen. Must consist of between 2 and 35 characters.

placement.managedCluster.config

Required*

object

Immutable. Required. The cluster configuration.

placement.managedCluster.config.autoscalingConfig

Optional

object

Immutable. Optional. Autoscaling config for the policy associated with the cluster. Cluster does not autoscale if this field is unset.

placement.managedCluster.config.autoscalingConfig.policyRef

Optional

object

Immutable.

placement.managedCluster.config.autoscalingConfig.policyRef.external

Optional

string

Optional. The autoscaling policy used by the cluster. Only resource names including projectid and location (region) are valid. Examples: * `https://www.googleapis.com/compute/v1/projects/[project_id]/locations/[dataproc_region]/autoscalingPolicies/[policy_id]` * `projects/[project_id]/locations/[dataproc_region]/autoscalingPolicies/[policy_id]` Note that the policy must be in the same project and Dataproc region. Allowed value: The Google Cloud resource name of a `DataprocAutoscalingPolicy` resource (format: `projects/{{project}}/locations/{{location}}/autoscalingPolicies/{{name}}`).

placement.managedCluster.config.autoscalingConfig.policyRef.name

Optional

string

Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names

placement.managedCluster.config.autoscalingConfig.policyRef.namespace

Optional

string

Namespace of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/

placement.managedCluster.config.encryptionConfig

Optional

object

Immutable. Optional. Encryption settings for the cluster.

placement.managedCluster.config.encryptionConfig.gcePdKmsKeyRef

Optional

object

Immutable.

placement.managedCluster.config.encryptionConfig.gcePdKmsKeyRef.external

Optional

string

Optional. The Cloud KMS key name to use for PD disk encryption for all instances in the cluster. Allowed value: The `selfLink` field of a `KMSCryptoKey` resource.

placement.managedCluster.config.encryptionConfig.gcePdKmsKeyRef.name

Optional

string

Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names

placement.managedCluster.config.encryptionConfig.gcePdKmsKeyRef.namespace

Optional

string

Namespace of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/

placement.managedCluster.config.endpointConfig

Optional

object

Immutable. Optional. Port/endpoint configuration for this cluster

placement.managedCluster.config.endpointConfig.enableHttpPortAccess

Optional

boolean

Immutable. Optional. If true, enable http access to specific ports on the cluster from external sources. Defaults to false.

placement.managedCluster.config.gceClusterConfig

Optional

object

Immutable. Optional. The shared Compute Engine config settings for all instances in a cluster.

placement.managedCluster.config.gceClusterConfig.internalIPOnly

Optional

boolean

Immutable. Optional. If true, all instances in the cluster will only have internal IP addresses. By default, clusters are not restricted to internal IP addresses, and will have ephemeral external IP addresses assigned to each instance. This `internal_ip_only` restriction can only be enabled for subnetwork enabled networks, and all off-cluster dependencies must be configured to be accessible without external IP addresses.

placement.managedCluster.config.gceClusterConfig.metadata

Optional

map (key: string, value: string)

Immutable. The Compute Engine metadata entries to add to all instances (see [Project and instance metadata](https://cloud.google.com/compute/docs/storing-retrieving-metadata#project_and_instance_metadata)).

placement.managedCluster.config.gceClusterConfig.networkRef

Optional

object

Immutable.

placement.managedCluster.config.gceClusterConfig.networkRef.external

Optional

string

Optional. The Compute Engine network to be used for machine communications. Cannot be specified with subnetwork_uri. If neither `network_uri` nor `subnetwork_uri` is specified, the "default" network of the project is used, if it exists. Cannot be a "Custom Subnet Network" (see [Using Subnetworks](https://cloud.google.com/compute/docs/subnetworks) for more information). A full URL, partial URI, or short name are valid. Examples: * `https://www.googleapis.com/compute/v1/projects/[project_id]/regions/global/default` * `projects/[project_id]/regions/global/default` * `default` Allowed value: The `selfLink` field of a `ComputeNetwork` resource.

placement.managedCluster.config.gceClusterConfig.networkRef.name

Optional

string

Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names

placement.managedCluster.config.gceClusterConfig.networkRef.namespace

Optional

string

Namespace of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/

placement.managedCluster.config.gceClusterConfig.nodeGroupAffinity

Optional

object

Immutable. Optional. Node Group Affinity for sole-tenant clusters.

placement.managedCluster.config.gceClusterConfig.nodeGroupAffinity.nodeGroupRef

Required*

object

Immutable.

placement.managedCluster.config.gceClusterConfig.nodeGroupAffinity.nodeGroupRef.external

Optional

string

Required. The URI of a sole-tenant [node group resource](https://cloud.google.com/compute/docs/reference/rest/v1/nodeGroups) that the cluster will be created on. A full URL, partial URI, or node group name are valid. Examples: * `https://www.googleapis.com/compute/v1/projects/[project_id]/zones/us-central1-a/nodeGroups/node-group-1` * `projects/[project_id]/zones/us-central1-a/nodeGroups/node-group-1` * `node-group-1` Allowed value: The `selfLink` field of a `ComputeNodeGroup` resource.

placement.managedCluster.config.gceClusterConfig.nodeGroupAffinity.nodeGroupRef.name

Optional

string

Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names

placement.managedCluster.config.gceClusterConfig.nodeGroupAffinity.nodeGroupRef.namespace

Optional

string

Namespace of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/

placement.managedCluster.config.gceClusterConfig.privateIPv6GoogleAccess

Optional

string

Immutable. Optional. The type of IPv6 access for a cluster. Possible values: PRIVATE_IPV6_GOOGLE_ACCESS_UNSPECIFIED, INHERIT_FROM_SUBNETWORK, OUTBOUND, BIDIRECTIONAL

placement.managedCluster.config.gceClusterConfig.reservationAffinity

Optional

object

Immutable. Optional. Reservation Affinity for consuming Zonal reservation.

placement.managedCluster.config.gceClusterConfig.reservationAffinity.consumeReservationType

Optional

string

Immutable. Optional. Type of reservation to consume Possible values: TYPE_UNSPECIFIED, NO_RESERVATION, ANY_RESERVATION, SPECIFIC_RESERVATION

placement.managedCluster.config.gceClusterConfig.reservationAffinity.key

Optional

string

Immutable. Optional. Corresponds to the label key of reservation resource.

placement.managedCluster.config.gceClusterConfig.reservationAffinity.values

Optional

list (string)

Immutable. Optional. Corresponds to the label values of reservation resource.

placement.managedCluster.config.gceClusterConfig.reservationAffinity.values[]

Optional

string

placement.managedCluster.config.gceClusterConfig.serviceAccountRef

Optional

object

Immutable.

placement.managedCluster.config.gceClusterConfig.serviceAccountRef.external

Optional

string

Optional. The [Dataproc service account](https://cloud.google.com/dataproc/docs/concepts/configuring-clusters/service-accounts#service_accounts_in_dataproc) (also see [VM Data Plane identity](https://cloud.google.com/dataproc/docs/concepts/iam/dataproc-principals#vm_service_account_data_plane_identity)) used by Dataproc cluster VM instances to access Google Cloud Platform services. If not specified, the [Compute Engine default service account](https://cloud.google.com/compute/docs/access/service-accounts#default_service_account) is used. Allowed value: The `email` field of an `IAMServiceAccount` resource.

placement.managedCluster.config.gceClusterConfig.serviceAccountRef.name

Optional

string

Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names

placement.managedCluster.config.gceClusterConfig.serviceAccountRef.namespace

Optional

string

Namespace of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/

placement.managedCluster.config.gceClusterConfig.serviceAccountScopes

Optional

list (string)

Immutable. Optional. The URIs of service account scopes to be included in Compute Engine instances. The following base set of scopes is always included: * https://www.googleapis.com/auth/cloud.useraccounts.readonly * https://www.googleapis.com/auth/devstorage.read_write * https://www.googleapis.com/auth/logging.write If no scopes are specified, the following defaults are also provided: * https://www.googleapis.com/auth/bigquery * https://www.googleapis.com/auth/bigtable.admin.table * https://www.googleapis.com/auth/bigtable.data * https://www.googleapis.com/auth/devstorage.full_control

placement.managedCluster.config.gceClusterConfig.serviceAccountScopes[]

Optional

string

placement.managedCluster.config.gceClusterConfig.subnetworkRef

Optional

object

Immutable.

placement.managedCluster.config.gceClusterConfig.subnetworkRef.external

Optional

string

Optional. The Compute Engine subnetwork to be used for machine communications. Cannot be specified with network_uri. A full URL, partial URI, or short name are valid. Examples: * `https://www.googleapis.com/compute/v1/projects/[project_id]/regions/us-east1/subnetworks/sub0` * `projects/[project_id]/regions/us-east1/subnetworks/sub0` * `sub0` Allowed value: The `selfLink` field of a `ComputeSubnetwork` resource.

placement.managedCluster.config.gceClusterConfig.subnetworkRef.name

Optional

string

Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names

placement.managedCluster.config.gceClusterConfig.subnetworkRef.namespace

Optional

string

Namespace of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/

placement.managedCluster.config.gceClusterConfig.tags

Optional

list (string)

Immutable. The Compute Engine tags to add to all instances (see [Tagging instances](https://cloud.google.com/compute/docs/label-or-tag-resources#tags)).

placement.managedCluster.config.gceClusterConfig.tags[]

Optional

string

placement.managedCluster.config.gceClusterConfig.zone

Optional

string

Immutable. Optional. The zone where the Compute Engine cluster will be located. On a create request, it is required in the "global" region. If omitted in a non-global Dataproc region, the service will pick a zone in the corresponding Compute Engine region. On a get request, zone will always be present. A full URL, partial URI, or short name are valid. Examples: * `https://www.googleapis.com/compute/v1/projects/[project_id]/zones/[zone]` * `projects/[project_id]/zones/[zone]` * `us-central1-f`

placement.managedCluster.config.initializationActions

Optional

list (object)

Immutable. Optional. Commands to execute on each node after config is completed. By default, executables are run on master and all worker nodes. You can test a node's `role` metadata to run an executable on a master or worker node, as shown below using `curl` (you can also use `wget`): ROLE=$(curl -H Metadata-Flavor:Google http://metadata/computeMetadata/v1/instance/attributes/dataproc-role) if [[ "${ROLE}" == 'Master' ]]; then ... master specific actions ... else ... worker specific actions ... fi

placement.managedCluster.config.initializationActions[]

Optional

object

placement.managedCluster.config.initializationActions[].executableFile

Optional

string

Immutable. Required. Cloud Storage URI of executable file.

placement.managedCluster.config.initializationActions[].executionTimeout

Optional

string

Immutable. Optional. Amount of time executable has to complete. Default is 10 minutes (see JSON representation of [Duration](https://developers.google.com/protocol-buffers/docs/proto3#json)). Cluster creation fails with an explanatory error message (the name of the executable that caused the error and the exceeded timeout period) if the executable is not completed at end of the timeout period.

placement.managedCluster.config.lifecycleConfig

Optional

object

Immutable. Optional. Lifecycle setting for the cluster.

placement.managedCluster.config.lifecycleConfig.autoDeleteTime

Optional

string

Immutable. Optional. The time when cluster will be auto-deleted (see JSON representation of [Timestamp](https://developers.google.com/protocol-buffers/docs/proto3#json)).

placement.managedCluster.config.lifecycleConfig.autoDeleteTtl

Optional

string

Immutable. Optional. The lifetime duration of cluster. The cluster will be auto-deleted at the end of this period. Minimum value is 10 minutes; maximum value is 14 days (see JSON representation of [Duration](https://developers.google.com/protocol-buffers/docs/proto3#json)).

placement.managedCluster.config.lifecycleConfig.idleDeleteTtl

Optional

string

Immutable. Optional. The duration to keep the cluster alive while idling (when no jobs are running). Passing this threshold will cause the cluster to be deleted. Minimum value is 5 minutes; maximum value is 14 days (see JSON representation of [Duration](https://developers.google.com/protocol-buffers/docs/proto3#json)).

placement.managedCluster.config.masterConfig

Optional

object

Immutable. Optional. The Compute Engine config settings for the master instance in a cluster.

placement.managedCluster.config.masterConfig.accelerators

Optional

list (object)

Immutable. Optional. The Compute Engine accelerator configuration for these instances.

placement.managedCluster.config.masterConfig.accelerators[]

Optional

object

placement.managedCluster.config.masterConfig.accelerators[].acceleratorCount

Optional

integer

Immutable. The number of the accelerator cards of this type exposed to this instance.

placement.managedCluster.config.masterConfig.accelerators[].acceleratorType

Optional

string

Immutable. Full URL, partial URI, or short name of the accelerator type resource to expose to this instance. See [Compute Engine AcceleratorTypes](https://cloud.google.com/compute/docs/reference/beta/acceleratorTypes). Examples: * `https://www.googleapis.com/compute/beta/projects/[project_id]/zones/us-east1-a/acceleratorTypes/nvidia-tesla-k80` * `projects/[project_id]/zones/us-east1-a/acceleratorTypes/nvidia-tesla-k80` * `nvidia-tesla-k80` **Auto Zone Exception**: If you are using the Dataproc [Auto Zone Placement](https://cloud.google.com/dataproc/docs/concepts/configuring-clusters/auto-zone#using_auto_zone_placement) feature, you must use the short name of the accelerator type resource, for example, `nvidia-tesla-k80`.

placement.managedCluster.config.masterConfig.diskConfig

Optional

object

Immutable. Optional. Disk option config settings.

placement.managedCluster.config.masterConfig.diskConfig.bootDiskSizeGb

Optional

integer

Immutable. Optional. Size in GB of the boot disk (default is 500GB).

placement.managedCluster.config.masterConfig.diskConfig.bootDiskType

Optional

string

Immutable. Optional. Type of the boot disk (default is "pd-standard"). Valid values: "pd-balanced" (Persistent Disk Balanced Solid State Drive), "pd-ssd" (Persistent Disk Solid State Drive), or "pd-standard" (Persistent Disk Hard Disk Drive). See [Disk types](https://cloud.google.com/compute/docs/disks#disk-types).

placement.managedCluster.config.masterConfig.diskConfig.numLocalSsds

Optional

integer

Immutable. Optional. Number of attached SSDs, from 0 to 4 (default is 0). If SSDs are not attached, the boot disk is used to store runtime logs and [HDFS](https://hadoop.apache.org/docs/r1.2.1/hdfs_user_guide.html) data. If one or more SSDs are attached, this runtime bulk data is spread across them, and the boot disk contains only basic config and installed binaries.

placement.managedCluster.config.masterConfig.imageRef

Optional

object

Immutable.

placement.managedCluster.config.masterConfig.imageRef.external

Optional

string

Optional. The Compute Engine image resource used for cluster instances. The URI can represent an image or image family. Image examples: * `https://www.googleapis.com/compute/beta/projects/[project_id]/global/images/[image-id]` * `projects/[project_id]/global/images/[image-id]` * `image-id` Image family examples. Dataproc will use the most recent image from the family: * `https://www.googleapis.com/compute/beta/projects/[project_id]/global/images/family/[custom-image-family-name]` * `projects/[project_id]/global/images/family/[custom-image-family-name]` If the URI is unspecified, it will be inferred from `SoftwareConfig.image_version` or the system default. Allowed value: The `selfLink` field of a `ComputeImage` resource.

placement.managedCluster.config.masterConfig.imageRef.name

Optional

string

Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names

placement.managedCluster.config.masterConfig.imageRef.namespace

Optional

string

Namespace of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/

placement.managedCluster.config.masterConfig.machineType

Optional

string

Immutable. Optional. The Compute Engine machine type used for cluster instances. A full URL, partial URI, or short name are valid. Examples: * `https://www.googleapis.com/compute/v1/projects/[project_id]/zones/us-east1-a/machineTypes/n1-standard-2` * `projects/[project_id]/zones/us-east1-a/machineTypes/n1-standard-2` * `n1-standard-2` **Auto Zone Exception**: If you are using the Dataproc [Auto Zone Placement](https://cloud.google.com/dataproc/docs/concepts/configuring-clusters/auto-zone#using_auto_zone_placement) feature, you must use the short name of the machine type resource, for example, `n1-standard-2`.

placement.managedCluster.config.masterConfig.minCpuPlatform

Optional

string

Immutable. Optional. Specifies the minimum cpu platform for the Instance Group. See [Dataproc -> Minimum CPU Platform](https://cloud.google.com/dataproc/docs/concepts/compute/dataproc-min-cpu).

placement.managedCluster.config.masterConfig.numInstances

Optional

integer

Immutable. Optional. The number of VM instances in the instance group. For [HA cluster](/dataproc/docs/concepts/configuring-clusters/high-availability) [master_config](#FIELDS.master_config) groups, **must be set to 3**. For standard cluster [master_config](#FIELDS.master_config) groups, **must be set to 1**.

placement.managedCluster.config.masterConfig.preemptibility

Optional

string

Immutable. Optional. Specifies the preemptibility of the instance group. The default value for master and worker groups is `NON_PREEMPTIBLE`. This default cannot be changed. The default value for secondary instances is `PREEMPTIBLE`. Possible values: PREEMPTIBILITY_UNSPECIFIED, NON_PREEMPTIBLE, PREEMPTIBLE

placement.managedCluster.config.secondaryWorkerConfig

Optional

object

Immutable. Optional. The Compute Engine config settings for additional worker instances in a cluster.

placement.managedCluster.config.secondaryWorkerConfig.accelerators

Optional

list (object)

Immutable. Optional. The Compute Engine accelerator configuration for these instances.

placement.managedCluster.config.secondaryWorkerConfig.accelerators[]

Optional

object

placement.managedCluster.config.secondaryWorkerConfig.accelerators[].acceleratorCount

Optional

integer

Immutable. The number of the accelerator cards of this type exposed to this instance.

placement.managedCluster.config.secondaryWorkerConfig.accelerators[].acceleratorType

Optional

string

Immutable. Full URL, partial URI, or short name of the accelerator type resource to expose to this instance. See [Compute Engine AcceleratorTypes](https://cloud.google.com/compute/docs/reference/beta/acceleratorTypes). Examples: * `https://www.googleapis.com/compute/beta/projects/[project_id]/zones/us-east1-a/acceleratorTypes/nvidia-tesla-k80` * `projects/[project_id]/zones/us-east1-a/acceleratorTypes/nvidia-tesla-k80` * `nvidia-tesla-k80` **Auto Zone Exception**: If you are using the Dataproc [Auto Zone Placement](https://cloud.google.com/dataproc/docs/concepts/configuring-clusters/auto-zone#using_auto_zone_placement) feature, you must use the short name of the accelerator type resource, for example, `nvidia-tesla-k80`.

placement.managedCluster.config.secondaryWorkerConfig.diskConfig

Optional

object

Immutable. Optional. Disk option config settings.

placement.managedCluster.config.secondaryWorkerConfig.diskConfig.bootDiskSizeGb

Optional

integer

Immutable. Optional. Size in GB of the boot disk (default is 500GB).

placement.managedCluster.config.secondaryWorkerConfig.diskConfig.bootDiskType

Optional

string

Immutable. Optional. Type of the boot disk (default is "pd-standard"). Valid values: "pd-balanced" (Persistent Disk Balanced Solid State Drive), "pd-ssd" (Persistent Disk Solid State Drive), or "pd-standard" (Persistent Disk Hard Disk Drive). See [Disk types](https://cloud.google.com/compute/docs/disks#disk-types).

placement.managedCluster.config.secondaryWorkerConfig.diskConfig.numLocalSsds

Optional

integer

Immutable. Optional. Number of attached SSDs, from 0 to 4 (default is 0). If SSDs are not attached, the boot disk is used to store runtime logs and [HDFS](https://hadoop.apache.org/docs/r1.2.1/hdfs_user_guide.html) data. If one or more SSDs are attached, this runtime bulk data is spread across them, and the boot disk contains only basic config and installed binaries.

placement.managedCluster.config.secondaryWorkerConfig.imageRef

Optional

object

Immutable.

placement.managedCluster.config.secondaryWorkerConfig.imageRef.external

Optional

string

Optional. The Compute Engine image resource used for cluster instances. The URI can represent an image or image family. Image examples: * `https://www.googleapis.com/compute/beta/projects/[project_id]/global/images/[image-id]` * `projects/[project_id]/global/images/[image-id]` * `image-id` Image family examples. Dataproc will use the most recent image from the family: * `https://www.googleapis.com/compute/beta/projects/[project_id]/global/images/family/[custom-image-family-name]` * `projects/[project_id]/global/images/family/[custom-image-family-name]` If the URI is unspecified, it will be inferred from `SoftwareConfig.image_version` or the system default. Allowed value: The `selfLink` field of a `ComputeImage` resource.

placement.managedCluster.config.secondaryWorkerConfig.imageRef.name

Optional

string

Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names

placement.managedCluster.config.secondaryWorkerConfig.imageRef.namespace

Optional

string

Namespace of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/

placement.managedCluster.config.secondaryWorkerConfig.machineType

Optional

string

Immutable. Optional. The Compute Engine machine type used for cluster instances. A full URL, partial URI, or short name are valid. Examples: * `https://www.googleapis.com/compute/v1/projects/[project_id]/zones/us-east1-a/machineTypes/n1-standard-2` * `projects/[project_id]/zones/us-east1-a/machineTypes/n1-standard-2` * `n1-standard-2` **Auto Zone Exception**: If you are using the Dataproc [Auto Zone Placement](https://cloud.google.com/dataproc/docs/concepts/configuring-clusters/auto-zone#using_auto_zone_placement) feature, you must use the short name of the machine type resource, for example, `n1-standard-2`.

placement.managedCluster.config.secondaryWorkerConfig.minCpuPlatform

Optional

string

Immutable. Optional. Specifies the minimum cpu platform for the Instance Group. See [Dataproc -> Minimum CPU Platform](https://cloud.google.com/dataproc/docs/concepts/compute/dataproc-min-cpu).

placement.managedCluster.config.secondaryWorkerConfig.numInstances

Optional

integer

Immutable. Optional. The number of VM instances in the instance group. For [HA cluster](/dataproc/docs/concepts/configuring-clusters/high-availability) [master_config](#FIELDS.master_config) groups, **must be set to 3**. For standard cluster [master_config](#FIELDS.master_config) groups, **must be set to 1**.

placement.managedCluster.config.secondaryWorkerConfig.preemptibility

Optional

string

Immutable. Optional. Specifies the preemptibility of the instance group. The default value for master and worker groups is `NON_PREEMPTIBLE`. This default cannot be changed. The default value for secondary instances is `PREEMPTIBLE`. Possible values: PREEMPTIBILITY_UNSPECIFIED, NON_PREEMPTIBLE, PREEMPTIBLE

placement.managedCluster.config.securityConfig

Optional

object

Immutable. Optional. Security settings for the cluster.

placement.managedCluster.config.securityConfig.kerberosConfig

Optional

object

Immutable. Optional. Kerberos related configuration.

placement.managedCluster.config.securityConfig.kerberosConfig.crossRealmTrustAdminServer

Optional

string

Immutable. Optional. The admin server (IP or hostname) for the remote trusted realm in a cross realm trust relationship.

placement.managedCluster.config.securityConfig.kerberosConfig.crossRealmTrustKdc

Optional

string

Immutable. Optional. The KDC (IP or hostname) for the remote trusted realm in a cross realm trust relationship.

placement.managedCluster.config.securityConfig.kerberosConfig.crossRealmTrustRealm

Optional

string

Immutable. Optional. The remote realm the Dataproc on-cluster KDC will trust, should the user enable cross realm trust.

placement.managedCluster.config.securityConfig.kerberosConfig.crossRealmTrustSharedPassword

Optional

string

Immutable. Optional. The Cloud Storage URI of a KMS encrypted file containing the shared password between the on-cluster Kerberos realm and the remote trusted realm, in a cross realm trust relationship.

placement.managedCluster.config.securityConfig.kerberosConfig.enableKerberos

Optional

boolean

Immutable. Optional. Flag to indicate whether to Kerberize the cluster (default: false). Set this field to true to enable Kerberos on a cluster.

placement.managedCluster.config.securityConfig.kerberosConfig.kdcDbKey

Optional

string

Immutable. Optional. The Cloud Storage URI of a KMS encrypted file containing the master key of the KDC database.

placement.managedCluster.config.securityConfig.kerberosConfig.keyPassword

Optional

string

Immutable. Optional. The Cloud Storage URI of a KMS encrypted file containing the password to the user provided key. For the self-signed certificate, this password is generated by Dataproc.

placement.managedCluster.config.securityConfig.kerberosConfig.keystore

Optional

string

Immutable. Optional. The Cloud Storage URI of the keystore file used for SSL encryption. If not provided, Dataproc will provide a self-signed certificate.

placement.managedCluster.config.securityConfig.kerberosConfig.keystorePassword

Optional

string

Immutable. Optional. The Cloud Storage URI of a KMS encrypted file containing the password to the user provided keystore. For the self-signed certificate, this password is generated by Dataproc.

placement.managedCluster.config.securityConfig.kerberosConfig.kmsKeyRef

Optional

object

Immutable.

placement.managedCluster.config.securityConfig.kerberosConfig.kmsKeyRef.external

Optional

string

Optional. The uri of the KMS key used to encrypt various sensitive files. Allowed value: The `selfLink` field of a `KMSCryptoKey` resource.

placement.managedCluster.config.securityConfig.kerberosConfig.kmsKeyRef.name

Optional

string

Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names

placement.managedCluster.config.securityConfig.kerberosConfig.kmsKeyRef.namespace

Optional

string

Namespace of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/

placement.managedCluster.config.securityConfig.kerberosConfig.realm

Optional

string

Immutable. Optional. The name of the on-cluster Kerberos realm. If not specified, the uppercased domain of hostnames will be the realm.

placement.managedCluster.config.securityConfig.kerberosConfig.rootPrincipalPassword

Optional

string

Immutable. Optional. The Cloud Storage URI of a KMS encrypted file containing the root principal password.

placement.managedCluster.config.securityConfig.kerberosConfig.tgtLifetimeHours

Optional

integer

Immutable. Optional. The lifetime of the ticket granting ticket, in hours. If not specified, or user specifies 0, then default value 10 will be used.

placement.managedCluster.config.securityConfig.kerberosConfig.truststore

Optional

string

Immutable. Optional. The Cloud Storage URI of the truststore file used for SSL encryption. If not provided, Dataproc will provide a self-signed certificate.

placement.managedCluster.config.securityConfig.kerberosConfig.truststorePassword

Optional

string

Immutable. Optional. The Cloud Storage URI of a KMS encrypted file containing the password to the user provided truststore. For the self-signed certificate, this password is generated by Dataproc.

placement.managedCluster.config.softwareConfig

Optional

object

Immutable. Optional. The config settings for software inside the cluster.

placement.managedCluster.config.softwareConfig.imageVersion

Optional

string

Immutable. Optional. The version of software inside the cluster. It must be one of the supported [Dataproc Versions](https://cloud.google.com/dataproc/docs/concepts/versioning/dataproc-versions#supported_dataproc_versions), such as "1.2" (including a subminor version, such as "1.2.29"), or the ["preview" version](https://cloud.google.com/dataproc/docs/concepts/versioning/dataproc-versions#other_versions). If unspecified, it defaults to the latest Debian version.

placement.managedCluster.config.softwareConfig.optionalComponents

Optional

list (string)

Immutable. Optional. The set of components to activate on the cluster.

placement.managedCluster.config.softwareConfig.optionalComponents[]

Optional

string

placement.managedCluster.config.softwareConfig.properties

Optional

map (key: string, value: string)

Immutable. Optional. The properties to set on daemon config files. Property keys are specified in `prefix:property` format, for example `core:hadoop.tmp.dir`. The following are supported prefixes and their mappings: * capacity-scheduler: `capacity-scheduler.xml` * core: `core-site.xml` * distcp: `distcp-default.xml` * hdfs: `hdfs-site.xml` * hive: `hive-site.xml` * mapred: `mapred-site.xml` * pig: `pig.properties` * spark: `spark-defaults.conf` * yarn: `yarn-site.xml` For more information, see [Cluster properties](https://cloud.google.com/dataproc/docs/concepts/cluster-properties).

placement.managedCluster.config.stagingBucketRef

Optional

object

Immutable.

placement.managedCluster.config.stagingBucketRef.external

Optional

string

Optional. A Cloud Storage bucket used to stage job dependencies, config files, and job driver console output. If you do not specify a staging bucket, Cloud Dataproc will determine a Cloud Storage location (US, ASIA, or EU) for your cluster's staging bucket according to the Compute Engine zone where your cluster is deployed, and then create and manage this project-level, per-location bucket (see [Dataproc staging bucket](https://cloud.google.com/dataproc/docs/concepts/configuring-clusters/staging-bucket)). **This field requires a Cloud Storage bucket name, not a URI to a Cloud Storage bucket.** Allowed value: The Google Cloud resource name of a `StorageBucket` resource (format: `{{name}}`).

placement.managedCluster.config.stagingBucketRef.name

Optional

string

Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names

placement.managedCluster.config.stagingBucketRef.namespace

Optional

string

Namespace of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/

placement.managedCluster.config.tempBucketRef

Optional

object

Immutable.

placement.managedCluster.config.tempBucketRef.external

Optional

string

Optional. A Cloud Storage bucket used to store ephemeral cluster and jobs data, such as Spark and MapReduce history files. If you do not specify a temp bucket, Dataproc will determine a Cloud Storage location (US, ASIA, or EU) for your cluster's temp bucket according to the Compute Engine zone where your cluster is deployed, and then create and manage this project-level, per-location bucket. The default bucket has a TTL of 90 days, but you can use any TTL (or none) if you specify a bucket. **This field requires a Cloud Storage bucket name, not a URI to a Cloud Storage bucket.** Allowed value: The Google Cloud resource name of a `StorageBucket` resource (format: `{{name}}`).

placement.managedCluster.config.tempBucketRef.name

Optional

string

Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names

placement.managedCluster.config.tempBucketRef.namespace

Optional

string

Namespace of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/

placement.managedCluster.config.workerConfig

Optional

object

Immutable. Optional. The Compute Engine config settings for worker instances in a cluster.

placement.managedCluster.config.workerConfig.accelerators

Optional

list (object)

Immutable. Optional. The Compute Engine accelerator configuration for these instances.

placement.managedCluster.config.workerConfig.accelerators[]

Optional

object

placement.managedCluster.config.workerConfig.accelerators[].acceleratorCount

Optional

integer

Immutable. The number of the accelerator cards of this type exposed to this instance.

placement.managedCluster.config.workerConfig.accelerators[].acceleratorType

Optional

string

Immutable. Full URL, partial URI, or short name of the accelerator type resource to expose to this instance. See [Compute Engine AcceleratorTypes](https://cloud.google.com/compute/docs/reference/beta/acceleratorTypes). Examples: * `https://www.googleapis.com/compute/beta/projects/[project_id]/zones/us-east1-a/acceleratorTypes/nvidia-tesla-k80` * `projects/[project_id]/zones/us-east1-a/acceleratorTypes/nvidia-tesla-k80` * `nvidia-tesla-k80` **Auto Zone Exception**: If you are using the Dataproc [Auto Zone Placement](https://cloud.google.com/dataproc/docs/concepts/configuring-clusters/auto-zone#using_auto_zone_placement) feature, you must use the short name of the accelerator type resource, for example, `nvidia-tesla-k80`.

placement.managedCluster.config.workerConfig.diskConfig

Optional

object

Immutable. Optional. Disk option config settings.

placement.managedCluster.config.workerConfig.diskConfig.bootDiskSizeGb

Optional

integer

Immutable. Optional. Size in GB of the boot disk (default is 500GB).

placement.managedCluster.config.workerConfig.diskConfig.bootDiskType

Optional

string

Immutable. Optional. Type of the boot disk (default is "pd-standard"). Valid values: "pd-balanced" (Persistent Disk Balanced Solid State Drive), "pd-ssd" (Persistent Disk Solid State Drive), or "pd-standard" (Persistent Disk Hard Disk Drive). See [Disk types](https://cloud.google.com/compute/docs/disks#disk-types).

placement.managedCluster.config.workerConfig.diskConfig.numLocalSsds

Optional

integer

Immutable. Optional. Number of attached SSDs, from 0 to 4 (default is 0). If SSDs are not attached, the boot disk is used to store runtime logs and [HDFS](https://hadoop.apache.org/docs/r1.2.1/hdfs_user_guide.html) data. If one or more SSDs are attached, this runtime bulk data is spread across them, and the boot disk contains only basic config and installed binaries.

placement.managedCluster.config.workerConfig.imageRef

Optional

object

Immutable.

placement.managedCluster.config.workerConfig.imageRef.external

Optional

string

Optional. The Compute Engine image resource used for cluster instances. The URI can represent an image or image family. Image examples: * `https://www.googleapis.com/compute/beta/projects/[project_id]/global/images/[image-id]` * `projects/[project_id]/global/images/[image-id]` * `image-id` Image family examples. Dataproc will use the most recent image from the family: * `https://www.googleapis.com/compute/beta/projects/[project_id]/global/images/family/[custom-image-family-name]` * `projects/[project_id]/global/images/family/[custom-image-family-name]` If the URI is unspecified, it will be inferred from `SoftwareConfig.image_version` or the system default. Allowed value: The `selfLink` field of a `ComputeImage` resource.

placement.managedCluster.config.workerConfig.imageRef.name

Optional

string

Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names

placement.managedCluster.config.workerConfig.imageRef.namespace

Optional

string

Namespace of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/

placement.managedCluster.config.workerConfig.machineType

Optional

string

Immutable. Optional. The Compute Engine machine type used for cluster instances. A full URL, partial URI, or short name are valid. Examples: * `https://www.googleapis.com/compute/v1/projects/[project_id]/zones/us-east1-a/machineTypes/n1-standard-2` * `projects/[project_id]/zones/us-east1-a/machineTypes/n1-standard-2` * `n1-standard-2` **Auto Zone Exception**: If you are using the Dataproc [Auto Zone Placement](https://cloud.google.com/dataproc/docs/concepts/configuring-clusters/auto-zone#using_auto_zone_placement) feature, you must use the short name of the machine type resource, for example, `n1-standard-2`.

placement.managedCluster.config.workerConfig.minCpuPlatform

Optional

string

Immutable. Optional. Specifies the minimum cpu platform for the Instance Group. See [Dataproc -> Minimum CPU Platform](https://cloud.google.com/dataproc/docs/concepts/compute/dataproc-min-cpu).

placement.managedCluster.config.workerConfig.numInstances

Optional

integer

Immutable. Optional. The number of VM instances in the instance group. For [HA cluster](/dataproc/docs/concepts/configuring-clusters/high-availability) [master_config](#FIELDS.master_config) groups, **must be set to 3**. For standard cluster [master_config](#FIELDS.master_config) groups, **must be set to 1**.

placement.managedCluster.config.workerConfig.preemptibility

Optional

string

Immutable. Optional. Specifies the preemptibility of the instance group. The default value for master and worker groups is `NON_PREEMPTIBLE`. This default cannot be changed. The default value for secondary instances is `PREEMPTIBLE`. Possible values: PREEMPTIBILITY_UNSPECIFIED, NON_PREEMPTIBLE, PREEMPTIBLE

placement.managedCluster.labels

Optional

map (key: string, value: string)

Immutable. Optional. The labels to associate with this cluster. Label keys must be between 1 and 63 characters long, and must conform to the following PCRE regular expression: p{Ll}p{Lo}{0,62} Label values must be between 1 and 63 characters long, and must conform to the following PCRE regular expression: [p{Ll}p{Lo}p{N}_-]{0,63} No more than 32 labels can be associated with a given cluster.

projectRef

Optional

object

Immutable. The Project that this resource belongs to.

projectRef.external

Optional

string

The project for the resource Allowed value: The Google Cloud resource name of a `Project` resource (format: `projects/{{name}}`).

projectRef.name

Optional

string

Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names

projectRef.namespace

Optional

string

Namespace of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/

resourceID

Optional

string

Immutable. Optional. The name of the resource. Used for creation and acquisition. When unset, the value of `metadata.name` is used as the default.

* Field is required when parent field is specified

Status

Schema

conditions:
- lastTransitionTime: string
  message: string
  reason: string
  status: string
  type: string
createTime: string
observedGeneration: integer
placement:
  managedCluster:
    config:
      endpointConfig:
        httpPorts:
          string: string
      lifecycleConfig:
        idleStartTime: string
      masterConfig:
        instanceNames:
        - string
        isPreemptible: boolean
        managedGroupConfig:
          instanceGroupManagerName: string
          instanceTemplateName: string
      secondaryWorkerConfig:
        instanceNames:
        - string
        isPreemptible: boolean
        managedGroupConfig:
          instanceGroupManagerName: string
          instanceTemplateName: string
      workerConfig:
        instanceNames:
        - string
        isPreemptible: boolean
        managedGroupConfig:
          instanceGroupManagerName: string
          instanceTemplateName: string
updateTime: string
version: integer
Fields
conditions

list (object)

Conditions represent the latest available observation of the resource's current state.

conditions[]

object

conditions[].lastTransitionTime

string

Last time the condition transitioned from one status to another.

conditions[].message

string

Human-readable message indicating details about last transition.

conditions[].reason

string

Unique, one-word, CamelCase reason for the condition's last transition.

conditions[].status

string

Status is the status of the condition. Can be True, False, Unknown.

conditions[].type

string

Type is the type of the condition.

createTime

string

Output only. The time template was created.

observedGeneration

integer

ObservedGeneration is the generation of the resource that was most recently observed by the Config Connector controller. If this is equal to metadata.generation, then that means that the current reported status reflects the most recent desired state of the resource.

placement

object

placement.managedCluster

object

placement.managedCluster.config

object

placement.managedCluster.config.endpointConfig

object

placement.managedCluster.config.endpointConfig.httpPorts

map (key: string, value: string)

Output only. The map of port descriptions to URLs. Will only be populated if enable_http_port_access is true.

placement.managedCluster.config.lifecycleConfig

object

placement.managedCluster.config.lifecycleConfig.idleStartTime

string

Output only. The time when cluster became idle (most recent job finished) and became eligible for deletion due to idleness (see JSON representation of [Timestamp](https://developers.google.com/protocol-buffers/docs/proto3#json)).

placement.managedCluster.config.masterConfig

object

placement.managedCluster.config.masterConfig.instanceNames

list (string)

Output only. The list of instance names. Dataproc derives the names from `cluster_name`, `num_instances`, and the instance group.

placement.managedCluster.config.masterConfig.instanceNames[]

string

placement.managedCluster.config.masterConfig.isPreemptible

boolean

Output only. Specifies that this instance group contains preemptible instances.

placement.managedCluster.config.masterConfig.managedGroupConfig

object

Output only. The config for Compute Engine Instance Group Manager that manages this group. This is only used for preemptible instance groups.

placement.managedCluster.config.masterConfig.managedGroupConfig.instanceGroupManagerName

string

Output only. The name of the Instance Group Manager for this group.

placement.managedCluster.config.masterConfig.managedGroupConfig.instanceTemplateName

string

Output only. The name of the Instance Template used for the Managed Instance Group.

placement.managedCluster.config.secondaryWorkerConfig

object

placement.managedCluster.config.secondaryWorkerConfig.instanceNames

list (string)

Output only. The list of instance names. Dataproc derives the names from `cluster_name`, `num_instances`, and the instance group.

placement.managedCluster.config.secondaryWorkerConfig.instanceNames[]

string

placement.managedCluster.config.secondaryWorkerConfig.isPreemptible

boolean

Output only. Specifies that this instance group contains preemptible instances.

placement.managedCluster.config.secondaryWorkerConfig.managedGroupConfig

object

Output only. The config for Compute Engine Instance Group Manager that manages this group. This is only used for preemptible instance groups.

placement.managedCluster.config.secondaryWorkerConfig.managedGroupConfig.instanceGroupManagerName

string

Output only. The name of the Instance Group Manager for this group.

placement.managedCluster.config.secondaryWorkerConfig.managedGroupConfig.instanceTemplateName

string

Output only. The name of the Instance Template used for the Managed Instance Group.

placement.managedCluster.config.workerConfig

object

placement.managedCluster.config.workerConfig.instanceNames

list (string)

Output only. The list of instance names. Dataproc derives the names from `cluster_name`, `num_instances`, and the instance group.

placement.managedCluster.config.workerConfig.instanceNames[]

string

placement.managedCluster.config.workerConfig.isPreemptible

boolean

Output only. Specifies that this instance group contains preemptible instances.

placement.managedCluster.config.workerConfig.managedGroupConfig

object

Output only. The config for Compute Engine Instance Group Manager that manages this group. This is only used for preemptible instance groups.

placement.managedCluster.config.workerConfig.managedGroupConfig.instanceGroupManagerName

string

Output only. The name of the Instance Group Manager for this group.

placement.managedCluster.config.workerConfig.managedGroupConfig.instanceTemplateName

string

Output only. The name of the Instance Template used for the Managed Instance Group.

updateTime

string

Output only. The time template was last updated.

version

integer

Output only. The current version of this workflow template.

Sample YAML(s)

Typical Use Case

# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

apiVersion: dataproc.cnrm.cloud.google.com/v1beta1
kind: DataprocWorkflowTemplate
metadata:
  labels:
    label-one: "value-one"
  name: dataprocworkflowtemplate-sample
spec:
  location: "us-central1"
  placement:
    managedCluster:
      clusterName: "test-cluster"
      config:
        autoscalingConfig:
          policyRef:
            name: dataprocworkflowtemplate-dep
        masterConfig:
          diskConfig:
            bootDiskSizeGb: 30
            bootDiskType: pd-standard
          machineType: "n2-standard-8"
          numInstances: 1
        workerConfig:
          numInstances: 2
          machineType: "n2-standard-8"
          diskConfig:
            bootDiskSizeGb: 30
            numLocalSsds: 1
        softwareConfig:
          imageVersion: "2.0.39-debian10"
        gceClusterConfig:
          tags:
          - "foo"
          - "bar"
  jobs:
  - stepId: "someJob"
    sparkJob:
      mainClass: "SomeClass"
  - stepId: "otherJob"
    prerequisiteStepIds:
    - "someJob"
    prestoJob:
      queryFileUri: "someUri"
---
apiVersion: dataproc.cnrm.cloud.google.com/v1beta1
kind: DataprocAutoscalingPolicy
metadata:
  name: dataprocworkflowtemplate-dep
spec:
  location: "us-central1"
  workerConfig:
    maxInstances: 5
  secondaryWorkerConfig:
    maxInstances: 2
  basicAlgorithm:
    yarnConfig:
      gracefulDecommissionTimeout: "30s"
      scaleDownFactor: 0.5
      scaleUpFactor: 1