Set the initial worker count for the RDE beam pipeline at 24 (#1572)

* Set the initial worker count for the RDE beam pipeline at 24

This likely will speed up the pipeline by skipping the initially slow
process of spinning up instances.
This commit is contained in:
Lai Jiang 2022-03-27 22:51:58 -04:00 committed by GitHub
parent 7135219151
commit fc0e461c44
5 changed files with 24 additions and 16 deletions

View file

@ -576,6 +576,19 @@ public final class RegistryConfig {
return config.beam.highPerformanceMachineType;
}
/**
* Returns initial number of workers used for a Beam pipeline. Autoscaling can still in effect.
*
* @see <a
* href=https://cloud.google.com/dataflow/docs/guides/deploying-a-pipeline#horizontal-autoscaling>
* Horizontal Autoscaling </a>
*/
@Provides
@Config("initialWorkerCount")
public static int provideInitialWorkerCount(RegistryConfigSettings config) {
return config.beam.initialWorkerCount;
}
/**
* Returns the default job region to run Apache Beam (Cloud Dataflow) jobs in.
*

View file

@ -137,6 +137,7 @@ public class RegistryConfigSettings {
public static class Beam {
public String defaultJobRegion;
public String highPerformanceMachineType;
public int initialWorkerCount;
public String stagingBucketUrl;
}

View file

@ -442,6 +442,11 @@ beam:
# core count per machine may be preferable in order to preserve IP addresses.
# See: https://cloud.google.com/compute/quotas#cpu_quota
highPerformanceMachineType: n2-standard-4
# The initial number of workers requested. This can help speed up the pipeline
# which otherwise would take some time to spin up the necessary number of
# works. Autoscaling is still in effect to reduce the number of workers if
# not in use.
initialWorkerCount: 24
stagingBucketUrl: gcs-bucket-with-staged-templates
keyring:

View file

@ -261,6 +261,10 @@ public final class RdeStagingAction implements Runnable {
@Config("highPerformanceMachineType")
String machineType;
@Inject
@Config("initialWorkerCount")
int numWorkers;
@Inject @Config("transactionCooldown") Duration transactionCooldown;
@Inject @Config("beamStagingBucketUrl") String stagingBucketUrl;
@Inject @Config("rdeBucket") String rdeBucket;
@ -341,6 +345,7 @@ public final class RdeStagingAction implements Runnable {
.encode(stagingKeyBytes))
.put("registryEnvironment", RegistryEnvironment.get().name())
.put("workerMachineType", machineType)
.put("numWorkers", String.valueOf(numWorkers))
.put(
"jpaTransactionManagerType",
JpaTransactionManagerType.READ_ONLY_REPLICA.toString())

View file

@ -49,22 +49,6 @@
"regexes": [
"[A-Za-z0-9\\-_]+"
]
},
{
"name": "workerMachineType",
"label": "The GCE machine type for the dataflow job workers.",
"helpText": "See https://cloud.google.com/dataflow/quotas#compute-engine-quotas for available machine types.",
"regexes": [
"[a-z0-9\\-]+"
]
},
{
"name": "usePublicIps",
"label": "Whether the GCE workers are assigned public IPs",
"helpText": "Public IPs have an associated cost and there's a quota per region on the total number of public IPs assigned at a given time. If the service only needs to access GCP APIs, it's better to not use public IP, but one needs to configure the network accordingly. See https://cloud.google.com/dataflow/docs/guides/routes-firewall.",
"regexes": [
"true|false"
]
}
]
}