Use shared jar to stage BEAM pipeline if possible (#1008)

* Use shared jar to stage BEAM pipeline if possible

Allow multiple BEAM pipelines with the same classes and dependencies to
share one Uber jar.

Added metadata for BulkDeleteDatastorePipeline.

Updated shell and Cloud Build scripts to stage all pipelines in one
step.
This commit is contained in:
Weimin Yu 2021-03-16 13:19:30 -04:00 committed by GitHub
parent 24db87a4cf
commit 7c3d0dd1a9
6 changed files with 88 additions and 49 deletions

View file

@ -751,7 +751,8 @@ project.tasks.create('initSqlPipeline', JavaExec) {
// nom_build :core:bulkDeleteDatastore --args="--project=domain-registry-crash \
// --region=us-central1 --runner=DataflowRunner --kindsToDelete=*"
createToolTask(
'bulkDeleteDatastore', 'google.registry.beam.datastore.BulkDeletePipeline')
'bulkDeleteDatastore',
'google.registry.beam.datastore.BulkDeleteDatastorePipeline')
project.tasks.create('generateSqlSchema', JavaExec) {
classpath = sourceSets.nonprod.runtimeClasspath
@ -782,10 +783,13 @@ generateGoldenImages.finalizedBy(findGoldenImages)
createUberJar('nomulus', 'nomulus', 'google.registry.tools.RegistryTool')
// Build the Uber jar shared by all flex-template based BEAM pipelines.
// This packages more code and dependency than necessary. However, without
// restructuring the source tree it is difficult to generate leaner jars.
createUberJar(
'init_sql_pipeline',
'init_sql_pipeline',
'google.registry.beam.initsql.InitSqlPipeline')
'beam_pipeline_common',
'beam_pipeline_common',
'')
// A jar with classes and resources from main sourceSet, excluding internal
// data. See comments on configurations.nomulus_test above for details.

View file

@ -78,7 +78,7 @@ import org.apache.beam.sdk.values.TupleTagList;
* types in the Datastore using the {@code --numOfKindsHint} argument. If the default value for this
* parameter is too low, performance will suffer.
*/
public class BulkDeletePipeline {
public class BulkDeleteDatastorePipeline {
private static final FluentLogger logger = FluentLogger.forEnclosingClass();
// This tool is not for use in our critical projects.
@ -89,7 +89,7 @@ public class BulkDeletePipeline {
private final Pipeline pipeline;
BulkDeletePipeline(BulkDeletePipelineOptions options) {
BulkDeleteDatastorePipeline(BulkDeletePipelineOptions options) {
this.options = options;
pipeline = Pipeline.create(options);
}
@ -303,7 +303,7 @@ public class BulkDeletePipeline {
public static void main(String[] args) {
BulkDeletePipelineOptions options =
PipelineOptionsFactory.fromArgs(args).withValidation().as(BulkDeletePipelineOptions.class);
BulkDeletePipeline pipeline = new BulkDeletePipeline(options);
BulkDeleteDatastorePipeline pipeline = new BulkDeleteDatastorePipeline(options);
pipeline.run();
System.exit(0);
}

View file

@ -0,0 +1,20 @@
{
"name": "Bulk Delete Cloud Datastore",
"description": "An Apache Beam batch pipeline that deletes Cloud Datastore in bulk. This is easier to use than the GCP-provided template.",
"parameters": [
{
"name": "kindsToDelete",
"label": "The data KINDs to delete.",
"helpText": "The Datastore KINDs to be deleted. The format may be: the list of kinds to be deleted as a comma-separated string; or '*', which causes all kinds to be deleted."
},
{
"name": "getNumOfKindsHint",
"label": "An estimate of the number of KINDs to be deleted.",
"helpText": "An estimate of the number of KINDs to be deleted. This is recommended if --kindsToDelete is '*' and the default value is too low.",
"is_optional": true,
"regexes": [
"^[1-9][0-9]*$"
]
}
]
}

View file

@ -14,9 +14,9 @@
package google.registry.beam.datastore;
import static google.registry.beam.datastore.BulkDeletePipeline.discoverEntityKinds;
import static google.registry.beam.datastore.BulkDeletePipeline.getDeletionTags;
import static google.registry.beam.datastore.BulkDeletePipeline.getOneDeletionTag;
import static google.registry.beam.datastore.BulkDeleteDatastorePipeline.discoverEntityKinds;
import static google.registry.beam.datastore.BulkDeleteDatastorePipeline.getDeletionTags;
import static google.registry.beam.datastore.BulkDeleteDatastorePipeline.getOneDeletionTag;
import com.google.common.base.Verify;
import com.google.common.collect.ImmutableMap;
@ -25,8 +25,8 @@ import com.google.datastore.v1.Entity;
import com.google.datastore.v1.Key;
import com.google.datastore.v1.Key.PathElement;
import google.registry.beam.TestPipelineExtension;
import google.registry.beam.datastore.BulkDeletePipeline.GenerateQueries;
import google.registry.beam.datastore.BulkDeletePipeline.SplitEntities;
import google.registry.beam.datastore.BulkDeleteDatastorePipeline.GenerateQueries;
import google.registry.beam.datastore.BulkDeleteDatastorePipeline.SplitEntities;
import java.io.Serializable;
import java.util.Map;
import org.apache.beam.sdk.testing.PAssert;
@ -44,8 +44,8 @@ import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.condition.EnabledIfSystemProperty;
import org.junit.jupiter.api.extension.RegisterExtension;
/** Unit tests for {@link BulkDeletePipeline}. */
class BulkDeletePipelineTest implements Serializable {
/** Unit tests for {@link BulkDeleteDatastorePipeline}. */
class BulkDeleteDatastorePipelineTest implements Serializable {
@RegisterExtension
final transient TestPipelineExtension testPipeline =
@ -67,7 +67,7 @@ class BulkDeletePipelineTest implements Serializable {
TupleTagList tags = getDeletionTags(2);
PCollection<String> kinds = testPipeline.apply("InjectKinds", Create.of("A", "B"));
PCollection<KV<String, TupleTag<Entity>>> kindToTagMapping =
BulkDeletePipeline.mapKindsToDeletionTags(kinds, tags);
BulkDeleteDatastorePipeline.mapKindsToDeletionTags(kinds, tags);
PAssert.thatMap(kindToTagMapping)
.isEqualTo(
ImmutableMap.of(
@ -81,7 +81,7 @@ class BulkDeletePipelineTest implements Serializable {
TupleTagList tags = getDeletionTags(3);
PCollection<String> kinds = testPipeline.apply("InjectKinds", Create.of("A", "B"));
PCollection<KV<String, TupleTag<Entity>>> kindToTagMapping =
BulkDeletePipeline.mapKindsToDeletionTags(kinds, tags);
BulkDeleteDatastorePipeline.mapKindsToDeletionTags(kinds, tags);
PAssert.thatMap(kindToTagMapping)
.isEqualTo(
ImmutableMap.of(
@ -95,7 +95,7 @@ class BulkDeletePipelineTest implements Serializable {
TupleTagList tags = getDeletionTags(2);
PCollection<String> kinds = testPipeline.apply("InjectKinds", Create.of("A", "B", "C"));
PCollection<KV<String, TupleTag<Entity>>> kindToTagMapping =
BulkDeletePipeline.mapKindsToDeletionTags(kinds, tags);
BulkDeleteDatastorePipeline.mapKindsToDeletionTags(kinds, tags);
PAssert.thatMap(kindToTagMapping)
.isEqualTo(
ImmutableMap.of(
@ -110,7 +110,7 @@ class BulkDeletePipelineTest implements Serializable {
TupleTagList tags = getDeletionTags(2);
PCollection<String> kinds = testPipeline.apply("InjectKinds", Create.of("A", "B"));
PCollectionView<Map<String, TupleTag<Entity>>> kindToTagMapping =
BulkDeletePipeline.mapKindsToDeletionTags(kinds, tags).apply(View.asMap());
BulkDeleteDatastorePipeline.mapKindsToDeletionTags(kinds, tags).apply(View.asMap());
Entity entityA = createTestEntity("A", 1);
Entity entityB = createTestEntity("B", 2);
PCollection<Entity> entities =