diff --git a/core/build.gradle b/core/build.gradle
index a699c0318..d53dc0b2d 100644
--- a/core/build.gradle
+++ b/core/build.gradle
@@ -172,6 +172,8 @@ dependencies {
compile deps['com.beust:jcommander']
compile deps['com.google.api:gax']
+ compile deps['com.google.api.grpc:proto-google-cloud-datastore-v1']
+ compile deps['com.google.api.grpc:proto-google-common-protos']
compile deps['com.google.api.grpc:proto-google-cloud-secretmanager-v1']
compile deps['com.google.api-client:google-api-client']
compile deps['com.google.api-client:google-api-client-appengine']
@@ -196,6 +198,8 @@ dependencies {
compile deps['com.google.appengine:appengine-remote-api']
compile deps['com.google.auth:google-auth-library-credentials']
compile deps['com.google.auth:google-auth-library-oauth2-http']
+ compile deps['com.google.cloud.bigdataoss:util']
+ compile deps['com.google.cloud.datastore:datastore-v1-proto-client']
compile deps['com.google.cloud.sql:jdbc-socket-factory-core']
runtimeOnly deps['com.google.cloud.sql:postgres-socket-factory']
compile deps['com.google.cloud:google-cloud-secretmanager']
@@ -736,6 +740,13 @@ project.tasks.create('initSqlPipeline', JavaExec) {
}
}
+// Caller must provide projectId, GCP region, runner, and the kinds to delete
+// (comma-separated kind names or '*' for all). E.g.:
+// nom_build :core:bulkDeleteDatastore --args="--project=domain-registry-crash \
+// --region=us-central1 --runner=DataflowRunner --kindsToDelete=*"
+createToolTask(
+ 'bulkDeleteDatastore', 'google.registry.beam.datastore.BulkDeletePipeline')
+
project.tasks.create('generateSqlSchema', JavaExec) {
classpath = sourceSets.nonprod.runtimeClasspath
main = 'google.registry.tools.DevTool'
diff --git a/core/src/main/java/google/registry/beam/datastore/BulkDeletePipeline.java b/core/src/main/java/google/registry/beam/datastore/BulkDeletePipeline.java
new file mode 100644
index 000000000..096daf2f9
--- /dev/null
+++ b/core/src/main/java/google/registry/beam/datastore/BulkDeletePipeline.java
@@ -0,0 +1,330 @@
+// Copyright 2020 The Nomulus Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package google.registry.beam.datastore;
+
+import static com.google.common.base.Preconditions.checkState;
+import static org.apache.beam.sdk.values.TypeDescriptors.kvs;
+import static org.apache.beam.sdk.values.TypeDescriptors.strings;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Splitter;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableSet;
+import com.google.common.collect.ImmutableSortedSet;
+import com.google.common.flogger.FluentLogger;
+import com.google.datastore.v1.Entity;
+import java.util.Iterator;
+import java.util.Map;
+import org.apache.beam.sdk.Pipeline;
+import org.apache.beam.sdk.extensions.gcp.options.GcpOptions;
+import org.apache.beam.sdk.io.gcp.datastore.DatastoreIO;
+import org.apache.beam.sdk.options.Default;
+import org.apache.beam.sdk.options.Description;
+import org.apache.beam.sdk.options.PipelineOptionsFactory;
+import org.apache.beam.sdk.options.Validation;
+import org.apache.beam.sdk.transforms.Create;
+import org.apache.beam.sdk.transforms.DoFn;
+import org.apache.beam.sdk.transforms.GroupByKey;
+import org.apache.beam.sdk.transforms.MapElements;
+import org.apache.beam.sdk.transforms.PTransform;
+import org.apache.beam.sdk.transforms.ParDo;
+import org.apache.beam.sdk.transforms.Reshuffle;
+import org.apache.beam.sdk.transforms.View;
+import org.apache.beam.sdk.values.KV;
+import org.apache.beam.sdk.values.PBegin;
+import org.apache.beam.sdk.values.PCollection;
+import org.apache.beam.sdk.values.PCollectionTuple;
+import org.apache.beam.sdk.values.PCollectionView;
+import org.apache.beam.sdk.values.TupleTag;
+import org.apache.beam.sdk.values.TupleTagList;
+
+/**
+ * A BEAM pipeline that deletes Datastore entities in bulk.
+ *
+ *
This pipeline provides an alternative to the GCP builtin template that performs
+ * the same task. It solves the following performance and usability problems in the builtin
+ * template:
+ *
+ *
+ * - When deleting all data (by using the {@code select __key__} or {@code select *} queries),
+ * the builtin template cannot parallelize the query, therefore has to query with a single
+ * worker.
+ *
- When deleting all data, the builtin template also attempts to delete Datastore internal
+ * tables which would cause permission-denied errors, which in turn MAY cause the pipeline to
+ * abort before all data has been deleted.
+ *
- With the builtin template, it is possible to delete multiple entity types in one pipeline
+ * ONLY if the user can come up with a single literal query that covers all of them. This is
+ * not the case with most Nomulus entity types.
+ *
+ *
+ * A user of this pipeline must specify the types of entities to delete using the {@code
+ * --kindsToDelete} command line argument. To delete specific entity types, give a comma-separated
+ * string of their kind names; to delete all data, give {@code "*"}.
+ *
+ *
When deleting all data, it is recommended for the user to specify the number of user entity
+ * types in the Datastore using the {@code --numOfKindsHint} argument. If the default value for this
+ * parameter is too low, performance will suffer.
+ */
+public class BulkDeletePipeline {
+ private static final FluentLogger logger = FluentLogger.forEnclosingClass();
+
+ // This tool is not for use in our critical projects.
+ private static final ImmutableSet FORBIDDEN_PROJECTS =
+ ImmutableSet.of("domain-registry", "domain-registry-sandbox");
+
+ private final BulkDeletePipelineOptions options;
+
+ private final Pipeline pipeline;
+
+ BulkDeletePipeline(BulkDeletePipelineOptions options) {
+ this.options = options;
+ pipeline = Pipeline.create(options);
+ }
+
+ public void run() {
+ setupPipeline();
+ pipeline.run();
+ }
+
+ @SuppressWarnings("deprecation") // org.apache.beam.sdk.transforms.Reshuffle
+ private void setupPipeline() {
+ checkState(
+ !FORBIDDEN_PROJECTS.contains(options.getProject()),
+ "Bulk delete is forbidden in %s",
+ options.getProject());
+
+ // Pre-allocated tags to label entities by kind. In the case of delete-all, we must use a guess.
+ TupleTagList deletionTags;
+ PCollection kindsToDelete;
+
+ if (options.getKindsToDelete().equals("*")) {
+ deletionTags = getDeletionTags(options.getNumOfKindsHint());
+ kindsToDelete =
+ pipeline.apply("DiscoverEntityKinds", discoverEntityKinds(options.getProject()));
+ } else {
+ ImmutableList kindsToDeleteParam = parseKindsToDelete(options);
+ checkState(
+ !kindsToDeleteParam.contains("*"),
+ "The --kindsToDelete argument should not contain both '*' and other kinds.");
+ deletionTags = getDeletionTags(kindsToDeleteParam.size());
+ kindsToDelete = pipeline.apply("UseProvidedKinds", Create.of(kindsToDeleteParam));
+ }
+
+ // Map each kind to a tag. The "SplitByKind" stage below will group entities by kind using
+ // this mapping. In practice, this has been effective at avoiding entity group contentions.
+ PCollectionView