Add [] to export domain lists to GCS

The ExportDomainListsAction [] has a cron entry that runs it twice per day.  It exports one flat text file per real (non-test) TLD to the "{project-id}-domain-lists" bucket in Google Cloud Storage, overwriting the existing ones in place.  Each file is a newline-delimited list of active (non-deleted) domains in that TLD.
-------------
Created by MOE: https://github.com/google/moe
MOE_MIGRATED_REVID=116767987
This commit is contained in:
mcilwain 2016-03-09 08:48:06 -08:00 committed by Ben McIlwain
parent d6815fb55a
commit dd633c9e72
12 changed files with 317 additions and 5 deletions

View file

@ -82,6 +82,12 @@ public final class ConfigModule {
return config.getCommitLogDatastoreRetention();
}
@Provides
@Config("domainListsGcsBucket")
public static String provideDomainListsGcsBucket(RegistryConfig config) {
return config.getDomainListsBucket();
}
/**
* Maximum number of commit logs to delete per transaction.
*

View file

@ -44,6 +44,13 @@ public interface RegistryConfig {
*/
public String getSnapshotsBucket();
/**
* Returns the Google Cloud Storage bucket for storing exported domain lists.
*
* @see com.google.domain.registry.export.ExportDomainListsAction
*/
public String getDomainListsBucket();
/**
* Returns the BigQuery dataset for storing directly imported datastore snapshots.
*

View file

@ -52,6 +52,11 @@ public class TestRegistryConfig implements RegistryConfig {
return getProjectId() + "-snapshots";
}
@Override
public String getDomainListsBucket() {
return getProjectId() + "-domain-lists";
}
@Override
public String getSnapshotsDataset() {
return "snapshots";

View file

@ -364,6 +364,15 @@
<url-pattern>/_dr/task/syncGroupMembers</url-pattern>
</servlet-mapping>
<servlet>
<servlet-name>export-domain-lists</servlet-name>
<servlet-class>com.google.domain.registry.module.backend.BackendServlet</servlet-class>
</servlet>
<servlet-mapping>
<servlet-name>export-domain-lists</servlet-name>
<url-pattern>/_dr/task/exportDomainLists</url-pattern>
</servlet-mapping>
<!-- Mapreduce to delete the specified contact resource if it is not referenced by any domains. -->
<servlet>
<description>

View file

@ -124,6 +124,15 @@
<target>backend</target>
</cron>
<cron>
<url><![CDATA[/_dr/task/exportDomainLists]]></url>
<description>
This job exports lists of all active domain names to Google Cloud Storage.
</description>
<schedule>every 12 hours synchronized</schedule>
<target>backend</target>
</cron>
<cron>
<url><![CDATA[/_dr/cron/fanout?queue=export-snapshot&endpoint=/_dr/task/exportSnapshot&runInEmpty]]></url>
<description>

View file

@ -23,16 +23,16 @@ java_library(
"//java/com/google/common/net",
"//java/com/google/domain/registry/bigquery",
"//java/com/google/domain/registry/config",
"//java/com/google/domain/registry/flows",
"//java/com/google/domain/registry/gcs",
"//java/com/google/domain/registry/groups",
"//java/com/google/domain/registry/mapreduce",
"//java/com/google/domain/registry/model",
"//java/com/google/domain/registry/request",
"//java/com/google/domain/registry/security:servlets",
"//java/com/google/domain/registry/storage/drive",
"//java/com/google/domain/registry/util",
"//third_party/java/appengine:appengine-api",
"//third_party/java/appengine_gcs_client",
"//third_party/java/appengine_mapreduce2:appengine_mapreduce",
"//third_party/java/dagger",
"//third_party/java/joda_time",
"//third_party/java/json_simple",

View file

@ -0,0 +1,135 @@
// Copyright 2016 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package com.google.domain.registry.export;
import static com.google.appengine.tools.cloudstorage.GcsServiceFactory.createGcsService;
import static com.google.domain.registry.mapreduce.EppResourceInputs.createEntityInput;
import static com.google.domain.registry.model.EppResourceUtils.isActive;
import static com.google.domain.registry.model.registry.Registries.getTldsOfType;
import static com.google.domain.registry.util.PipelineUtils.createJobPath;
import static java.nio.charset.StandardCharsets.UTF_8;
import static org.joda.time.DateTimeZone.UTC;
import com.google.appengine.tools.cloudstorage.GcsFilename;
import com.google.appengine.tools.cloudstorage.RetryParams;
import com.google.appengine.tools.mapreduce.Mapper;
import com.google.appengine.tools.mapreduce.Reducer;
import com.google.appengine.tools.mapreduce.ReducerInput;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSet;
import com.google.domain.registry.config.ConfigModule.Config;
import com.google.domain.registry.gcs.GcsUtils;
import com.google.domain.registry.mapreduce.MapreduceAction;
import com.google.domain.registry.mapreduce.MapreduceRunner;
import com.google.domain.registry.model.domain.DomainResource;
import com.google.domain.registry.model.registry.Registry.TldType;
import com.google.domain.registry.request.Action;
import com.google.domain.registry.request.Response;
import com.google.domain.registry.util.FormattingLogger;
import org.joda.time.DateTime;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.Writer;
import javax.inject.Inject;
/**
* A mapreduce that exports the list of active domains on all real TLDs to Google Cloud Storage.
*
* Each TLD's active domain names are exported as a newline-delimited flat text file with the name
* TLD.txt into the domain-lists bucket. Note that this overwrites the files in place.
*/
@Action(path = "/_dr/task/exportDomainLists")
public class ExportDomainListsAction implements MapreduceAction {
private static final FormattingLogger logger = FormattingLogger.getLoggerForCallerClass();
@Inject MapreduceRunner mrRunner;
@Inject Response response;
@Inject @Config("domainListsGcsBucket") String gcsBucket;
@Inject @Config("gcsBufferSize") int gcsBufferSize;
@Inject ExportDomainListsAction() {}
@Override
public void run() {
ImmutableSet<String> realTlds = getTldsOfType(TldType.REAL);
logger.infofmt("Exporting domain lists for tlds %s", realTlds);
response.sendJavaScriptRedirect(createJobPath(mrRunner
.setJobName("Export domain lists")
.setModuleName("backend")
.runMapreduce(
new ExportDomainListsMapper(DateTime.now(UTC), realTlds),
new ExportDomainListsReducer(gcsBucket, gcsBufferSize),
ImmutableList.of(createEntityInput(DomainResource.class)))));
}
static class ExportDomainListsMapper extends Mapper<DomainResource, String, String> {
private static final long serialVersionUID = -7312206212434039854L;
private final DateTime exportTime;
private final ImmutableSet<String> realTlds;
ExportDomainListsMapper(DateTime exportTime, ImmutableSet<String> realTlds) {
this.exportTime = exportTime;
this.realTlds = realTlds;
}
@Override
public void map(DomainResource domain) {
if (realTlds.contains(domain.getTld()) && isActive(domain, exportTime)) {
emit(domain.getTld(), domain.getFullyQualifiedDomainName());
getContext().incrementCounter(String.format("domains in tld %s", domain.getTld()));
}
}
}
static class ExportDomainListsReducer extends Reducer<String, String, Void> {
private static final long serialVersionUID = 7035260977259119087L;
private final String gcsBucket;
private final int gcsBufferSize;
public ExportDomainListsReducer(String gcsBucket, int gcsBufferSize) {
this.gcsBucket = gcsBucket;
this.gcsBufferSize = gcsBufferSize;
}
@Override
public void reduce(String tld, ReducerInput<String> fqdns) {
GcsFilename filename = new GcsFilename(gcsBucket, tld + ".txt");
GcsUtils cloudStorage =
new GcsUtils(createGcsService(RetryParams.getDefaultInstance()), gcsBufferSize);
try (OutputStream gcsOutput = cloudStorage.openOutputStream(filename);
Writer osWriter = new OutputStreamWriter(gcsOutput, UTF_8);
PrintWriter writer = new PrintWriter(osWriter)) {
long count;
for (count = 0; fqdns.hasNext(); count++) {
writer.println(fqdns.next());
}
writer.flush();
getContext().incrementCounter("tld domain lists written out");
logger.infofmt("Wrote out %d domains for tld %s.", count, tld);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}
}

View file

@ -52,7 +52,7 @@ import javax.xml.bind.annotation.XmlTransient;
public abstract class DomainBase extends EppResource {
/**
* Fully qualified domain name, which serves as the foreign key for this domain.
* Fully qualified domain name (puny-coded), which serves as the foreign key for this domain.
* <p>
* This is only unique in the sense that for any given lifetime specified as the time range from
* (creationTime, deletionTime) there can only be one domain in the datastore with this name.

View file

@ -28,6 +28,7 @@ import com.google.domain.registry.dns.ReadDnsQueueAction;
import com.google.domain.registry.dns.RefreshDns;
import com.google.domain.registry.dns.WriteDnsTask;
import com.google.domain.registry.export.BigqueryPollJobAction;
import com.google.domain.registry.export.ExportDomainListsAction;
import com.google.domain.registry.export.ExportRequestModule;
import com.google.domain.registry.export.ExportReservedTermsTask;
import com.google.domain.registry.export.SyncGroupMembersTask;
@ -81,6 +82,7 @@ interface BackendRequestComponent {
DeleteOldCommitLogsAction deleteOldCommitLogsAction();
DnsRefreshForHostRenameAction dnsRefreshForHostRenameAction();
ExportCommitLogDiffAction exportCommitLogDiffAction();
ExportDomainListsAction exportDomainListsAction();
ExportReservedTermsTask exportReservedTermsTask();
NordnUploadAction nordnUploadAction();
NordnVerifyAction nordnVerifyAction();