Allow over 1000 dns-updates to be handled at once

The task-queue API only allows reading 1000 tasks at a time, hence the original reason for this limit. We get over that limit by reading (and processing) items from the queue in a loop - 1000 at a time.

This is important because the 1000 dns-updates are shared among all TLDs,
meaning that a TLD with >1000 waiting updates can affect the update latency of
other TLDs.

In addition, partially fixes the bug where if there are more than 1000 updates to paused
/ non-existing TLDs, we completely block all updated to all TLDs.

By partially fixed, I mean "if we have around 1000 updates to paused TLDs, we will read them every time ReadDnsUpdates is called, ignore then, and only then get to the actual updates we want to process".

This works for a number of 1000 updates waiting - but if paused TLDs have tens or hundreds of thousands of updates waiting - this might still choke up other TLDs (not to mention we keep reading / updating 10s or 100s of thousands of tasks in the queue, that's... bad.)

A more thorough fix will come in a future CL, as it requires a more thorough change in the code.

Note that the queue lease command supports a maximum of 10 QPS. Any more than
that - and we get errors / empty results. Hence we limit our QPS to 9 to be on
the safe side.

-------------
Created by MOE: https://github.com/google/moe
MOE_MIGRATED_REVID=185218684
This commit is contained in:
guyben 2018-02-09 17:56:29 -08:00 committed by jianglai
parent ce5baafc4a
commit bba975a991
7 changed files with 389 additions and 78 deletions

View file

@ -34,25 +34,44 @@ import com.google.apphosting.api.DeadlineExceededException;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.ImmutableList;
import com.google.common.net.InternetDomainName;
import com.google.common.util.concurrent.RateLimiter;
import google.registry.dns.DnsConstants.TargetType;
import google.registry.model.registry.Registries;
import google.registry.util.FormattingLogger;
import google.registry.util.NonFinalForTesting;
import java.util.List;
import java.util.Optional;
import java.util.concurrent.TimeUnit;
import java.util.logging.Level;
import javax.inject.Inject;
import javax.inject.Named;
import org.joda.time.Duration;
/** Methods for manipulating the queue used for DNS write tasks. */
/**
* Methods for manipulating the queue used for DNS write tasks.
*
* <p>This includes a {@link RateLimiter} to limit the {@link Queue#leaseTasks} call rate to 9 QPS,
* to stay under the 10 QPS limit for this function.
*
* <p>Note that overlapping calls to {@link ReadDnsQueueAction} (the only place where
* {@link DnsQueue#leaseTasks} is used) will have different rate limiters, so they could exceed the
* allowed rate. This should be rare though - because {@link DnsQueue#leaseTasks} is only used in
* {@link ReadDnsQueueAction}, which is run as a cron job with running time shorter than the cron
* repeat time - meaning there should never be two instances running at once.
*
* @see google.registry.config.RegistryConfig#provideReadDnsQueueRuntime
*/
public class DnsQueue {
private static final FormattingLogger logger = FormattingLogger.getLoggerForCallerClass();
private final Queue queue;
// Queue.leaseTasks is limited to 10 requests per second as per
// https://cloud.google.com/appengine/docs/standard/java/javadoc/com/google/appengine/api/taskqueue/Queue.html
// "If you generate more than 10 LeaseTasks requests per second, only the first 10 requests will
// return results. The others will return no results."
private static final RateLimiter rateLimiter = RateLimiter.create(9);
@Inject
public DnsQueue(@Named(DNS_PULL_QUEUE_NAME) Queue queue) {
this.queue = queue;
@ -112,9 +131,21 @@ public class DnsQueue {
return addToQueue(TargetType.ZONE, fullyQualifiedZoneName, fullyQualifiedZoneName);
}
/**
* Returns the maximum number of tasks that can be leased with {@link #leaseTasks}.
*
* <p>If this many tasks are returned, then there might be more tasks still waiting in the queue.
*
* <p>If less than this number of tasks are returned, then there are no more items in the queue.
*/
public long getLeaseTasksBatchSize() {
return leaseTasksBatchSize;
}
/** Returns handles for a batch of tasks, leased for the specified duration. */
public List<TaskHandle> leaseTasks(Duration leaseDuration) {
try {
rateLimiter.acquire();
int numTasks = queue.fetchStatistics().getNumTasks();
logger.logfmt(
(numTasks >= leaseTasksBatchSize) ? Level.WARNING : Level.INFO,
@ -128,17 +159,6 @@ public class DnsQueue {
}
}
/** Reduce the task lease time to zero, making it immediately available to be leased again. */
public void dropTaskLease(TaskHandle task) {
try {
queue.modifyTaskLease(task, 0, TimeUnit.SECONDS);
} catch (IllegalStateException e) {
logger.warningfmt(e, "Failed dropping expired lease: %s", task.getName());
} catch (TransientFailureException | DeadlineExceededException e) {
logger.severe(e, "Failed dropping task leases too fast");
}
}
/** Delete the task, removing it from the queue permanently. */
public void deleteTask(TaskHandle task) {
try {