Add metrics for async batch operation processing

We want to know how long it's actually taking to process asynchronous
contact/host deletions and DNS refreshes on host renames. This adds
instrumentation. Five metrics are recorded as follows:

* An incrementable metric for each async task processed (split out by
  type of task and result).
* Two event metrics for processing time between when a task is enqueued
  and when it is processed -- tracked separately for contact/host
  deletion and DNS refresh on host rename.
* Two event metrics for batch size every time the two mapreduces are
  run (this is usually 0). Tracked separately for contact/host deletion
  and DNS refresh on host rename.

-------------
Created by MOE: https://github.com/google/moe
MOE_MIGRATED_REVID=157001310
This commit is contained in:
mcilwain 2017-05-24 10:00:14 -07:00 committed by Ben McIlwain
parent 1adeb57fea
commit bb67841884
14 changed files with 671 additions and 154 deletions

View file

@ -29,6 +29,7 @@ import google.registry.util.Retrier;
import java.util.concurrent.Callable;
import javax.inject.Inject;
import javax.inject.Named;
import org.joda.time.DateTime;
import org.joda.time.Duration;
/** Helper class to enqueue tasks for handling asynchronous operations in flows. */
@ -41,6 +42,7 @@ public final class AsyncFlowEnqueuer {
public static final String PARAM_SERVER_TRANSACTION_ID = "serverTransactionId";
public static final String PARAM_IS_SUPERUSER = "isSuperuser";
public static final String PARAM_HOST_KEY = "hostKey";
public static final String PARAM_REQUESTED_TIME = "requestedTime";
/** The task queue names used by async flows. */
public static final String QUEUE_ASYNC_DELETE = "async-delete-pull";
@ -48,32 +50,31 @@ public final class AsyncFlowEnqueuer {
private static final FormattingLogger logger = FormattingLogger.getLoggerForCallerClass();
@VisibleForTesting
@Inject
@Config("asyncDeleteFlowMapreduceDelay")
public Duration asyncDeleteDelay;
private final Duration asyncDeleteDelay;
private final Queue asyncDeletePullQueue;
private final Queue asyncDnsRefreshPullQueue;
private final Retrier retrier;
@VisibleForTesting
@Inject
@Named("async-delete-pull")
public Queue asyncDeletePullQueue;
@VisibleForTesting
@Inject
@Named(QUEUE_ASYNC_HOST_RENAME)
public Queue asyncDnsRefreshPullQueue;
@VisibleForTesting
@Inject
public Retrier retrier;
@VisibleForTesting
@Inject
public AsyncFlowEnqueuer() {}
public AsyncFlowEnqueuer(
@Named(QUEUE_ASYNC_DELETE) Queue asyncDeletePullQueue,
@Named(QUEUE_ASYNC_HOST_RENAME) Queue asyncDnsRefreshPullQueue,
@Config("asyncDeleteFlowMapreduceDelay") Duration asyncDeleteDelay,
Retrier retrier) {
this.asyncDeletePullQueue = asyncDeletePullQueue;
this.asyncDnsRefreshPullQueue = asyncDnsRefreshPullQueue;
this.asyncDeleteDelay = asyncDeleteDelay;
this.retrier = retrier;
}
/** Enqueues a task to asynchronously delete a contact or host, by key. */
public void enqueueAsyncDelete(
EppResource resourceToDelete, String requestingClientId, Trid trid, boolean isSuperuser) {
EppResource resourceToDelete,
DateTime now,
String requestingClientId,
Trid trid,
boolean isSuperuser) {
Key<EppResource> resourceKey = Key.create(resourceToDelete);
logger.infofmt(
"Enqueuing async deletion of %s on behalf of registrar %s.",
@ -85,17 +86,20 @@ public final class AsyncFlowEnqueuer {
.param(PARAM_REQUESTING_CLIENT_ID, requestingClientId)
.param(PARAM_CLIENT_TRANSACTION_ID, trid.getClientTransactionId())
.param(PARAM_SERVER_TRANSACTION_ID, trid.getServerTransactionId())
.param(PARAM_IS_SUPERUSER, Boolean.toString(isSuperuser));
.param(PARAM_IS_SUPERUSER, Boolean.toString(isSuperuser))
.param(PARAM_REQUESTED_TIME, now.toString());
addTaskToQueueWithRetry(asyncDeletePullQueue, task);
}
/** Enqueues a task to asynchronously refresh DNS for a renamed host. */
public void enqueueAsyncDnsRefresh(HostResource host) {
public void enqueueAsyncDnsRefresh(HostResource host, DateTime now) {
Key<HostResource> hostKey = Key.create(host);
logger.infofmt("Enqueuing async DNS refresh for renamed host %s.", hostKey);
addTaskToQueueWithRetry(
asyncDnsRefreshPullQueue,
TaskOptions.Builder.withMethod(Method.PULL).param(PARAM_HOST_KEY, hostKey.getString()));
TaskOptions.Builder.withMethod(Method.PULL)
.param(PARAM_HOST_KEY, hostKey.getString())
.param(PARAM_REQUESTED_TIME, now.toString()));
}
/**

View file

@ -0,0 +1,161 @@
// Copyright 2017 The Nomulus Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package google.registry.flows.async;
import static com.google.appengine.api.taskqueue.QueueConstants.maxLeaseCount;
import static google.registry.flows.async.AsyncFlowMetrics.OperationType.CONTACT_AND_HOST_DELETE;
import static google.registry.flows.async.AsyncFlowMetrics.OperationType.DNS_REFRESH;
import static google.registry.monitoring.metrics.EventMetric.DEFAULT_FITTER;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.ImmutableSet;
import google.registry.monitoring.metrics.DistributionFitter;
import google.registry.monitoring.metrics.EventMetric;
import google.registry.monitoring.metrics.FibonacciFitter;
import google.registry.monitoring.metrics.IncrementableMetric;
import google.registry.monitoring.metrics.LabelDescriptor;
import google.registry.monitoring.metrics.MetricRegistryImpl;
import google.registry.util.Clock;
import google.registry.util.NonFinalForTesting;
import javax.inject.Inject;
import org.joda.time.DateTime;
import org.joda.time.Duration;
/**
* Instrumentation for async flows (contact/host deletion and DNS refreshes).
*
* @see AsyncFlowEnqueuer
*/
public class AsyncFlowMetrics {
private final Clock clock;
@Inject
public AsyncFlowMetrics(Clock clock) {
this.clock = clock;
}
/**
* A Fibonacci fitter used for bucketing the batch count.
*
* <p>We use a Fibonacci filter because it provides better resolution at the low end than an
* exponential fitter, which is important because most batch sizes are likely to be very low,
* despite going up to 1,000 on the high end. Also, the precision is better, as batch size is
* inherently an integer, whereas an exponential fitter with an exponent base less than 2 would
* have unintuitive boundaries.
*/
private static final DistributionFitter FITTER_BATCH_SIZE =
FibonacciFitter.create(maxLeaseCount());
private static final ImmutableSet<LabelDescriptor> LABEL_DESCRIPTORS =
ImmutableSet.of(
LabelDescriptor.create("operation_type", "The type of async flow operation."),
LabelDescriptor.create("result", "The result of the async flow operation."));
@NonFinalForTesting
@VisibleForTesting
static IncrementableMetric asyncFlowOperationCounts =
MetricRegistryImpl.getDefault()
.newIncrementableMetric(
"/async_flows/operations",
"Count of Async Flow Operations",
"count",
LABEL_DESCRIPTORS);
@NonFinalForTesting
@VisibleForTesting
static EventMetric asyncFlowOperationProcessingTime =
MetricRegistryImpl.getDefault()
.newEventMetric(
"/async_flows/processing_time",
"Async Flow Processing Time",
"milliseconds",
LABEL_DESCRIPTORS,
DEFAULT_FITTER);
@NonFinalForTesting
@VisibleForTesting
static EventMetric asyncFlowBatchSize =
MetricRegistryImpl.getDefault()
.newEventMetric(
"/async_flows/batch_size",
"Async Operation Batch Size",
"batch size",
ImmutableSet.of(
LabelDescriptor.create("operation_type", "The type of async flow operation.")),
FITTER_BATCH_SIZE);
/** The type of asynchronous operation. */
public enum OperationType {
CONTACT_DELETE("contactDelete"),
HOST_DELETE("hostDelete"),
CONTACT_AND_HOST_DELETE("contactAndHostDelete"),
DNS_REFRESH("dnsRefresh");
private final String metricLabelValue;
private OperationType(String metricLabelValue) {
this.metricLabelValue = metricLabelValue;
}
String getMetricLabelValue() {
return metricLabelValue;
}
}
/** The result of an asynchronous operation. */
public enum OperationResult {
/** The operation processed correctly and the result was success. */
SUCCESS("success"),
/** The operation processed correctly and the result was failure. */
FAILURE("failure"),
/** The operation did not process correctly due to some unexpected error. */
ERROR("error"),
/** The operation was skipped because the request is now stale. */
STALE("stale");
private final String metricLabelValue;
private OperationResult(String metricLabelValue) {
this.metricLabelValue = metricLabelValue;
}
String getMetricLabelValue() {
return metricLabelValue;
}
}
public void recordAsyncFlowResult(
OperationType operationType, OperationResult operationResult, DateTime whenEnqueued) {
asyncFlowOperationCounts.increment(
operationType.getMetricLabelValue(), operationResult.getMetricLabelValue());
asyncFlowOperationProcessingTime.record(
new Duration(whenEnqueued, clock.nowUtc()).getMillis(),
operationType.getMetricLabelValue(),
operationResult.getMetricLabelValue());
}
public void recordContactHostDeletionBatchSize(long batchSize) {
asyncFlowBatchSize.record(
Double.valueOf(batchSize), CONTACT_AND_HOST_DELETE.getMetricLabelValue());
}
public void recordDnsRefreshBatchSize(long batchSize) {
asyncFlowBatchSize.record(Double.valueOf(batchSize), DNS_REFRESH.getMetricLabelValue());
}
}