Add metric for lock contention

Also - remove logging from TransactNew, to prevent double logging on transient
failures (TransactNew retries on failure)

-------------
Created by MOE: https://github.com/google/moe
MOE_MIGRATED_REVID=172500772
This commit is contained in:
guyben 2017-10-17 13:03:14 -07:00 committed by jianglai
parent 6f9b039e72
commit 77ee3e3544
3 changed files with 165 additions and 47 deletions

View file

@ -18,6 +18,8 @@ import static com.google.common.base.Preconditions.checkArgument;
import static google.registry.model.ofy.ObjectifyService.ofy;
import static google.registry.util.DateTimeUtils.isAtOrAfter;
import com.google.auto.value.AutoValue;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Strings;
import com.googlecode.objectify.VoidWork;
import com.googlecode.objectify.Work;
@ -49,6 +51,12 @@ public class Lock extends ImmutableObject {
private static final FormattingLogger logger = FormattingLogger.getLoggerForCallerClass();
/** Disposition of locking, for monitoring. */
enum LockState { IN_USE, FREE, TIMED_OUT, OWNER_DIED }
@VisibleForTesting
static LockMetrics lockMetrics = new LockMetrics();
/** The name of the locked resource. */
@Id
String lockId;
@ -90,19 +98,82 @@ public class Lock extends ImmutableObject {
return String.format("%s-%s", tld, resourceName);
}
@AutoValue
abstract static class AcquireResult {
public abstract DateTime transactionTime();
public abstract @Nullable Lock existingLock();
public abstract @Nullable Lock newLock();
public abstract LockState lockState();
public static AcquireResult create(
DateTime transactionTime,
@Nullable Lock existingLock,
@Nullable Lock newLock,
LockState lockState) {
return new AutoValue_Lock_AcquireResult(transactionTime, existingLock, newLock, lockState);
}
}
private static void logAcquireResult(AcquireResult acquireResult) {
try {
Lock lock = acquireResult.existingLock();
DateTime now = acquireResult.transactionTime();
switch (acquireResult.lockState()) {
case IN_USE:
logger.infofmt(
"Existing lock by request %s is still valid now %s (until %s) lock: %s",
lock.requestLogId,
now,
lock.expirationTime,
lock.lockId);
break;
case TIMED_OUT:
logger.infofmt(
"Existing lock by request %s is timed out now %s (was valid until %s) lock: %s",
lock.requestLogId,
now,
lock.expirationTime,
lock.lockId);
break;
case OWNER_DIED:
logger.infofmt(
"Existing lock is valid now %s (until %s), but owner (%s) isn't running lock: %s",
now,
lock.expirationTime,
lock.requestLogId,
lock.lockId);
break;
case FREE:
// There was no existing lock
break;
}
Lock newLock = acquireResult.newLock();
if (acquireResult.newLock() != null) {
logger.infofmt(
"acquire succeeded %s lock: %s",
newLock,
newLock.lockId);
}
} catch (Throwable e) {
// We might get here if there is a NullPointerException for example, if AcquireResult wasn't
// constructed correctly. Simply log it for debugging but continue as if nothing happened
logger.warningfmt(e, "Error while logging AcquireResult %s. Continuing.", acquireResult);
}
}
/** Try to acquire a lock. Returns absent if it can't be acquired. */
public static Optional<Lock> acquire(
final String resourceName,
@Nullable final String tld,
final Duration leaseLength,
final RequestStatusChecker requestStatusChecker) {
String lockId = makeLockId(resourceName, tld);
// It's important to use transactNew rather than transact, because a Lock can be used to control
// access to resources like GCS that can't be transactionally rolled back. Therefore, the lock
// must be definitively acquired before it is used, even when called inside another transaction.
return Optional.ofNullable(ofy().transactNew(new Work<Lock>() {
AcquireResult acquireResult = ofy().transactNew(new Work<AcquireResult>() {
@Override
public Lock run() {
String lockId = makeLockId(resourceName, tld);
public AcquireResult run() {
DateTime now = ofy().getTransactionTime();
// Checking if an unexpired lock still exists - if so, the lock can't be acquired.
@ -111,29 +182,18 @@ public class Lock extends ImmutableObject {
logger.infofmt(
"Loaded existing lock: %s for request: %s", lock.lockId, lock.requestLogId);
}
// TODO(b/63982642): remove check on requestLogId being null once migration is done
// Until then we assume missing requestLogId means the app is still running (since we have
// no information to the contrary)
if (lock != null
&& !isAtOrAfter(now, lock.expirationTime)
&& (lock.requestLogId == null || requestStatusChecker.isRunning(lock.requestLogId))) {
logger.infofmt(
"Existing lock by request %s is still valid now %s (until %s) lock: %s",
lock.requestLogId,
now,
lock.expirationTime,
lockId);
return null;
LockState lockState;
if (lock == null) {
lockState = LockState.FREE;
} else if (isAtOrAfter(now, lock.expirationTime)) {
lockState = LockState.TIMED_OUT;
} else if (!requestStatusChecker.isRunning(lock.requestLogId)) {
lockState = LockState.OWNER_DIED;
} else {
lockState = LockState.IN_USE;
return AcquireResult.create(now, lock, null, lockState);
}
if (lock != null) {
logger.infofmt(
"Existing lock by request %s is timed out now %s (was valid until %s) lock: %s",
lock.requestLogId,
now,
lock.expirationTime,
lockId);
}
Lock newLock = create(
resourceName,
tld,
@ -142,12 +202,12 @@ public class Lock extends ImmutableObject {
// Locks are not parented under an EntityGroupRoot (so as to avoid write contention) and
// don't need to be backed up.
ofy().saveWithoutBackup().entity(newLock);
logger.infofmt(
"acquire succeeded %s lock: %s",
newLock,
lockId);
return newLock;
}}));
return AcquireResult.create(now, lock, newLock, lockState);
}});
logAcquireResult(acquireResult);
lockMetrics.record(resourceName, tld, acquireResult.lockState());
return Optional.ofNullable(acquireResult.newLock());
}
/** Release the lock. */

View file

@ -0,0 +1,45 @@
// Copyright 2017 The Nomulus Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package google.registry.model.server;
import com.google.common.collect.ImmutableSet;
import google.registry.model.server.Lock.LockState;
import google.registry.monitoring.metrics.IncrementableMetric;
import google.registry.monitoring.metrics.LabelDescriptor;
import google.registry.monitoring.metrics.MetricRegistryImpl;
import javax.annotation.Nullable;
/** Metrics for lock contention. */
class LockMetrics {
private static final ImmutableSet<LabelDescriptor> LABEL_DESCRIPTORS =
ImmutableSet.of(
LabelDescriptor.create("tld", "TLD"),
LabelDescriptor.create("resource", "resource name"),
LabelDescriptor.create(
"state", "The existing lock state (before attempting to acquire)."));
private static final IncrementableMetric lockRequestsMetric =
MetricRegistryImpl.getDefault()
.newIncrementableMetric(
"/lock/acquire_lock_requests",
"Count of lock acquisition attempts",
"count",
LABEL_DESCRIPTORS);
void record(String resourceName, @Nullable String tld, LockState state) {
lockRequestsMetric.increment(String.valueOf(tld), resourceName, state.name());
}
}

View file

@ -14,12 +14,18 @@
package google.registry.model.server;
import static com.google.common.truth.Truth.assertThat;
import static com.google.common.truth.Truth8.assertThat;
import static google.registry.model.server.Lock.LockState.FREE;
import static google.registry.model.server.Lock.LockState.IN_USE;
import static google.registry.model.server.Lock.LockState.OWNER_DIED;
import static google.registry.model.server.Lock.LockState.TIMED_OUT;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.verifyNoMoreInteractions;
import static org.mockito.Mockito.when;
import google.registry.model.ofy.Ofy;
import google.registry.model.server.Lock.LockState;
import google.registry.testing.AppEngineRule;
import google.registry.testing.ExceptionRule;
import google.registry.testing.FakeClock;
@ -53,63 +59,70 @@ public class LockTest {
@Rule
public final ExceptionRule thrown = new ExceptionRule();
private Optional<Lock> acquire(String tld, Duration leaseLength) {
return Lock.acquire(RESOURCE_NAME, tld, leaseLength, requestStatusChecker);
private Optional<Lock> acquire(String tld, Duration leaseLength, LockState expectedLockState) {
Lock.lockMetrics = mock(LockMetrics.class);
Optional<Lock> lock = Lock.acquire(RESOURCE_NAME, tld, leaseLength, requestStatusChecker);
verify(Lock.lockMetrics).record(RESOURCE_NAME, tld, expectedLockState);
verifyNoMoreInteractions(Lock.lockMetrics);
Lock.lockMetrics = null;
return lock;
}
@Before public void setUp() {
Lock.lockMetrics = null;
when(requestStatusChecker.getLogId()).thenReturn("current-request-id");
when(requestStatusChecker.isRunning("current-request-id")).thenReturn(true);
}
@Test
public void testReleasedExplicitly() throws Exception {
Optional<Lock> lock = acquire("", ONE_DAY);
Optional<Lock> lock = acquire("", ONE_DAY, FREE);
assertThat(lock).isPresent();
// We can't get it again at the same time.
assertThat(acquire("", ONE_DAY)).isEmpty();
assertThat(acquire("", ONE_DAY, IN_USE)).isEmpty();
// But if we release it, it's available.
lock.get().release();
assertThat(acquire("", ONE_DAY)).isPresent();
assertThat(acquire("", ONE_DAY, FREE)).isPresent();
}
@Test
public void testReleasedAfterTimeout() throws Exception {
FakeClock clock = new FakeClock();
inject.setStaticField(Ofy.class, "clock", clock);
assertThat(acquire("", TWO_MILLIS)).isPresent();
assertThat(acquire("", TWO_MILLIS, FREE)).isPresent();
// We can't get it again at the same time.
assertThat(acquire("", TWO_MILLIS)).isEmpty();
assertThat(acquire("", TWO_MILLIS, IN_USE)).isEmpty();
// A second later we still can't get the lock.
clock.advanceOneMilli();
assertThat(acquire("", TWO_MILLIS)).isEmpty();
assertThat(acquire("", TWO_MILLIS, IN_USE)).isEmpty();
// But two seconds later we can get it.
clock.advanceOneMilli();
assertThat(acquire("", TWO_MILLIS)).isPresent();
assertThat(acquire("", TWO_MILLIS, TIMED_OUT)).isPresent();
}
@Test
public void testReleasedAfterRequestFinish() throws Exception {
assertThat(acquire("", ONE_DAY)).isPresent();
assertThat(acquire("", ONE_DAY, FREE)).isPresent();
// We can't get it again while request is active
assertThat(acquire("", ONE_DAY)).isEmpty();
assertThat(acquire("", ONE_DAY, IN_USE)).isEmpty();
// But if request is finished, we can get it.
when(requestStatusChecker.isRunning("current-request-id")).thenReturn(false);
assertThat(acquire("", ONE_DAY)).isPresent();
assertThat(acquire("", ONE_DAY, OWNER_DIED)).isPresent();
}
@Test
public void testTldsAreIndependent() throws Exception {
Optional<Lock> lockA = acquire("a", ONE_DAY);
Optional<Lock> lockA = acquire("a", ONE_DAY, FREE);
assertThat(lockA).isPresent();
// For a different tld we can still get a lock with the same name.
Optional<Lock> lockB = acquire("b", ONE_DAY);
Optional<Lock> lockB = acquire("b", ONE_DAY, FREE);
assertThat(lockB).isPresent();
// We can't get lockB again at the same time.
assertThat(acquire("b", ONE_DAY)).isEmpty();
assertThat(acquire("b", ONE_DAY, IN_USE)).isEmpty();
// Releasing lockA has no effect on lockB (even though we are still using the "b" tld).
lockA.get().release();
assertThat(acquire("b", ONE_DAY)).isEmpty();
assertThat(acquire("b", ONE_DAY, IN_USE)).isEmpty();
}
@Test