Retry Datastore errors in CommitLogManifestReader.next()

When trying to run the MapReduce for DeleteOldCommitLogsAction, we run into a
lot of DatastoreTimeoutException during CommitLogManifestReader.next.

This causes the entire shard to fail. Since we have a lot of keys (tens of
millions), this is almost guaranteed to happen, dooming the entire MapReduce.

Here is an attempt to recover from the Timeout Exception by saving the state
before the read, then on failure restoring that state and trying again.

-------------
Created by MOE: https://github.com/google/moe
MOE_MIGRATED_REVID=165172222
This commit is contained in:
guyben 2017-08-14 06:43:39 -07:00 committed by Ben McIlwain
parent 8b0b54e997
commit 00f2662f33
3 changed files with 172 additions and 9 deletions

View file

@ -39,6 +39,29 @@ public class Retrier implements Serializable {
private final Sleeper sleeper;
private final int attempts;
/** Holds functions to call whenever the code being retried fails. */
public static interface FailureReporter {
/**
* Called after a retriable failure happened.
*
* <p>Not called after the final failure, nor if the Throwable thrown isn't "a retriable error".
*
* <p>Not called at all if the retrier succeeded on its first attempt.
*/
public void beforeRetry(Throwable thrown, int failures, int maxAttempts);
/**
* Called after a a non-retriable error.
*
* <p>Called either after the final failure, or if the Throwable thrown isn't "a retriable
* error". The retrier throws right after calling this function.
*
* <p>Not called at all if the retrier succeeds.
*/
public void afterFinalFailure(Throwable thrown, int failures);
}
@Inject
public Retrier(Sleeper sleeper, @Named("transientFailureRetries") int transientFailureRetries) {
this.sleeper = sleeper;
@ -57,17 +80,21 @@ public class Retrier implements Serializable {
*
* @return <V> the value returned by the {@link Callable}.
*/
private <V> V callWithRetry(Callable<V> callable, Predicate<Throwable> isRetryable) {
private <V> V callWithRetry(
Callable<V> callable,
FailureReporter failureReporter,
Predicate<Throwable> isRetryable) {
int failures = 0;
while (true) {
try {
return callable.call();
} catch (Throwable e) {
if (++failures == attempts || !isRetryable.apply(e)) {
failureReporter.afterFinalFailure(e, failures);
throwIfUnchecked(e);
throw new RuntimeException(e);
}
logger.info(e, "Retrying transient error, attempt " + failures);
failureReporter.beforeRetry(e, failures, attempts);
try {
// Wait 100ms on the first attempt, doubling on each subsequent attempt.
sleeper.sleep(Duration.millis(pow(2, failures) * 100));
@ -82,6 +109,41 @@ public class Retrier implements Serializable {
}
}
private static final FailureReporter LOGGING_FAILURE_REPORTER = new FailureReporter() {
@Override
public void beforeRetry(Throwable thrown, int failures, int maxAttempts) {
logger.infofmt(thrown, "Retrying transient error, attempt %d", failures);
}
@Override
public void afterFinalFailure(Throwable thrown, int failures) {}
};
/**
* Retries a unit of work in the face of transient errors.
*
* <p>Retrying is done a fixed number of times, with exponential backoff, if the exception that is
* thrown is on a whitelist of retryable errors. If the error is not on the whitelist, or if the
* thread is interrupted, or if the allowable number of attempts has been exhausted, the original
* exception is propagated through to the caller. Checked exceptions are wrapped in a
* RuntimeException, while unchecked exceptions are propagated as-is.
*
* <p>Uses a default FailureReporter that logs before each retry.
*
* @return <V> the value returned by the {@link Callable}.
*/
@SafeVarargs
public final <V> V callWithRetry(
Callable<V> callable,
Class<? extends Throwable> retryableError,
Class<? extends Throwable>... moreRetryableErrors) {
return callWithRetry(
callable,
LOGGING_FAILURE_REPORTER,
retryableError,
moreRetryableErrors);
}
/**
* Retries a unit of work in the face of transient errors.
*
@ -96,11 +158,12 @@ public class Retrier implements Serializable {
@SafeVarargs
public final <V> V callWithRetry(
Callable<V> callable,
FailureReporter failureReporter,
Class<? extends Throwable> retryableError,
Class<? extends Throwable>... moreRetryableErrors) {
final Set<Class<?>> retryables =
new ImmutableSet.Builder<Class<?>>().add(retryableError).add(moreRetryableErrors).build();
return callWithRetry(callable, new Predicate<Throwable>() {
return callWithRetry(callable, failureReporter, new Predicate<Throwable>() {
@Override
public boolean apply(Throwable e) {
return any(retryables, supertypeOf(e.getClass()));