mirror of
https://github.com/google/nomulus.git
synced 2025-05-12 22:38:16 +02:00
Retry Datastore errors in CommitLogManifestReader.next()
When trying to run the MapReduce for DeleteOldCommitLogsAction, we run into a lot of DatastoreTimeoutException during CommitLogManifestReader.next. This causes the entire shard to fail. Since we have a lot of keys (tens of millions), this is almost guaranteed to happen, dooming the entire MapReduce. Here is an attempt to recover from the Timeout Exception by saving the state before the read, then on failure restoring that state and trying again. ------------- Created by MOE: https://github.com/google/moe MOE_MIGRATED_REVID=165172222
This commit is contained in:
parent
8b0b54e997
commit
00f2662f33
3 changed files with 172 additions and 9 deletions
|
@ -39,6 +39,29 @@ public class Retrier implements Serializable {
|
|||
private final Sleeper sleeper;
|
||||
private final int attempts;
|
||||
|
||||
/** Holds functions to call whenever the code being retried fails. */
|
||||
public static interface FailureReporter {
|
||||
|
||||
/**
|
||||
* Called after a retriable failure happened.
|
||||
*
|
||||
* <p>Not called after the final failure, nor if the Throwable thrown isn't "a retriable error".
|
||||
*
|
||||
* <p>Not called at all if the retrier succeeded on its first attempt.
|
||||
*/
|
||||
public void beforeRetry(Throwable thrown, int failures, int maxAttempts);
|
||||
|
||||
/**
|
||||
* Called after a a non-retriable error.
|
||||
*
|
||||
* <p>Called either after the final failure, or if the Throwable thrown isn't "a retriable
|
||||
* error". The retrier throws right after calling this function.
|
||||
*
|
||||
* <p>Not called at all if the retrier succeeds.
|
||||
*/
|
||||
public void afterFinalFailure(Throwable thrown, int failures);
|
||||
}
|
||||
|
||||
@Inject
|
||||
public Retrier(Sleeper sleeper, @Named("transientFailureRetries") int transientFailureRetries) {
|
||||
this.sleeper = sleeper;
|
||||
|
@ -57,17 +80,21 @@ public class Retrier implements Serializable {
|
|||
*
|
||||
* @return <V> the value returned by the {@link Callable}.
|
||||
*/
|
||||
private <V> V callWithRetry(Callable<V> callable, Predicate<Throwable> isRetryable) {
|
||||
private <V> V callWithRetry(
|
||||
Callable<V> callable,
|
||||
FailureReporter failureReporter,
|
||||
Predicate<Throwable> isRetryable) {
|
||||
int failures = 0;
|
||||
while (true) {
|
||||
try {
|
||||
return callable.call();
|
||||
} catch (Throwable e) {
|
||||
if (++failures == attempts || !isRetryable.apply(e)) {
|
||||
failureReporter.afterFinalFailure(e, failures);
|
||||
throwIfUnchecked(e);
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
logger.info(e, "Retrying transient error, attempt " + failures);
|
||||
failureReporter.beforeRetry(e, failures, attempts);
|
||||
try {
|
||||
// Wait 100ms on the first attempt, doubling on each subsequent attempt.
|
||||
sleeper.sleep(Duration.millis(pow(2, failures) * 100));
|
||||
|
@ -82,6 +109,41 @@ public class Retrier implements Serializable {
|
|||
}
|
||||
}
|
||||
|
||||
private static final FailureReporter LOGGING_FAILURE_REPORTER = new FailureReporter() {
|
||||
@Override
|
||||
public void beforeRetry(Throwable thrown, int failures, int maxAttempts) {
|
||||
logger.infofmt(thrown, "Retrying transient error, attempt %d", failures);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void afterFinalFailure(Throwable thrown, int failures) {}
|
||||
};
|
||||
|
||||
/**
|
||||
* Retries a unit of work in the face of transient errors.
|
||||
*
|
||||
* <p>Retrying is done a fixed number of times, with exponential backoff, if the exception that is
|
||||
* thrown is on a whitelist of retryable errors. If the error is not on the whitelist, or if the
|
||||
* thread is interrupted, or if the allowable number of attempts has been exhausted, the original
|
||||
* exception is propagated through to the caller. Checked exceptions are wrapped in a
|
||||
* RuntimeException, while unchecked exceptions are propagated as-is.
|
||||
*
|
||||
* <p>Uses a default FailureReporter that logs before each retry.
|
||||
*
|
||||
* @return <V> the value returned by the {@link Callable}.
|
||||
*/
|
||||
@SafeVarargs
|
||||
public final <V> V callWithRetry(
|
||||
Callable<V> callable,
|
||||
Class<? extends Throwable> retryableError,
|
||||
Class<? extends Throwable>... moreRetryableErrors) {
|
||||
return callWithRetry(
|
||||
callable,
|
||||
LOGGING_FAILURE_REPORTER,
|
||||
retryableError,
|
||||
moreRetryableErrors);
|
||||
}
|
||||
|
||||
/**
|
||||
* Retries a unit of work in the face of transient errors.
|
||||
*
|
||||
|
@ -96,11 +158,12 @@ public class Retrier implements Serializable {
|
|||
@SafeVarargs
|
||||
public final <V> V callWithRetry(
|
||||
Callable<V> callable,
|
||||
FailureReporter failureReporter,
|
||||
Class<? extends Throwable> retryableError,
|
||||
Class<? extends Throwable>... moreRetryableErrors) {
|
||||
final Set<Class<?>> retryables =
|
||||
new ImmutableSet.Builder<Class<?>>().add(retryableError).add(moreRetryableErrors).build();
|
||||
return callWithRetry(callable, new Predicate<Throwable>() {
|
||||
return callWithRetry(callable, failureReporter, new Predicate<Throwable>() {
|
||||
@Override
|
||||
public boolean apply(Throwable e) {
|
||||
return any(retryables, supertypeOf(e.getClass()));
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue