mirror of
https://github.com/google/nomulus.git
synced 2025-05-13 16:07:15 +02:00
Retry Datastore errors in CommitLogManifestReader.next()
When trying to run the MapReduce for DeleteOldCommitLogsAction, we run into a lot of DatastoreTimeoutException during CommitLogManifestReader.next. This causes the entire shard to fail. Since we have a lot of keys (tens of millions), this is almost guaranteed to happen, dooming the entire MapReduce. Here is an attempt to recover from the Timeout Exception by saving the state before the read, then on failure restoring that state and trying again. ------------- Created by MOE: https://github.com/google/moe MOE_MIGRATED_REVID=165172222
This commit is contained in:
parent
8b0b54e997
commit
00f2662f33
3 changed files with 172 additions and 9 deletions
|
@ -14,9 +14,11 @@
|
|||
|
||||
package google.registry.mapreduce.inputs;
|
||||
|
||||
import static com.google.common.base.Preconditions.checkNotNull;
|
||||
import static google.registry.model.ofy.ObjectifyService.ofy;
|
||||
|
||||
import com.google.appengine.api.datastore.Cursor;
|
||||
import com.google.appengine.api.datastore.DatastoreTimeoutException;
|
||||
import com.google.appengine.api.datastore.QueryResultIterator;
|
||||
import com.google.appengine.tools.mapreduce.InputReader;
|
||||
import com.google.common.base.Optional;
|
||||
|
@ -24,7 +26,11 @@ import com.googlecode.objectify.Key;
|
|||
import com.googlecode.objectify.cmd.Query;
|
||||
import google.registry.model.ofy.CommitLogBucket;
|
||||
import google.registry.model.ofy.CommitLogManifest;
|
||||
import google.registry.util.FormattingLogger;
|
||||
import google.registry.util.Retrier;
|
||||
import google.registry.util.SystemSleeper;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.concurrent.Callable;
|
||||
import org.joda.time.DateTime;
|
||||
|
||||
/** {@link InputReader} that maps over {@link CommitLogManifest}. */
|
||||
|
@ -32,6 +38,8 @@ class CommitLogManifestReader extends InputReader<Key<CommitLogManifest>> {
|
|||
|
||||
private static final long serialVersionUID = 5117046535590539778L;
|
||||
|
||||
static final FormattingLogger logger = FormattingLogger.getLoggerForCallerClass();
|
||||
|
||||
/**
|
||||
* Memory estimation for this reader.
|
||||
*
|
||||
|
@ -40,6 +48,8 @@ class CommitLogManifestReader extends InputReader<Key<CommitLogManifest>> {
|
|||
*/
|
||||
private static final long MEMORY_ESTIMATE = 100 * 1024;
|
||||
|
||||
private static final Retrier retrier = new Retrier(new SystemSleeper(), 3);
|
||||
|
||||
private final Key<CommitLogBucket> bucketKey;
|
||||
|
||||
/**
|
||||
|
@ -125,8 +135,31 @@ class CommitLogManifestReader extends InputReader<Key<CommitLogManifest>> {
|
|||
@Override
|
||||
public Key<CommitLogManifest> next() {
|
||||
loaded++;
|
||||
final Cursor currentCursor = queryIterator.getCursor();
|
||||
try {
|
||||
return queryIterator.next();
|
||||
return retrier.callWithRetry(
|
||||
new Callable<Key<CommitLogManifest>>() {
|
||||
@Override
|
||||
public Key<CommitLogManifest> call() {
|
||||
return queryIterator.next();
|
||||
}
|
||||
},
|
||||
new Retrier.FailureReporter() {
|
||||
@Override
|
||||
public void beforeRetry(Throwable thrown, int failures, int maxAttempts) {
|
||||
checkNotNull(currentCursor, "Can't retry because cursor is null. Giving up.");
|
||||
queryIterator = query().startAt(currentCursor).keys().iterator();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void afterFinalFailure(Throwable thrown, int failures) {
|
||||
logger.severefmt(
|
||||
"Max retry attempts reached trying to read item %d/%d. Giving up.",
|
||||
loaded,
|
||||
total);
|
||||
}
|
||||
},
|
||||
DatastoreTimeoutException.class);
|
||||
} finally {
|
||||
ofy().clearSessionCache(); // Try not to leak memory.
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue