google-nomulus/java/google/registry/mapreduce/MapreduceRunner.java
jianglai 70b13596e4 Migrate to Flogger (green)
This is a 'green' Flogger migration CL. Green CLs are intended to be as
safe as possible and should be easy to review and submit.

No changes should be necessary to the code itself prior to submission,
but small changes to BUILD files may be required.

Changes within files are completely independent of each other, so this CL
can be safely split up for review using tools such as Rosie.

For more information, see []
Base CL: 197826149

-------------
Created by MOE: https://github.com/google/moe
MOE_MIGRATED_REVID=198560170
2018-05-30 12:18:54 -04:00

281 lines
11 KiB
Java

// Copyright 2017 The Nomulus Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package google.registry.mapreduce;
import static com.google.appengine.tools.pipeline.PipelineServiceFactory.newPipelineService;
import static com.google.common.base.Preconditions.checkNotNull;
import static google.registry.util.PreconditionsUtils.checkArgumentNotNull;
import com.google.appengine.tools.mapreduce.Input;
import com.google.appengine.tools.mapreduce.MapJob;
import com.google.appengine.tools.mapreduce.MapReduceJob;
import com.google.appengine.tools.mapreduce.MapReduceSettings;
import com.google.appengine.tools.mapreduce.MapReduceSpecification;
import com.google.appengine.tools.mapreduce.MapSettings;
import com.google.appengine.tools.mapreduce.MapSpecification;
import com.google.appengine.tools.mapreduce.Mapper;
import com.google.appengine.tools.mapreduce.Marshallers;
import com.google.appengine.tools.mapreduce.Output;
import com.google.appengine.tools.mapreduce.Reducer;
import com.google.appengine.tools.mapreduce.outputs.NoOutput;
import com.google.appengine.tools.pipeline.Job0;
import com.google.appengine.tools.pipeline.JobSetting;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.flogger.FluentLogger;
import google.registry.mapreduce.inputs.ConcatenatingInput;
import google.registry.request.Parameter;
import google.registry.util.PipelineUtils;
import java.io.Serializable;
import java.util.Optional;
import javax.inject.Inject;
import org.joda.time.Duration;
/**
* Runner for map-only or full map and reduce mapreduces.
*
* <p>We use hardcoded serialization marshallers for moving data between steps, so all types used as
* keys or values must implement {@link Serializable}.
*/
public class MapreduceRunner {
private static final FluentLogger logger = FluentLogger.forEnclosingClass();
public static final String PARAM_DRY_RUN = "dryRun";
public static final String PARAM_MAP_SHARDS = "mapShards";
public static final String PARAM_REDUCE_SHARDS = "reduceShards";
private static final String BASE_URL = "/_dr/mapreduce/";
private static final String QUEUE_NAME = "mapreduce";
private final Optional<Integer> httpParamMapShards;
private final Optional<Integer> httpParamReduceShards;
// Default to 3 minutes since many slices will contain Datastore queries that time out at 4:30.
private Duration sliceDuration = Duration.standardMinutes(3);
private String jobName;
private String moduleName;
// Defaults for number of mappers/reducers if not specified in HTTP params. The max allowable
// count for both (which is specified in the App Engine mapreduce framework) is 1000. We use 100
// mapper shards because there's a bottleneck in the App Engine mapreduce framework caused by
// updating the mapreduce status on a single Datastore entity (which only supports so many writes
// per second). The existing mapreduces don't actually do that much work for TLDs that aren't
// .com-sized, so the shards finish so quickly that contention becomes a problem. This number can
// always be tuned up for large registry systems with on the order of hundreds of thousands of
// entities on up.
// The default reducer shard count is one because most mapreduces use it to collate and output
// results. The ones that actually perform a substantial amount of work in a reduce step use a
// higher non-default number of reducer shards.
private int defaultMapShards = 100;
private int defaultReduceShards = 1;
/**
* @param mapShards number of map shards; if omitted, the {@link Input} objects will choose
* @param reduceShards number of reduce shards; if omitted, uses {@link #defaultReduceShards}
*/
@Inject
@VisibleForTesting
public MapreduceRunner(
@Parameter(PARAM_MAP_SHARDS) Optional<Integer> mapShards,
@Parameter(PARAM_REDUCE_SHARDS) Optional<Integer> reduceShards) {
this.httpParamMapShards = mapShards;
this.httpParamReduceShards = reduceShards;
}
/** Set the max time to run a slice before serializing; defaults to 3 minutes. */
public MapreduceRunner setSliceDuration(Duration sliceDuration) {
this.sliceDuration = checkArgumentNotNull(sliceDuration, "sliceDuration");
return this;
}
/** Set the human readable job name for display purposes. */
public MapreduceRunner setJobName(String jobName) {
this.jobName = checkArgumentNotNull(jobName, "jobName");
return this;
}
/** Set the module to run in. */
public MapreduceRunner setModuleName(String moduleName) {
this.moduleName = checkArgumentNotNull(moduleName, "moduleName");
return this;
}
/** Set the default number of mappers, if not overridden by the http param. */
public MapreduceRunner setDefaultMapShards(int defaultMapShards) {
this.defaultMapShards = defaultMapShards;
return this;
}
/** Set the default number of reducers, if not overridden by the http param. */
public MapreduceRunner setDefaultReduceShards(int defaultReduceShards) {
this.defaultReduceShards = defaultReduceShards;
return this;
}
/**
* Create a map-only mapreduce to be run as part of a pipeline.
*
* @see #runMapOnly for creating and running an independent map-only mapreduce
*
* @param mapper instance of a mapper class
* @param inputs input sources for the mapper
* @param <I> mapper input type
* @param <O> individual output record type sent to the {@link Output}
* @param <R> overall output result type
*/
public <I, O, R> MapJob<I, O, R> createMapOnlyJob(
Mapper<I, Void, O> mapper,
Output<O, R> output,
Iterable<? extends Input<? extends I>> inputs) {
checkCommonRequiredFields(inputs, mapper);
return new MapJob<>(
new MapSpecification.Builder<I, O, R>()
.setJobName(jobName)
.setInput(new ConcatenatingInput<>(inputs, httpParamMapShards.orElse(defaultMapShards)))
.setMapper(mapper)
.setOutput(output)
.build(),
new MapSettings.Builder()
.setWorkerQueueName(QUEUE_NAME)
.setBaseUrl(BASE_URL)
.setModule(moduleName)
.setMillisPerSlice((int) sliceDuration.getMillis())
.build());
}
/**
* Kick off a map-only mapreduce.
*
* <p>For simplicity, the mapreduce is hard-coded with {@link NoOutput}, on the assumption that
* all work will be accomplished via side effects during the map phase.
*
* @see #createMapOnlyJob for creating and running a map-only mapreduce as part of a pipeline
*
* @param mapper instance of a mapper class
* @param inputs input sources for the mapper
* @param <I> mapper input type
* @return the job id
*/
public <I> String runMapOnly(
Mapper<I, Void, Void> mapper,
Iterable<? extends Input<? extends I>> inputs) {
return runAsPipeline(createMapOnlyJob(mapper, new NoOutput<Void, Void>(), inputs));
}
/**
* Create a mapreduce job to be run as part of a pipeline.
*
* @see #runMapreduce for creating and running an independent mapreduce
*
* @param mapper instance of a mapper class
* @param reducer instance of a reducer class
* @param inputs input sources for the mapper
* @param <I> mapper input type
* @param <K> emitted key type
* @param <V> emitted value type
* @param <O> individual output record type sent to the {@link Output}
* @param <R> overall output result type
*/
public final <I, K extends Serializable, V extends Serializable, O, R> MapReduceJob<I, K, V, O, R>
createMapreduceJob(
Mapper<I, K, V> mapper,
Reducer<K, V, O> reducer,
Iterable<? extends Input<? extends I>> inputs,
Output<O, R> output) {
checkCommonRequiredFields(inputs, mapper);
checkArgumentNotNull(reducer, "reducer");
return new MapReduceJob<>(
new MapReduceSpecification.Builder<I, K, V, O, R>()
.setJobName(jobName)
.setInput(new ConcatenatingInput<>(inputs, httpParamMapShards.orElse(defaultMapShards)))
.setMapper(mapper)
.setReducer(reducer)
.setOutput(output)
.setKeyMarshaller(Marshallers.getSerializationMarshaller())
.setValueMarshaller(Marshallers.getSerializationMarshaller())
.setNumReducers(httpParamReduceShards.orElse(defaultReduceShards))
.build(),
new MapReduceSettings.Builder()
.setWorkerQueueName(QUEUE_NAME)
.setBaseUrl(BASE_URL)
.setModule(moduleName)
.setMillisPerSlice((int) sliceDuration.getMillis())
.build());
}
/**
* Kick off a mapreduce job.
*
* <p>For simplicity, the mapreduce is hard-coded with {@link NoOutput}, on the assumption that
* all work will be accomplished via side effects during the map or reduce phases.
*
* @see #createMapreduceJob for creating and running a mapreduce as part of a pipeline
*
* @param mapper instance of a mapper class
* @param reducer instance of a reducer class
* @param inputs input sources for the mapper
* @param <I> mapper input type
* @param <K> emitted key type
* @param <V> emitted value type
* @return the job id
*/
public final <I, K extends Serializable, V extends Serializable> String runMapreduce(
Mapper<I, K, V> mapper,
Reducer<K, V, Void> reducer,
Iterable<? extends Input<? extends I>> inputs) {
return runMapreduce(mapper, reducer, inputs, new NoOutput<Void, Void>());
}
/**
* Kick off a mapreduce job with specified Output handler.
*
* @see #createMapreduceJob for creating and running a mapreduce as part of a pipeline
*
* @param mapper instance of a mapper class
* @param reducer instance of a reducer class
* @param inputs input sources for the mapper
* @param <I> mapper input type
* @param <K> emitted key type
* @param <V> emitted value type
* @param <O> emitted output type
* @param <R> return value of output
* @return the job id
*/
public final <I, K extends Serializable, V extends Serializable, O, R> String runMapreduce(
Mapper<I, K, V> mapper,
Reducer<K, V, O> reducer,
Iterable<? extends Input<? extends I>> inputs,
Output<O, R> output) {
return runAsPipeline(createMapreduceJob(mapper, reducer, inputs, output));
}
private void checkCommonRequiredFields(Iterable<?> inputs, Mapper<?, ?, ?> mapper) {
checkNotNull(jobName, "jobName");
checkNotNull(moduleName, "moduleName");
checkArgumentNotNull(inputs, "inputs");
checkArgumentNotNull(mapper, "mapper");
}
private String runAsPipeline(Job0<?> job) {
String jobId = newPipelineService().startNewPipeline(
job,
new JobSetting.OnModule(moduleName),
new JobSetting.OnQueue(QUEUE_NAME));
logger.atInfo().log(
"Started '%s' %s job: %s",
jobName, job instanceof MapJob ? "map" : "mapreduce", PipelineUtils.createJobPath(jobId));
return jobId;
}
}