// Copyright 2017 The Nomulus Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package google.registry.mapreduce;
import static com.google.appengine.api.search.checkers.Preconditions.checkNotNull;
import static com.google.appengine.tools.pipeline.PipelineServiceFactory.newPipelineService;
import static google.registry.util.PreconditionsUtils.checkArgumentNotNull;
import com.google.appengine.tools.mapreduce.Input;
import com.google.appengine.tools.mapreduce.MapJob;
import com.google.appengine.tools.mapreduce.MapReduceJob;
import com.google.appengine.tools.mapreduce.MapReduceSettings;
import com.google.appengine.tools.mapreduce.MapReduceSpecification;
import com.google.appengine.tools.mapreduce.MapSettings;
import com.google.appengine.tools.mapreduce.MapSpecification;
import com.google.appengine.tools.mapreduce.Mapper;
import com.google.appengine.tools.mapreduce.Marshallers;
import com.google.appengine.tools.mapreduce.Output;
import com.google.appengine.tools.mapreduce.Reducer;
import com.google.appengine.tools.mapreduce.outputs.NoOutput;
import com.google.appengine.tools.pipeline.Job0;
import com.google.appengine.tools.pipeline.JobSetting;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Optional;
import google.registry.mapreduce.inputs.ConcatenatingInput;
import google.registry.request.Parameter;
import google.registry.util.FormattingLogger;
import google.registry.util.PipelineUtils;
import java.io.Serializable;
import javax.inject.Inject;
import org.joda.time.Duration;
/**
* Runner for map-only or full map and reduce mapreduces.
*
*
We use hardcoded serialization marshallers for moving data between steps, so all types used as
* keys or values must implement {@link Serializable}.
*/
public class MapreduceRunner {
private static final FormattingLogger logger = FormattingLogger.getLoggerForCallerClass();
public static final String PARAM_DRY_RUN = "dryRun";
public static final String PARAM_MAP_SHARDS = "mapShards";
public static final String PARAM_REDUCE_SHARDS = "reduceShards";
private static final String BASE_URL = "/_dr/mapreduce/";
private static final String QUEUE_NAME = "mapreduce";
private final Optional httpParamMapShards;
private final Optional httpParamReduceShards;
// Default to 3 minutes since many slices will contain Datastore queries that time out at 4:30.
private Duration sliceDuration = Duration.standardMinutes(3);
private String jobName;
private String moduleName;
// Defaults for number of mappers/reducers if not specified in HTTP params. The max allowable
// count for both (which is specified in the App Engine mapreduce framework) is 1000. We use 100
// mapper shards because there's a bottleneck in the App Engine mapreduce framework caused by
// updating the mapreduce status on a single Datastore entity (which only supports so many writes
// per second). The existing mapreduces don't actually do that much work for TLDs that aren't
// .com-sized, so the shards finish so quickly that contention becomes a problem. This number can
// always be tuned up for large registry systems with on the order of hundreds of thousands of
// entities on up.
// The default reducer shard count is one because most mapreduces use it to collate and output
// results. The ones that actually perform a substantial amount of work in a reduce step use a
// higher non-default number of reducer shards.
private int defaultMapShards = 100;
private int defaultReduceShards = 1;
/**
* @param mapShards number of map shards; if omitted, the {@link Input} objects will choose
* @param reduceShards number of reduce shards; if omitted, uses {@link #defaultReduceShards}
*/
@Inject
@VisibleForTesting
public MapreduceRunner(
@Parameter(PARAM_MAP_SHARDS) Optional mapShards,
@Parameter(PARAM_REDUCE_SHARDS) Optional reduceShards) {
this.httpParamMapShards = mapShards;
this.httpParamReduceShards = reduceShards;
}
/** Set the max time to run a slice before serializing; defaults to 3 minutes. */
public MapreduceRunner setSliceDuration(Duration sliceDuration) {
this.sliceDuration = checkArgumentNotNull(sliceDuration, "sliceDuration");
return this;
}
/** Set the human readable job name for display purposes. */
public MapreduceRunner setJobName(String jobName) {
this.jobName = checkArgumentNotNull(jobName, "jobName");
return this;
}
/** Set the module to run in. */
public MapreduceRunner setModuleName(String moduleName) {
this.moduleName = checkArgumentNotNull(moduleName, "moduleName");
return this;
}
/** Set the default number of mappers, if not overriden by the http param. */
public MapreduceRunner setDefaultMapShards(int defaultMapShards) {
this.defaultMapShards = defaultMapShards;
return this;
}
/** Set the default number of reducers, if not overriden by the http param. */
public MapreduceRunner setDefaultReduceShards(int defaultReduceShards) {
this.defaultReduceShards = defaultReduceShards;
return this;
}
/**
* Create a map-only mapreduce to be run as part of a pipeline.
*
* @see #runMapOnly for creating and running an independent map-only mapreduce
*
* @param mapper instance of a mapper class
* @param inputs input sources for the mapper
* @param mapper input type
* @param individual output record type sent to the {@link Output}
* @param overall output result type
*/
public MapJob createMapOnlyJob(
Mapper mapper,
Output output,
Iterable extends Input extends I>> inputs) {
checkCommonRequiredFields(inputs, mapper);
return new MapJob<>(
new MapSpecification.Builder()
.setJobName(jobName)
.setInput(new ConcatenatingInput<>(inputs, httpParamMapShards.or(defaultMapShards)))
.setMapper(mapper)
.setOutput(output)
.build(),
new MapSettings.Builder()
.setWorkerQueueName(QUEUE_NAME)
.setBaseUrl(BASE_URL)
.setModule(moduleName)
.setMillisPerSlice((int) sliceDuration.getMillis())
.build());
}
/**
* Kick off a map-only mapreduce.
*
* For simplicity, the mapreduce is hard-coded with {@link NoOutput}, on the assumption that
* all work will be accomplished via side effects during the map phase.
*
* @see #createMapOnlyJob for creating and running a map-only mapreduce as part of a pipeline
*
* @param mapper instance of a mapper class
* @param inputs input sources for the mapper
* @param mapper input type
* @return the job id
*/
public String runMapOnly(
Mapper mapper,
Iterable extends Input extends I>> inputs) {
return runAsPipeline(createMapOnlyJob(mapper, new NoOutput(), inputs));
}
/**
* Create a mapreduce job to be run as part of a pipeline.
*
* @see #runMapreduce for creating and running an independent mapreduce
*
* @param mapper instance of a mapper class
* @param reducer instance of a reducer class
* @param inputs input sources for the mapper
* @param mapper input type
* @param emitted key type
* @param emitted value type
* @param individual output record type sent to the {@link Output}
* @param overall output result type
*/
public final MapReduceJob
createMapreduceJob(
Mapper mapper,
Reducer reducer,
Iterable extends Input extends I>> inputs,
Output output) {
checkCommonRequiredFields(inputs, mapper);
checkArgumentNotNull(reducer, "reducer");
return new MapReduceJob<>(
new MapReduceSpecification.Builder()
.setJobName(jobName)
.setInput(new ConcatenatingInput<>(inputs, httpParamMapShards.or(defaultMapShards)))
.setMapper(mapper)
.setReducer(reducer)
.setOutput(output)
.setKeyMarshaller(Marshallers.getSerializationMarshaller())
.setValueMarshaller(Marshallers.getSerializationMarshaller())
.setNumReducers(httpParamReduceShards.or(defaultReduceShards))
.build(),
new MapReduceSettings.Builder()
.setWorkerQueueName(QUEUE_NAME)
.setBaseUrl(BASE_URL)
.setModule(moduleName)
.setMillisPerSlice((int) sliceDuration.getMillis())
.build());
}
/**
* Kick off a mapreduce job.
*
* For simplicity, the mapreduce is hard-coded with {@link NoOutput}, on the assumption that
* all work will be accomplished via side effects during the map or reduce phases.
*
* @see #createMapreduceJob for creating and running a mapreduce as part of a pipeline
*
* @param mapper instance of a mapper class
* @param reducer instance of a reducer class
* @param inputs input sources for the mapper
* @param mapper input type
* @param emitted key type
* @param emitted value type
* @return the job id
*/
public final String runMapreduce(
Mapper mapper,
Reducer reducer,
Iterable extends Input extends I>> inputs) {
return runMapreduce(mapper, reducer, inputs, new NoOutput());
}
/**
* Kick off a mapreduce job with specified Output handler.
*
* @see #createMapreduceJob for creating and running a mapreduce as part of a pipeline
*
* @param mapper instance of a mapper class
* @param reducer instance of a reducer class
* @param inputs input sources for the mapper
* @param mapper input type
* @param emitted key type
* @param emitted value type
* @param emitted output type
* @param return value of output
* @return the job id
*/
public final String runMapreduce(
Mapper mapper,
Reducer reducer,
Iterable extends Input extends I>> inputs,
Output output) {
return runAsPipeline(createMapreduceJob(mapper, reducer, inputs, output));
}
private void checkCommonRequiredFields(Iterable> inputs, Mapper, ?, ?> mapper) {
checkNotNull(jobName, "jobName");
checkNotNull(moduleName, "moduleName");
checkArgumentNotNull(inputs, "inputs");
checkArgumentNotNull(mapper, "mapper");
}
private String runAsPipeline(Job0> job) {
String jobId = newPipelineService().startNewPipeline(
job,
new JobSetting.OnModule(moduleName),
new JobSetting.OnQueue(QUEUE_NAME));
logger.infofmt(
"Started '%s' %s job: %s",
jobName,
job instanceof MapJob ? "map" : "mapreduce",
PipelineUtils.createJobPath(jobId));
return jobId;
}
}