mirror of
https://github.com/google/nomulus.git
synced 2025-04-30 12:07:51 +02:00
273 lines
10 KiB
Java
273 lines
10 KiB
Java
// Copyright 2016 The Domain Registry Authors. All Rights Reserved.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package google.registry.mapreduce;
|
|
|
|
import static com.google.appengine.api.search.checkers.Preconditions.checkNotNull;
|
|
import static com.google.appengine.tools.pipeline.PipelineServiceFactory.newPipelineService;
|
|
import static google.registry.util.PreconditionsUtils.checkArgumentNotNull;
|
|
|
|
import com.google.appengine.tools.mapreduce.Input;
|
|
import com.google.appengine.tools.mapreduce.MapJob;
|
|
import com.google.appengine.tools.mapreduce.MapReduceJob;
|
|
import com.google.appengine.tools.mapreduce.MapReduceSettings;
|
|
import com.google.appengine.tools.mapreduce.MapReduceSpecification;
|
|
import com.google.appengine.tools.mapreduce.MapSettings;
|
|
import com.google.appengine.tools.mapreduce.MapSpecification;
|
|
import com.google.appengine.tools.mapreduce.Mapper;
|
|
import com.google.appengine.tools.mapreduce.Marshallers;
|
|
import com.google.appengine.tools.mapreduce.Output;
|
|
import com.google.appengine.tools.mapreduce.Reducer;
|
|
import com.google.appengine.tools.mapreduce.outputs.NoOutput;
|
|
import com.google.appengine.tools.pipeline.Job0;
|
|
import com.google.appengine.tools.pipeline.JobSetting;
|
|
import com.google.common.annotations.VisibleForTesting;
|
|
import com.google.common.base.Optional;
|
|
import google.registry.mapreduce.inputs.ConcatenatingInput;
|
|
import google.registry.request.Parameter;
|
|
import google.registry.util.FormattingLogger;
|
|
import google.registry.util.PipelineUtils;
|
|
import java.io.Serializable;
|
|
import javax.inject.Inject;
|
|
import org.joda.time.Duration;
|
|
|
|
/**
|
|
* Runner for map-only or full map and reduce mapreduces.
|
|
*
|
|
* <p>We use hardcoded serialization marshallers for moving data between steps, so all types used as
|
|
* keys or values must implement {@link Serializable}.
|
|
*/
|
|
public class MapreduceRunner {
|
|
|
|
private static final FormattingLogger logger = FormattingLogger.getLoggerForCallerClass();
|
|
|
|
public static final String PARAM_DRY_RUN = "dryRun";
|
|
public static final String PARAM_MAP_SHARDS = "mapShards";
|
|
public static final String PARAM_REDUCE_SHARDS = "reduceShards";
|
|
|
|
private static final String BASE_URL = "/_dr/mapreduce/";
|
|
private static final String QUEUE_NAME = "mapreduce";
|
|
|
|
private final Optional<Integer> httpParamMapShards;
|
|
private final Optional<Integer> httpParamReduceShards;
|
|
|
|
// Default to 3 minutes since many slices will contain datastore queries that time out at 4:30.
|
|
private Duration sliceDuration = Duration.standardMinutes(3);
|
|
private String jobName;
|
|
private String moduleName;
|
|
|
|
// Defaults for number of mappers/reducers if not specified in HTTP params.
|
|
private int defaultMapShards = Integer.MAX_VALUE;
|
|
private int defaultReduceShards = 1;
|
|
|
|
/**
|
|
* @param mapShards number of map shards; if omitted, the {@link Input} objects will choose
|
|
* @param reduceShards number of reduce shards; if omitted, uses {@link #defaultReduceShards}
|
|
*/
|
|
@Inject
|
|
@VisibleForTesting
|
|
public MapreduceRunner(
|
|
@Parameter(PARAM_MAP_SHARDS) Optional<Integer> mapShards,
|
|
@Parameter(PARAM_REDUCE_SHARDS) Optional<Integer> reduceShards) {
|
|
this.httpParamMapShards = mapShards;
|
|
this.httpParamReduceShards = reduceShards;
|
|
}
|
|
|
|
/** Set the max time to run a slice before serializing; defaults to 3 minutes. */
|
|
public MapreduceRunner setSliceDuration(Duration sliceDuration) {
|
|
this.sliceDuration = checkArgumentNotNull(sliceDuration, "sliceDuration");
|
|
return this;
|
|
}
|
|
|
|
/** Set the human readable job name for display purposes. */
|
|
public MapreduceRunner setJobName(String jobName) {
|
|
this.jobName = checkArgumentNotNull(jobName, "jobName");
|
|
return this;
|
|
}
|
|
|
|
/** Set the module to run in. */
|
|
public MapreduceRunner setModuleName(String moduleName) {
|
|
this.moduleName = checkArgumentNotNull(moduleName, "moduleName");
|
|
return this;
|
|
}
|
|
|
|
/** Set the default number of mappers, if not overriden by the http param. */
|
|
public MapreduceRunner setDefaultMapShards(int defaultMapShards) {
|
|
this.defaultMapShards = defaultMapShards;
|
|
return this;
|
|
}
|
|
|
|
/** Set the default number of reducers, if not overriden by the http param. */
|
|
public MapreduceRunner setDefaultReduceShards(int defaultReduceShards) {
|
|
this.defaultReduceShards = defaultReduceShards;
|
|
return this;
|
|
}
|
|
|
|
/**
|
|
* Create a map-only mapreduce to be run as part of a pipeline.
|
|
*
|
|
* @see #runMapOnly for creating and running an independent map-only mapreduce
|
|
*
|
|
* @param mapper instance of a mapper class
|
|
* @param inputs input sources for the mapper
|
|
* @param <I> mapper input type
|
|
* @param <O> individual output record type sent to the {@link Output}
|
|
* @param <R> overall output result type
|
|
*/
|
|
public <I, O, R> MapJob<I, O, R> createMapOnlyJob(
|
|
Mapper<I, Void, O> mapper,
|
|
Output<O, R> output,
|
|
Iterable<? extends Input<? extends I>> inputs) {
|
|
checkCommonRequiredFields(inputs, mapper);
|
|
return new MapJob<>(
|
|
new MapSpecification.Builder<I, O, R>()
|
|
.setJobName(jobName)
|
|
.setInput(new ConcatenatingInput<>(inputs, httpParamMapShards.or(defaultMapShards)))
|
|
.setMapper(mapper)
|
|
.setOutput(output)
|
|
.build(),
|
|
new MapSettings.Builder()
|
|
.setWorkerQueueName(QUEUE_NAME)
|
|
.setBaseUrl(BASE_URL)
|
|
.setModule(moduleName)
|
|
.setMillisPerSlice((int) sliceDuration.getMillis())
|
|
.build());
|
|
}
|
|
|
|
/**
|
|
* Kick off a map-only mapreduce.
|
|
*
|
|
* <p>For simplicity, the mapreduce is hard-coded with {@link NoOutput}, on the assumption that
|
|
* all work will be accomplished via side effects during the map phase.
|
|
*
|
|
* @see #createMapOnlyJob for creating and running a map-only mapreduce as part of a pipeline
|
|
*
|
|
* @param mapper instance of a mapper class
|
|
* @param inputs input sources for the mapper
|
|
* @param <I> mapper input type
|
|
* @return the job id
|
|
*/
|
|
public <I> String runMapOnly(
|
|
Mapper<I, Void, Void> mapper,
|
|
Iterable<? extends Input<? extends I>> inputs) {
|
|
return runAsPipeline(createMapOnlyJob(mapper, new NoOutput<Void, Void>(), inputs));
|
|
}
|
|
|
|
/**
|
|
* Create a mapreduce job to be run as part of a pipeline.
|
|
*
|
|
* @see #runMapreduce for creating and running an independent mapreduce
|
|
*
|
|
* @param mapper instance of a mapper class
|
|
* @param reducer instance of a reducer class
|
|
* @param inputs input sources for the mapper
|
|
* @param <I> mapper input type
|
|
* @param <K> emitted key type
|
|
* @param <V> emitted value type
|
|
* @param <O> individual output record type sent to the {@link Output}
|
|
* @param <R> overall output result type
|
|
*/
|
|
public final <I, K extends Serializable, V extends Serializable, O, R> MapReduceJob<I, K, V, O, R>
|
|
createMapreduceJob(
|
|
Mapper<I, K, V> mapper,
|
|
Reducer<K, V, O> reducer,
|
|
Iterable<? extends Input<? extends I>> inputs,
|
|
Output<O, R> output) {
|
|
checkCommonRequiredFields(inputs, mapper);
|
|
checkArgumentNotNull(reducer, "reducer");
|
|
return new MapReduceJob<>(
|
|
new MapReduceSpecification.Builder<I, K, V, O, R>()
|
|
.setJobName(jobName)
|
|
.setInput(new ConcatenatingInput<>(inputs, httpParamMapShards.or(defaultMapShards)))
|
|
.setMapper(mapper)
|
|
.setReducer(reducer)
|
|
.setOutput(output)
|
|
.setKeyMarshaller(Marshallers.<K>getSerializationMarshaller())
|
|
.setValueMarshaller(Marshallers.<V>getSerializationMarshaller())
|
|
.setNumReducers(httpParamReduceShards.or(defaultReduceShards))
|
|
.build(),
|
|
new MapReduceSettings.Builder()
|
|
.setWorkerQueueName(QUEUE_NAME)
|
|
.setBaseUrl(BASE_URL)
|
|
.setModule(moduleName)
|
|
.setMillisPerSlice((int) sliceDuration.getMillis())
|
|
.build());
|
|
}
|
|
|
|
/**
|
|
* Kick off a mapreduce job.
|
|
*
|
|
* <p>For simplicity, the mapreduce is hard-coded with {@link NoOutput}, on the assumption that
|
|
* all work will be accomplished via side effects during the map or reduce phases.
|
|
*
|
|
* @see #createMapreduceJob for creating and running a mapreduce as part of a pipeline
|
|
*
|
|
* @param mapper instance of a mapper class
|
|
* @param reducer instance of a reducer class
|
|
* @param inputs input sources for the mapper
|
|
* @param <I> mapper input type
|
|
* @param <K> emitted key type
|
|
* @param <V> emitted value type
|
|
* @return the job id
|
|
*/
|
|
public final <I, K extends Serializable, V extends Serializable> String runMapreduce(
|
|
Mapper<I, K, V> mapper,
|
|
Reducer<K, V, Void> reducer,
|
|
Iterable<? extends Input<? extends I>> inputs) {
|
|
return runMapreduce(mapper, reducer, inputs, new NoOutput<Void, Void>());
|
|
}
|
|
|
|
/**
|
|
* Kick off a mapreduce job with specified Output handler.
|
|
*
|
|
* @see #createMapreduceJob for creating and running a mapreduce as part of a pipeline
|
|
*
|
|
* @param mapper instance of a mapper class
|
|
* @param reducer instance of a reducer class
|
|
* @param inputs input sources for the mapper
|
|
* @param <I> mapper input type
|
|
* @param <K> emitted key type
|
|
* @param <V> emitted value type
|
|
* @param <O> emitted output type
|
|
* @param <R> return value of output
|
|
* @return the job id
|
|
*/
|
|
public final <I, K extends Serializable, V extends Serializable, O, R> String runMapreduce(
|
|
Mapper<I, K, V> mapper,
|
|
Reducer<K, V, O> reducer,
|
|
Iterable<? extends Input<? extends I>> inputs,
|
|
Output<O, R> output) {
|
|
return runAsPipeline(createMapreduceJob(mapper, reducer, inputs, output));
|
|
}
|
|
|
|
private void checkCommonRequiredFields(Iterable<?> inputs, Mapper<?, ?, ?> mapper) {
|
|
checkNotNull(jobName, "jobName");
|
|
checkNotNull(moduleName, "moduleName");
|
|
checkArgumentNotNull(inputs, "inputs");
|
|
checkArgumentNotNull(mapper, "mapper");
|
|
}
|
|
|
|
private String runAsPipeline(Job0<?> job) {
|
|
String jobId = newPipelineService().startNewPipeline(
|
|
job,
|
|
new JobSetting.OnModule(moduleName),
|
|
new JobSetting.OnQueue(QUEUE_NAME));
|
|
logger.infofmt(
|
|
"Started '%s' %s job: %s",
|
|
jobName,
|
|
job instanceof MapJob ? "map" : "mapreduce",
|
|
PipelineUtils.createJobPath(jobId));
|
|
return jobId;
|
|
}
|
|
}
|