diff --git a/core/src/main/java/google/registry/beam/common/RegistryJpaIO.java b/core/src/main/java/google/registry/beam/common/RegistryJpaIO.java index 2f169c0ef..b224e2547 100644 --- a/core/src/main/java/google/registry/beam/common/RegistryJpaIO.java +++ b/core/src/main/java/google/registry/beam/common/RegistryJpaIO.java @@ -34,11 +34,11 @@ import org.apache.beam.sdk.coders.SerializableCoder; import org.apache.beam.sdk.metrics.Counter; import org.apache.beam.sdk.metrics.Metrics; import org.apache.beam.sdk.transforms.Create; -import org.apache.beam.sdk.transforms.Deduplicate; import org.apache.beam.sdk.transforms.DoFn; import org.apache.beam.sdk.transforms.GroupIntoBatches; import org.apache.beam.sdk.transforms.PTransform; import org.apache.beam.sdk.transforms.ParDo; +import org.apache.beam.sdk.transforms.Reshuffle; import org.apache.beam.sdk.transforms.SerializableFunction; import org.apache.beam.sdk.transforms.WithKeys; import org.apache.beam.sdk.util.ShardedKey; @@ -83,22 +83,6 @@ public final class RegistryJpaIO { * A {@link PTransform transform} that executes a JPA {@link CriteriaQuery} and adds the results * to the BEAM pipeline. Users have the option to transform the results before sending them to the * next stages. - * - *
The BEAM pipeline may execute this transform multiple times due to transient failures, - * loading duplicate results into the pipeline. Before we add dedepuplication support, the easiest - * workaround is to map results to {@link KV} pairs, and apply the {@link Deduplicate} transform - * to the output of this transform: - * - *
{@code - * PCollection*/ @AutoValue public abstract static class ReadcontactIds = - * pipeline - * .apply(RegistryJpaIO.read( - * (JpaTransactionManager tm) -> tm.createQueryComposer..., - * contact -> KV.of(contact.getRepoId(), contact.getContactId())) - * .withCoder(KvCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of()))) - * .apply(Deduplicate.keyedValues()) - * .apply(Values.create()); - * }
With many JDBC drivers, including Postgresql, a positive fetch size is required for + * streaming large result sets. A zero value, often the drivers' default setting, requires that + * the entire result set is buffered. + * + *
The fetch size value, the default as well as the user-provided one, will be applied if and
+ * only if the underlying query implementor supports it. The Hibernate implementations do support
+ * this.
+ */
+ public QueryComposer