Improve logs in the GCP proxy

Tweaked a few logging levels to not spam error level logs. Also make it easy to debug issues in case relay retry fails. [1] Put non-fatal exceptions that should be logged at warning in their explicit sets. Also always use the root cause to determine if an exception is non-fatal, because sometimes the actual causes are wrapped inside other exceptions. [2] Record the cause of a relay failure, and record if a relay retry is successful. This way we can look at the log and figure out if a relay is eventually successful. [3] Add a log when the frontend connection from the client is terminated. [4] Alway close the relay channel when a relay has failed, which, depend on if the channel is frontend or backend, will reconnect and trigger a retry. [5] Lastly changed failure test to use assertThrows instead of fail. ------------- Created by MOE: https://github.com/google/moe MOE_MIGRATED_REVID=208649916
2025-08-26 19:13:48 +02:00 · 2018-08-14 08:22:31 -07:00 · 2018-08-14 08:22:31 -07:00 · 0e64015cdf
commit 0e64015cdf
parent b552c1d115
10 changed files with 154 additions and 95 deletions
--- a/java/google/registry/proxy/handler/HttpsRelayServiceHandler.java
+++ b/java/google/registry/proxy/handler/HttpsRelayServiceHandler.java
@ -14,13 +14,14 @@

 package google.registry.proxy.handler;

-import static com.google.common.base.Preconditions.checkArgument;
 import static java.nio.charset.StandardCharsets.UTF_8;

 import com.google.common.base.Throwables;
+import com.google.common.collect.ImmutableSet;
 import com.google.common.flogger.FluentLogger;
 import google.registry.proxy.metric.FrontendMetrics;
 import io.netty.buffer.ByteBuf;
+import io.netty.channel.Channel;
 import io.netty.channel.ChannelFuture;
 import io.netty.channel.ChannelHandlerContext;
 import io.netty.channel.ChannelPromise;
@ -58,10 +59,16 @@ import javax.net.ssl.SSLHandshakeException;
 * <p>This handler is session aware and will store all the session cookies that the are contained in
 * the HTTP response headers, which are added back to headers of subsequent HTTP requests.
 */
-abstract class HttpsRelayServiceHandler extends ByteToMessageCodec<FullHttpResponse> {
+public abstract class HttpsRelayServiceHandler extends ByteToMessageCodec<FullHttpResponse> {

  private static final FluentLogger logger = FluentLogger.forEnclosingClass();

+  protected static final ImmutableSet<Class<? extends Exception>> NON_FATAL_INBOUND_EXCEPTIONS =
+      ImmutableSet.of(ReadTimeoutException.class, SSLHandshakeException.class);
+
+  protected static final ImmutableSet<Class<? extends Exception>> NON_FATAL_OUTBOUND_EXCEPTIONS =
+      ImmutableSet.of(NonOkHttpResponseException.class);
+
  private final Map<String, Cookie> cookieStore = new LinkedHashMap<>();
  private final String relayHost;
  private final String relayPath;
@ -153,12 +160,9 @@ abstract class HttpsRelayServiceHandler extends ByteToMessageCodec<FullHttpRespo
  @Override
  protected void encode(ChannelHandlerContext ctx, FullHttpResponse response, ByteBuf byteBuf)
      throws Exception {
-    checkArgument(
-        response.status().equals(HttpResponseStatus.OK),
-        "Cannot relay HTTP response status \"%s\" in channel %s:\n%s",
-        response.status(),
-        ctx.channel(),
-        response.content().toString(UTF_8));
+    if (!response.status().equals(HttpResponseStatus.OK)) {
+      throw new NonOkHttpResponseException(response, ctx.channel());
+    }
    saveCookies(response);
    byteBuf.writeBytes(encodeFullHttpResponse(response));
  }
@ -166,10 +170,7 @@ abstract class HttpsRelayServiceHandler extends ByteToMessageCodec<FullHttpRespo
  /** Terminates connection upon inbound exception. */
  @Override
  public void exceptionCaught(ChannelHandlerContext ctx, Throwable cause) throws Exception {
-    // ReadTimeoutException is non fatal as the client times out due to inactivity.
-    // SslHandshakeException is caused by the client not able to complete the handshake, we should
-    // not log it at error as we do not control client behavior.
-    if (cause instanceof ReadTimeoutException || cause instanceof SSLHandshakeException) {
+    if (NON_FATAL_INBOUND_EXCEPTIONS.contains(Throwables.getRootCause(cause).getClass())) {
      logger.atWarning().withCause(cause).log(
          "Inbound exception caught for channel %s", ctx.channel());
    } else {
@ -187,10 +188,7 @@ abstract class HttpsRelayServiceHandler extends ByteToMessageCodec<FullHttpRespo
        (ChannelFuture channelFuture) -> {
          if (!channelFuture.isSuccess()) {
            Throwable cause = channelFuture.cause();
-            // If the failure is caused by IllegalArgumentException, we know that it is because we
-            // got a non 200 response. This is an expected error from the backend and should not be
-            // logged at severe.
-            if (Throwables.getRootCause(cause) instanceof IllegalArgumentException) {
+            if (NON_FATAL_OUTBOUND_EXCEPTIONS.contains(Throwables.getRootCause(cause).getClass())) {
              logger.atWarning().withCause(channelFuture.cause()).log(
                  "Outbound exception caught for channel %s", channelFuture.channel());
            } else {
@ -202,4 +200,14 @@ abstract class HttpsRelayServiceHandler extends ByteToMessageCodec<FullHttpRespo
        });
    super.write(ctx, msg, promise);
  }
+
+  /** Exception thrown when the response status from GAE is not 200. */
+  public static class NonOkHttpResponseException extends Exception {
+    NonOkHttpResponseException(FullHttpResponse response, Channel channel) {
+      super(
+          String.format(
+              "Cannot relay HTTP response status \"%s\" in channel %s:\n%s",
+              response.status(), channel, response.content().toString(UTF_8)));
+    }
+  }
 }
--- a/java/google/registry/proxy/handler/RelayHandler.java
+++ b/java/google/registry/proxy/handler/RelayHandler.java
@ -66,39 +66,51 @@ public class RelayHandler<I> extends SimpleChannelInboundHandler<I> {
      logger.atSevere().log("Relay channel not specified for channel: %s", channel);
      ChannelFuture unusedFuture = channel.close();
    } else {
-      writeToRelayChannel(channel, relayChannel, msg);
+      writeToRelayChannel(channel, relayChannel, msg, false);
    }
  }

-  public static void writeToRelayChannel(Channel channel, Channel relayChannel, Object msg) {
+  public static void writeToRelayChannel(
+      Channel channel, Channel relayChannel, Object msg, boolean retry) {
    ChannelFuture unusedFuture =
        relayChannel
            .writeAndFlush(msg)
            .addListener(
                future -> {
                  if (!future.isSuccess()) {
-                    logger.atWarning().log(
-                        "Relay failed: %s --> %s\nINBOUND: %s\nOUTBOUND: %s",
+                    // TODO (jianglai): do not log the message once retry behavior is confirmed.
+                    logger.atWarning().withCause(future.cause()).log(
+                        "Relay failed: %s --> %s\nINBOUND: %s\nOUTBOUND: %s\nMESSAGE: %s",
                        channel.attr(PROTOCOL_KEY).get().name(),
                        relayChannel.attr(PROTOCOL_KEY).get().name(),
                        channel,
-                        relayChannel);
+                        relayChannel,
+                        msg);
                    // If we cannot write to the relay channel and the originating channel has
                    // a relay buffer (i. e. we tried to relay the frontend to the backend), store
-                    // the message in the buffer for retry later. Otherwise, we are relaying from
-                    // the backend to the frontend, and this relay failure cannot be recovered
-                    // from, we should just kill the relay (frontend) channel, which in turn will
-                    // kill the backend channel. We should not kill any backend channel while the
-                    // the frontend channel is open, because that will just trigger a reconnect.
-                    // It is fine to just save the message object itself, not a clone of it,
-                    // because if the relay is not successful, its content is not read, therefore
-                    // its buffer is not cleared.
+                    // the message in the buffer for retry later. The relay channel (backend) should
+                    // be killed (if it is not already dead, usually the relay is unsuccessful
+                    // because the connection is closed), and a new backend channel will re-connect
+                    // as long as the frontend channel is open. Otherwise, we are relaying from the
+                    // backend to the frontend, and this relay failure cannot be recovered from: we
+                    // should just kill the relay (frontend) channel, which in turn will kill the
+                    // backend channel. It is fine to just save the message object itself, not a
+                    // clone of it, because if the relay is not successful, its content is not read,
+                    // therefore its buffer is not cleared.
                    Queue<Object> relayBuffer = channel.attr(RELAY_BUFFER_KEY).get();
                    if (relayBuffer != null) {
                      channel.attr(RELAY_BUFFER_KEY).get().add(msg);
-                    } else {
-                      ChannelFuture unusedFuture2 = relayChannel.close();
                    }
+                    ChannelFuture unusedFuture2 = relayChannel.close();
+                  } else if (retry) {
+                    // TODO (jianglai): do not log the message once retry behavior is confirmed.
+                    logger.atInfo().log(
+                        "Relay retry succeeded: %s --> %s\nINBOUND: %s\nOUTBOUND: %s\nsMESSAGE: %s",
+                        channel.attr(PROTOCOL_KEY).get().name(),
+                        relayChannel.attr(PROTOCOL_KEY).get().name(),
+                        channel,
+                        relayChannel,
+                        msg);
                  }
                });
  }