Change ICANN SRS/EPP activity query to use new JSON log line

This finally fixes b/37629674 by cutting over the ICANN activity reporting query for EPP/SRS metrics to use the new JSON-based structured log line in FlowReporter, which is much easier to parse and interpret correctly than the old XML logging which was not designed to be ingested in BigQuery. ------------- Created by MOE: https://github.com/google/moe MOE_MIGRATED_REVID=159633467
2025-07-10 21:23:22 +02:00 · 2017-06-20 16:42:22 -07:00 · 2017-06-20 16:42:22 -07:00 · dce0daafc3
commit dce0daafc3
parent 02a5e3d20f
2 changed files with 95 additions and 210 deletions
--- a/python/google/registry/reporting/icann_report_query_builder.py
+++ b/python/google/registry/reporting/icann_report_query_builder.py
@ -24,18 +24,9 @@ reports (not transaction reports).
 """
 import datetime
-# This regex pattern matches the full signature of the 'EPP Command' log line
+# This signature must match the one logged by FlowReporter - see
-# from FlowRunner.run(), i.e. it matches the logging class/method that prefixes
+# cs/symbol:google.registry.flows.FlowReporter.METADATA_LOG_SIGNATURE
-# the log message, plus the 'EPP Command' string, up to the newline.
+FLOWREPORTER_LOG_SIGNATURE = 'FLOW-LOG-SIGNATURE-METADATA'
 # Queries used below depend on matching this log line and parsing its
 # exact format, so it must be kept in sync with the logging site.
 # TODO(b/20725722): make the log statement format more robust.
 FLOWRUNNER_LOG_SIGNATURE_PATTERN = '(?:{}): EPP Command'.format('|'.join([
    'google.registry.flows.FlowRunner run',
    'com.google.domain.registry.flows.FlowRunner run',
    # TODO(b/29397966): figure out why this is FormattingLogger vs FlowRunner.
    'com.google.domain.registry.util.FormattingLogger log',
    'google.registry.util.FormattingLogger log']))
 class IcannReportQueryBuilder(object):
@ -68,14 +59,13 @@ class IcannReportQueryBuilder(object):
    # Construct the queries themselves.
    logs_query = self._MakeMonthlyLogsQuery(this_yearmonth, next_yearmonth)
    epp_xml_logs_query = self._MakeEppXmlLogsQuery(logs_query)
    data_source_queries = [
        self._MakeActivityOperationalRegistrarsQuery(next_yearmonth),
        self._MakeActivityAllRampedUpRegistrarsQuery(next_yearmonth),
        self._MakeActivityAllRegistrarsQuery(registrar_count),
        self._MakeActivityWhoisQuery(logs_query),
        self._MakeActivityDnsQuery(),
-        self._MakeActivityEppSrsMetricsQuery(epp_xml_logs_query)
+        self._MakeActivityEppSrsMetricsQuery(logs_query)
    ]
    return _StripTrailingWhitespaceFromLines(self._MakeActivityReportQuery(
        data_source_queries))
@ -99,32 +89,6 @@ class IcannReportQueryBuilder(object):
    return query % {'this_yearmonth': this_yearmonth,
                    'next_yearmonth': next_yearmonth}
  def _MakeEppXmlLogsQuery(self, logs_query):
    # TODO(b/20725722): add a real docstring.
    # pylint: disable=missing-docstring
    # This query relies on regex-parsing the precise format of the 'EPP Command'
    # log line from FlowRunner.run(), so it must be kept in sync.
    # TODO(b/20725722): make the log statement format more robust.
    query = r"""
      -- Query EPP request logs and extract the clientId and raw EPP XML.
      SELECT
        REGEXP_EXTRACT(logMessage, r'^%(log_signature)s\n\t.*\n\t(.*)\n') AS clientId,
        REGEXP_EXTRACT(logMessage, r'^%(log_signature)s\n\t.*\n\t.*\n\t.*\n\t((?s).*)$') AS xml,
      FROM (
        -- BEGIN LOGS QUERY --
        %(logs_query)s
        -- END LOGS QUERY --
      )
      WHERE
        -- EPP endpoints from the proxy, regtool, and console respectively.
        (requestPath IN ('/_dr/epp', '/_dr/epptool', '/registrar-xhr')
          OR LEFT(requestPath, 7) = '/check?')
        AND REGEXP_MATCH(logMessage, r'^%(log_signature)s')
        AND NOT logMessage CONTAINS 'DRY_RUN'
    """
    return query % {'logs_query': logs_query,
                    'log_signature': FLOWRUNNER_LOG_SIGNATURE_PATTERN}
  def _MakeActivityReportQuery(self, data_source_queries):
    """Make the overall activity report query.
@ -316,91 +280,60 @@ class IcannReportQueryBuilder(object):
    """
    return query
-  def _MakeActivityEppSrsMetricsQuery(self, epp_xml_logs_query):
+  def _MakeActivityEppSrsMetricsQuery(self, logs_query):
    # TODO(b/20725722): add a real docstring.
    # pylint: disable=missing-docstring
    query = r"""
-      -- Query EPP XML messages and calculate SRS metrics.
+      -- Query FlowReporter JSON log messages and calculate SRS metrics.
      SELECT * FROM (
      SELECT
-        LOWER(domainTld) AS tld,
+        tld,
-        -- SRS metric names follow a set pattern corresponding to the EPP
+        activityReportField AS metricName,
-        -- protocol elements.  First we extract the 'inner' command element in
+        -- Manual INTEGER cast to work around a BigQuery bug (b/14560012).
-        -- EPP, e.g. <domain:create>, which is the resource type followed by
+        INTEGER(COUNT(*)) AS count,
-        -- the standard EPP command type.  To get the metric name, we add the
+      FROM
-        -- prefix 'srs-', abbreviate 'domain' as 'dom' and 'contact' as 'cont',
+        -- Flatten the "tld" column (repeated) so that domain checks for names
-        -- and replace ':' with '-' to produce 'srs-dom-create'.
+        -- across multiple TLDs are counted towards each checked TLD as though
-        --
+        -- there were one copy of this row per TLD (the effect of flattening).
-        -- Transfers have subcommands indicated by an 'op' attribute, which we
+        FLATTEN((
-        -- extract and add as an extra suffix for transfer commands, so e.g.
+          SELECT
-        -- 'srs-cont-transfer-approve'.  Domain restores are domain updates
+            -- Use some ugly regex hackery to convert JSON list of strings into
-        -- with a special <rgp:restore> element; if present, the command counts
+            -- repeated string values, since there's no built-in for this.
-        -- under the srs-dom-rgp-restore-{request,report} metric (depending on
+            -- TODO(b/20829992): replace with "JSON.parse()" inside a JS UDF
-        -- the value of the 'op' attribute) instead of srs-dom-update.
+            --   once we can use GoogleSQL; example in b/37629674#comment2.
-        CONCAT(
+            REGEXP_EXTRACT(
-          'srs-',
+              SPLIT(
-          REPLACE(REPLACE(REPLACE(
+                REGEXP_EXTRACT(
-            CASE
+                  JSON_EXTRACT(json, '$.tlds'),
-              WHEN NOT restoreOp IS NULL THEN CONCAT('domain-rgp-restore-', restoreOp)
+                  r'^\[(.*)\]$')),
-              WHEN commandType = 'transfer' THEN CONCAT(innerCommand, '-', commandOpArg)
+              '^"(.*)"$') AS tld,
-              ELSE innerCommand
+            -- TODO(b/XXX): remove rawTlds after June 2017 (see below).
-            END,
+            JSON_EXTRACT_SCALAR(json, '$.resourceType') AS resourceType,
-          ':', '-'), 'domain', 'dom'), 'contact', 'cont')
+            JSON_EXTRACT_SCALAR(json, '$.icannActivityReportField')
-        ) AS metricName,
+              AS activityReportField,
        INTEGER(COUNT(xml)) AS count,
          FROM (
            SELECT
-        -- Extract salient bits of the EPP XML using regexes.  This is fairly
+              -- Extract JSON payload following log signature.
-        -- safe since the EPP gets schema-validated and pretty-printed before
+              REGEXP_EXTRACT(logMessage, r'%(log_signature)s: (.*)\n?$')
-        -- getting logged, and so it looks something like this:
+                AS json,
        --
        --   <command>
        --     <transfer op="request">
        --       <domain:transfer ...
        --
        -- From that, we parse out 'transfer' as the command type from the name
        -- of the first element after <command>, 'request' as the value of the
        -- 'op' attribute of that element (if any), and 'domain:transfer' as
        -- the inner command from the name of the subsequent element.
        --
        -- Domain commands all have at least one <domain:name> element (more
        -- than one for domain checks, but we just count the first), from which
        -- we extract the domain TLD as everything after the first dot in the
        -- element value.  This won't work if the client mistakenly sends a
        -- hostname (e.g. 'www.foo.example') as the domain name, but we prefer
        -- this over taking everything after the last dot so that multipart
        -- TLDs like 'co.uk' can be supported.
        --
        -- Domain restores are indicated by an <rgp:restore> element, from
        -- which we extract the value of the 'op' attribute.
        --
        -- TODO(b/20725722): preprocess the XML in FlowRunner so we don't need
        -- regex parsing of XML here (http://stackoverflow.com/a/1732454).
        --
        REGEXP_EXTRACT(xml, '(?s)<command>.*?<([a-z]+)') AS commandType,
        REGEXP_EXTRACT(xml, '(?s)<command>.*?<[a-z]+ op="(.+?)"') AS commandOpArg,
        REGEXP_EXTRACT(xml, '(?s)<command>.*?<.+?>.*?<([a-z]+:[a-z]+)') AS innerCommand,
        REGEXP_EXTRACT(xml, '<domain:name.*?>[^.]*[.](.+)</domain:name>') AS domainTld,
        REGEXP_EXTRACT(xml, '<rgp:restore op="(.+?)"/>') AS restoreOp,
        xml,
            FROM (
-        -- BEGIN EPP XML LOGS QUERY --
+              -- BEGIN LOGS QUERY --
-        %(epp_xml_logs_query)s
+              %(logs_query)s
-        -- END EPP XML LOGS QUERY --
+              -- END LOGS QUERY --
            )
-      -- Filter to just XML that contains a <command> element (no <hello>s).
+            WHERE logMessage CONTAINS '%(log_signature)s'
      WHERE xml CONTAINS '<command>'
          )
-      -- Whitelist of EPP command types that we care about for metrics;
+        ),
-      -- excludes login, logout, and poll.
+        -- Second argument to flatten (see above).
-      WHERE commandType IN ('check', 'create', 'delete', 'info', 'renew', 'transfer', 'update')
+        tld)
      -- Exclude cases that can't be tabulated correctly - activity report field
      -- is null/empty, or the TLD is null/empty even though it's a domain flow.
      WHERE
        activityReportField != '' AND (tld != '' OR resourceType != 'domain')
      GROUP BY tld, metricName
-      )
+      ORDER BY tld, metricName
      -- Exclude domain-related EPP requests with no parsed TLD, otherwise
      -- a NULL tld will make them apply to all TLDs like contact/host requests.
      WHERE NOT (metricName CONTAINS 'srs-dom' AND tld IS NULL)
    """
-    return query % {'epp_xml_logs_query': epp_xml_logs_query}
+    return query % {'logs_query': logs_query,
                    'log_signature': FLOWREPORTER_LOG_SIGNATURE}
 def _StripTrailingWhitespaceFromLines(string):
--- a/python/google/registry/reporting/testdata/golden_activity_query.sql
+++ b/python/google/registry/reporting/testdata/golden_activity_query.sql
@ -175,76 +175,37 @@
 ),
 (
-      -- Query EPP XML messages and calculate SRS metrics.
+      -- Query FlowReporter JSON log messages and calculate SRS metrics.
      SELECT * FROM (
      SELECT
-        LOWER(domainTld) AS tld,
+        tld,
-        -- SRS metric names follow a set pattern corresponding to the EPP
+        activityReportField AS metricName,
-        -- protocol elements.  First we extract the 'inner' command element in
+        -- Manual INTEGER cast to work around a BigQuery bug (b/14560012).
-        -- EPP, e.g. <domain:create>, which is the resource type followed by
+        INTEGER(COUNT(*)) AS count,
-        -- the standard EPP command type.  To get the metric name, we add the
+      FROM
-        -- prefix 'srs-', abbreviate 'domain' as 'dom' and 'contact' as 'cont',
+        -- Flatten the "tld" column (repeated) so that domain checks for names
-        -- and replace ':' with '-' to produce 'srs-dom-create'.
+        -- across multiple TLDs are counted towards each checked TLD as though
-        --
+        -- there were one copy of this row per TLD (the effect of flattening).
-        -- Transfers have subcommands indicated by an 'op' attribute, which we
+        FLATTEN((
-        -- extract and add as an extra suffix for transfer commands, so e.g.
+          SELECT
-        -- 'srs-cont-transfer-approve'.  Domain restores are domain updates
+            -- Use some ugly regex hackery to convert JSON list of strings into
-        -- with a special <rgp:restore> element; if present, the command counts
+            -- repeated string values, since there's no built-in for this.
-        -- under the srs-dom-rgp-restore-{request,report} metric (depending on
+            -- TODO(b/20829992): replace with "JSON.parse()" inside a JS UDF
-        -- the value of the 'op' attribute) instead of srs-dom-update.
+            --   once we can use GoogleSQL; example in b/37629674#comment2.
-        CONCAT(
+            REGEXP_EXTRACT(
-          'srs-',
+              SPLIT(
-          REPLACE(REPLACE(REPLACE(
+                REGEXP_EXTRACT(
-            CASE
+                  JSON_EXTRACT(json, '$.tlds'),
-              WHEN NOT restoreOp IS NULL THEN CONCAT('domain-rgp-restore-', restoreOp)
+                  r'^\[(.*)\]$')),
-              WHEN commandType = 'transfer' THEN CONCAT(innerCommand, '-', commandOpArg)
+              '^"(.*)"$') AS tld,
-              ELSE innerCommand
+            -- TODO(b/XXX): remove rawTlds after June 2017 (see below).
-            END,
+            JSON_EXTRACT_SCALAR(json, '$.resourceType') AS resourceType,
-          ':', '-'), 'domain', 'dom'), 'contact', 'cont')
+            JSON_EXTRACT_SCALAR(json, '$.icannActivityReportField')
-        ) AS metricName,
+              AS activityReportField,
        INTEGER(COUNT(xml)) AS count,
          FROM (
            SELECT
-        -- Extract salient bits of the EPP XML using regexes.  This is fairly
+              -- Extract JSON payload following log signature.
-        -- safe since the EPP gets schema-validated and pretty-printed before
+              REGEXP_EXTRACT(logMessage, r'FLOW-LOG-SIGNATURE-METADATA: (.*)\n?$')
-        -- getting logged, and so it looks something like this:
+                AS json,
        --
        --   <command>
        --     <transfer op="request">
        --       <domain:transfer ...
        --
        -- From that, we parse out 'transfer' as the command type from the name
        -- of the first element after <command>, 'request' as the value of the
        -- 'op' attribute of that element (if any), and 'domain:transfer' as
        -- the inner command from the name of the subsequent element.
        --
        -- Domain commands all have at least one <domain:name> element (more
        -- than one for domain checks, but we just count the first), from which
        -- we extract the domain TLD as everything after the first dot in the
        -- element value.  This won't work if the client mistakenly sends a
        -- hostname (e.g. 'www.foo.example') as the domain name, but we prefer
        -- this over taking everything after the last dot so that multipart
        -- TLDs like 'co.uk' can be supported.
        --
        -- Domain restores are indicated by an <rgp:restore> element, from
        -- which we extract the value of the 'op' attribute.
        --
        -- TODO(b/20725722): preprocess the XML in FlowRunner so we don't need
        -- regex parsing of XML here (http://stackoverflow.com/a/1732454).
        --
        REGEXP_EXTRACT(xml, '(?s)<command>.*?<([a-z]+)') AS commandType,
        REGEXP_EXTRACT(xml, '(?s)<command>.*?<[a-z]+ op="(.+?)"') AS commandOpArg,
        REGEXP_EXTRACT(xml, '(?s)<command>.*?<.+?>.*?<([a-z]+:[a-z]+)') AS innerCommand,
        REGEXP_EXTRACT(xml, '<domain:name.*?>[^.]*[.](.+)</domain:name>') AS domainTld,
        REGEXP_EXTRACT(xml, '<rgp:restore op="(.+?)"/>') AS restoreOp,
        xml,
      FROM (
        -- BEGIN EPP XML LOGS QUERY --
      -- Query EPP request logs and extract the clientId and raw EPP XML.
      SELECT
        REGEXP_EXTRACT(logMessage, r'^(?:google.registry.flows.FlowRunner run|com.google.domain.registry.flows.FlowRunner run|com.google.domain.registry.util.FormattingLogger log|google.registry.util.FormattingLogger log): EPP Command\n\t.*\n\t(.*)\n') AS clientId,
        REGEXP_EXTRACT(logMessage, r'^(?:google.registry.flows.FlowRunner run|com.google.domain.registry.flows.FlowRunner run|com.google.domain.registry.util.FormattingLogger log|google.registry.util.FormattingLogger log): EPP Command\n\t.*\n\t.*\n\t.*\n\t((?s).*)$') AS xml,
            FROM (
              -- BEGIN LOGS QUERY --
@ -262,26 +223,17 @@
              -- END LOGS QUERY --
            )
            WHERE logMessage CONTAINS 'FLOW-LOG-SIGNATURE-METADATA'
          )
        ),
        -- Second argument to flatten (see above).
        tld)
      -- Exclude cases that can't be tabulated correctly - activity report field
      -- is null/empty, or the TLD is null/empty even though it's a domain flow.
      WHERE
-        -- EPP endpoints from the proxy, regtool, and console respectively.
+        activityReportField != '' AND (tld != '' OR resourceType != 'domain')
        (requestPath IN ('/_dr/epp', '/_dr/epptool', '/registrar-xhr')
          OR LEFT(requestPath, 7) = '/check?')
        AND REGEXP_MATCH(logMessage, r'^(?:google.registry.flows.FlowRunner run|com.google.domain.registry.flows.FlowRunner run|com.google.domain.registry.util.FormattingLogger log|google.registry.util.FormattingLogger log): EPP Command')
        AND NOT logMessage CONTAINS 'DRY_RUN'
        -- END EPP XML LOGS QUERY --
       )
      -- Filter to just XML that contains a <command> element (no <hello>s).
      WHERE xml CONTAINS '<command>'
      )
      -- Whitelist of EPP command types that we care about for metrics;
      -- excludes login, logout, and poll.
      WHERE commandType IN ('check', 'create', 'delete', 'info', 'renew', 'transfer', 'update')
      GROUP BY tld, metricName
-      )
+      ORDER BY tld, metricName
      -- Exclude domain-related EPP requests with no parsed TLD, otherwise
      -- a NULL tld will make them apply to all TLDs like contact/host requests.
      WHERE NOT (metricName CONTAINS 'srs-dom' AND tld IS NULL)
 )
          -- END JOINED DATA SOURCES --