Change ICANN SRS/EPP activity query to use new JSON log line

This finally fixes b/37629674 by cutting over the ICANN activity reporting query for EPP/SRS metrics to use the new JSON-based structured log line in FlowReporter, which is much easier to parse and interpret correctly than the old XML logging which was not designed to be ingested in BigQuery.

-------------
Created by MOE: https://github.com/google/moe
MOE_MIGRATED_REVID=159633467
This commit is contained in:
nickfelt 2017-06-20 16:42:22 -07:00 committed by Ben McIlwain
parent 02a5e3d20f
commit dce0daafc3
2 changed files with 95 additions and 210 deletions

View file

@ -24,18 +24,9 @@ reports (not transaction reports).
""" """
import datetime import datetime
# This regex pattern matches the full signature of the 'EPP Command' log line # This signature must match the one logged by FlowReporter - see
# from FlowRunner.run(), i.e. it matches the logging class/method that prefixes # cs/symbol:google.registry.flows.FlowReporter.METADATA_LOG_SIGNATURE
# the log message, plus the 'EPP Command' string, up to the newline. FLOWREPORTER_LOG_SIGNATURE = 'FLOW-LOG-SIGNATURE-METADATA'
# Queries used below depend on matching this log line and parsing its
# exact format, so it must be kept in sync with the logging site.
# TODO(b/20725722): make the log statement format more robust.
FLOWRUNNER_LOG_SIGNATURE_PATTERN = '(?:{}): EPP Command'.format('|'.join([
'google.registry.flows.FlowRunner run',
'com.google.domain.registry.flows.FlowRunner run',
# TODO(b/29397966): figure out why this is FormattingLogger vs FlowRunner.
'com.google.domain.registry.util.FormattingLogger log',
'google.registry.util.FormattingLogger log']))
class IcannReportQueryBuilder(object): class IcannReportQueryBuilder(object):
@ -68,14 +59,13 @@ class IcannReportQueryBuilder(object):
# Construct the queries themselves. # Construct the queries themselves.
logs_query = self._MakeMonthlyLogsQuery(this_yearmonth, next_yearmonth) logs_query = self._MakeMonthlyLogsQuery(this_yearmonth, next_yearmonth)
epp_xml_logs_query = self._MakeEppXmlLogsQuery(logs_query)
data_source_queries = [ data_source_queries = [
self._MakeActivityOperationalRegistrarsQuery(next_yearmonth), self._MakeActivityOperationalRegistrarsQuery(next_yearmonth),
self._MakeActivityAllRampedUpRegistrarsQuery(next_yearmonth), self._MakeActivityAllRampedUpRegistrarsQuery(next_yearmonth),
self._MakeActivityAllRegistrarsQuery(registrar_count), self._MakeActivityAllRegistrarsQuery(registrar_count),
self._MakeActivityWhoisQuery(logs_query), self._MakeActivityWhoisQuery(logs_query),
self._MakeActivityDnsQuery(), self._MakeActivityDnsQuery(),
self._MakeActivityEppSrsMetricsQuery(epp_xml_logs_query) self._MakeActivityEppSrsMetricsQuery(logs_query)
] ]
return _StripTrailingWhitespaceFromLines(self._MakeActivityReportQuery( return _StripTrailingWhitespaceFromLines(self._MakeActivityReportQuery(
data_source_queries)) data_source_queries))
@ -99,32 +89,6 @@ class IcannReportQueryBuilder(object):
return query % {'this_yearmonth': this_yearmonth, return query % {'this_yearmonth': this_yearmonth,
'next_yearmonth': next_yearmonth} 'next_yearmonth': next_yearmonth}
def _MakeEppXmlLogsQuery(self, logs_query):
# TODO(b/20725722): add a real docstring.
# pylint: disable=missing-docstring
# This query relies on regex-parsing the precise format of the 'EPP Command'
# log line from FlowRunner.run(), so it must be kept in sync.
# TODO(b/20725722): make the log statement format more robust.
query = r"""
-- Query EPP request logs and extract the clientId and raw EPP XML.
SELECT
REGEXP_EXTRACT(logMessage, r'^%(log_signature)s\n\t.*\n\t(.*)\n') AS clientId,
REGEXP_EXTRACT(logMessage, r'^%(log_signature)s\n\t.*\n\t.*\n\t.*\n\t((?s).*)$') AS xml,
FROM (
-- BEGIN LOGS QUERY --
%(logs_query)s
-- END LOGS QUERY --
)
WHERE
-- EPP endpoints from the proxy, regtool, and console respectively.
(requestPath IN ('/_dr/epp', '/_dr/epptool', '/registrar-xhr')
OR LEFT(requestPath, 7) = '/check?')
AND REGEXP_MATCH(logMessage, r'^%(log_signature)s')
AND NOT logMessage CONTAINS 'DRY_RUN'
"""
return query % {'logs_query': logs_query,
'log_signature': FLOWRUNNER_LOG_SIGNATURE_PATTERN}
def _MakeActivityReportQuery(self, data_source_queries): def _MakeActivityReportQuery(self, data_source_queries):
"""Make the overall activity report query. """Make the overall activity report query.
@ -316,91 +280,60 @@ class IcannReportQueryBuilder(object):
""" """
return query return query
def _MakeActivityEppSrsMetricsQuery(self, epp_xml_logs_query): def _MakeActivityEppSrsMetricsQuery(self, logs_query):
# TODO(b/20725722): add a real docstring. # TODO(b/20725722): add a real docstring.
# pylint: disable=missing-docstring # pylint: disable=missing-docstring
query = r""" query = r"""
-- Query EPP XML messages and calculate SRS metrics. -- Query FlowReporter JSON log messages and calculate SRS metrics.
SELECT * FROM (
SELECT SELECT
LOWER(domainTld) AS tld, tld,
-- SRS metric names follow a set pattern corresponding to the EPP activityReportField AS metricName,
-- protocol elements. First we extract the 'inner' command element in -- Manual INTEGER cast to work around a BigQuery bug (b/14560012).
-- EPP, e.g. <domain:create>, which is the resource type followed by INTEGER(COUNT(*)) AS count,
-- the standard EPP command type. To get the metric name, we add the FROM
-- prefix 'srs-', abbreviate 'domain' as 'dom' and 'contact' as 'cont', -- Flatten the "tld" column (repeated) so that domain checks for names
-- and replace ':' with '-' to produce 'srs-dom-create'. -- across multiple TLDs are counted towards each checked TLD as though
-- -- there were one copy of this row per TLD (the effect of flattening).
-- Transfers have subcommands indicated by an 'op' attribute, which we FLATTEN((
-- extract and add as an extra suffix for transfer commands, so e.g. SELECT
-- 'srs-cont-transfer-approve'. Domain restores are domain updates -- Use some ugly regex hackery to convert JSON list of strings into
-- with a special <rgp:restore> element; if present, the command counts -- repeated string values, since there's no built-in for this.
-- under the srs-dom-rgp-restore-{request,report} metric (depending on -- TODO(b/20829992): replace with "JSON.parse()" inside a JS UDF
-- the value of the 'op' attribute) instead of srs-dom-update. -- once we can use GoogleSQL; example in b/37629674#comment2.
CONCAT( REGEXP_EXTRACT(
'srs-', SPLIT(
REPLACE(REPLACE(REPLACE( REGEXP_EXTRACT(
CASE JSON_EXTRACT(json, '$.tlds'),
WHEN NOT restoreOp IS NULL THEN CONCAT('domain-rgp-restore-', restoreOp) r'^\[(.*)\]$')),
WHEN commandType = 'transfer' THEN CONCAT(innerCommand, '-', commandOpArg) '^"(.*)"$') AS tld,
ELSE innerCommand -- TODO(b/XXX): remove rawTlds after June 2017 (see below).
END, JSON_EXTRACT_SCALAR(json, '$.resourceType') AS resourceType,
':', '-'), 'domain', 'dom'), 'contact', 'cont') JSON_EXTRACT_SCALAR(json, '$.icannActivityReportField')
) AS metricName, AS activityReportField,
INTEGER(COUNT(xml)) AS count, FROM (
FROM ( SELECT
SELECT -- Extract JSON payload following log signature.
-- Extract salient bits of the EPP XML using regexes. This is fairly REGEXP_EXTRACT(logMessage, r'%(log_signature)s: (.*)\n?$')
-- safe since the EPP gets schema-validated and pretty-printed before AS json,
-- getting logged, and so it looks something like this: FROM (
-- -- BEGIN LOGS QUERY --
-- <command> %(logs_query)s
-- <transfer op="request"> -- END LOGS QUERY --
-- <domain:transfer ... )
-- WHERE logMessage CONTAINS '%(log_signature)s'
-- From that, we parse out 'transfer' as the command type from the name )
-- of the first element after <command>, 'request' as the value of the ),
-- 'op' attribute of that element (if any), and 'domain:transfer' as -- Second argument to flatten (see above).
-- the inner command from the name of the subsequent element. tld)
-- -- Exclude cases that can't be tabulated correctly - activity report field
-- Domain commands all have at least one <domain:name> element (more -- is null/empty, or the TLD is null/empty even though it's a domain flow.
-- than one for domain checks, but we just count the first), from which WHERE
-- we extract the domain TLD as everything after the first dot in the activityReportField != '' AND (tld != '' OR resourceType != 'domain')
-- element value. This won't work if the client mistakenly sends a
-- hostname (e.g. 'www.foo.example') as the domain name, but we prefer
-- this over taking everything after the last dot so that multipart
-- TLDs like 'co.uk' can be supported.
--
-- Domain restores are indicated by an <rgp:restore> element, from
-- which we extract the value of the 'op' attribute.
--
-- TODO(b/20725722): preprocess the XML in FlowRunner so we don't need
-- regex parsing of XML here (http://stackoverflow.com/a/1732454).
--
REGEXP_EXTRACT(xml, '(?s)<command>.*?<([a-z]+)') AS commandType,
REGEXP_EXTRACT(xml, '(?s)<command>.*?<[a-z]+ op="(.+?)"') AS commandOpArg,
REGEXP_EXTRACT(xml, '(?s)<command>.*?<.+?>.*?<([a-z]+:[a-z]+)') AS innerCommand,
REGEXP_EXTRACT(xml, '<domain:name.*?>[^.]*[.](.+)</domain:name>') AS domainTld,
REGEXP_EXTRACT(xml, '<rgp:restore op="(.+?)"/>') AS restoreOp,
xml,
FROM (
-- BEGIN EPP XML LOGS QUERY --
%(epp_xml_logs_query)s
-- END EPP XML LOGS QUERY --
)
-- Filter to just XML that contains a <command> element (no <hello>s).
WHERE xml CONTAINS '<command>'
)
-- Whitelist of EPP command types that we care about for metrics;
-- excludes login, logout, and poll.
WHERE commandType IN ('check', 'create', 'delete', 'info', 'renew', 'transfer', 'update')
GROUP BY tld, metricName GROUP BY tld, metricName
) ORDER BY tld, metricName
-- Exclude domain-related EPP requests with no parsed TLD, otherwise
-- a NULL tld will make them apply to all TLDs like contact/host requests.
WHERE NOT (metricName CONTAINS 'srs-dom' AND tld IS NULL)
""" """
return query % {'epp_xml_logs_query': epp_xml_logs_query} return query % {'logs_query': logs_query,
'log_signature': FLOWREPORTER_LOG_SIGNATURE}
def _StripTrailingWhitespaceFromLines(string): def _StripTrailingWhitespaceFromLines(string):

View file

@ -175,78 +175,39 @@
), ),
( (
-- Query EPP XML messages and calculate SRS metrics. -- Query FlowReporter JSON log messages and calculate SRS metrics.
SELECT * FROM (
SELECT SELECT
LOWER(domainTld) AS tld, tld,
-- SRS metric names follow a set pattern corresponding to the EPP activityReportField AS metricName,
-- protocol elements. First we extract the 'inner' command element in -- Manual INTEGER cast to work around a BigQuery bug (b/14560012).
-- EPP, e.g. <domain:create>, which is the resource type followed by INTEGER(COUNT(*)) AS count,
-- the standard EPP command type. To get the metric name, we add the FROM
-- prefix 'srs-', abbreviate 'domain' as 'dom' and 'contact' as 'cont', -- Flatten the "tld" column (repeated) so that domain checks for names
-- and replace ':' with '-' to produce 'srs-dom-create'. -- across multiple TLDs are counted towards each checked TLD as though
-- -- there were one copy of this row per TLD (the effect of flattening).
-- Transfers have subcommands indicated by an 'op' attribute, which we FLATTEN((
-- extract and add as an extra suffix for transfer commands, so e.g. SELECT
-- 'srs-cont-transfer-approve'. Domain restores are domain updates -- Use some ugly regex hackery to convert JSON list of strings into
-- with a special <rgp:restore> element; if present, the command counts -- repeated string values, since there's no built-in for this.
-- under the srs-dom-rgp-restore-{request,report} metric (depending on -- TODO(b/20829992): replace with "JSON.parse()" inside a JS UDF
-- the value of the 'op' attribute) instead of srs-dom-update. -- once we can use GoogleSQL; example in b/37629674#comment2.
CONCAT( REGEXP_EXTRACT(
'srs-', SPLIT(
REPLACE(REPLACE(REPLACE( REGEXP_EXTRACT(
CASE JSON_EXTRACT(json, '$.tlds'),
WHEN NOT restoreOp IS NULL THEN CONCAT('domain-rgp-restore-', restoreOp) r'^\[(.*)\]$')),
WHEN commandType = 'transfer' THEN CONCAT(innerCommand, '-', commandOpArg) '^"(.*)"$') AS tld,
ELSE innerCommand -- TODO(b/XXX): remove rawTlds after June 2017 (see below).
END, JSON_EXTRACT_SCALAR(json, '$.resourceType') AS resourceType,
':', '-'), 'domain', 'dom'), 'contact', 'cont') JSON_EXTRACT_SCALAR(json, '$.icannActivityReportField')
) AS metricName, AS activityReportField,
INTEGER(COUNT(xml)) AS count, FROM (
FROM ( SELECT
SELECT -- Extract JSON payload following log signature.
-- Extract salient bits of the EPP XML using regexes. This is fairly REGEXP_EXTRACT(logMessage, r'FLOW-LOG-SIGNATURE-METADATA: (.*)\n?$')
-- safe since the EPP gets schema-validated and pretty-printed before AS json,
-- getting logged, and so it looks something like this: FROM (
-- -- BEGIN LOGS QUERY --
-- <command>
-- <transfer op="request">
-- <domain:transfer ...
--
-- From that, we parse out 'transfer' as the command type from the name
-- of the first element after <command>, 'request' as the value of the
-- 'op' attribute of that element (if any), and 'domain:transfer' as
-- the inner command from the name of the subsequent element.
--
-- Domain commands all have at least one <domain:name> element (more
-- than one for domain checks, but we just count the first), from which
-- we extract the domain TLD as everything after the first dot in the
-- element value. This won't work if the client mistakenly sends a
-- hostname (e.g. 'www.foo.example') as the domain name, but we prefer
-- this over taking everything after the last dot so that multipart
-- TLDs like 'co.uk' can be supported.
--
-- Domain restores are indicated by an <rgp:restore> element, from
-- which we extract the value of the 'op' attribute.
--
-- TODO(b/20725722): preprocess the XML in FlowRunner so we don't need
-- regex parsing of XML here (http://stackoverflow.com/a/1732454).
--
REGEXP_EXTRACT(xml, '(?s)<command>.*?<([a-z]+)') AS commandType,
REGEXP_EXTRACT(xml, '(?s)<command>.*?<[a-z]+ op="(.+?)"') AS commandOpArg,
REGEXP_EXTRACT(xml, '(?s)<command>.*?<.+?>.*?<([a-z]+:[a-z]+)') AS innerCommand,
REGEXP_EXTRACT(xml, '<domain:name.*?>[^.]*[.](.+)</domain:name>') AS domainTld,
REGEXP_EXTRACT(xml, '<rgp:restore op="(.+?)"/>') AS restoreOp,
xml,
FROM (
-- BEGIN EPP XML LOGS QUERY --
-- Query EPP request logs and extract the clientId and raw EPP XML.
SELECT
REGEXP_EXTRACT(logMessage, r'^(?:google.registry.flows.FlowRunner run|com.google.domain.registry.flows.FlowRunner run|com.google.domain.registry.util.FormattingLogger log|google.registry.util.FormattingLogger log): EPP Command\n\t.*\n\t(.*)\n') AS clientId,
REGEXP_EXTRACT(logMessage, r'^(?:google.registry.flows.FlowRunner run|com.google.domain.registry.flows.FlowRunner run|com.google.domain.registry.util.FormattingLogger log|google.registry.util.FormattingLogger log): EPP Command\n\t.*\n\t.*\n\t.*\n\t((?s).*)$') AS xml,
FROM (
-- BEGIN LOGS QUERY --
-- Query AppEngine request logs for the report month. -- Query AppEngine request logs for the report month.
SELECT SELECT
@ -260,28 +221,19 @@
-- timestamp representing the start of the next month. -- timestamp representing the start of the next month.
DATE_ADD(TIMESTAMP('2016-07-01'), -1, 'SECOND')) DATE_ADD(TIMESTAMP('2016-07-01'), -1, 'SECOND'))
-- END LOGS QUERY -- -- END LOGS QUERY --
) )
WHERE logMessage CONTAINS 'FLOW-LOG-SIGNATURE-METADATA'
)
),
-- Second argument to flatten (see above).
tld)
-- Exclude cases that can't be tabulated correctly - activity report field
-- is null/empty, or the TLD is null/empty even though it's a domain flow.
WHERE WHERE
-- EPP endpoints from the proxy, regtool, and console respectively. activityReportField != '' AND (tld != '' OR resourceType != 'domain')
(requestPath IN ('/_dr/epp', '/_dr/epptool', '/registrar-xhr')
OR LEFT(requestPath, 7) = '/check?')
AND REGEXP_MATCH(logMessage, r'^(?:google.registry.flows.FlowRunner run|com.google.domain.registry.flows.FlowRunner run|com.google.domain.registry.util.FormattingLogger log|google.registry.util.FormattingLogger log): EPP Command')
AND NOT logMessage CONTAINS 'DRY_RUN'
-- END EPP XML LOGS QUERY --
)
-- Filter to just XML that contains a <command> element (no <hello>s).
WHERE xml CONTAINS '<command>'
)
-- Whitelist of EPP command types that we care about for metrics;
-- excludes login, logout, and poll.
WHERE commandType IN ('check', 'create', 'delete', 'info', 'renew', 'transfer', 'update')
GROUP BY tld, metricName GROUP BY tld, metricName
) ORDER BY tld, metricName
-- Exclude domain-related EPP requests with no parsed TLD, otherwise
-- a NULL tld will make them apply to all TLDs like contact/host requests.
WHERE NOT (metricName CONTAINS 'srs-dom' AND tld IS NULL)
) )
-- END JOINED DATA SOURCES -- -- END JOINED DATA SOURCES --