diff --git a/python/google/registry/reporting/icann_report_query_builder.py b/python/google/registry/reporting/icann_report_query_builder.py index fc647f245..d107db907 100644 --- a/python/google/registry/reporting/icann_report_query_builder.py +++ b/python/google/registry/reporting/icann_report_query_builder.py @@ -24,18 +24,9 @@ reports (not transaction reports). """ import datetime -# This regex pattern matches the full signature of the 'EPP Command' log line -# from FlowRunner.run(), i.e. it matches the logging class/method that prefixes -# the log message, plus the 'EPP Command' string, up to the newline. -# Queries used below depend on matching this log line and parsing its -# exact format, so it must be kept in sync with the logging site. -# TODO(b/20725722): make the log statement format more robust. -FLOWRUNNER_LOG_SIGNATURE_PATTERN = '(?:{}): EPP Command'.format('|'.join([ - 'google.registry.flows.FlowRunner run', - 'com.google.domain.registry.flows.FlowRunner run', - # TODO(b/29397966): figure out why this is FormattingLogger vs FlowRunner. - 'com.google.domain.registry.util.FormattingLogger log', - 'google.registry.util.FormattingLogger log'])) +# This signature must match the one logged by FlowReporter - see +# cs/symbol:google.registry.flows.FlowReporter.METADATA_LOG_SIGNATURE +FLOWREPORTER_LOG_SIGNATURE = 'FLOW-LOG-SIGNATURE-METADATA' class IcannReportQueryBuilder(object): @@ -68,14 +59,13 @@ class IcannReportQueryBuilder(object): # Construct the queries themselves. logs_query = self._MakeMonthlyLogsQuery(this_yearmonth, next_yearmonth) - epp_xml_logs_query = self._MakeEppXmlLogsQuery(logs_query) data_source_queries = [ self._MakeActivityOperationalRegistrarsQuery(next_yearmonth), self._MakeActivityAllRampedUpRegistrarsQuery(next_yearmonth), self._MakeActivityAllRegistrarsQuery(registrar_count), self._MakeActivityWhoisQuery(logs_query), self._MakeActivityDnsQuery(), - self._MakeActivityEppSrsMetricsQuery(epp_xml_logs_query) + self._MakeActivityEppSrsMetricsQuery(logs_query) ] return _StripTrailingWhitespaceFromLines(self._MakeActivityReportQuery( data_source_queries)) @@ -99,32 +89,6 @@ class IcannReportQueryBuilder(object): return query % {'this_yearmonth': this_yearmonth, 'next_yearmonth': next_yearmonth} - def _MakeEppXmlLogsQuery(self, logs_query): - # TODO(b/20725722): add a real docstring. - # pylint: disable=missing-docstring - # This query relies on regex-parsing the precise format of the 'EPP Command' - # log line from FlowRunner.run(), so it must be kept in sync. - # TODO(b/20725722): make the log statement format more robust. - query = r""" - -- Query EPP request logs and extract the clientId and raw EPP XML. - SELECT - REGEXP_EXTRACT(logMessage, r'^%(log_signature)s\n\t.*\n\t(.*)\n') AS clientId, - REGEXP_EXTRACT(logMessage, r'^%(log_signature)s\n\t.*\n\t.*\n\t.*\n\t((?s).*)$') AS xml, - FROM ( - -- BEGIN LOGS QUERY -- - %(logs_query)s - -- END LOGS QUERY -- - ) - WHERE - -- EPP endpoints from the proxy, regtool, and console respectively. - (requestPath IN ('/_dr/epp', '/_dr/epptool', '/registrar-xhr') - OR LEFT(requestPath, 7) = '/check?') - AND REGEXP_MATCH(logMessage, r'^%(log_signature)s') - AND NOT logMessage CONTAINS 'DRY_RUN' - """ - return query % {'logs_query': logs_query, - 'log_signature': FLOWRUNNER_LOG_SIGNATURE_PATTERN} - def _MakeActivityReportQuery(self, data_source_queries): """Make the overall activity report query. @@ -316,91 +280,60 @@ class IcannReportQueryBuilder(object): """ return query - def _MakeActivityEppSrsMetricsQuery(self, epp_xml_logs_query): + def _MakeActivityEppSrsMetricsQuery(self, logs_query): # TODO(b/20725722): add a real docstring. # pylint: disable=missing-docstring query = r""" - -- Query EPP XML messages and calculate SRS metrics. - SELECT * FROM ( + -- Query FlowReporter JSON log messages and calculate SRS metrics. SELECT - LOWER(domainTld) AS tld, - -- SRS metric names follow a set pattern corresponding to the EPP - -- protocol elements. First we extract the 'inner' command element in - -- EPP, e.g. , which is the resource type followed by - -- the standard EPP command type. To get the metric name, we add the - -- prefix 'srs-', abbreviate 'domain' as 'dom' and 'contact' as 'cont', - -- and replace ':' with '-' to produce 'srs-dom-create'. - -- - -- Transfers have subcommands indicated by an 'op' attribute, which we - -- extract and add as an extra suffix for transfer commands, so e.g. - -- 'srs-cont-transfer-approve'. Domain restores are domain updates - -- with a special element; if present, the command counts - -- under the srs-dom-rgp-restore-{request,report} metric (depending on - -- the value of the 'op' attribute) instead of srs-dom-update. - CONCAT( - 'srs-', - REPLACE(REPLACE(REPLACE( - CASE - WHEN NOT restoreOp IS NULL THEN CONCAT('domain-rgp-restore-', restoreOp) - WHEN commandType = 'transfer' THEN CONCAT(innerCommand, '-', commandOpArg) - ELSE innerCommand - END, - ':', '-'), 'domain', 'dom'), 'contact', 'cont') - ) AS metricName, - INTEGER(COUNT(xml)) AS count, - FROM ( - SELECT - -- Extract salient bits of the EPP XML using regexes. This is fairly - -- safe since the EPP gets schema-validated and pretty-printed before - -- getting logged, and so it looks something like this: - -- - -- - -- - -- , 'request' as the value of the - -- 'op' attribute of that element (if any), and 'domain:transfer' as - -- the inner command from the name of the subsequent element. - -- - -- Domain commands all have at least one element (more - -- than one for domain checks, but we just count the first), from which - -- we extract the domain TLD as everything after the first dot in the - -- element value. This won't work if the client mistakenly sends a - -- hostname (e.g. 'www.foo.example') as the domain name, but we prefer - -- this over taking everything after the last dot so that multipart - -- TLDs like 'co.uk' can be supported. - -- - -- Domain restores are indicated by an element, from - -- which we extract the value of the 'op' attribute. - -- - -- TODO(b/20725722): preprocess the XML in FlowRunner so we don't need - -- regex parsing of XML here (http://stackoverflow.com/a/1732454). - -- - REGEXP_EXTRACT(xml, '(?s).*?<([a-z]+)') AS commandType, - REGEXP_EXTRACT(xml, '(?s).*?<[a-z]+ op="(.+?)"') AS commandOpArg, - REGEXP_EXTRACT(xml, '(?s).*?<.+?>.*?<([a-z]+:[a-z]+)') AS innerCommand, - REGEXP_EXTRACT(xml, '[^.]*[.](.+)') AS domainTld, - REGEXP_EXTRACT(xml, '') AS restoreOp, - xml, - FROM ( - -- BEGIN EPP XML LOGS QUERY -- - %(epp_xml_logs_query)s - -- END EPP XML LOGS QUERY -- - ) - -- Filter to just XML that contains a element (no s). - WHERE xml CONTAINS '' - ) - -- Whitelist of EPP command types that we care about for metrics; - -- excludes login, logout, and poll. - WHERE commandType IN ('check', 'create', 'delete', 'info', 'renew', 'transfer', 'update') + tld, + activityReportField AS metricName, + -- Manual INTEGER cast to work around a BigQuery bug (b/14560012). + INTEGER(COUNT(*)) AS count, + FROM + -- Flatten the "tld" column (repeated) so that domain checks for names + -- across multiple TLDs are counted towards each checked TLD as though + -- there were one copy of this row per TLD (the effect of flattening). + FLATTEN(( + SELECT + -- Use some ugly regex hackery to convert JSON list of strings into + -- repeated string values, since there's no built-in for this. + -- TODO(b/20829992): replace with "JSON.parse()" inside a JS UDF + -- once we can use GoogleSQL; example in b/37629674#comment2. + REGEXP_EXTRACT( + SPLIT( + REGEXP_EXTRACT( + JSON_EXTRACT(json, '$.tlds'), + r'^\[(.*)\]$')), + '^"(.*)"$') AS tld, + -- TODO(b/XXX): remove rawTlds after June 2017 (see below). + JSON_EXTRACT_SCALAR(json, '$.resourceType') AS resourceType, + JSON_EXTRACT_SCALAR(json, '$.icannActivityReportField') + AS activityReportField, + FROM ( + SELECT + -- Extract JSON payload following log signature. + REGEXP_EXTRACT(logMessage, r'%(log_signature)s: (.*)\n?$') + AS json, + FROM ( + -- BEGIN LOGS QUERY -- + %(logs_query)s + -- END LOGS QUERY -- + ) + WHERE logMessage CONTAINS '%(log_signature)s' + ) + ), + -- Second argument to flatten (see above). + tld) + -- Exclude cases that can't be tabulated correctly - activity report field + -- is null/empty, or the TLD is null/empty even though it's a domain flow. + WHERE + activityReportField != '' AND (tld != '' OR resourceType != 'domain') GROUP BY tld, metricName - ) - -- Exclude domain-related EPP requests with no parsed TLD, otherwise - -- a NULL tld will make them apply to all TLDs like contact/host requests. - WHERE NOT (metricName CONTAINS 'srs-dom' AND tld IS NULL) + ORDER BY tld, metricName """ - return query % {'epp_xml_logs_query': epp_xml_logs_query} + return query % {'logs_query': logs_query, + 'log_signature': FLOWREPORTER_LOG_SIGNATURE} def _StripTrailingWhitespaceFromLines(string): diff --git a/python/google/registry/reporting/testdata/golden_activity_query.sql b/python/google/registry/reporting/testdata/golden_activity_query.sql index 96f79e6af..c81885f63 100644 --- a/python/google/registry/reporting/testdata/golden_activity_query.sql +++ b/python/google/registry/reporting/testdata/golden_activity_query.sql @@ -175,78 +175,39 @@ ), ( - -- Query EPP XML messages and calculate SRS metrics. - SELECT * FROM ( + -- Query FlowReporter JSON log messages and calculate SRS metrics. SELECT - LOWER(domainTld) AS tld, - -- SRS metric names follow a set pattern corresponding to the EPP - -- protocol elements. First we extract the 'inner' command element in - -- EPP, e.g. , which is the resource type followed by - -- the standard EPP command type. To get the metric name, we add the - -- prefix 'srs-', abbreviate 'domain' as 'dom' and 'contact' as 'cont', - -- and replace ':' with '-' to produce 'srs-dom-create'. - -- - -- Transfers have subcommands indicated by an 'op' attribute, which we - -- extract and add as an extra suffix for transfer commands, so e.g. - -- 'srs-cont-transfer-approve'. Domain restores are domain updates - -- with a special element; if present, the command counts - -- under the srs-dom-rgp-restore-{request,report} metric (depending on - -- the value of the 'op' attribute) instead of srs-dom-update. - CONCAT( - 'srs-', - REPLACE(REPLACE(REPLACE( - CASE - WHEN NOT restoreOp IS NULL THEN CONCAT('domain-rgp-restore-', restoreOp) - WHEN commandType = 'transfer' THEN CONCAT(innerCommand, '-', commandOpArg) - ELSE innerCommand - END, - ':', '-'), 'domain', 'dom'), 'contact', 'cont') - ) AS metricName, - INTEGER(COUNT(xml)) AS count, - FROM ( - SELECT - -- Extract salient bits of the EPP XML using regexes. This is fairly - -- safe since the EPP gets schema-validated and pretty-printed before - -- getting logged, and so it looks something like this: - -- - -- - -- - -- , 'request' as the value of the - -- 'op' attribute of that element (if any), and 'domain:transfer' as - -- the inner command from the name of the subsequent element. - -- - -- Domain commands all have at least one element (more - -- than one for domain checks, but we just count the first), from which - -- we extract the domain TLD as everything after the first dot in the - -- element value. This won't work if the client mistakenly sends a - -- hostname (e.g. 'www.foo.example') as the domain name, but we prefer - -- this over taking everything after the last dot so that multipart - -- TLDs like 'co.uk' can be supported. - -- - -- Domain restores are indicated by an element, from - -- which we extract the value of the 'op' attribute. - -- - -- TODO(b/20725722): preprocess the XML in FlowRunner so we don't need - -- regex parsing of XML here (http://stackoverflow.com/a/1732454). - -- - REGEXP_EXTRACT(xml, '(?s).*?<([a-z]+)') AS commandType, - REGEXP_EXTRACT(xml, '(?s).*?<[a-z]+ op="(.+?)"') AS commandOpArg, - REGEXP_EXTRACT(xml, '(?s).*?<.+?>.*?<([a-z]+:[a-z]+)') AS innerCommand, - REGEXP_EXTRACT(xml, '[^.]*[.](.+)') AS domainTld, - REGEXP_EXTRACT(xml, '') AS restoreOp, - xml, - FROM ( - -- BEGIN EPP XML LOGS QUERY -- - - -- Query EPP request logs and extract the clientId and raw EPP XML. - SELECT - REGEXP_EXTRACT(logMessage, r'^(?:google.registry.flows.FlowRunner run|com.google.domain.registry.flows.FlowRunner run|com.google.domain.registry.util.FormattingLogger log|google.registry.util.FormattingLogger log): EPP Command\n\t.*\n\t(.*)\n') AS clientId, - REGEXP_EXTRACT(logMessage, r'^(?:google.registry.flows.FlowRunner run|com.google.domain.registry.flows.FlowRunner run|com.google.domain.registry.util.FormattingLogger log|google.registry.util.FormattingLogger log): EPP Command\n\t.*\n\t.*\n\t.*\n\t((?s).*)$') AS xml, - FROM ( - -- BEGIN LOGS QUERY -- + tld, + activityReportField AS metricName, + -- Manual INTEGER cast to work around a BigQuery bug (b/14560012). + INTEGER(COUNT(*)) AS count, + FROM + -- Flatten the "tld" column (repeated) so that domain checks for names + -- across multiple TLDs are counted towards each checked TLD as though + -- there were one copy of this row per TLD (the effect of flattening). + FLATTEN(( + SELECT + -- Use some ugly regex hackery to convert JSON list of strings into + -- repeated string values, since there's no built-in for this. + -- TODO(b/20829992): replace with "JSON.parse()" inside a JS UDF + -- once we can use GoogleSQL; example in b/37629674#comment2. + REGEXP_EXTRACT( + SPLIT( + REGEXP_EXTRACT( + JSON_EXTRACT(json, '$.tlds'), + r'^\[(.*)\]$')), + '^"(.*)"$') AS tld, + -- TODO(b/XXX): remove rawTlds after June 2017 (see below). + JSON_EXTRACT_SCALAR(json, '$.resourceType') AS resourceType, + JSON_EXTRACT_SCALAR(json, '$.icannActivityReportField') + AS activityReportField, + FROM ( + SELECT + -- Extract JSON payload following log signature. + REGEXP_EXTRACT(logMessage, r'FLOW-LOG-SIGNATURE-METADATA: (.*)\n?$') + AS json, + FROM ( + -- BEGIN LOGS QUERY -- -- Query AppEngine request logs for the report month. SELECT @@ -260,28 +221,19 @@ -- timestamp representing the start of the next month. DATE_ADD(TIMESTAMP('2016-07-01'), -1, 'SECOND')) - -- END LOGS QUERY -- - ) + -- END LOGS QUERY -- + ) + WHERE logMessage CONTAINS 'FLOW-LOG-SIGNATURE-METADATA' + ) + ), + -- Second argument to flatten (see above). + tld) + -- Exclude cases that can't be tabulated correctly - activity report field + -- is null/empty, or the TLD is null/empty even though it's a domain flow. WHERE - -- EPP endpoints from the proxy, regtool, and console respectively. - (requestPath IN ('/_dr/epp', '/_dr/epptool', '/registrar-xhr') - OR LEFT(requestPath, 7) = '/check?') - AND REGEXP_MATCH(logMessage, r'^(?:google.registry.flows.FlowRunner run|com.google.domain.registry.flows.FlowRunner run|com.google.domain.registry.util.FormattingLogger log|google.registry.util.FormattingLogger log): EPP Command') - AND NOT logMessage CONTAINS 'DRY_RUN' - - -- END EPP XML LOGS QUERY -- - ) - -- Filter to just XML that contains a element (no s). - WHERE xml CONTAINS '' - ) - -- Whitelist of EPP command types that we care about for metrics; - -- excludes login, logout, and poll. - WHERE commandType IN ('check', 'create', 'delete', 'info', 'renew', 'transfer', 'update') + activityReportField != '' AND (tld != '' OR resourceType != 'domain') GROUP BY tld, metricName - ) - -- Exclude domain-related EPP requests with no parsed TLD, otherwise - -- a NULL tld will make them apply to all TLDs like contact/host requests. - WHERE NOT (metricName CONTAINS 'srs-dom' AND tld IS NULL) + ORDER BY tld, metricName ) -- END JOINED DATA SOURCES --