Upgrade activity reporting queries to StandardSQL

This also brings the SQL template parameters in-line with the anticipated Bigquery dataset format, and switches from DateTime to the more appropriate LocalDate (since we only need monthly granularity).

-------------
Created by MOE: https://github.com/google/moe
MOE_MIGRATED_REVID=162747692
This commit is contained in:
larryruili 2017-07-21 09:13:50 -07:00 committed by Ben McIlwain
parent 8a921f08ed
commit 33eb5f1c87
14 changed files with 160 additions and 171 deletions

View file

@ -11,8 +11,10 @@
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-- See the License for the specific language governing permissions and
-- limitations under the License.
-- This query pulls from all intermediary tables to create the activity
-- report csv, via a table transpose and sum over all activity report fields.
SELECT
Tld.tld AS tld,
SUM(IF(metricName = 'operational-registrars', count, 0)) AS operational_registrars,
@ -79,10 +81,10 @@ LEFT OUTER JOIN (
metricName,
count FROM
-- BEGIN INTERMEDIARY DATA SOURCES --
[%ACTIVITY_REPORTING_DATA_SET%.%REGISTRAR_OPERATING_STATUS_TABLE%],
[%ACTIVITY_REPORTING_DATA_SET%.%DNS_COUNTS_TABLE%],
[%ACTIVITY_REPORTING_DATA_SET%.%EPP_METRICS_TABLE%],
[%ACTIVITY_REPORTING_DATA_SET%.%WHOIS_COUNTS_TABLE%],
[%ICANN_REPORTING_DATA_SET%.%REGISTRAR_OPERATING_STATUS_TABLE%],
[%ICANN_REPORTING_DATA_SET%.%DNS_COUNTS_TABLE%],
[%ICANN_REPORTING_DATA_SET%.%EPP_METRICS_TABLE%],
[%ICANN_REPORTING_DATA_SET%.%WHOIS_COUNTS_TABLE%],
-- END INTERMEDIARY DATA SOURCES --
) AS TldMetrics
ON

View file

@ -14,20 +14,16 @@
-- Query for DNS metrics.
-- Our DNS provider exports logs for all queries received. However, these
-- tables only have a TTL of 7 days. We make daily exports of the data
-- relevant to us, which allows us to get the full month's UDP and TCP
-- queries when generating activity reports.
-- This is a no-op until after we transition to Google Cloud DNS, which
-- will likely export metrics via Stackdriver.
SELECT
-- DNS metrics apply to all tlds, which requires the 'null' magic value.
STRING(NULL) AS tld,
metricName,
-- TODO(b/63388735): Change this to actually query the DNS tables when ready.
-1 AS count,
FROM (
SELECT
'dns-udp-queries' AS metricName),
(
SELECT
'dns-tcp-queries' AS metricName)
-- TODO(b/63388735): Change this to actually query Google Cloud DNS when ready.
-1 AS count
FROM ((
SELECT 'dns-udp-queries' AS metricName)
UNION ALL
(SELECT 'dns-tcp-queries' AS metricName))

View file

@ -14,51 +14,44 @@
-- Query FlowReporter JSON log messages and calculate SRS metrics.
-- We use regex's over the monthly appengine logs to determine how many
-- EPP requests we received for each command.
-- We use ugly regex's over the monthly appengine logs to determine how many
-- EPP requests we received for each command. For example:
-- {"commandType":"check"...,"targetIds":["ais.a.how"],
-- "tld":"","tlds":["a.how"],"icannActivityReportField":"srs-dom-check"}
SELECT
tld,
-- Remove quotation marks from tld fields.
REGEXP_EXTRACT(tld, '^"(.*)"$') AS tld,
activityReportField AS metricName,
-- Manual INTEGER cast to work around a BigQuery bug (b/14560012).
INTEGER(COUNT(*)) AS count,
FROM
-- Flatten the "tld" column (repeated) so that domain checks for names
-- across multiple TLDs are counted towards each checked TLD as though
-- there were one copy of this row per TLD (the effect of flattening).
FLATTEN((
COUNT(*) AS count
FROM (
SELECT
-- TODO(b/32486667): Replace with JSON.parse() UDF when available for views
SPLIT(
REGEXP_EXTRACT(JSON_EXTRACT(json, '$.tlds'), r'^\[(.*)\]$')) AS tlds,
JSON_EXTRACT_SCALAR(json,
'$.resourceType') AS resourceType,
JSON_EXTRACT_SCALAR(json,
'$.icannActivityReportField') AS activityReportField
FROM (
SELECT
-- Use some ugly regex hackery to convert JSON list of strings into
-- repeated string values, since there's no built-in for this.
-- TODO(b/20829992): replace with "JSON.parse()" inside a JS UDF
-- once we can use GoogleSQL; example in b/37629674#comment2.
-- e.g. JSON:"{"commandType":"check"...,"targetIds":["ais.a.how"],
-- "tld":"","tlds":["a.how"],"icannActivityReportField":"srs-dom-check"}
REGEXP_EXTRACT(
SPLIT(
REGEXP_EXTRACT(
JSON_EXTRACT(json, '$.tlds'),
r'^\[(.*)\]$')),
'^"(.*)"$') AS tld,
-- TODO(b/XXX): remove rawTlds after June 2017 (see below).
JSON_EXTRACT_SCALAR(json, '$.resourceType') AS resourceType,
JSON_EXTRACT_SCALAR(json, '$.icannActivityReportField')
AS activityReportField,
FROM (
SELECT
-- Extract JSON payload following log signature.
REGEXP_EXTRACT(logMessage, r'FLOW-LOG-SIGNATURE-METADATA: (.*)\n?$')
AS json,
FROM
[%MONTHLY_LOGS_DATA_SET%.%MONTHLY_LOGS_TABLE%]
WHERE logMessage CONTAINS 'FLOW-LOG-SIGNATURE-METADATA'
)
),
-- Second argument to flatten (see above).
tld)
-- Exclude cases that can't be tabulated correctly - activity report field
-- is null/empty, or the TLD is null/empty even though it's a domain flow.
-- Extract the logged JSON payload.
REGEXP_EXTRACT(logMessage, r'FLOW-LOG-SIGNATURE-METADATA: (.*)\n?$')
AS json
FROM `%PROJECT_ID%.%ICANN_REPORTING_DATA_SET%.%MONTHLY_LOGS_TABLE%` AS logs
JOIN
UNNEST(logs.logMessage) AS logMessage
WHERE
logMessage LIKE "%FLOW-LOG-SIGNATURE-METADATA%")) AS regexes
JOIN
-- Unnest the JSON-parsed tlds.
UNNEST(regexes.tlds) AS tld
-- Exclude cases that can't be tabulated correctly, where activityReportField
-- is null/empty, or TLD is null/empty despite being a domain flow.
WHERE
activityReportField != '' AND (tld != '' OR resourceType != 'domain')
GROUP BY tld, metricName
ORDER BY tld, metricName
activityReportField != ''
AND (tld != '' OR resourceType != 'domain')
GROUP BY
tld, metricName
ORDER BY
tld, metricName

View file

@ -14,15 +14,16 @@
-- Query to fetch AppEngine request logs for the report month.
-- START_OF_MONTH and END_OF_MONTH should be in YYYY-MM-01 format.
-- START_OF_MONTH and END_OF_MONTH should be in YYYYMM01 format.
SELECT
protoPayload.resource AS requestPath,
protoPayload.line.logMessage AS logMessage,
ARRAY(
SELECT
logMessage
FROM
UNNEST(protoPayload.line)) AS logMessage
FROM
TABLE_DATE_RANGE_STRICT(
[%APPENGINE_LOGS_DATA_SET%.%REQUEST_TABLE%],
TIMESTAMP('%START_OF_MONTH%'),
-- End timestamp is inclusive, so subtract 1 day from the
-- timestamp representing the start of the next month.
DATE_ADD(TIMESTAMP('%END_OF_MONTH%'), -1, 'DAY'))
`%PROJECT_ID%.%APPENGINE_LOGS_DATA_SET%.%REQUEST_TABLE%*`
WHERE
_TABLE_SUFFIX BETWEEN '%START_OF_MONTH%' AND '%END_OF_MONTH%'

View file

@ -26,7 +26,7 @@ SELECT
THEN 'pre-ramp-up-registrars'
-- The import process is imprecise; filter out invalid rows.
ELSE 'not-applicable' END AS metricName,
INTEGER(COUNT(registrar_id)) AS count
COUNT(registrar_id) AS count
FROM
[%REGISTRAR_DATA_SET%.%REGISTRAR_STATUS_TABLE%]
`%PROJECT_ID%.%REGISTRAR_DATA_SET%.%REGISTRAR_STATUS_TABLE%`
GROUP BY metricName

View file

@ -18,16 +18,15 @@
-- counts the number of hits via both endpoints (port 43 and the web).
SELECT
-- Whois applies to all TLDs, hence the 'null' magic value.
STRING(NULL) AS tld,
-- Whois queries over port 43 get forwarded by the proxy to /_dr/whois,
-- while web queries come in via /whois/<params>.
CASE
WHEN requestPath = '/_dr/whois' THEN 'whois-43-queries'
WHEN LEFT(requestPath, 7) = '/whois/' THEN 'web-whois-queries'
WHEN SUBSTR(requestPath, 0, 7) = '/whois/' THEN 'web-whois-queries'
END AS metricName,
INTEGER(COUNT(requestPath)) AS count,
COUNT(requestPath) AS count
FROM
[%MONTHLY_LOGS_DATA_SET%.%MONTHLY_LOGS_TABLE%]
GROUP BY metricName
HAVING metricName IS NOT NULL
`%PROJECT_ID%.%ICANN_REPORTING_DATA_SET%.%MONTHLY_LOGS_TABLE%`
GROUP BY
metricName
HAVING
metricName IS NOT NULL