add scripts for generating icann lookup and response code csvs (#2)

* add scripts for generating icann lookup and response code csvs * add docstring at the top of scripts * add error catching for whois command * exclude data for local use only
2025-05-14 16:47:02 +02:00 · 2022-08-09 09:03:51 -04:00 · 2022-08-09 09:03:51 -04:00 · b5b3defe41
commit b5b3defe41
parent 3fbced1f62
3 changed files with 71 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
+docs/research/data/**
--- a/docs/research/scripts/icann_lookup.py
+++ b/docs/research/scripts/icann_lookup.py
@ -0,0 +1,36 @@
+"""
+This script takes each domain in a dataset of non-.gov government domains and looks for 
+which registrar they are currently registered with. 
+
+This script can be run locally to generate data and currently takes some time to run.
+"""
+
+import csv
+import requests
+import whois
+
+GOV_URLS_CSV_URL = "https://raw.githubusercontent.com/GSA/govt-urls/master/1_govt_urls_full.csv"
+
+data = requests.get(GOV_URLS_CSV_URL).text
+csv_data = list(csv.reader(data.splitlines(), delimiter=','))
+domains = csv_data[1:]
+fields = csv_data[0] + ['Registrar']
+
+def check_registration(name):
+    try:
+        domain_info = whois.whois(name)
+        return domain_info['registrar']
+    except:
+        print('Something went wrong')
+
+full_data = []
+for domain in domains:
+    domain_name = domain[0].lower()
+    if domain_name.endswith('.com') or domain_name.endswith('.edu') or domain_name.endswith('.net'):
+        registrar = check_registration(domain_name)
+        full_data.append(domain + [registrar])
+
+with open('../data/registrar_data.csv', 'w') as f:
+    writer = csv.writer(f)
+    writer.writerow(fields)
+    writer.writerows(full_data)
--- a/docs/research/scripts/response_codes.py
+++ b/docs/research/scripts/response_codes.py
@ -0,0 +1,34 @@
+"""
+This script performs a basic request to each of the domains in the current list of 
+dotgov domains hosted at https://flatgithub.com/cisagov/dotgov-data/blob/main/?filename=current-full.csv
+
+This script can be run locally to generate data and currently takes some time to run.
+"""
+
+import csv
+import requests
+
+DOMAIN_LIST_URL = "https://raw.githubusercontent.com/cisagov/dotgov-data/main/current-full.csv"
+
+data = requests.get(DOMAIN_LIST_URL).content.decode('utf-8')
+csv_data = list(csv.reader(data.splitlines(), delimiter=','))
+domains =  csv_data[1:]
+fields = csv_data[0] + ['Response']
+
+def check_status_response(domain):
+    try:
+        response = requests.get(f"https://{domain}", timeout=3).status_code
+    except Exception as e:
+        response = type(e).__name__
+    return response
+
+full_data = []
+for domain in domains:
+    domain_name = domain[0]
+    response = check_status_response(domain_name)
+    full_data.append(domain + [response])
+
+with open('../data/response_codes.csv', 'w') as f:
+    writer = csv.writer(f)
+    writer.writerow(fields)
+    writer.writerows(full_data)