diff --git a/.gitignore b/.gitignore index e69de29bb..91f50ef72 100644 --- a/.gitignore +++ b/.gitignore @@ -0,0 +1 @@ +docs/research/data/** \ No newline at end of file diff --git a/docs/research/scripts/icann_lookup.py b/docs/research/scripts/icann_lookup.py new file mode 100644 index 000000000..368fc93c5 --- /dev/null +++ b/docs/research/scripts/icann_lookup.py @@ -0,0 +1,36 @@ +""" +This script takes each domain in a dataset of non-.gov government domains and looks for +which registrar they are currently registered with. + +This script can be run locally to generate data and currently takes some time to run. +""" + +import csv +import requests +import whois + +GOV_URLS_CSV_URL = "https://raw.githubusercontent.com/GSA/govt-urls/master/1_govt_urls_full.csv" + +data = requests.get(GOV_URLS_CSV_URL).text +csv_data = list(csv.reader(data.splitlines(), delimiter=',')) +domains = csv_data[1:] +fields = csv_data[0] + ['Registrar'] + +def check_registration(name): + try: + domain_info = whois.whois(name) + return domain_info['registrar'] + except: + print('Something went wrong') + +full_data = [] +for domain in domains: + domain_name = domain[0].lower() + if domain_name.endswith('.com') or domain_name.endswith('.edu') or domain_name.endswith('.net'): + registrar = check_registration(domain_name) + full_data.append(domain + [registrar]) + +with open('../data/registrar_data.csv', 'w') as f: + writer = csv.writer(f) + writer.writerow(fields) + writer.writerows(full_data) diff --git a/docs/research/scripts/response_codes.py b/docs/research/scripts/response_codes.py new file mode 100644 index 000000000..073d0d20f --- /dev/null +++ b/docs/research/scripts/response_codes.py @@ -0,0 +1,34 @@ +""" +This script performs a basic request to each of the domains in the current list of +dotgov domains hosted at https://flatgithub.com/cisagov/dotgov-data/blob/main/?filename=current-full.csv + +This script can be run locally to generate data and currently takes some time to run. +""" + +import csv +import requests + +DOMAIN_LIST_URL = "https://raw.githubusercontent.com/cisagov/dotgov-data/main/current-full.csv" + +data = requests.get(DOMAIN_LIST_URL).content.decode('utf-8') +csv_data = list(csv.reader(data.splitlines(), delimiter=',')) +domains = csv_data[1:] +fields = csv_data[0] + ['Response'] + +def check_status_response(domain): + try: + response = requests.get(f"https://{domain}", timeout=3).status_code + except Exception as e: + response = type(e).__name__ + return response + +full_data = [] +for domain in domains: + domain_name = domain[0] + response = check_status_response(domain_name) + full_data.append(domain + [response]) + +with open('../data/response_codes.csv', 'w') as f: + writer = csv.writer(f) + writer.writerow(fields) + writer.writerows(full_data)