add scripts for generating icann lookup and response code csvs (#2)

* add scripts for generating icann lookup and response code csvs

* add docstring at the top of scripts

* add error catching for whois command

* exclude data for local use only
This commit is contained in:
Logan McDonald 2022-08-09 09:03:51 -04:00 committed by GitHub
parent 3fbced1f62
commit b5b3defe41
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 71 additions and 0 deletions

1
.gitignore vendored
View file

@ -0,0 +1 @@
docs/research/data/**

View file

@ -0,0 +1,36 @@
"""
This script takes each domain in a dataset of non-.gov government domains and looks for
which registrar they are currently registered with.
This script can be run locally to generate data and currently takes some time to run.
"""
import csv
import requests
import whois
GOV_URLS_CSV_URL = "https://raw.githubusercontent.com/GSA/govt-urls/master/1_govt_urls_full.csv"
data = requests.get(GOV_URLS_CSV_URL).text
csv_data = list(csv.reader(data.splitlines(), delimiter=','))
domains = csv_data[1:]
fields = csv_data[0] + ['Registrar']
def check_registration(name):
try:
domain_info = whois.whois(name)
return domain_info['registrar']
except:
print('Something went wrong')
full_data = []
for domain in domains:
domain_name = domain[0].lower()
if domain_name.endswith('.com') or domain_name.endswith('.edu') or domain_name.endswith('.net'):
registrar = check_registration(domain_name)
full_data.append(domain + [registrar])
with open('../data/registrar_data.csv', 'w') as f:
writer = csv.writer(f)
writer.writerow(fields)
writer.writerows(full_data)

View file

@ -0,0 +1,34 @@
"""
This script performs a basic request to each of the domains in the current list of
dotgov domains hosted at https://flatgithub.com/cisagov/dotgov-data/blob/main/?filename=current-full.csv
This script can be run locally to generate data and currently takes some time to run.
"""
import csv
import requests
DOMAIN_LIST_URL = "https://raw.githubusercontent.com/cisagov/dotgov-data/main/current-full.csv"
data = requests.get(DOMAIN_LIST_URL).content.decode('utf-8')
csv_data = list(csv.reader(data.splitlines(), delimiter=','))
domains = csv_data[1:]
fields = csv_data[0] + ['Response']
def check_status_response(domain):
try:
response = requests.get(f"https://{domain}", timeout=3).status_code
except Exception as e:
response = type(e).__name__
return response
full_data = []
for domain in domains:
domain_name = domain[0]
response = check_status_response(domain_name)
full_data.append(domain + [response])
with open('../data/response_codes.csv', 'w') as f:
writer = csv.writer(f)
writer.writerow(fields)
writer.writerows(full_data)