mirror of
https://github.com/cisagov/manage.get.gov.git
synced 2025-05-14 16:47:02 +02:00
add scripts for generating icann lookup and response code csvs (#2)
* add scripts for generating icann lookup and response code csvs * add docstring at the top of scripts * add error catching for whois command * exclude data for local use only
This commit is contained in:
parent
3fbced1f62
commit
b5b3defe41
3 changed files with 71 additions and 0 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -0,0 +1 @@
|
|||
docs/research/data/**
|
36
docs/research/scripts/icann_lookup.py
Normal file
36
docs/research/scripts/icann_lookup.py
Normal file
|
@ -0,0 +1,36 @@
|
|||
"""
|
||||
This script takes each domain in a dataset of non-.gov government domains and looks for
|
||||
which registrar they are currently registered with.
|
||||
|
||||
This script can be run locally to generate data and currently takes some time to run.
|
||||
"""
|
||||
|
||||
import csv
|
||||
import requests
|
||||
import whois
|
||||
|
||||
GOV_URLS_CSV_URL = "https://raw.githubusercontent.com/GSA/govt-urls/master/1_govt_urls_full.csv"
|
||||
|
||||
data = requests.get(GOV_URLS_CSV_URL).text
|
||||
csv_data = list(csv.reader(data.splitlines(), delimiter=','))
|
||||
domains = csv_data[1:]
|
||||
fields = csv_data[0] + ['Registrar']
|
||||
|
||||
def check_registration(name):
|
||||
try:
|
||||
domain_info = whois.whois(name)
|
||||
return domain_info['registrar']
|
||||
except:
|
||||
print('Something went wrong')
|
||||
|
||||
full_data = []
|
||||
for domain in domains:
|
||||
domain_name = domain[0].lower()
|
||||
if domain_name.endswith('.com') or domain_name.endswith('.edu') or domain_name.endswith('.net'):
|
||||
registrar = check_registration(domain_name)
|
||||
full_data.append(domain + [registrar])
|
||||
|
||||
with open('../data/registrar_data.csv', 'w') as f:
|
||||
writer = csv.writer(f)
|
||||
writer.writerow(fields)
|
||||
writer.writerows(full_data)
|
34
docs/research/scripts/response_codes.py
Normal file
34
docs/research/scripts/response_codes.py
Normal file
|
@ -0,0 +1,34 @@
|
|||
"""
|
||||
This script performs a basic request to each of the domains in the current list of
|
||||
dotgov domains hosted at https://flatgithub.com/cisagov/dotgov-data/blob/main/?filename=current-full.csv
|
||||
|
||||
This script can be run locally to generate data and currently takes some time to run.
|
||||
"""
|
||||
|
||||
import csv
|
||||
import requests
|
||||
|
||||
DOMAIN_LIST_URL = "https://raw.githubusercontent.com/cisagov/dotgov-data/main/current-full.csv"
|
||||
|
||||
data = requests.get(DOMAIN_LIST_URL).content.decode('utf-8')
|
||||
csv_data = list(csv.reader(data.splitlines(), delimiter=','))
|
||||
domains = csv_data[1:]
|
||||
fields = csv_data[0] + ['Response']
|
||||
|
||||
def check_status_response(domain):
|
||||
try:
|
||||
response = requests.get(f"https://{domain}", timeout=3).status_code
|
||||
except Exception as e:
|
||||
response = type(e).__name__
|
||||
return response
|
||||
|
||||
full_data = []
|
||||
for domain in domains:
|
||||
domain_name = domain[0]
|
||||
response = check_status_response(domain_name)
|
||||
full_data.append(domain + [response])
|
||||
|
||||
with open('../data/response_codes.csv', 'w') as f:
|
||||
writer = csv.writer(f)
|
||||
writer.writerow(fields)
|
||||
writer.writerows(full_data)
|
Loading…
Add table
Add a link
Reference in a new issue