Prepare for parsing expiration date

This commit is contained in:
zandercymatics 2023-10-31 15:09:19 -06:00
parent 353079e7dd
commit 13172870fb
No known key found for this signature in database
GPG key ID: FF4636ABEC9682B7
5 changed files with 189 additions and 71 deletions

View file

@ -6,6 +6,7 @@ import argparse
from collections import defaultdict from collections import defaultdict
from django.core.management import BaseCommand from django.core.management import BaseCommand
from registrar.management.commands.utility.epp_data_containers import EnumFilenames
from registrar.models import TransitionDomain from registrar.models import TransitionDomain
@ -14,6 +15,9 @@ from registrar.management.commands.utility.terminal_helper import (
TerminalHelper, TerminalHelper,
) )
from .utility.transition_domain_arguments import TransitionDomainArguments
from .utility.extra_transition_domain import LoadExtraTransitionDomain
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -61,6 +65,31 @@ class Command(BaseCommand):
action=argparse.BooleanOptionalAction, action=argparse.BooleanOptionalAction,
) )
# TODO - Narrow this down
parser.add_argument(
"--directory", default="migrationdata", help="Desired directory"
)
parser.add_argument(
"--agency_adhoc_filename",
default=EnumFilenames.AGENCY_ADHOC.value[1],
help="Defines the filename for agency adhocs",
)
parser.add_argument(
"--domain_additional_filename",
default=EnumFilenames.DOMAIN_ADDITIONAL.value[1],
help="Defines the filename for additional domain data",
)
parser.add_argument(
"--domain_adhoc_filename",
default=EnumFilenames.DOMAIN_ADHOC.value[1],
help="Defines the filename for domain type adhocs",
)
parser.add_argument(
"--organization_adhoc_filename",
default=EnumFilenames.ORGANIZATION_ADHOC.value[1],
help="Defines the filename for domain type adhocs",
)
def print_debug_mode_statements( def print_debug_mode_statements(
self, debug_on: bool, debug_max_entries_to_parse: int self, debug_on: bool, debug_max_entries_to_parse: int
): ):
@ -255,7 +284,6 @@ class Command(BaseCommand):
): ):
"""Parse the data files and create TransitionDomains.""" """Parse the data files and create TransitionDomains."""
sep = options.get("sep") sep = options.get("sep")
load_extra_data = options.get("loadExtraData")
# If --resetTable was used, prompt user to confirm # If --resetTable was used, prompt user to confirm
# deletion of table data # deletion of table data
@ -286,7 +314,6 @@ class Command(BaseCommand):
# STEP 3: # STEP 3:
# Parse the domain_contacts file and create TransitionDomain objects, # Parse the domain_contacts file and create TransitionDomain objects,
# using the dictionaries from steps 1 & 2 to lookup needed information. # using the dictionaries from steps 1 & 2 to lookup needed information.
to_create = [] to_create = []
# keep track of statuses that don't match our available # keep track of statuses that don't match our available
@ -472,3 +499,17 @@ class Command(BaseCommand):
duplicate_domain_user_combos, duplicate_domains, users_without_email duplicate_domain_user_combos, duplicate_domains, users_without_email
) )
self.print_summary_status_findings(domains_without_status, outlier_statuses) self.print_summary_status_findings(domains_without_status, outlier_statuses)
# Prompt the user if they want to load additional data on the domains
# TODO - add this logic into the core of this file
arguments = TransitionDomainArguments(**options)
do_parse_extra = TerminalHelper.prompt_for_execution(
True,
"./manage.py test",
"Running load_extra_transition_domains script",
)
if do_parse_extra:
extra = LoadExtraTransitionDomain(arguments)
extra_logs = extra.parse_logs.logs

View file

@ -6,6 +6,7 @@ Not intended to be used as models but rather as an alternative to storing as a d
By keeping it as a dataclass instead of a dictionary, we can maintain data consistency. By keeping it as a dataclass instead of a dictionary, we can maintain data consistency.
""" """
from dataclasses import dataclass from dataclasses import dataclass
from datetime import date
from enum import Enum from enum import Enum
from typing import List, Optional from typing import List, Optional
@ -64,6 +65,13 @@ class AuthorityAdhoc:
agencyid: Optional[int] = None agencyid: Optional[int] = None
addlinfo: Optional[List[str]] = None addlinfo: Optional[List[str]] = None
@dataclass
class DomainEscrow:
"""Defines the structure given in the DOMAIN_ESCROW file"""
domainname: Optional[str] = None
creationdate: Optional[date] = None
expirationdate: Optional[date] = None
class EnumFilenames(Enum): class EnumFilenames(Enum):
"""Returns a tuple mapping for (filetype, default_file_name). """Returns a tuple mapping for (filetype, default_file_name).
@ -79,6 +87,7 @@ class EnumFilenames(Enum):
"domain_additional", "domain_additional",
"domainadditionaldatalink.adhoc.dotgov.txt", "domainadditionaldatalink.adhoc.dotgov.txt",
) )
DOMAIN_ESCROW = ("domain_escrow", "escrow_domains.daily.dotgov.GOV.txt")
DOMAIN_ADHOC = ("domain_adhoc", "domaintypes.adhoc.dotgov.txt") DOMAIN_ADHOC = ("domain_adhoc", "domaintypes.adhoc.dotgov.txt")
ORGANIZATION_ADHOC = ("organization_adhoc", "organization.adhoc.dotgov.txt") ORGANIZATION_ADHOC = ("organization_adhoc", "organization.adhoc.dotgov.txt")
AUTHORITY_ADHOC = ("authority_adhoc", "authority.adhoc.dotgov.txt") AUTHORITY_ADHOC = ("authority_adhoc", "authority.adhoc.dotgov.txt")

View file

@ -1,25 +1,29 @@
"""""" """"""
import csv import csv
from dataclasses import dataclass from dataclasses import dataclass
from datetime import datetime
from enum import Enum from enum import Enum
import glob import glob
import re import re
import logging import logging
import os import os
from typing import List from typing import List, Tuple
from registrar.models.transition_domain import TransitionDomain from registrar.models.transition_domain import TransitionDomain
from transition_domain_arguments import TransitionDomainArguments
from epp_data_containers import ( from .epp_data_containers import (
AgencyAdhoc, AgencyAdhoc,
DomainAdditionalData, DomainAdditionalData,
DomainEscrow,
DomainTypeAdhoc, DomainTypeAdhoc,
OrganizationAdhoc, OrganizationAdhoc,
AuthorityAdhoc, AuthorityAdhoc,
EnumFilenames, EnumFilenames,
) )
from .transition_domain_arguments import TransitionDomainArguments
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class LogCode(Enum): class LogCode(Enum):
@ -105,15 +109,9 @@ class LoadExtraTransitionDomain:
# Stores event logs and organizes them # Stores event logs and organizes them
self.parse_logs = FileTransitionLog() self.parse_logs = FileTransitionLog()
arguments = options.args_extra_transition_domain()
# Reads and parses migration files # Reads and parses migration files
self.domain_object = ExtraTransitionDomain( self.domain_object = ExtraTransitionDomain(**arguments)
agency_adhoc_filename=options.agency_adhoc_filename,
domain_additional_filename=options.domain_additional_filename,
domain_adhoc_filename=options.domain_adhoc_filename,
organization_adhoc_filename=options.organization_adhoc_filename,
directory=options.directory,
seperator=options.seperator,
)
self.domain_object.parse_all_files() self.domain_object.parse_all_files()
# Given the data we just parsed, update each # Given the data we just parsed, update each
@ -131,6 +129,7 @@ class LoadExtraTransitionDomain:
for transition_domain in all_transition_domains: for transition_domain in all_transition_domains:
domain_name = transition_domain.domain_name.upper() domain_name = transition_domain.domain_name.upper()
updated_transition_domain = transition_domain updated_transition_domain = transition_domain
# STEP 1: Parse organization data # STEP 1: Parse organization data
updated_transition_domain = self.parse_org_data( updated_transition_domain = self.parse_org_data(
domain_name, transition_domain domain_name, transition_domain
@ -526,59 +525,91 @@ class PatternMap:
class ExtraTransitionDomain: class ExtraTransitionDomain:
"""Helper class to aid in storing TransitionDomain data spread across """Helper class to aid in storing TransitionDomain data spread across
multiple files.""" multiple files."""
filenames = EnumFilenames
#strip_date_regex = re.compile(r"\d+\.(.+)")
strip_date_regex = re.compile(r"(?:.*\/)?(\d+)\.(.+)") strip_date_regex = re.compile(r"(?:.*\/)?(\d+)\.(.+)")
def __init__( def __init__(
self, self,
agency_adhoc_filename=filenames.AGENCY_ADHOC.value[1], agency_adhoc_filename=EnumFilenames.AGENCY_ADHOC.value[1],
domain_additional_filename=filenames.DOMAIN_ADDITIONAL.value[1], domain_additional_filename=EnumFilenames.DOMAIN_ADDITIONAL.value[1],
domain_adhoc_filename=filenames.DOMAIN_ADHOC.value[1], domain_escrow_filename=EnumFilenames.DOMAIN_ESCROW.value[1],
organization_adhoc_filename=filenames.ORGANIZATION_ADHOC.value[1], domain_adhoc_filename=EnumFilenames.DOMAIN_ADHOC.value[1],
authority_adhoc_filename=filenames.AUTHORITY_ADHOC.value[1], organization_adhoc_filename=EnumFilenames.ORGANIZATION_ADHOC.value[1],
authority_adhoc_filename=EnumFilenames.AUTHORITY_ADHOC.value[1],
directory="migrationdata", directory="migrationdata",
seperator="|", sep="|",
): ):
# Add a slash if the last character isn't one # Add a slash if the last character isn't one
if directory and directory[-1] != "/": if directory and directory[-1] != "/":
directory += "/" directory += "/"
self.directory = directory self.directory = directory
self.seperator = seperator self.seperator = sep
self.all_files = glob.glob(f"{directory}*") self.all_files = glob.glob(f"{directory}*")
# Create a set with filenames as keys for quick lookup # Create a set with filenames as keys for quick lookup
self.all_files_set = {os.path.basename(file) for file in self.all_files} self.all_files_set = {os.path.basename(file) for file in self.all_files}
self.file_data = {
# (filename, default_url): metadata about the desired file # Used for a container of values at each filename.
self.filenames.AGENCY_ADHOC: PatternMap( # Instead of tracking each in a seperate variable, we can declare
agency_adhoc_filename, self.strip_date_regex, AgencyAdhoc, "agencyid" # metadata about each file and associate it with an enum.
), # That way if we want the data located at the agency_adhoc file,
self.filenames.DOMAIN_ADDITIONAL: PatternMap( # we can just call EnumFilenames.AGENCY_ADHOC.
domain_additional_filename, pattern_map_params = [
self.strip_date_regex, (EnumFilenames.AGENCY_ADHOC, agency_adhoc_filename, AgencyAdhoc, "agencyid"),
DomainAdditionalData, (EnumFilenames.DOMAIN_ADDITIONAL, domain_additional_filename, DomainAdditionalData, "domainname"),
"domainname", (EnumFilenames.DOMAIN_ESCROW, domain_escrow_filename, DomainEscrow, "domainname"),
), (EnumFilenames.DOMAIN_ADHOC, domain_adhoc_filename, DomainTypeAdhoc, "domaintypeid"),
self.filenames.DOMAIN_ADHOC: PatternMap( (EnumFilenames.ORGANIZATION_ADHOC, organization_adhoc_filename, OrganizationAdhoc, "orgid"),
domain_adhoc_filename, (EnumFilenames.AUTHORITY_ADHOC, authority_adhoc_filename, AuthorityAdhoc, "authorityid"),
self.strip_date_regex, ]
DomainTypeAdhoc, self.file_data = self.populate_file_data(pattern_map_params)
"domaintypeid",
), def populate_file_data(
self.filenames.ORGANIZATION_ADHOC: PatternMap( self,
organization_adhoc_filename, pattern_map_params: List[Tuple[EnumFilenames, str, type, str]]
self.strip_date_regex, ):
OrganizationAdhoc, """Populates the self.file_data field given a set
"orgid", of tuple params.
),
self.filenames.AUTHORITY_ADHOC: PatternMap( pattern_map_params must adhere to this format:
[
(field_type, filename, data_type, id_field),
]
vars:
file_type (EnumFilenames) -> The name of the dictionary.
Defined as a value on EnumFilenames, such as
EnumFilenames.AGENCY_ADHOC
filename (str) -> The filepath of the given
"file_type", such as migrationdata/test123.txt
data_type (type) -> The type of data to be read
at the location of the filename. For instance,
each row of test123.txt may return data of type AgencyAdhoc
id_field (str) -> Given the "data_type" of each row,
this specifies what the "id" of that row is.
For example, "agencyid". This is used so we can
store each record in a dictionary rather than
a list of values.
return example:
EnumFilenames.AUTHORITY_ADHOC: PatternMap(
authority_adhoc_filename, authority_adhoc_filename,
self.strip_date_regex, self.strip_date_regex,
AuthorityAdhoc, AuthorityAdhoc,
"authorityid", "authorityid",
), ),
} """
file_data = {}
for file_type, filename, data_type, id_field in pattern_map_params:
file_data[file_type] = PatternMap(
filename,
self.strip_date_regex,
data_type,
id_field,
)
return file_data
def parse_all_files(self, infer_filenames=True): def parse_all_files(self, infer_filenames=True):
"""Clears all preexisting data then parses each related CSV file. """Clears all preexisting data then parses each related CSV file.
@ -588,15 +619,16 @@ class ExtraTransitionDomain:
""" """
self.clear_file_data() self.clear_file_data()
for name, value in self.file_data.items(): for name, value in self.file_data.items():
is_domain_escrow = name == EnumFilenames.DOMAIN_ESCROW
filename = f"{value.filename}" filename = f"{value.filename}"
if filename in self.all_files_set: if filename in self.all_files_set:
_file = f"{self.directory}{value.filename}" _file = f"{self.directory}{value.filename}"
value.data = self._read_csv_file( value.data = self.parse_csv_file(
_file, _file,
self.seperator, self.seperator,
value.data_type, value.data_type,
value.id_field, value.id_field,
is_domain_escrow,
) )
else: else:
if not infer_filenames: if not infer_filenames:
@ -618,11 +650,12 @@ class ExtraTransitionDomain:
if filename in self.all_files_set: if filename in self.all_files_set:
logger.info(f"Infer success. Found file {filename}") logger.info(f"Infer success. Found file {filename}")
_file = f"{self.directory}{filename}" _file = f"{self.directory}{filename}"
value.data = self._read_csv_file( value.data = self.parse_csv_file(
_file, _file,
self.seperator, self.seperator,
value.data_type, value.data_type,
value.id_field, value.id_field,
is_domain_escrow,
) )
continue continue
# Log if we can't find the desired file # Log if we can't find the desired file
@ -633,6 +666,32 @@ class ExtraTransitionDomain:
file_type: PatternMap = item file_type: PatternMap = item
file_type.data = {} file_type.data = {}
def parse_csv_file(self, file, seperator, dataclass_type, id_field, is_domain_escrow=False):
# Domain escrow is an edge case
if is_domain_escrow:
return self._read_domain_escrow(file, seperator)
else:
return self._read_csv_file(file, seperator, dataclass_type, id_field)
# Domain escrow is an edgecase given that its structured differently data-wise.
def _read_domain_escrow(self, file, seperator):
dict_data = {}
with open(file, "r", encoding="utf-8-sig") as requested_file:
reader = csv.reader(requested_file, delimiter=seperator)
for row in reader:
domain_name = row[0]
date_format = "%Y-%m-%dT%H:%M:%SZ"
# TODO - add error handling
creation_date = datetime.strptime(row[8], date_format)
expiration_date = datetime.strptime(row[10], date_format)
dict_data[domain_name] = DomainEscrow(
domain_name,
creation_date,
expiration_date
)
return dict_data
def _read_csv_file(self, file, seperator, dataclass_type, id_field): def _read_csv_file(self, file, seperator, dataclass_type, id_field):
with open(file, "r", encoding="utf-8-sig") as requested_file: with open(file, "r", encoding="utf-8-sig") as requested_file:
reader = csv.DictReader(requested_file, delimiter=seperator) reader = csv.DictReader(requested_file, delimiter=seperator)

View file

@ -1,26 +1,35 @@
from dataclasses import dataclass
@dataclass
class TransitionDomainArguments: class TransitionDomainArguments:
"""Stores arguments for load_transition_domain""" """Stores arguments for load_transition_domain"""
# Settings #
directory: str def __init__(self, **options):
seperator: str # Settings #
limit_parse: int self.directory = options.get('directory')
self.sep = options.get('sep')
self.limitParse = options.get('limitParse')
# Filenames #
## Adhocs ##
self.agency_adhoc_filename = options.get('agency_adhoc_filename')
self.domain_adhoc_filename = options.get('domain_adhoc_filename')
self.organization_adhoc_filename = options.get('organization_adhoc_filename')
## Data files ##
self.domain_additional_filename = options.get('domain_additional_filename')
self.domain_contacts_filename = options.get('domain_contacts_filename')
self.domain_statuses_filename = options.get('domain_statuses_filename')
# Flags #
self.debug = options.get('debug')
self.resetTable = options.get('resetTable')
# Filenames # def args_extra_transition_domain(self):
## Adhocs ## return {
agency_adhoc_filename: str "agency_adhoc_filename": self.agency_adhoc_filename,
domain_adhoc_filename: str "domain_adhoc_filename": self.domain_adhoc_filename,
organization_adhoc_filename: str "organization_adhoc_filename": self.organization_adhoc_filename,
"domain_additional_filename": self.domain_additional_filename,
## Data files ## "directory": self.directory,
domain_additional_filename: str "sep": self.sep,
domain_contacts_filename: str }
domain_statuses_filename: str
# Flags #
debug: bool
reset_table: bool
load_extra: bool