mirror of
https://github.com/cisagov/manage.get.gov.git
synced 2025-05-31 17:53:56 +02:00
Fix parse error due to corrupt data
This commit is contained in:
parent
22f9952ffd
commit
78658653a6
2 changed files with 60 additions and 98 deletions
|
@ -2,7 +2,7 @@
|
||||||
import csv
|
import csv
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from enum import Enum
|
import io
|
||||||
import glob
|
import glob
|
||||||
import re
|
import re
|
||||||
import logging
|
import logging
|
||||||
|
@ -911,8 +911,6 @@ class ExtraTransitionDomain:
|
||||||
dict_data = {}
|
dict_data = {}
|
||||||
with open(file, "r", encoding="utf-8-sig") as requested_file:
|
with open(file, "r", encoding="utf-8-sig") as requested_file:
|
||||||
reader = csv.reader(requested_file, delimiter=seperator)
|
reader = csv.reader(requested_file, delimiter=seperator)
|
||||||
# clean the rows of any whitespace around delimiters
|
|
||||||
for row in reader: yield (c.strip() for c in row)
|
|
||||||
for row in reader:
|
for row in reader:
|
||||||
domain_name = row[0]
|
domain_name = row[0]
|
||||||
date_format = "%Y-%m-%dT%H:%M:%SZ"
|
date_format = "%Y-%m-%dT%H:%M:%SZ"
|
||||||
|
@ -926,19 +924,24 @@ class ExtraTransitionDomain:
|
||||||
return dict_data
|
return dict_data
|
||||||
|
|
||||||
def _read_csv_file(self, file, seperator, dataclass_type, id_field):
|
def _read_csv_file(self, file, seperator, dataclass_type, id_field):
|
||||||
|
dict_data = {}
|
||||||
|
# Used when we encounter bad data
|
||||||
|
updated_file_content = None
|
||||||
with open(file, "r", encoding="utf-8-sig") as requested_file:
|
with open(file, "r", encoding="utf-8-sig") as requested_file:
|
||||||
reader = csv.DictReader(requested_file, delimiter=seperator)
|
reader = csv.DictReader(requested_file, delimiter=seperator)
|
||||||
dict_data = {}
|
|
||||||
# clean the rows of any whitespace around delimiters
|
|
||||||
for row in reader: yield (c.strip() for c in row)
|
|
||||||
for row in reader:
|
for row in reader:
|
||||||
|
# Checks if we encounter any bad data.
|
||||||
|
# If we do, we (non-destructively) clean the file
|
||||||
if None in row:
|
if None in row:
|
||||||
logger.info("Skipping row with None key")
|
logger.warning(
|
||||||
logger.info(dataclass_type)
|
f"{TerminalColors.YELLOW}"
|
||||||
for key, value in row.items():
|
f"Found bad data in {file}. Attempting to clean."
|
||||||
logger.info(f"key: {key} value: {value}")
|
f"{TerminalColors.ENDC}"
|
||||||
TerminalHelper.prompt_for_execution(False, "COnintue?", "DEBUG")
|
)
|
||||||
continue
|
updated_file_content = self.replace_bad_seperators(file, f"{seperator}", ";badseperator;")
|
||||||
|
dict_data = {}
|
||||||
|
break
|
||||||
|
|
||||||
row_id = row[id_field]
|
row_id = row[id_field]
|
||||||
|
|
||||||
# To maintain pairity with the load_transition_domain
|
# To maintain pairity with the load_transition_domain
|
||||||
|
@ -946,5 +949,45 @@ class ExtraTransitionDomain:
|
||||||
if id_field == "domainname" and row_id is not None:
|
if id_field == "domainname" and row_id is not None:
|
||||||
row_id = row_id.lower()
|
row_id = row_id.lower()
|
||||||
dict_data[row_id] = dataclass_type(**row)
|
dict_data[row_id] = dataclass_type(**row)
|
||||||
# dict_data = {row[id_field]: dataclass_type(**row) for row in reader}
|
|
||||||
return dict_data
|
# After we clean the data, try to parse it again
|
||||||
|
if updated_file_content:
|
||||||
|
logger.info(
|
||||||
|
f"{TerminalColors.MAGENTA}"
|
||||||
|
f"Retrying load for {file}"
|
||||||
|
f"{TerminalColors.ENDC}"
|
||||||
|
)
|
||||||
|
# Store the file locally rather than writing to the file.
|
||||||
|
# This is to avoid potential data corruption.
|
||||||
|
updated_file = io.StringIO(updated_file_content)
|
||||||
|
reader = csv.DictReader(updated_file, delimiter=seperator)
|
||||||
|
for row in reader:
|
||||||
|
row_id = row[id_field]
|
||||||
|
# If the key is still none, something
|
||||||
|
# is wrong with the file.
|
||||||
|
if None in row:
|
||||||
|
logger.error(
|
||||||
|
f"{TerminalColors.FAIL}"
|
||||||
|
f"Corrupt data found for {row_id}. Skipping."
|
||||||
|
f"{TerminalColors.ENDC}"
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
for key, value in row.items():
|
||||||
|
if value is not None and isinstance(value, str):
|
||||||
|
value = value.replace(";badseperator;", f" {seperator} ")
|
||||||
|
row[key] = value
|
||||||
|
|
||||||
|
# To maintain pairity with the load_transition_domain
|
||||||
|
# script, we store this data in lowercase.
|
||||||
|
if id_field == "domainname" and row_id is not None:
|
||||||
|
row_id = row_id.lower()
|
||||||
|
dict_data[row_id] = dataclass_type(**row)
|
||||||
|
return dict_data
|
||||||
|
|
||||||
|
def replace_bad_seperators(self, filename, delimiter, special_character):
|
||||||
|
with open(filename, "r", encoding="utf-8-sig") as file:
|
||||||
|
contents = file.read()
|
||||||
|
|
||||||
|
new_content = re.sub(rf" \{delimiter} ", special_character, contents)
|
||||||
|
return new_content
|
|
@ -242,66 +242,6 @@ class TestMigrations(TestCase):
|
||||||
)
|
)
|
||||||
|
|
||||||
expected_transition_domains = [
|
expected_transition_domains = [
|
||||||
TransitionDomain(
|
|
||||||
username="",
|
|
||||||
domain_name="anomaly.gov",
|
|
||||||
status="ready",
|
|
||||||
email_sent=False,
|
|
||||||
organization_type=None,
|
|
||||||
organization_name="Flashdog",
|
|
||||||
federal_type=None,
|
|
||||||
federal_agency=None,
|
|
||||||
epp_creation_date=None,
|
|
||||||
epp_expiration_date=None
|
|
||||||
),
|
|
||||||
TransitionDomain(
|
|
||||||
username="testuser@gmail.com",
|
|
||||||
domain_name="testdomain.gov",
|
|
||||||
status="ready",
|
|
||||||
email_sent=False,
|
|
||||||
organization_type=None,
|
|
||||||
organization_name="Gigaclub",
|
|
||||||
federal_type=None,
|
|
||||||
federal_agency=None,
|
|
||||||
epp_creation_date=None,
|
|
||||||
epp_expiration_date=None
|
|
||||||
),
|
|
||||||
TransitionDomain(
|
|
||||||
username="agustina.wyman7@test.com",
|
|
||||||
domain_name="fakewebsite1.gov",
|
|
||||||
status="on hold",
|
|
||||||
email_sent=False,
|
|
||||||
organization_type=None,
|
|
||||||
organization_name="Midel",
|
|
||||||
federal_type=None,
|
|
||||||
federal_agency=None,
|
|
||||||
epp_creation_date=None,
|
|
||||||
epp_expiration_date=None
|
|
||||||
),
|
|
||||||
TransitionDomain(
|
|
||||||
username="susy.martin4@test.com",
|
|
||||||
domain_name="fakewebsite1.gov",
|
|
||||||
status="on hold",
|
|
||||||
email_sent=False,
|
|
||||||
organization_type=None,
|
|
||||||
organization_name="Midel",
|
|
||||||
federal_type=None,
|
|
||||||
federal_agency=None,
|
|
||||||
epp_creation_date=None,
|
|
||||||
epp_expiration_date=None
|
|
||||||
),
|
|
||||||
TransitionDomain(
|
|
||||||
username="stephania.winters4@test.com",
|
|
||||||
domain_name="fakewebsite1.gov",
|
|
||||||
status="on hold",
|
|
||||||
email_sent=False,
|
|
||||||
organization_type=None,
|
|
||||||
organization_name="Midel",
|
|
||||||
federal_type=None,
|
|
||||||
federal_agency=None,
|
|
||||||
epp_creation_date=None,
|
|
||||||
epp_expiration_date=None
|
|
||||||
),
|
|
||||||
TransitionDomain(
|
TransitionDomain(
|
||||||
username="alexandra.bobbitt5@test.com",
|
username="alexandra.bobbitt5@test.com",
|
||||||
domain_name="fakewebsite2.gov",
|
domain_name="fakewebsite2.gov",
|
||||||
|
@ -314,30 +254,6 @@ class TestMigrations(TestCase):
|
||||||
epp_creation_date=None,
|
epp_creation_date=None,
|
||||||
epp_expiration_date=None
|
epp_expiration_date=None
|
||||||
),
|
),
|
||||||
TransitionDomain(
|
|
||||||
username="jospeh.mcdowell3@test.com",
|
|
||||||
domain_name="fakewebsite2.gov",
|
|
||||||
status="on hold",
|
|
||||||
email_sent=False,
|
|
||||||
organization_type="Federal",
|
|
||||||
organization_name="Fanoodle",
|
|
||||||
federal_type="Executive",
|
|
||||||
federal_agency="InnoZ",
|
|
||||||
epp_creation_date=None,
|
|
||||||
epp_expiration_date=None
|
|
||||||
),
|
|
||||||
TransitionDomain(
|
|
||||||
username="reginald.ratcliff4@test.com",
|
|
||||||
domain_name="fakewebsite2.gov",
|
|
||||||
status="on hold",
|
|
||||||
email_sent=False,
|
|
||||||
organization_type="Federal",
|
|
||||||
organization_name="Fanoodle",
|
|
||||||
federal_type="Executive",
|
|
||||||
federal_agency="InnoZ",
|
|
||||||
epp_creation_date=None,
|
|
||||||
epp_expiration_date=None
|
|
||||||
),
|
|
||||||
TransitionDomain(
|
TransitionDomain(
|
||||||
username="reginald.ratcliff4@test.com",
|
username="reginald.ratcliff4@test.com",
|
||||||
domain_name="fakewebsite3.gov",
|
domain_name="fakewebsite3.gov",
|
||||||
|
@ -352,6 +268,9 @@ class TestMigrations(TestCase):
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
expected_transition_domains = TransitionDomain.objects.filter(username="alexandra.bobbitt5@test.com")
|
||||||
|
self.assertEqual(expected_transition_domains.count(), 1)
|
||||||
|
expected_transition_domain = expected_transition_domains.get()
|
||||||
|
|
||||||
#TransitionDomain.objects.filter(domain_name = "fakewebsite3.gov")
|
#TransitionDomain.objects.filter(domain_name = "fakewebsite3.gov")
|
||||||
# Afterwards, their values should be what we expect
|
# Afterwards, their values should be what we expect
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue