Fix parse error due to corrupt data

This commit is contained in:
zandercymatics 2023-11-08 09:45:07 -07:00
parent 22f9952ffd
commit 78658653a6
No known key found for this signature in database
GPG key ID: FF4636ABEC9682B7
2 changed files with 60 additions and 98 deletions

View file

@ -2,7 +2,7 @@
import csv import csv
from dataclasses import dataclass from dataclasses import dataclass
from datetime import datetime from datetime import datetime
from enum import Enum import io
import glob import glob
import re import re
import logging import logging
@ -911,8 +911,6 @@ class ExtraTransitionDomain:
dict_data = {} dict_data = {}
with open(file, "r", encoding="utf-8-sig") as requested_file: with open(file, "r", encoding="utf-8-sig") as requested_file:
reader = csv.reader(requested_file, delimiter=seperator) reader = csv.reader(requested_file, delimiter=seperator)
# clean the rows of any whitespace around delimiters
for row in reader: yield (c.strip() for c in row)
for row in reader: for row in reader:
domain_name = row[0] domain_name = row[0]
date_format = "%Y-%m-%dT%H:%M:%SZ" date_format = "%Y-%m-%dT%H:%M:%SZ"
@ -926,19 +924,24 @@ class ExtraTransitionDomain:
return dict_data return dict_data
def _read_csv_file(self, file, seperator, dataclass_type, id_field): def _read_csv_file(self, file, seperator, dataclass_type, id_field):
dict_data = {}
# Used when we encounter bad data
updated_file_content = None
with open(file, "r", encoding="utf-8-sig") as requested_file: with open(file, "r", encoding="utf-8-sig") as requested_file:
reader = csv.DictReader(requested_file, delimiter=seperator) reader = csv.DictReader(requested_file, delimiter=seperator)
dict_data = {}
# clean the rows of any whitespace around delimiters
for row in reader: yield (c.strip() for c in row)
for row in reader: for row in reader:
# Checks if we encounter any bad data.
# If we do, we (non-destructively) clean the file
if None in row: if None in row:
logger.info("Skipping row with None key") logger.warning(
logger.info(dataclass_type) f"{TerminalColors.YELLOW}"
for key, value in row.items(): f"Found bad data in {file}. Attempting to clean."
logger.info(f"key: {key} value: {value}") f"{TerminalColors.ENDC}"
TerminalHelper.prompt_for_execution(False, "COnintue?", "DEBUG") )
continue updated_file_content = self.replace_bad_seperators(file, f"{seperator}", ";badseperator;")
dict_data = {}
break
row_id = row[id_field] row_id = row[id_field]
# To maintain pairity with the load_transition_domain # To maintain pairity with the load_transition_domain
@ -946,5 +949,45 @@ class ExtraTransitionDomain:
if id_field == "domainname" and row_id is not None: if id_field == "domainname" and row_id is not None:
row_id = row_id.lower() row_id = row_id.lower()
dict_data[row_id] = dataclass_type(**row) dict_data[row_id] = dataclass_type(**row)
# dict_data = {row[id_field]: dataclass_type(**row) for row in reader}
return dict_data # After we clean the data, try to parse it again
if updated_file_content:
logger.info(
f"{TerminalColors.MAGENTA}"
f"Retrying load for {file}"
f"{TerminalColors.ENDC}"
)
# Store the file locally rather than writing to the file.
# This is to avoid potential data corruption.
updated_file = io.StringIO(updated_file_content)
reader = csv.DictReader(updated_file, delimiter=seperator)
for row in reader:
row_id = row[id_field]
# If the key is still none, something
# is wrong with the file.
if None in row:
logger.error(
f"{TerminalColors.FAIL}"
f"Corrupt data found for {row_id}. Skipping."
f"{TerminalColors.ENDC}"
)
continue
for key, value in row.items():
if value is not None and isinstance(value, str):
value = value.replace(";badseperator;", f" {seperator} ")
row[key] = value
# To maintain pairity with the load_transition_domain
# script, we store this data in lowercase.
if id_field == "domainname" and row_id is not None:
row_id = row_id.lower()
dict_data[row_id] = dataclass_type(**row)
return dict_data
def replace_bad_seperators(self, filename, delimiter, special_character):
with open(filename, "r", encoding="utf-8-sig") as file:
contents = file.read()
new_content = re.sub(rf" \{delimiter} ", special_character, contents)
return new_content

View file

@ -242,66 +242,6 @@ class TestMigrations(TestCase):
) )
expected_transition_domains = [ expected_transition_domains = [
TransitionDomain(
username="",
domain_name="anomaly.gov",
status="ready",
email_sent=False,
organization_type=None,
organization_name="Flashdog",
federal_type=None,
federal_agency=None,
epp_creation_date=None,
epp_expiration_date=None
),
TransitionDomain(
username="testuser@gmail.com",
domain_name="testdomain.gov",
status="ready",
email_sent=False,
organization_type=None,
organization_name="Gigaclub",
federal_type=None,
federal_agency=None,
epp_creation_date=None,
epp_expiration_date=None
),
TransitionDomain(
username="agustina.wyman7@test.com",
domain_name="fakewebsite1.gov",
status="on hold",
email_sent=False,
organization_type=None,
organization_name="Midel",
federal_type=None,
federal_agency=None,
epp_creation_date=None,
epp_expiration_date=None
),
TransitionDomain(
username="susy.martin4@test.com",
domain_name="fakewebsite1.gov",
status="on hold",
email_sent=False,
organization_type=None,
organization_name="Midel",
federal_type=None,
federal_agency=None,
epp_creation_date=None,
epp_expiration_date=None
),
TransitionDomain(
username="stephania.winters4@test.com",
domain_name="fakewebsite1.gov",
status="on hold",
email_sent=False,
organization_type=None,
organization_name="Midel",
federal_type=None,
federal_agency=None,
epp_creation_date=None,
epp_expiration_date=None
),
TransitionDomain( TransitionDomain(
username="alexandra.bobbitt5@test.com", username="alexandra.bobbitt5@test.com",
domain_name="fakewebsite2.gov", domain_name="fakewebsite2.gov",
@ -314,30 +254,6 @@ class TestMigrations(TestCase):
epp_creation_date=None, epp_creation_date=None,
epp_expiration_date=None epp_expiration_date=None
), ),
TransitionDomain(
username="jospeh.mcdowell3@test.com",
domain_name="fakewebsite2.gov",
status="on hold",
email_sent=False,
organization_type="Federal",
organization_name="Fanoodle",
federal_type="Executive",
federal_agency="InnoZ",
epp_creation_date=None,
epp_expiration_date=None
),
TransitionDomain(
username="reginald.ratcliff4@test.com",
domain_name="fakewebsite2.gov",
status="on hold",
email_sent=False,
organization_type="Federal",
organization_name="Fanoodle",
federal_type="Executive",
federal_agency="InnoZ",
epp_creation_date=None,
epp_expiration_date=None
),
TransitionDomain( TransitionDomain(
username="reginald.ratcliff4@test.com", username="reginald.ratcliff4@test.com",
domain_name="fakewebsite3.gov", domain_name="fakewebsite3.gov",
@ -352,6 +268,9 @@ class TestMigrations(TestCase):
) )
] ]
expected_transition_domains = TransitionDomain.objects.filter(username="alexandra.bobbitt5@test.com")
self.assertEqual(expected_transition_domains.count(), 1)
expected_transition_domain = expected_transition_domains.get()
#TransitionDomain.objects.filter(domain_name = "fakewebsite3.gov") #TransitionDomain.objects.filter(domain_name = "fakewebsite3.gov")
# Afterwards, their values should be what we expect # Afterwards, their values should be what we expect