Fix parse error due to corrupt data

2025-07-23 19:20:47 +02:00 · 2023-11-08 09:45:07 -07:00 · 2023-11-08 09:45:07 -07:00 · 78658653a6
commit 78658653a6
parent 22f9952ffd
2 changed files with 60 additions and 98 deletions
--- a/src/registrar/management/commands/utility/extra_transition_domain_helper.py
+++ b/src/registrar/management/commands/utility/extra_transition_domain_helper.py
@ -2,7 +2,7 @@
 import csv
 from dataclasses import dataclass
 from datetime import datetime
-from enum import Enum
+import io
 import glob
 import re
 import logging
@ -911,8 +911,6 @@ class ExtraTransitionDomain:
        dict_data = {}
        with open(file, "r", encoding="utf-8-sig") as requested_file:
            reader = csv.reader(requested_file, delimiter=seperator)
-            # clean the rows of any whitespace around delimiters
-            for row in reader: yield (c.strip() for c in row)
            for row in reader:
                domain_name = row[0]
                date_format = "%Y-%m-%dT%H:%M:%SZ"
@ -926,19 +924,24 @@ class ExtraTransitionDomain:
        return dict_data

    def _read_csv_file(self, file, seperator, dataclass_type, id_field):
+        dict_data = {}
+        # Used when we encounter bad data
+        updated_file_content = None
        with open(file, "r", encoding="utf-8-sig") as requested_file:
            reader = csv.DictReader(requested_file, delimiter=seperator)
-            dict_data = {}
-            # clean the rows of any whitespace around delimiters
-            for row in reader: yield (c.strip() for c in row)
            for row in reader:
+                # Checks if we encounter any bad data.
+                # If we do, we (non-destructively) clean the file     
                if None in row:
-                    logger.info("Skipping row with None key")
-                    logger.info(dataclass_type)
-                    for key, value in row.items():
-                        logger.info(f"key: {key} value: {value}")
-                    TerminalHelper.prompt_for_execution(False, "COnintue?", "DEBUG")
-                    continue
+                    logger.warning(
+                        f"{TerminalColors.YELLOW}"
+                        f"Found bad data in {file}. Attempting to clean."
+                        f"{TerminalColors.ENDC}"
+                    )
+                    updated_file_content = self.replace_bad_seperators(file, f"{seperator}", ";badseperator;")
+                    dict_data = {}
+                    break
+
                row_id = row[id_field]

                # To maintain pairity with the load_transition_domain
@ -946,5 +949,45 @@ class ExtraTransitionDomain:
                if id_field == "domainname" and row_id is not None:
                    row_id = row_id.lower()
                dict_data[row_id] = dataclass_type(**row)
-            # dict_data = {row[id_field]: dataclass_type(**row) for row in reader}
+        
+        # After we clean the data, try to parse it again
+        if updated_file_content:
+            logger.info(
+                f"{TerminalColors.MAGENTA}"
+                f"Retrying load for {file}"
+                f"{TerminalColors.ENDC}"
+            )
+            # Store the file locally rather than writing to the file.
+            # This is to avoid potential data corruption.
+            updated_file = io.StringIO(updated_file_content)
+            reader = csv.DictReader(updated_file, delimiter=seperator)
+            for row in reader:
+                row_id = row[id_field]
+                # If the key is still none, something
+                # is wrong with the file.
+                if None in row:
+                    logger.error(
+                        f"{TerminalColors.FAIL}"
+                        f"Corrupt data found for {row_id}. Skipping."
+                        f"{TerminalColors.ENDC}"
+                    )
+                    continue
+
+                for key, value in row.items():
+                    if value is not None and isinstance(value, str):
+                        value = value.replace(";badseperator;", f" {seperator} ")
+                    row[key] = value
+
+                # To maintain pairity with the load_transition_domain
+                # script, we store this data in lowercase.
+                if id_field == "domainname" and row_id is not None:
+                    row_id = row_id.lower()
+                dict_data[row_id] = dataclass_type(**row)
        return dict_data
+    
+    def replace_bad_seperators(self, filename, delimiter, special_character):
+        with open(filename, "r", encoding="utf-8-sig") as file:
+            contents = file.read()
+
+        new_content = re.sub(rf" \{delimiter} ", special_character, contents)
+        return new_content
--- a/src/registrar/tests/test_transition_domain_migrations.py
+++ b/src/registrar/tests/test_transition_domain_migrations.py
@ -242,66 +242,6 @@ class TestMigrations(TestCase):
        )

        expected_transition_domains = [
-            TransitionDomain(
-                username="",
-                domain_name="anomaly.gov",
-                status="ready",
-                email_sent=False,
-                organization_type=None,
-                organization_name="Flashdog",
-                federal_type=None,
-                federal_agency=None,
-                epp_creation_date=None,
-                epp_expiration_date=None
-            ),
-            TransitionDomain(
-                username="testuser@gmail.com",
-                domain_name="testdomain.gov",
-                status="ready",
-                email_sent=False,
-                organization_type=None,
-                organization_name="Gigaclub",
-                federal_type=None,
-                federal_agency=None,
-                epp_creation_date=None,
-                epp_expiration_date=None
-            ),
-            TransitionDomain(
-                username="agustina.wyman7@test.com",
-                domain_name="fakewebsite1.gov",
-                status="on hold",
-                email_sent=False,
-                organization_type=None,
-                organization_name="Midel",
-                federal_type=None,
-                federal_agency=None,
-                epp_creation_date=None,
-                epp_expiration_date=None
-            ),
-            TransitionDomain(
-                username="susy.martin4@test.com",
-                domain_name="fakewebsite1.gov",
-                status="on hold",
-                email_sent=False,
-                organization_type=None,
-                organization_name="Midel",
-                federal_type=None,
-                federal_agency=None,
-                epp_creation_date=None,
-                epp_expiration_date=None
-            ),
-            TransitionDomain(
-                username="stephania.winters4@test.com",
-                domain_name="fakewebsite1.gov",
-                status="on hold",
-                email_sent=False,
-                organization_type=None,
-                organization_name="Midel",
-                federal_type=None,
-                federal_agency=None,
-                epp_creation_date=None,
-                epp_expiration_date=None
-            ),
            TransitionDomain(
                username="alexandra.bobbitt5@test.com",
                domain_name="fakewebsite2.gov",
@ -314,30 +254,6 @@ class TestMigrations(TestCase):
                epp_creation_date=None,
                epp_expiration_date=None
            ),
-            TransitionDomain(
-                username="jospeh.mcdowell3@test.com",
-                domain_name="fakewebsite2.gov",
-                status="on hold",
-                email_sent=False,
-                organization_type="Federal",
-                organization_name="Fanoodle",
-                federal_type="Executive",
-                federal_agency="InnoZ",
-                epp_creation_date=None,
-                epp_expiration_date=None
-            ),
-            TransitionDomain(
-                username="reginald.ratcliff4@test.com",
-                domain_name="fakewebsite2.gov",
-                status="on hold",
-                email_sent=False,
-                organization_type="Federal",
-                organization_name="Fanoodle",
-                federal_type="Executive",
-                federal_agency="InnoZ",
-                epp_creation_date=None,
-                epp_expiration_date=None
-            ),
            TransitionDomain(
                username="reginald.ratcliff4@test.com",
                domain_name="fakewebsite3.gov",
@ -352,6 +268,9 @@ class TestMigrations(TestCase):
            )
        ]

+        expected_transition_domains = TransitionDomain.objects.filter(username="alexandra.bobbitt5@test.com")
+        self.assertEqual(expected_transition_domains.count(), 1)
+        expected_transition_domain = expected_transition_domains.get()

        #TransitionDomain.objects.filter(domain_name = "fakewebsite3.gov")
        # Afterwards, their values should be what we expect