From 1b6667ab73097a63ed869f76e71391291ee92a10 Mon Sep 17 00:00:00 2001 From: zandercymatics <141044360+zandercymatics@users.noreply.github.com> Date: Thu, 9 Jan 2025 11:01:15 -0700 Subject: [PATCH] Cleanup script --- .../commands/create_federal_portfolio.py | 10 +- .../commands/patch_suborganizations.py | 216 +++++++----------- .../commands/utility/terminal_helper.py | 19 +- .../models/utility/generic_helper.py | 15 ++ 4 files changed, 107 insertions(+), 153 deletions(-) diff --git a/src/registrar/management/commands/create_federal_portfolio.py b/src/registrar/management/commands/create_federal_portfolio.py index 37bd4765d..43030d078 100644 --- a/src/registrar/management/commands/create_federal_portfolio.py +++ b/src/registrar/management/commands/create_federal_portfolio.py @@ -133,15 +133,15 @@ class Command(BaseCommand): portfolio__isnull=True, organization_name__isnull=True, sub_organization__isnull=True, - organization_name__iexact=F("portfolio__organization_name") - ).in_bulk("organization_name") + organization_name__iexact=F("portfolio__organization_name"), + ).in_bulk(field_name="organization_name") requests = requests.exclude( portfolio__isnull=True, organization_name__isnull=True, sub_organization__isnull=True, - organization_name__iexact=F("portfolio__organization_name") - ).in_bulk("organization_name") + organization_name__iexact=F("portfolio__organization_name"), + ).in_bulk(field_name="organization_name") for suborg in suborganizations: domain = domains.get(suborg.name, None) @@ -169,7 +169,7 @@ class Command(BaseCommand): if city: suborg.city = city - + if suborg: suborg.state_territory = state_territory diff --git a/src/registrar/management/commands/patch_suborganizations.py b/src/registrar/management/commands/patch_suborganizations.py index 07fb94f3b..2c471aad9 100644 --- a/src/registrar/management/commands/patch_suborganizations.py +++ b/src/registrar/management/commands/patch_suborganizations.py @@ -1,9 +1,8 @@ -@ -0,0 +1,123 @@ import logging from django.core.management import BaseCommand from registrar.models import Suborganization, DomainRequest, DomainInformation from registrar.management.commands.utility.terminal_helper import TerminalColors, TerminalHelper -from registrar.models.utility.generic_helper import normalize_string +from registrar.models.utility.generic_helper import count_capitals, normalize_string logger = logging.getLogger(__name__) @@ -13,157 +12,98 @@ class Command(BaseCommand): help = "Clean up duplicate suborganizations that differ only by spaces and capitalization" def handle(self, **kwargs): - manual_records = [ - "Assistant Secretary for Preparedness and Response Office of the Secretary", - "US Geological Survey", - "USDA/OC", - ] - duplicates = {} - for record in Suborganization.objects.filter(name__in=manual_records): - if record.name: - norm_name = normalize_string(record.name) - duplicates[norm_name] = { - "keep": None, - "delete": [record] - } + """Process manual deletions and find/remove duplicates. Shows preview + and updates DomainInformation / DomainRequest sub_organization references before deletion.""" - records_to_delete.update(self.handle_suborganization_duplicates()) + # First: get a preset list of records we want to delete. + # The key gets deleted, the value gets kept. + additional_records_to_delete = { + normalize_string("Assistant Secretary for Preparedness and Response Office of the Secretary"): { + "keep": Suborganization.objects.none() + }, + normalize_string("US Geological Survey"): {"keep": Suborganization.objects.none()}, + normalize_string("USDA/OC"): {"keep": Suborganization.objects.none()}, + } - # Get confirmation and execute deletions - if TerminalHelper.prompt_for_execution( - system_exit_on_terminate=True, - prompt_message=preview, - prompt_title="Clean up duplicate suborganizations?", - verify_message="*** WARNING: This will delete suborganizations! ***" - ): - # Update all references to point to the right suborg before deletion - for record in duplicates.values(): - best_record = record.get("keep") - delete_ids = [dupe.id for dupe in record.get("delete")] - - # Update domain requests - DomainRequest.objects.filter( - sub_organization_id__in=delete_ids - ).update(sub_organization=best_record) - - # Update domain information - DomainInformation.objects.filter( - sub_organization_id__in=delete_ids - ).update(sub_organization=best_record) + # First: Group all suborganization names by their "normalized" names (finding duplicates) + name_groups = {} + for suborg in Suborganization.objects.all(): + normalized_name = normalize_string(suborg.name) + if normalized_name not in name_groups: + name_groups[normalized_name] = [] + name_groups[normalized_name].append(suborg) - records_to_delete = set( - dupe.id - for data in duplicates.values() - for dupe in data["delete"] - ) - try: - delete_count, _ = Suborganization.objects.filter(id__in=records_to_delete).delete() - logger.info(f"{TerminalColors.OKGREEN}Successfully deleted {delete_count} suborganizations{TerminalColors.ENDC}") - except Exception as e: - logger.error(f"{TerminalColors.FAIL}Failed to clean up suborganizations: {str(e)}{TerminalColors.ENDC}") - - - def handle_suborganization_duplicates(self, duplicates): - # Find duplicates - all_suborgs = Suborganization.objects.all() - for suborg in all_suborgs: - # Normalize name by removing extra spaces and converting to lowercase - normalized_name = " ".join(suborg.name.split()).lower() - - # First occurrence of this name - if normalized_name not in duplicates: - duplicates[normalized_name] = { - "keep": suborg, - "delete": [] - } + # Second: find the record we should keep, and the duplicate records we should delete + records_to_prune = {} + for normalized_name, duplicate_suborgs in name_groups.items(): + if normalized_name in additional_records_to_delete: + record = additional_records_to_delete.get(normalized_name) + records_to_prune[normalized_name] = {"keep": record.get("keep"), "delete": duplicate_suborgs} continue - # Compare with our current best - current_best = duplicates[normalized_name]["keep"] + if len(duplicate_suborgs) > 1: + # Pick the best record to keep. + # The fewest spaces and most capitals (at the beginning of each word) wins. + best_record = duplicate_suborgs[0] + for suborg in duplicate_suborgs: + has_fewer_spaces = suborg.name.count(" ") < best_record.name.count(" ") + has_more_capitals = count_capitals(suborg.name, leading_only=True) > count_capitals( + best_record.name, leading_only=True + ) + if has_fewer_spaces or has_more_capitals: + best_record = suborg - # Check if all other fields match. - # If they don't, we should inspect this record manually. - fields_to_compare = ["portfolio", "city", "state_territory"] - fields_match = all( - getattr(suborg, field) == getattr(current_best, field) - for field in fields_to_compare - ) - if not fields_match: - logger.warning( - f"{TerminalColors.YELLOW}" - f"\nSkipping potential duplicate: {suborg.name} (id: {suborg.id})" - f"\nData mismatch with {current_best.name} (id: {current_best.id})" - f"{TerminalColors.ENDC}" - ) - continue - - # Determine if new suborg is better than current best. - # The fewest spaces and most capitals wins. - new_has_fewer_spaces = suborg.name.count(" ") < current_best.name.count(" ") - new_has_more_capitals = sum(1 for c in suborg.name if c.isupper()) > sum(1 for c in current_best.name if c.isupper()) - # TODO - # Split into words and count properly capitalized first letters - # new_proper_caps = sum( - # 1 for word in suborg.name.split() - # if word and word[0].isupper() - # ) - # current_proper_caps = sum( - # 1 for word in current_best.name.split() - # if word and word[0].isupper() - # ) - # new_has_better_caps = new_proper_caps > current_proper_caps + records_to_prune[normalized_name] = { + "keep": best_record, + "delete": [s for s in duplicate_suborgs if s != best_record], + } - if new_has_fewer_spaces or new_has_more_capitals: - # New suborg is better - demote the old one to the delete list - duplicates[normalized_name]["delete"].append(current_best) - duplicates[normalized_name]["keep"] = suborg - else: - # If it is not better, just delete the old one - duplicates[normalized_name]["delete"].append(suborg) - - # Filter out entries without duplicates - duplicates = {k: v for k, v in duplicates.items() if v.get("delete")} - if not duplicates: - logger.info(f"No duplicate suborganizations found.") + if len(records_to_prune) == 0: + TerminalHelper.colorful_logger(logger.error, TerminalColors.FAIL, "No suborganizations to delete.") return - # Show preview of changes - preview = "The following duplicates will be removed:\n" - for data in duplicates.values(): - best = data.get("keep") - preview += f"\nKeeping: '{best.name}' (id: {best.id})" - + # Third: Show a preview of the changes + total_records_to_remove = 0 + preview = "The following records will be removed:\n" + for data in records_to_prune.values(): + keep = data.get("keep") + if keep: + preview += f"\nKeeping: '{keep.name}' (id: {keep.id})" + for duplicate in data.get("delete"): preview += f"\nRemoving: '{duplicate.name}' (id: {duplicate.id})" + total_records_to_remove += 1 preview += "\n" - # Get confirmation and execute deletions + # Fourth: Get user confirmation and execute deletions if TerminalHelper.prompt_for_execution( system_exit_on_terminate=True, prompt_message=preview, - prompt_title="Clean up duplicate suborganizations?", - verify_message="*** WARNING: This will delete suborganizations! ***" + prompt_title=f"Remove {total_records_to_remove} suborganizations?", + verify_message="*** WARNING: This will replace the record on DomainInformation and DomainRequest! ***", ): - # Update all references to point to the right suborg before deletion - for record in duplicates.values(): - best_record = record.get("keep") - delete_ids = [dupe.id for dupe in record.get("delete")] - - # Update domain requests - DomainRequest.objects.filter( - sub_organization_id__in=delete_ids - ).update(sub_organization=best_record) - - # Update domain information - DomainInformation.objects.filter( - sub_organization_id__in=delete_ids - ).update(sub_organization=best_record) + try: + # Update all references to point to the right suborg before deletion + all_suborgs_to_remove = set() + for record in records_to_prune.values(): + best_record = record["keep"] + suborgs_to_remove = {dupe.id for dupe in record["delete"]} + # Update domain requests + DomainRequest.objects.filter(sub_organization_id__in=suborgs_to_remove).update( + sub_organization=best_record + ) - records_to_delete = set( - dupe.id - for data in duplicates.values() - for dupe in data["delete"] - ) - return records_to_delete - else: - return set() + # Update domain information + DomainInformation.objects.filter(sub_organization_id__in=suborgs_to_remove).update( + sub_organization=best_record + ) + + all_suborgs_to_remove.update(suborgs_to_remove) + delete_count, _ = Suborganization.objects.filter(id__in=all_suborgs_to_remove).delete() + TerminalHelper.colorful_logger( + logger.info, TerminalColors.MAGENTA, f"Successfully deleted {delete_count} suborganizations." + ) + except Exception as e: + TerminalHelper.colorful_logger( + logger.error, TerminalColors.FAIL, f"Failed to delete suborganizations: {str(e)}" + ) diff --git a/src/registrar/management/commands/utility/terminal_helper.py b/src/registrar/management/commands/utility/terminal_helper.py index eed1027f7..b16ca72f2 100644 --- a/src/registrar/management/commands/utility/terminal_helper.py +++ b/src/registrar/management/commands/utility/terminal_helper.py @@ -401,16 +401,15 @@ class TerminalHelper: # Allow the user to inspect the command string # and ask if they wish to proceed proceed_execution = TerminalHelper.query_yes_no_exit( - f"""{TerminalColors.OKCYAN} - ===================================================== - {prompt_title} - ===================================================== - {verify_message} - - {prompt_message} - {TerminalColors.FAIL} - Proceed? (Y = proceed, N = {action_description_for_selecting_no}) - {TerminalColors.ENDC}""" + f"\n{TerminalColors.OKCYAN}" + "=====================================================" + f"\n{prompt_title}\n" + "=====================================================" + f"\n{verify_message}\n" + f"\n{prompt_message}\n" + f"{TerminalColors.FAIL}" + f"Proceed? (Y = proceed, N = {action_description_for_selecting_no})" + f"{TerminalColors.ENDC}" ) # If the user decided to proceed return true. diff --git a/src/registrar/models/utility/generic_helper.py b/src/registrar/models/utility/generic_helper.py index 84dc28db1..af7780194 100644 --- a/src/registrar/models/utility/generic_helper.py +++ b/src/registrar/models/utility/generic_helper.py @@ -344,6 +344,7 @@ def value_of_attribute(obj, attribute_name: str): value = value() return value + def normalize_string(string_to_normalize, lowercase=True): """Normalizes a given string. Returns a string without extra spaces, in all lowercase.""" if not isinstance(string_to_normalize, str): @@ -352,3 +353,17 @@ def normalize_string(string_to_normalize, lowercase=True): new_string = " ".join(string_to_normalize.split()) return new_string.lower() if lowercase else new_string + + +def count_capitals(text: str, leading_only: bool): + """Counts capital letters in a string. + Args: + text (str): The string to analyze. + leading_only (bool): If False, counts all capital letters. + If True, only counts capitals at the start of words. + Returns: + int: Number of capital letters found. + """ + if leading_only: + return sum(word[0].isupper() for word in text.split() if word) + return sum(c.isupper() for c in text if c)