mirror of
https://github.com/cisagov/manage.get.gov.git
synced 2025-08-01 07:26:34 +02:00
Cleanup script
This commit is contained in:
parent
ae2e4c5999
commit
1b6667ab73
4 changed files with 107 additions and 153 deletions
|
@ -133,15 +133,15 @@ class Command(BaseCommand):
|
|||
portfolio__isnull=True,
|
||||
organization_name__isnull=True,
|
||||
sub_organization__isnull=True,
|
||||
organization_name__iexact=F("portfolio__organization_name")
|
||||
).in_bulk("organization_name")
|
||||
organization_name__iexact=F("portfolio__organization_name"),
|
||||
).in_bulk(field_name="organization_name")
|
||||
|
||||
requests = requests.exclude(
|
||||
portfolio__isnull=True,
|
||||
organization_name__isnull=True,
|
||||
sub_organization__isnull=True,
|
||||
organization_name__iexact=F("portfolio__organization_name")
|
||||
).in_bulk("organization_name")
|
||||
organization_name__iexact=F("portfolio__organization_name"),
|
||||
).in_bulk(field_name="organization_name")
|
||||
|
||||
for suborg in suborganizations:
|
||||
domain = domains.get(suborg.name, None)
|
||||
|
@ -169,7 +169,7 @@ class Command(BaseCommand):
|
|||
|
||||
if city:
|
||||
suborg.city = city
|
||||
|
||||
|
||||
if suborg:
|
||||
suborg.state_territory = state_territory
|
||||
|
||||
|
|
|
@ -1,9 +1,8 @@
|
|||
@ -0,0 +1,123 @@
|
||||
import logging
|
||||
from django.core.management import BaseCommand
|
||||
from registrar.models import Suborganization, DomainRequest, DomainInformation
|
||||
from registrar.management.commands.utility.terminal_helper import TerminalColors, TerminalHelper
|
||||
from registrar.models.utility.generic_helper import normalize_string
|
||||
from registrar.models.utility.generic_helper import count_capitals, normalize_string
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
@ -13,157 +12,98 @@ class Command(BaseCommand):
|
|||
help = "Clean up duplicate suborganizations that differ only by spaces and capitalization"
|
||||
|
||||
def handle(self, **kwargs):
|
||||
manual_records = [
|
||||
"Assistant Secretary for Preparedness and Response Office of the Secretary",
|
||||
"US Geological Survey",
|
||||
"USDA/OC",
|
||||
]
|
||||
duplicates = {}
|
||||
for record in Suborganization.objects.filter(name__in=manual_records):
|
||||
if record.name:
|
||||
norm_name = normalize_string(record.name)
|
||||
duplicates[norm_name] = {
|
||||
"keep": None,
|
||||
"delete": [record]
|
||||
}
|
||||
"""Process manual deletions and find/remove duplicates. Shows preview
|
||||
and updates DomainInformation / DomainRequest sub_organization references before deletion."""
|
||||
|
||||
records_to_delete.update(self.handle_suborganization_duplicates())
|
||||
# First: get a preset list of records we want to delete.
|
||||
# The key gets deleted, the value gets kept.
|
||||
additional_records_to_delete = {
|
||||
normalize_string("Assistant Secretary for Preparedness and Response Office of the Secretary"): {
|
||||
"keep": Suborganization.objects.none()
|
||||
},
|
||||
normalize_string("US Geological Survey"): {"keep": Suborganization.objects.none()},
|
||||
normalize_string("USDA/OC"): {"keep": Suborganization.objects.none()},
|
||||
}
|
||||
|
||||
# Get confirmation and execute deletions
|
||||
if TerminalHelper.prompt_for_execution(
|
||||
system_exit_on_terminate=True,
|
||||
prompt_message=preview,
|
||||
prompt_title="Clean up duplicate suborganizations?",
|
||||
verify_message="*** WARNING: This will delete suborganizations! ***"
|
||||
):
|
||||
# Update all references to point to the right suborg before deletion
|
||||
for record in duplicates.values():
|
||||
best_record = record.get("keep")
|
||||
delete_ids = [dupe.id for dupe in record.get("delete")]
|
||||
|
||||
# Update domain requests
|
||||
DomainRequest.objects.filter(
|
||||
sub_organization_id__in=delete_ids
|
||||
).update(sub_organization=best_record)
|
||||
|
||||
# Update domain information
|
||||
DomainInformation.objects.filter(
|
||||
sub_organization_id__in=delete_ids
|
||||
).update(sub_organization=best_record)
|
||||
# First: Group all suborganization names by their "normalized" names (finding duplicates)
|
||||
name_groups = {}
|
||||
for suborg in Suborganization.objects.all():
|
||||
normalized_name = normalize_string(suborg.name)
|
||||
if normalized_name not in name_groups:
|
||||
name_groups[normalized_name] = []
|
||||
name_groups[normalized_name].append(suborg)
|
||||
|
||||
records_to_delete = set(
|
||||
dupe.id
|
||||
for data in duplicates.values()
|
||||
for dupe in data["delete"]
|
||||
)
|
||||
try:
|
||||
delete_count, _ = Suborganization.objects.filter(id__in=records_to_delete).delete()
|
||||
logger.info(f"{TerminalColors.OKGREEN}Successfully deleted {delete_count} suborganizations{TerminalColors.ENDC}")
|
||||
except Exception as e:
|
||||
logger.error(f"{TerminalColors.FAIL}Failed to clean up suborganizations: {str(e)}{TerminalColors.ENDC}")
|
||||
|
||||
|
||||
def handle_suborganization_duplicates(self, duplicates):
|
||||
# Find duplicates
|
||||
all_suborgs = Suborganization.objects.all()
|
||||
for suborg in all_suborgs:
|
||||
# Normalize name by removing extra spaces and converting to lowercase
|
||||
normalized_name = " ".join(suborg.name.split()).lower()
|
||||
|
||||
# First occurrence of this name
|
||||
if normalized_name not in duplicates:
|
||||
duplicates[normalized_name] = {
|
||||
"keep": suborg,
|
||||
"delete": []
|
||||
}
|
||||
# Second: find the record we should keep, and the duplicate records we should delete
|
||||
records_to_prune = {}
|
||||
for normalized_name, duplicate_suborgs in name_groups.items():
|
||||
if normalized_name in additional_records_to_delete:
|
||||
record = additional_records_to_delete.get(normalized_name)
|
||||
records_to_prune[normalized_name] = {"keep": record.get("keep"), "delete": duplicate_suborgs}
|
||||
continue
|
||||
|
||||
# Compare with our current best
|
||||
current_best = duplicates[normalized_name]["keep"]
|
||||
if len(duplicate_suborgs) > 1:
|
||||
# Pick the best record to keep.
|
||||
# The fewest spaces and most capitals (at the beginning of each word) wins.
|
||||
best_record = duplicate_suborgs[0]
|
||||
for suborg in duplicate_suborgs:
|
||||
has_fewer_spaces = suborg.name.count(" ") < best_record.name.count(" ")
|
||||
has_more_capitals = count_capitals(suborg.name, leading_only=True) > count_capitals(
|
||||
best_record.name, leading_only=True
|
||||
)
|
||||
if has_fewer_spaces or has_more_capitals:
|
||||
best_record = suborg
|
||||
|
||||
# Check if all other fields match.
|
||||
# If they don't, we should inspect this record manually.
|
||||
fields_to_compare = ["portfolio", "city", "state_territory"]
|
||||
fields_match = all(
|
||||
getattr(suborg, field) == getattr(current_best, field)
|
||||
for field in fields_to_compare
|
||||
)
|
||||
if not fields_match:
|
||||
logger.warning(
|
||||
f"{TerminalColors.YELLOW}"
|
||||
f"\nSkipping potential duplicate: {suborg.name} (id: {suborg.id})"
|
||||
f"\nData mismatch with {current_best.name} (id: {current_best.id})"
|
||||
f"{TerminalColors.ENDC}"
|
||||
)
|
||||
continue
|
||||
|
||||
# Determine if new suborg is better than current best.
|
||||
# The fewest spaces and most capitals wins.
|
||||
new_has_fewer_spaces = suborg.name.count(" ") < current_best.name.count(" ")
|
||||
new_has_more_capitals = sum(1 for c in suborg.name if c.isupper()) > sum(1 for c in current_best.name if c.isupper())
|
||||
# TODO
|
||||
# Split into words and count properly capitalized first letters
|
||||
# new_proper_caps = sum(
|
||||
# 1 for word in suborg.name.split()
|
||||
# if word and word[0].isupper()
|
||||
# )
|
||||
# current_proper_caps = sum(
|
||||
# 1 for word in current_best.name.split()
|
||||
# if word and word[0].isupper()
|
||||
# )
|
||||
# new_has_better_caps = new_proper_caps > current_proper_caps
|
||||
records_to_prune[normalized_name] = {
|
||||
"keep": best_record,
|
||||
"delete": [s for s in duplicate_suborgs if s != best_record],
|
||||
}
|
||||
|
||||
if new_has_fewer_spaces or new_has_more_capitals:
|
||||
# New suborg is better - demote the old one to the delete list
|
||||
duplicates[normalized_name]["delete"].append(current_best)
|
||||
duplicates[normalized_name]["keep"] = suborg
|
||||
else:
|
||||
# If it is not better, just delete the old one
|
||||
duplicates[normalized_name]["delete"].append(suborg)
|
||||
|
||||
# Filter out entries without duplicates
|
||||
duplicates = {k: v for k, v in duplicates.items() if v.get("delete")}
|
||||
if not duplicates:
|
||||
logger.info(f"No duplicate suborganizations found.")
|
||||
if len(records_to_prune) == 0:
|
||||
TerminalHelper.colorful_logger(logger.error, TerminalColors.FAIL, "No suborganizations to delete.")
|
||||
return
|
||||
|
||||
# Show preview of changes
|
||||
preview = "The following duplicates will be removed:\n"
|
||||
for data in duplicates.values():
|
||||
best = data.get("keep")
|
||||
preview += f"\nKeeping: '{best.name}' (id: {best.id})"
|
||||
|
||||
# Third: Show a preview of the changes
|
||||
total_records_to_remove = 0
|
||||
preview = "The following records will be removed:\n"
|
||||
for data in records_to_prune.values():
|
||||
keep = data.get("keep")
|
||||
if keep:
|
||||
preview += f"\nKeeping: '{keep.name}' (id: {keep.id})"
|
||||
|
||||
for duplicate in data.get("delete"):
|
||||
preview += f"\nRemoving: '{duplicate.name}' (id: {duplicate.id})"
|
||||
total_records_to_remove += 1
|
||||
preview += "\n"
|
||||
|
||||
# Get confirmation and execute deletions
|
||||
# Fourth: Get user confirmation and execute deletions
|
||||
if TerminalHelper.prompt_for_execution(
|
||||
system_exit_on_terminate=True,
|
||||
prompt_message=preview,
|
||||
prompt_title="Clean up duplicate suborganizations?",
|
||||
verify_message="*** WARNING: This will delete suborganizations! ***"
|
||||
prompt_title=f"Remove {total_records_to_remove} suborganizations?",
|
||||
verify_message="*** WARNING: This will replace the record on DomainInformation and DomainRequest! ***",
|
||||
):
|
||||
# Update all references to point to the right suborg before deletion
|
||||
for record in duplicates.values():
|
||||
best_record = record.get("keep")
|
||||
delete_ids = [dupe.id for dupe in record.get("delete")]
|
||||
|
||||
# Update domain requests
|
||||
DomainRequest.objects.filter(
|
||||
sub_organization_id__in=delete_ids
|
||||
).update(sub_organization=best_record)
|
||||
|
||||
# Update domain information
|
||||
DomainInformation.objects.filter(
|
||||
sub_organization_id__in=delete_ids
|
||||
).update(sub_organization=best_record)
|
||||
try:
|
||||
# Update all references to point to the right suborg before deletion
|
||||
all_suborgs_to_remove = set()
|
||||
for record in records_to_prune.values():
|
||||
best_record = record["keep"]
|
||||
suborgs_to_remove = {dupe.id for dupe in record["delete"]}
|
||||
# Update domain requests
|
||||
DomainRequest.objects.filter(sub_organization_id__in=suborgs_to_remove).update(
|
||||
sub_organization=best_record
|
||||
)
|
||||
|
||||
records_to_delete = set(
|
||||
dupe.id
|
||||
for data in duplicates.values()
|
||||
for dupe in data["delete"]
|
||||
)
|
||||
return records_to_delete
|
||||
else:
|
||||
return set()
|
||||
# Update domain information
|
||||
DomainInformation.objects.filter(sub_organization_id__in=suborgs_to_remove).update(
|
||||
sub_organization=best_record
|
||||
)
|
||||
|
||||
all_suborgs_to_remove.update(suborgs_to_remove)
|
||||
delete_count, _ = Suborganization.objects.filter(id__in=all_suborgs_to_remove).delete()
|
||||
TerminalHelper.colorful_logger(
|
||||
logger.info, TerminalColors.MAGENTA, f"Successfully deleted {delete_count} suborganizations."
|
||||
)
|
||||
except Exception as e:
|
||||
TerminalHelper.colorful_logger(
|
||||
logger.error, TerminalColors.FAIL, f"Failed to delete suborganizations: {str(e)}"
|
||||
)
|
||||
|
|
|
@ -401,16 +401,15 @@ class TerminalHelper:
|
|||
# Allow the user to inspect the command string
|
||||
# and ask if they wish to proceed
|
||||
proceed_execution = TerminalHelper.query_yes_no_exit(
|
||||
f"""{TerminalColors.OKCYAN}
|
||||
=====================================================
|
||||
{prompt_title}
|
||||
=====================================================
|
||||
{verify_message}
|
||||
|
||||
{prompt_message}
|
||||
{TerminalColors.FAIL}
|
||||
Proceed? (Y = proceed, N = {action_description_for_selecting_no})
|
||||
{TerminalColors.ENDC}"""
|
||||
f"\n{TerminalColors.OKCYAN}"
|
||||
"====================================================="
|
||||
f"\n{prompt_title}\n"
|
||||
"====================================================="
|
||||
f"\n{verify_message}\n"
|
||||
f"\n{prompt_message}\n"
|
||||
f"{TerminalColors.FAIL}"
|
||||
f"Proceed? (Y = proceed, N = {action_description_for_selecting_no})"
|
||||
f"{TerminalColors.ENDC}"
|
||||
)
|
||||
|
||||
# If the user decided to proceed return true.
|
||||
|
|
|
@ -344,6 +344,7 @@ def value_of_attribute(obj, attribute_name: str):
|
|||
value = value()
|
||||
return value
|
||||
|
||||
|
||||
def normalize_string(string_to_normalize, lowercase=True):
|
||||
"""Normalizes a given string. Returns a string without extra spaces, in all lowercase."""
|
||||
if not isinstance(string_to_normalize, str):
|
||||
|
@ -352,3 +353,17 @@ def normalize_string(string_to_normalize, lowercase=True):
|
|||
|
||||
new_string = " ".join(string_to_normalize.split())
|
||||
return new_string.lower() if lowercase else new_string
|
||||
|
||||
|
||||
def count_capitals(text: str, leading_only: bool):
|
||||
"""Counts capital letters in a string.
|
||||
Args:
|
||||
text (str): The string to analyze.
|
||||
leading_only (bool): If False, counts all capital letters.
|
||||
If True, only counts capitals at the start of words.
|
||||
Returns:
|
||||
int: Number of capital letters found.
|
||||
"""
|
||||
if leading_only:
|
||||
return sum(word[0].isupper() for word in text.split() if word)
|
||||
return sum(c.isupper() for c in text if c)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue