Add suborg data and clean duplicate script

This commit is contained in:
zandercymatics 2025-01-06 11:36:44 -07:00
parent 1e12459c10
commit 0118e1f00d
No known key found for this signature in database
GPG key ID: FF4636ABEC9682B7
2 changed files with 139 additions and 7 deletions

View file

@ -0,0 +1,123 @@
import logging
from django.core.management import BaseCommand
from registrar.models import Suborganization, DomainRequest, DomainInformation
from registrar.management.commands.utility.terminal_helper import TerminalColors, TerminalHelper
logger = logging.getLogger(__name__)
class Command(BaseCommand):
help = "Clean up duplicate suborganizations that differ only by spaces and capitalization"
def handle(self, **kwargs):
# Find duplicates
duplicates = {}
all_suborgs = Suborganization.objects.all()
for suborg in all_suborgs:
# Normalize name by removing extra spaces and converting to lowercase
normalized_name = " ".join(suborg.name.split()).lower()
# First occurrence of this name
if normalized_name not in duplicates:
duplicates[normalized_name] = {
"keep": suborg,
"delete": []
}
continue
# Compare with our current best
current_best = duplicates[normalized_name]["keep"]
# Check if all other fields match.
# If they don't, we should inspect this record manually.
fields_to_compare = ["portfolio", "city", "state_territory"]
fields_match = all(
getattr(suborg, field) == getattr(current_best, field)
for field in fields_to_compare
)
if not fields_match:
logger.warning(
f"{TerminalColors.YELLOW}"
f"\nSkipping potential duplicate: {suborg.name} (id: {suborg.id})"
f"\nData mismatch with {current_best.name} (id: {current_best.id})"
f"{TerminalColors.ENDC}"
)
continue
# Determine if new suborg is better than current best.
# The fewest spaces and most capitals wins.
new_has_fewer_spaces = suborg.name.count(" ") < current_best.name.count(" ")
new_has_more_capitals = sum(1 for c in suborg.name if c.isupper()) > sum(1 for c in current_best.name if c.isupper())
# TODO
# Split into words and count properly capitalized first letters
# new_proper_caps = sum(
# 1 for word in suborg.name.split()
# if word and word[0].isupper()
# )
# current_proper_caps = sum(
# 1 for word in current_best.name.split()
# if word and word[0].isupper()
# )
# new_has_better_caps = new_proper_caps > current_proper_caps
if new_has_fewer_spaces or new_has_more_capitals:
# New suborg is better - demote the old one to the delete list
duplicates[normalized_name]["delete"].append(current_best)
duplicates[normalized_name]["keep"] = suborg
else:
# If it is not better, just delete the old one
duplicates[normalized_name]["delete"].append(suborg)
# Filter out entries without duplicates
duplicates = {k: v for k, v in duplicates.items() if v.get("delete")}
if not duplicates:
logger.info(f"No duplicate suborganizations found.")
return
# Show preview of changes
preview = "The following duplicates will be removed:\n"
for data in duplicates.values():
best = data.get("keep")
preview += f"\nKeeping: '{best.name}' (id: {best.id})"
for duplicate in data.get("delete"):
preview += f"\nRemoving: '{duplicate.name}' (id: {duplicate.id})"
preview += "\n"
# Get confirmation and execute deletions
if TerminalHelper.prompt_for_execution(
system_exit_on_terminate=True,
prompt_message=preview,
prompt_title="Clean up duplicate suborganizations?",
verify_message="*** WARNING: This will delete suborganizations! ***"
):
try:
# Update all references to point to the right suborg before deletion
for record in duplicates.values():
best_record = record.get("keep")
delete_ids = [dupe.id for dupe in record.get("delete")]
# Update domain requests
DomainRequest.objects.filter(
sub_organization_id__in=delete_ids
).update(sub_organization=best_record)
# Update domain information
DomainInformation.objects.filter(
sub_organization_id__in=delete_ids
).update(sub_organization=best_record)
ids_to_delete = [
dupe.id
for data in duplicates.values()
for dupe in data["delete"]
]
# Bulk delete all duplicates
delete_count, _ = Suborganization.objects.filter(id__in=ids_to_delete).delete()
logger.info(f"{TerminalColors.OKGREEN}Successfully deleted {delete_count} suborganizations{TerminalColors.ENDC}")
except Exception as e:
logger.error(f"{TerminalColors.FAIL}Failed to clean up suborganizations: {str(e)}{TerminalColors.ENDC}")

View file

@ -104,7 +104,11 @@ class Command(BaseCommand):
also create new suborganizations"""
portfolio, created = self.create_portfolio(federal_agency)
if created:
self.create_suborganizations(portfolio, federal_agency)
valid_agencies = DomainInformation.objects.filter(
federal_agency=federal_agency, organization_name__isnull=False
)
org_names = set(valid_agencies.values_list("organization_name", flat=True))
self.create_suborganizations(portfolio, federal_agency, org_names)
if parse_domains or both:
self.handle_portfolio_domains(portfolio, federal_agency)
@ -155,13 +159,8 @@ class Command(BaseCommand):
return portfolio, True
def create_suborganizations(self, portfolio: Portfolio, federal_agency: FederalAgency):
def create_suborganizations(self, portfolio: Portfolio, federal_agency: FederalAgency, org_names: set):
"""Create Suborganizations tied to the given portfolio based on DomainInformation objects"""
valid_agencies = DomainInformation.objects.filter(
federal_agency=federal_agency, organization_name__isnull=False
)
org_names = set(valid_agencies.values_list("organization_name", flat=True))
if not org_names:
message = (
"Could not add any suborganizations."
@ -232,6 +231,16 @@ class Command(BaseCommand):
domain_request.portfolio = portfolio
if domain_request.organization_name in suborgs:
domain_request.sub_organization = suborgs.get(domain_request.organization_name)
else:
# Fill in the requesting suborg fields if we have the data to do so
if domain_request.organization_name and domain_request.city and domain_request.state_territory:
domain_request.requested_suborganization = domain_request.organization_name
domain_request.suborganization_city = domain_request.city
domain_request.suborganization_state_territory = domain_request.state_territory
else:
message = f"No suborganization data found whatsoever for {domain_request}."
TerminalHelper.colorful_logger(logger.warning, TerminalColors.YELLOW, message)
self.updated_portfolios.add(portfolio)
DomainRequest.objects.bulk_update(domain_requests, ["portfolio", "sub_organization"])