mirror of
https://github.com/cisagov/manage.get.gov.git
synced 2025-07-31 06:56:33 +02:00
Add suborg data and clean duplicate script
This commit is contained in:
parent
1e12459c10
commit
0118e1f00d
2 changed files with 139 additions and 7 deletions
123
src/registrar/management/commands/clean_duplicate_suborgs.py
Normal file
123
src/registrar/management/commands/clean_duplicate_suborgs.py
Normal file
|
@ -0,0 +1,123 @@
|
|||
import logging
|
||||
from django.core.management import BaseCommand
|
||||
from registrar.models import Suborganization, DomainRequest, DomainInformation
|
||||
from registrar.management.commands.utility.terminal_helper import TerminalColors, TerminalHelper
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "Clean up duplicate suborganizations that differ only by spaces and capitalization"
|
||||
|
||||
def handle(self, **kwargs):
|
||||
# Find duplicates
|
||||
duplicates = {}
|
||||
all_suborgs = Suborganization.objects.all()
|
||||
|
||||
for suborg in all_suborgs:
|
||||
# Normalize name by removing extra spaces and converting to lowercase
|
||||
normalized_name = " ".join(suborg.name.split()).lower()
|
||||
|
||||
# First occurrence of this name
|
||||
if normalized_name not in duplicates:
|
||||
duplicates[normalized_name] = {
|
||||
"keep": suborg,
|
||||
"delete": []
|
||||
}
|
||||
continue
|
||||
|
||||
# Compare with our current best
|
||||
current_best = duplicates[normalized_name]["keep"]
|
||||
|
||||
# Check if all other fields match.
|
||||
# If they don't, we should inspect this record manually.
|
||||
fields_to_compare = ["portfolio", "city", "state_territory"]
|
||||
fields_match = all(
|
||||
getattr(suborg, field) == getattr(current_best, field)
|
||||
for field in fields_to_compare
|
||||
)
|
||||
if not fields_match:
|
||||
logger.warning(
|
||||
f"{TerminalColors.YELLOW}"
|
||||
f"\nSkipping potential duplicate: {suborg.name} (id: {suborg.id})"
|
||||
f"\nData mismatch with {current_best.name} (id: {current_best.id})"
|
||||
f"{TerminalColors.ENDC}"
|
||||
)
|
||||
continue
|
||||
|
||||
# Determine if new suborg is better than current best.
|
||||
# The fewest spaces and most capitals wins.
|
||||
new_has_fewer_spaces = suborg.name.count(" ") < current_best.name.count(" ")
|
||||
new_has_more_capitals = sum(1 for c in suborg.name if c.isupper()) > sum(1 for c in current_best.name if c.isupper())
|
||||
# TODO
|
||||
# Split into words and count properly capitalized first letters
|
||||
# new_proper_caps = sum(
|
||||
# 1 for word in suborg.name.split()
|
||||
# if word and word[0].isupper()
|
||||
# )
|
||||
# current_proper_caps = sum(
|
||||
# 1 for word in current_best.name.split()
|
||||
# if word and word[0].isupper()
|
||||
# )
|
||||
# new_has_better_caps = new_proper_caps > current_proper_caps
|
||||
|
||||
if new_has_fewer_spaces or new_has_more_capitals:
|
||||
# New suborg is better - demote the old one to the delete list
|
||||
duplicates[normalized_name]["delete"].append(current_best)
|
||||
duplicates[normalized_name]["keep"] = suborg
|
||||
else:
|
||||
# If it is not better, just delete the old one
|
||||
duplicates[normalized_name]["delete"].append(suborg)
|
||||
|
||||
# Filter out entries without duplicates
|
||||
duplicates = {k: v for k, v in duplicates.items() if v.get("delete")}
|
||||
if not duplicates:
|
||||
logger.info(f"No duplicate suborganizations found.")
|
||||
return
|
||||
|
||||
# Show preview of changes
|
||||
preview = "The following duplicates will be removed:\n"
|
||||
for data in duplicates.values():
|
||||
best = data.get("keep")
|
||||
preview += f"\nKeeping: '{best.name}' (id: {best.id})"
|
||||
|
||||
for duplicate in data.get("delete"):
|
||||
preview += f"\nRemoving: '{duplicate.name}' (id: {duplicate.id})"
|
||||
preview += "\n"
|
||||
|
||||
# Get confirmation and execute deletions
|
||||
if TerminalHelper.prompt_for_execution(
|
||||
system_exit_on_terminate=True,
|
||||
prompt_message=preview,
|
||||
prompt_title="Clean up duplicate suborganizations?",
|
||||
verify_message="*** WARNING: This will delete suborganizations! ***"
|
||||
):
|
||||
try:
|
||||
# Update all references to point to the right suborg before deletion
|
||||
for record in duplicates.values():
|
||||
best_record = record.get("keep")
|
||||
delete_ids = [dupe.id for dupe in record.get("delete")]
|
||||
|
||||
# Update domain requests
|
||||
DomainRequest.objects.filter(
|
||||
sub_organization_id__in=delete_ids
|
||||
).update(sub_organization=best_record)
|
||||
|
||||
# Update domain information
|
||||
DomainInformation.objects.filter(
|
||||
sub_organization_id__in=delete_ids
|
||||
).update(sub_organization=best_record)
|
||||
|
||||
ids_to_delete = [
|
||||
dupe.id
|
||||
for data in duplicates.values()
|
||||
for dupe in data["delete"]
|
||||
]
|
||||
|
||||
# Bulk delete all duplicates
|
||||
delete_count, _ = Suborganization.objects.filter(id__in=ids_to_delete).delete()
|
||||
logger.info(f"{TerminalColors.OKGREEN}Successfully deleted {delete_count} suborganizations{TerminalColors.ENDC}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"{TerminalColors.FAIL}Failed to clean up suborganizations: {str(e)}{TerminalColors.ENDC}")
|
|
@ -104,7 +104,11 @@ class Command(BaseCommand):
|
|||
also create new suborganizations"""
|
||||
portfolio, created = self.create_portfolio(federal_agency)
|
||||
if created:
|
||||
self.create_suborganizations(portfolio, federal_agency)
|
||||
valid_agencies = DomainInformation.objects.filter(
|
||||
federal_agency=federal_agency, organization_name__isnull=False
|
||||
)
|
||||
org_names = set(valid_agencies.values_list("organization_name", flat=True))
|
||||
self.create_suborganizations(portfolio, federal_agency, org_names)
|
||||
if parse_domains or both:
|
||||
self.handle_portfolio_domains(portfolio, federal_agency)
|
||||
|
||||
|
@ -155,13 +159,8 @@ class Command(BaseCommand):
|
|||
|
||||
return portfolio, True
|
||||
|
||||
def create_suborganizations(self, portfolio: Portfolio, federal_agency: FederalAgency):
|
||||
def create_suborganizations(self, portfolio: Portfolio, federal_agency: FederalAgency, org_names: set):
|
||||
"""Create Suborganizations tied to the given portfolio based on DomainInformation objects"""
|
||||
valid_agencies = DomainInformation.objects.filter(
|
||||
federal_agency=federal_agency, organization_name__isnull=False
|
||||
)
|
||||
org_names = set(valid_agencies.values_list("organization_name", flat=True))
|
||||
|
||||
if not org_names:
|
||||
message = (
|
||||
"Could not add any suborganizations."
|
||||
|
@ -232,6 +231,16 @@ class Command(BaseCommand):
|
|||
domain_request.portfolio = portfolio
|
||||
if domain_request.organization_name in suborgs:
|
||||
domain_request.sub_organization = suborgs.get(domain_request.organization_name)
|
||||
else:
|
||||
# Fill in the requesting suborg fields if we have the data to do so
|
||||
if domain_request.organization_name and domain_request.city and domain_request.state_territory:
|
||||
domain_request.requested_suborganization = domain_request.organization_name
|
||||
domain_request.suborganization_city = domain_request.city
|
||||
domain_request.suborganization_state_territory = domain_request.state_territory
|
||||
else:
|
||||
message = f"No suborganization data found whatsoever for {domain_request}."
|
||||
TerminalHelper.colorful_logger(logger.warning, TerminalColors.YELLOW, message)
|
||||
|
||||
self.updated_portfolios.add(portfolio)
|
||||
|
||||
DomainRequest.objects.bulk_update(domain_requests, ["portfolio", "sub_organization"])
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue