mirror of
https://github.com/cisagov/manage.get.gov.git
synced 2025-08-03 00:12:16 +02:00
updated scripts to break datasets into max of 10000 rows
This commit is contained in:
parent
97ee855de6
commit
552e434096
2 changed files with 85 additions and 49 deletions
|
@ -1,6 +1,9 @@
|
|||
import glob
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
import pyzipper
|
||||
import tablib
|
||||
from django.core.management import BaseCommand
|
||||
import registrar.admin
|
||||
|
||||
|
@ -37,17 +40,21 @@ class Command(BaseCommand):
|
|||
zip_filename = "tmp/exported_tables.zip"
|
||||
with pyzipper.AESZipFile(zip_filename, "w", compression=pyzipper.ZIP_DEFLATED) as zipf:
|
||||
for table_name in table_names:
|
||||
csv_filename = f"tmp/{table_name}.csv"
|
||||
if os.path.exists(csv_filename):
|
||||
zipf.write(csv_filename, os.path.basename(csv_filename))
|
||||
logger.info(f"Added {csv_filename} to zip archive {zip_filename}")
|
||||
|
||||
# Remove the CSV files after adding them to the zip file
|
||||
for table_name in table_names:
|
||||
csv_filename = f"tmp/{table_name}.csv"
|
||||
if os.path.exists(csv_filename):
|
||||
os.remove(csv_filename)
|
||||
logger.info(f"Removed temporary file {csv_filename}")
|
||||
# Define the directory and the pattern
|
||||
tmp_dir = 'tmp'
|
||||
pattern = os.path.join(tmp_dir, f'{table_name}_*.csv')
|
||||
zip_file_path = os.path.join(tmp_dir, 'exported_files.zip')
|
||||
|
||||
# Find all files that match the pattern
|
||||
for file_path in glob.glob(pattern):
|
||||
# Add each file to the zip archive
|
||||
zipf.write(file_path, os.path.basename(file_path))
|
||||
logger.info(f'Added {file_path} to {zip_file_path}')
|
||||
|
||||
# Remove the file after adding to zip
|
||||
os.remove(file_path)
|
||||
logger.info(f'Removed {file_path}')
|
||||
|
||||
def export_table(self, table_name):
|
||||
"""Export a given table to a csv file in the tmp directory"""
|
||||
|
@ -55,11 +62,36 @@ class Command(BaseCommand):
|
|||
try:
|
||||
resourceclass = getattr(registrar.admin, resourcename)
|
||||
dataset = resourceclass().export()
|
||||
filename = f"tmp/{table_name}.csv"
|
||||
with open(filename, "w") as outputfile:
|
||||
outputfile.write(dataset.csv)
|
||||
logger.info(f"Successfully exported {table_name} to {filename}")
|
||||
except AttributeError:
|
||||
logger.error(f"Resource class {resourcename} not found in registrar.admin")
|
||||
if not isinstance(dataset, tablib.Dataset):
|
||||
raise ValueError(f"Exported data from {resourcename} is not a tablib.Dataset")
|
||||
|
||||
# Determine the number of rows per file
|
||||
rows_per_file = 10000
|
||||
total_rows = len(dataset)
|
||||
|
||||
# Calculate the number of files needed
|
||||
num_files = math.ceil(total_rows / rows_per_file)
|
||||
logger.info(f'splitting {table_name} into {num_files} files')
|
||||
|
||||
# Split the dataset and export each chunk to a separate file
|
||||
for i in range(num_files):
|
||||
start_row = i * rows_per_file
|
||||
end_row = start_row + rows_per_file
|
||||
|
||||
# Create a new dataset for the chunk
|
||||
chunk = tablib.Dataset(headers=dataset.headers)
|
||||
for row in dataset[start_row:end_row]:
|
||||
chunk.append(row)
|
||||
#chunk = dataset[start_row:end_row]
|
||||
|
||||
# Export the chunk to a new file
|
||||
filename = f'tmp/{table_name}_{i + 1}.csv'
|
||||
with open(filename, 'w') as f:
|
||||
f.write(chunk.export('csv'))
|
||||
|
||||
logger.info(f'Successfully exported {table_name} into {num_files} files.')
|
||||
|
||||
except AttributeError as ae:
|
||||
logger.error(f"Resource class {resourcename} not found in registrar.admin: {ae}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to export {table_name}: {e}")
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
import argparse
|
||||
import glob
|
||||
import logging
|
||||
import os
|
||||
import pyzipper
|
||||
|
@ -64,43 +65,46 @@ class Command(BaseCommand):
|
|||
"""Import data from a CSV file into the given table"""
|
||||
|
||||
resourcename = f"{table_name}Resource"
|
||||
csv_filename = f"tmp/{table_name}.csv"
|
||||
try:
|
||||
if not os.path.exists(csv_filename):
|
||||
logger.error(f"CSV file {csv_filename} not found.")
|
||||
return
|
||||
|
||||
# if table_name is Contact, clean the table first
|
||||
# User table is loaded before Contact, and signals create
|
||||
# rows in Contact table which break the import, so need
|
||||
# to be cleaned again before running import on Contact table
|
||||
if table_name == "Contact":
|
||||
self.clean_table(table_name)
|
||||
# if table_name is Contact, clean the table first
|
||||
# User table is loaded before Contact, and signals create
|
||||
# rows in Contact table which break the import, so need
|
||||
# to be cleaned again before running import on Contact table
|
||||
if table_name == "Contact":
|
||||
self.clean_table(table_name)
|
||||
|
||||
resourceclass = getattr(registrar.admin, resourcename)
|
||||
resource_instance = resourceclass()
|
||||
with open(csv_filename, "r") as csvfile:
|
||||
dataset = tablib.Dataset().load(csvfile.read(), format="csv")
|
||||
result = resource_instance.import_data(dataset, dry_run=False, skip_epp_save=self.skip_epp_save)
|
||||
# Define the directory and the pattern for csv filenames
|
||||
tmp_dir = 'tmp'
|
||||
pattern = os.path.join(tmp_dir, f'{table_name}_*.csv')
|
||||
|
||||
if result.has_errors():
|
||||
logger.error(f"Errors occurred while importing {csv_filename}:")
|
||||
for row_error in result.row_errors():
|
||||
row_index = row_error[0]
|
||||
errors = row_error[1]
|
||||
for error in errors:
|
||||
logger.error(f"Row {row_index} - {error.error} - {error.row}")
|
||||
else:
|
||||
logger.info(f"Successfully imported {csv_filename} into {table_name}")
|
||||
resourceclass = getattr(registrar.admin, resourcename)
|
||||
resource_instance = resourceclass()
|
||||
|
||||
except AttributeError:
|
||||
logger.error(f"Resource class {resourcename} not found in registrar.admin")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to import {csv_filename}: {e}")
|
||||
finally:
|
||||
if os.path.exists(csv_filename):
|
||||
os.remove(csv_filename)
|
||||
logger.info(f"Removed temporary file {csv_filename}")
|
||||
# Find all files that match the pattern
|
||||
for csv_filename in glob.glob(pattern):
|
||||
try:
|
||||
with open(csv_filename, "r") as csvfile:
|
||||
dataset = tablib.Dataset().load(csvfile.read(), format="csv")
|
||||
result = resource_instance.import_data(dataset, dry_run=False, skip_epp_save=self.skip_epp_save)
|
||||
|
||||
if result.has_errors():
|
||||
logger.error(f"Errors occurred while importing {csv_filename}:")
|
||||
for row_error in result.row_errors():
|
||||
row_index = row_error[0]
|
||||
errors = row_error[1]
|
||||
for error in errors:
|
||||
logger.error(f"Row {row_index} - {error.error} - {error.row}")
|
||||
else:
|
||||
logger.info(f"Successfully imported {csv_filename} into {table_name}")
|
||||
|
||||
except AttributeError:
|
||||
logger.error(f"Resource class {resourcename} not found in registrar.admin")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to import {csv_filename}: {e}")
|
||||
finally:
|
||||
if os.path.exists(csv_filename):
|
||||
os.remove(csv_filename)
|
||||
logger.info(f"Removed temporary file {csv_filename}")
|
||||
|
||||
def clean_table(self, table_name):
|
||||
"""Delete all rows in the given table"""
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue