updated scripts to break datasets into max of 10000 rows

This commit is contained in:
David Kennedy 2024-06-13 09:42:33 -04:00
parent 97ee855de6
commit 552e434096
No known key found for this signature in database
GPG key ID: 6528A5386E66B96B
2 changed files with 85 additions and 49 deletions

View file

@ -1,6 +1,9 @@
import glob
import logging
import math
import os
import pyzipper
import tablib
from django.core.management import BaseCommand
import registrar.admin
@ -37,17 +40,21 @@ class Command(BaseCommand):
zip_filename = "tmp/exported_tables.zip"
with pyzipper.AESZipFile(zip_filename, "w", compression=pyzipper.ZIP_DEFLATED) as zipf:
for table_name in table_names:
csv_filename = f"tmp/{table_name}.csv"
if os.path.exists(csv_filename):
zipf.write(csv_filename, os.path.basename(csv_filename))
logger.info(f"Added {csv_filename} to zip archive {zip_filename}")
# Remove the CSV files after adding them to the zip file
for table_name in table_names:
csv_filename = f"tmp/{table_name}.csv"
if os.path.exists(csv_filename):
os.remove(csv_filename)
logger.info(f"Removed temporary file {csv_filename}")
# Define the directory and the pattern
tmp_dir = 'tmp'
pattern = os.path.join(tmp_dir, f'{table_name}_*.csv')
zip_file_path = os.path.join(tmp_dir, 'exported_files.zip')
# Find all files that match the pattern
for file_path in glob.glob(pattern):
# Add each file to the zip archive
zipf.write(file_path, os.path.basename(file_path))
logger.info(f'Added {file_path} to {zip_file_path}')
# Remove the file after adding to zip
os.remove(file_path)
logger.info(f'Removed {file_path}')
def export_table(self, table_name):
"""Export a given table to a csv file in the tmp directory"""
@ -55,11 +62,36 @@ class Command(BaseCommand):
try:
resourceclass = getattr(registrar.admin, resourcename)
dataset = resourceclass().export()
filename = f"tmp/{table_name}.csv"
with open(filename, "w") as outputfile:
outputfile.write(dataset.csv)
logger.info(f"Successfully exported {table_name} to {filename}")
except AttributeError:
logger.error(f"Resource class {resourcename} not found in registrar.admin")
if not isinstance(dataset, tablib.Dataset):
raise ValueError(f"Exported data from {resourcename} is not a tablib.Dataset")
# Determine the number of rows per file
rows_per_file = 10000
total_rows = len(dataset)
# Calculate the number of files needed
num_files = math.ceil(total_rows / rows_per_file)
logger.info(f'splitting {table_name} into {num_files} files')
# Split the dataset and export each chunk to a separate file
for i in range(num_files):
start_row = i * rows_per_file
end_row = start_row + rows_per_file
# Create a new dataset for the chunk
chunk = tablib.Dataset(headers=dataset.headers)
for row in dataset[start_row:end_row]:
chunk.append(row)
#chunk = dataset[start_row:end_row]
# Export the chunk to a new file
filename = f'tmp/{table_name}_{i + 1}.csv'
with open(filename, 'w') as f:
f.write(chunk.export('csv'))
logger.info(f'Successfully exported {table_name} into {num_files} files.')
except AttributeError as ae:
logger.error(f"Resource class {resourcename} not found in registrar.admin: {ae}")
except Exception as e:
logger.error(f"Failed to export {table_name}: {e}")

View file

@ -1,4 +1,5 @@
import argparse
import glob
import logging
import os
import pyzipper
@ -64,43 +65,46 @@ class Command(BaseCommand):
"""Import data from a CSV file into the given table"""
resourcename = f"{table_name}Resource"
csv_filename = f"tmp/{table_name}.csv"
try:
if not os.path.exists(csv_filename):
logger.error(f"CSV file {csv_filename} not found.")
return
# if table_name is Contact, clean the table first
# User table is loaded before Contact, and signals create
# rows in Contact table which break the import, so need
# to be cleaned again before running import on Contact table
if table_name == "Contact":
self.clean_table(table_name)
# if table_name is Contact, clean the table first
# User table is loaded before Contact, and signals create
# rows in Contact table which break the import, so need
# to be cleaned again before running import on Contact table
if table_name == "Contact":
self.clean_table(table_name)
resourceclass = getattr(registrar.admin, resourcename)
resource_instance = resourceclass()
with open(csv_filename, "r") as csvfile:
dataset = tablib.Dataset().load(csvfile.read(), format="csv")
result = resource_instance.import_data(dataset, dry_run=False, skip_epp_save=self.skip_epp_save)
# Define the directory and the pattern for csv filenames
tmp_dir = 'tmp'
pattern = os.path.join(tmp_dir, f'{table_name}_*.csv')
if result.has_errors():
logger.error(f"Errors occurred while importing {csv_filename}:")
for row_error in result.row_errors():
row_index = row_error[0]
errors = row_error[1]
for error in errors:
logger.error(f"Row {row_index} - {error.error} - {error.row}")
else:
logger.info(f"Successfully imported {csv_filename} into {table_name}")
resourceclass = getattr(registrar.admin, resourcename)
resource_instance = resourceclass()
except AttributeError:
logger.error(f"Resource class {resourcename} not found in registrar.admin")
except Exception as e:
logger.error(f"Failed to import {csv_filename}: {e}")
finally:
if os.path.exists(csv_filename):
os.remove(csv_filename)
logger.info(f"Removed temporary file {csv_filename}")
# Find all files that match the pattern
for csv_filename in glob.glob(pattern):
try:
with open(csv_filename, "r") as csvfile:
dataset = tablib.Dataset().load(csvfile.read(), format="csv")
result = resource_instance.import_data(dataset, dry_run=False, skip_epp_save=self.skip_epp_save)
if result.has_errors():
logger.error(f"Errors occurred while importing {csv_filename}:")
for row_error in result.row_errors():
row_index = row_error[0]
errors = row_error[1]
for error in errors:
logger.error(f"Row {row_index} - {error.error} - {error.row}")
else:
logger.info(f"Successfully imported {csv_filename} into {table_name}")
except AttributeError:
logger.error(f"Resource class {resourcename} not found in registrar.admin")
except Exception as e:
logger.error(f"Failed to import {csv_filename}: {e}")
finally:
if os.path.exists(csv_filename):
os.remove(csv_filename)
logger.info(f"Removed temporary file {csv_filename}")
def clean_table(self, table_name):
"""Delete all rows in the given table"""