mirror of
https://github.com/cisagov/manage.get.gov.git
synced 2025-08-12 20:49:41 +02:00
updated scripts to break datasets into max of 10000 rows
This commit is contained in:
parent
97ee855de6
commit
552e434096
2 changed files with 85 additions and 49 deletions
|
@ -1,6 +1,9 @@
|
||||||
|
import glob
|
||||||
import logging
|
import logging
|
||||||
|
import math
|
||||||
import os
|
import os
|
||||||
import pyzipper
|
import pyzipper
|
||||||
|
import tablib
|
||||||
from django.core.management import BaseCommand
|
from django.core.management import BaseCommand
|
||||||
import registrar.admin
|
import registrar.admin
|
||||||
|
|
||||||
|
@ -37,17 +40,21 @@ class Command(BaseCommand):
|
||||||
zip_filename = "tmp/exported_tables.zip"
|
zip_filename = "tmp/exported_tables.zip"
|
||||||
with pyzipper.AESZipFile(zip_filename, "w", compression=pyzipper.ZIP_DEFLATED) as zipf:
|
with pyzipper.AESZipFile(zip_filename, "w", compression=pyzipper.ZIP_DEFLATED) as zipf:
|
||||||
for table_name in table_names:
|
for table_name in table_names:
|
||||||
csv_filename = f"tmp/{table_name}.csv"
|
|
||||||
if os.path.exists(csv_filename):
|
|
||||||
zipf.write(csv_filename, os.path.basename(csv_filename))
|
|
||||||
logger.info(f"Added {csv_filename} to zip archive {zip_filename}")
|
|
||||||
|
|
||||||
# Remove the CSV files after adding them to the zip file
|
# Define the directory and the pattern
|
||||||
for table_name in table_names:
|
tmp_dir = 'tmp'
|
||||||
csv_filename = f"tmp/{table_name}.csv"
|
pattern = os.path.join(tmp_dir, f'{table_name}_*.csv')
|
||||||
if os.path.exists(csv_filename):
|
zip_file_path = os.path.join(tmp_dir, 'exported_files.zip')
|
||||||
os.remove(csv_filename)
|
|
||||||
logger.info(f"Removed temporary file {csv_filename}")
|
# Find all files that match the pattern
|
||||||
|
for file_path in glob.glob(pattern):
|
||||||
|
# Add each file to the zip archive
|
||||||
|
zipf.write(file_path, os.path.basename(file_path))
|
||||||
|
logger.info(f'Added {file_path} to {zip_file_path}')
|
||||||
|
|
||||||
|
# Remove the file after adding to zip
|
||||||
|
os.remove(file_path)
|
||||||
|
logger.info(f'Removed {file_path}')
|
||||||
|
|
||||||
def export_table(self, table_name):
|
def export_table(self, table_name):
|
||||||
"""Export a given table to a csv file in the tmp directory"""
|
"""Export a given table to a csv file in the tmp directory"""
|
||||||
|
@ -55,11 +62,36 @@ class Command(BaseCommand):
|
||||||
try:
|
try:
|
||||||
resourceclass = getattr(registrar.admin, resourcename)
|
resourceclass = getattr(registrar.admin, resourcename)
|
||||||
dataset = resourceclass().export()
|
dataset = resourceclass().export()
|
||||||
filename = f"tmp/{table_name}.csv"
|
if not isinstance(dataset, tablib.Dataset):
|
||||||
with open(filename, "w") as outputfile:
|
raise ValueError(f"Exported data from {resourcename} is not a tablib.Dataset")
|
||||||
outputfile.write(dataset.csv)
|
|
||||||
logger.info(f"Successfully exported {table_name} to {filename}")
|
# Determine the number of rows per file
|
||||||
except AttributeError:
|
rows_per_file = 10000
|
||||||
logger.error(f"Resource class {resourcename} not found in registrar.admin")
|
total_rows = len(dataset)
|
||||||
|
|
||||||
|
# Calculate the number of files needed
|
||||||
|
num_files = math.ceil(total_rows / rows_per_file)
|
||||||
|
logger.info(f'splitting {table_name} into {num_files} files')
|
||||||
|
|
||||||
|
# Split the dataset and export each chunk to a separate file
|
||||||
|
for i in range(num_files):
|
||||||
|
start_row = i * rows_per_file
|
||||||
|
end_row = start_row + rows_per_file
|
||||||
|
|
||||||
|
# Create a new dataset for the chunk
|
||||||
|
chunk = tablib.Dataset(headers=dataset.headers)
|
||||||
|
for row in dataset[start_row:end_row]:
|
||||||
|
chunk.append(row)
|
||||||
|
#chunk = dataset[start_row:end_row]
|
||||||
|
|
||||||
|
# Export the chunk to a new file
|
||||||
|
filename = f'tmp/{table_name}_{i + 1}.csv'
|
||||||
|
with open(filename, 'w') as f:
|
||||||
|
f.write(chunk.export('csv'))
|
||||||
|
|
||||||
|
logger.info(f'Successfully exported {table_name} into {num_files} files.')
|
||||||
|
|
||||||
|
except AttributeError as ae:
|
||||||
|
logger.error(f"Resource class {resourcename} not found in registrar.admin: {ae}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to export {table_name}: {e}")
|
logger.error(f"Failed to export {table_name}: {e}")
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
import argparse
|
import argparse
|
||||||
|
import glob
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import pyzipper
|
import pyzipper
|
||||||
|
@ -64,11 +65,6 @@ class Command(BaseCommand):
|
||||||
"""Import data from a CSV file into the given table"""
|
"""Import data from a CSV file into the given table"""
|
||||||
|
|
||||||
resourcename = f"{table_name}Resource"
|
resourcename = f"{table_name}Resource"
|
||||||
csv_filename = f"tmp/{table_name}.csv"
|
|
||||||
try:
|
|
||||||
if not os.path.exists(csv_filename):
|
|
||||||
logger.error(f"CSV file {csv_filename} not found.")
|
|
||||||
return
|
|
||||||
|
|
||||||
# if table_name is Contact, clean the table first
|
# if table_name is Contact, clean the table first
|
||||||
# User table is loaded before Contact, and signals create
|
# User table is loaded before Contact, and signals create
|
||||||
|
@ -77,8 +73,16 @@ class Command(BaseCommand):
|
||||||
if table_name == "Contact":
|
if table_name == "Contact":
|
||||||
self.clean_table(table_name)
|
self.clean_table(table_name)
|
||||||
|
|
||||||
|
# Define the directory and the pattern for csv filenames
|
||||||
|
tmp_dir = 'tmp'
|
||||||
|
pattern = os.path.join(tmp_dir, f'{table_name}_*.csv')
|
||||||
|
|
||||||
resourceclass = getattr(registrar.admin, resourcename)
|
resourceclass = getattr(registrar.admin, resourcename)
|
||||||
resource_instance = resourceclass()
|
resource_instance = resourceclass()
|
||||||
|
|
||||||
|
# Find all files that match the pattern
|
||||||
|
for csv_filename in glob.glob(pattern):
|
||||||
|
try:
|
||||||
with open(csv_filename, "r") as csvfile:
|
with open(csv_filename, "r") as csvfile:
|
||||||
dataset = tablib.Dataset().load(csvfile.read(), format="csv")
|
dataset = tablib.Dataset().load(csvfile.read(), format="csv")
|
||||||
result = resource_instance.import_data(dataset, dry_run=False, skip_epp_save=self.skip_epp_save)
|
result = resource_instance.import_data(dataset, dry_run=False, skip_epp_save=self.skip_epp_save)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue