colouring-montreal/maintenance/extract_data/extract_data.py

119 lines
3.1 KiB
Python
Raw Normal View History

2019-08-29 08:17:09 -04:00
#!/usr/bin/env python3
import csv
import datetime
from io import StringIO
import os
from pathlib import Path
import zipfile
import psycopg2
class ZipFileExistsError(Exception):
pass
2019-08-29 08:17:09 -04:00
def get_connection():
return psycopg2.connect(
host=os.environ['PGHOST'],
dbname=os.environ['PGDATABASE'],
user=os.environ['PGUSER'],
password=os.environ['PGPASSWORD']
)
def fetch_with_server_side_cursor(
connection,
query,
on_row,
row_batch_size=10000
):
with connection.cursor('server_side') as cur:
cur.itersize = row_batch_size
cur.execute(query)
header_saved = False
for row in cur:
if not header_saved:
columns = [c[0] for c in cur.description]
on_row(columns)
header_saved = True
on_row(row)
def db_to_csv(connection, query):
string_io = StringIO()
writer = csv.writer(string_io)
fetch_with_server_side_cursor(
connection,
query,
lambda row: writer.writerow(row)
)
return string_io.getvalue()
def get_extract_zip_file_path(current_time):
base_dir = Path(os.environ['EXTRACTS_DIRECTORY'])
2019-08-30 09:11:25 -04:00
file_name = f"data-extract-{current_time:%Y-%m-%d-%H_%M_%S}.zip"
2019-08-29 08:17:09 -04:00
return base_dir / file_name
def add_extract_record_to_database(connection, zip_file_path, extracted_time):
with connection.cursor() as cur:
truncated_time = extracted_time.replace(second=0, microsecond=0)
cur.execute('''INSERT INTO
bulk_extracts (extracted_on, extract_path)
VALUES
(%s, %s)
''', (truncated_time, str(zip_file_path)))
connection.commit()
def read_sql(rel_path_from_script):
script_directory = Path(__file__).resolve().parent
sql_path = script_directory / rel_path_from_script
return sql_path.read_text()
building_attr_query = read_sql('./export_attributes.sql')
building_uprn_query = read_sql('./export_uprns.sql')
edit_history_query = read_sql('./export_edit_history.sql')
def make_data_extract(current_time, connection, zip_file_path):
if zip_file_path.exists():
raise ZipFileExistsError('Archive file under specified name already exists')
2019-08-29 08:17:09 -04:00
zip_file_path.parent.mkdir(parents=True, exist_ok=True)
try:
with zipfile.ZipFile(zip_file_path, mode='w') as newzip:
newzip.writestr('building_attributes.csv',
db_to_csv(connection, building_attr_query))
newzip.writestr('building_uprns.csv',
db_to_csv(connection, building_uprn_query))
newzip.writestr('edit_history.csv',
db_to_csv(connection, edit_history_query))
# TODO: add README
add_extract_record_to_database(connection, zip_file_path, current_time)
except:
zip_file_path.unlink()
raise
def main():
current_time = datetime.datetime.utcnow()
conn = get_connection()
zip_file_path = get_extract_zip_file_path(current_time)
make_data_extract(current_time, conn, zip_file_path)
if __name__ == '__main__':
main()