From be13cd94f59aa9041b05ae5f2d7e493012b8cfed Mon Sep 17 00:00:00 2001 From: Maciej Ziarkowski Date: Thu, 29 Aug 2019 13:17:09 +0100 Subject: [PATCH] Add data extract maintenance script --- .../extract_data/export_attributes.sql | 51 ++++++++ .../extract_data/export_edit_history.sql | 3 + maintenance/extract_data/export_uprns.sql | 3 + maintenance/extract_data/extract_data.py | 112 ++++++++++++++++++ maintenance/requirements.txt | 1 + 5 files changed, 170 insertions(+) create mode 100644 maintenance/extract_data/export_attributes.sql create mode 100644 maintenance/extract_data/export_edit_history.sql create mode 100644 maintenance/extract_data/export_uprns.sql create mode 100644 maintenance/extract_data/extract_data.py create mode 100644 maintenance/requirements.txt diff --git a/maintenance/extract_data/export_attributes.sql b/maintenance/extract_data/export_attributes.sql new file mode 100644 index 00000000..82bb67ba --- /dev/null +++ b/maintenance/extract_data/export_attributes.sql @@ -0,0 +1,51 @@ +SELECT + building_id, + ref_toid, + ref_osm_id, + revision_id, + location_name, + location_number, + location_street, + location_line_two, + location_town, + location_postcode, + location_latitude, + location_longitude, + date_year, + date_lower, + date_upper, + date_source, + date_source_detail, + facade_year, + facade_upper, + facade_lower, + facade_source, + facade_source_detail, + size_storeys_attic, + size_storeys_core, + size_storeys_basement, + size_height_apex, + size_floor_area_ground, + size_floor_area_total, + size_width_frontage, + likes_total, + planning_portal_link, + planning_in_conservation_area, + planning_conservation_area_name, + planning_in_list, + planning_list_id, + planning_heritage_at_risk_id, + planning_world_list_id, + planning_in_glher, + planning_glher_url, + planning_in_apa, + planning_apa_name, + planning_apa_tier, + planning_in_local_list, + planning_local_list_url, + planning_in_historic_area_assessment, + planning_historic_area_assessment_url, + planning_list_cat, + planning_list_grade, + date_link +FROM buildings \ No newline at end of file diff --git a/maintenance/extract_data/export_edit_history.sql b/maintenance/extract_data/export_edit_history.sql new file mode 100644 index 00000000..d142b3fc --- /dev/null +++ b/maintenance/extract_data/export_edit_history.sql @@ -0,0 +1,3 @@ +SELECT log_id as revision_id, log_timestamp as revision_timestamp, building_id, forward_patch, reverse_patch, u.username as user +FROM logs l +JOIN users u ON l.user_id = u.user_id \ No newline at end of file diff --git a/maintenance/extract_data/export_uprns.sql b/maintenance/extract_data/export_uprns.sql new file mode 100644 index 00000000..cb6378ef --- /dev/null +++ b/maintenance/extract_data/export_uprns.sql @@ -0,0 +1,3 @@ +SELECT building_id, uprn, parent_uprn +FROM building_properties +WHERE building_id IS NOT NULL \ No newline at end of file diff --git a/maintenance/extract_data/extract_data.py b/maintenance/extract_data/extract_data.py new file mode 100644 index 00000000..93697f87 --- /dev/null +++ b/maintenance/extract_data/extract_data.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python3 + +import csv +import datetime +from io import StringIO +import os +from pathlib import Path +import zipfile + +import psycopg2 + + +def get_connection(): + return psycopg2.connect( + host=os.environ['PGHOST'], + dbname=os.environ['PGDATABASE'], + user=os.environ['PGUSER'], + password=os.environ['PGPASSWORD'] + ) + + +def fetch_with_server_side_cursor( + connection, + query, + on_row, + row_batch_size=10000 +): + with connection.cursor('server_side') as cur: + cur.itersize = row_batch_size + cur.execute(query) + + header_saved = False + + for row in cur: + if not header_saved: + columns = [c[0] for c in cur.description] + on_row(columns) + header_saved = True + on_row(row) + + +def db_to_csv(connection, query): + string_io = StringIO() + writer = csv.writer(string_io) + + fetch_with_server_side_cursor( + connection, + query, + lambda row: writer.writerow(row) + ) + + return string_io.getvalue() + + +def get_extract_zip_file_path(current_time): + base_dir = Path(os.environ['EXTRACTS_DIRECTORY']) + file_name = f"data-extract-{current_time:%Y-%m-%d}.zip" + return base_dir / file_name + + +def add_extract_record_to_database(connection, zip_file_path, extracted_time): + with connection.cursor() as cur: + truncated_time = extracted_time.replace(second=0, microsecond=0) + cur.execute('''INSERT INTO + bulk_extracts (extracted_on, extract_path) + VALUES + (%s, %s) + ''', (truncated_time, str(zip_file_path))) + + connection.commit() + + +def read_sql(rel_path_from_script): + script_directory = Path(__file__).resolve().parent + sql_path = script_directory / rel_path_from_script + return sql_path.read_text() + + +building_attr_query = read_sql('./export_attributes.sql') +building_uprn_query = read_sql('./export_uprns.sql') +edit_history_query = read_sql('./export_edit_history.sql') + + +def make_data_extract(current_time, connection, zip_file_path): + zip_file_path.parent.mkdir(parents=True, exist_ok=True) + + try: + with zipfile.ZipFile(zip_file_path, mode='w') as newzip: + newzip.writestr('building_attributes.csv', + db_to_csv(connection, building_attr_query)) + newzip.writestr('building_uprns.csv', + db_to_csv(connection, building_uprn_query)) + newzip.writestr('edit_history.csv', + db_to_csv(connection, edit_history_query)) + + # TODO: add README + + add_extract_record_to_database(connection, zip_file_path, current_time) + except: + zip_file_path.unlink() + raise + + +def main(): + current_time = datetime.datetime.utcnow() + conn = get_connection() + zip_file_path = get_extract_zip_file_path(current_time) + make_data_extract(current_time, conn, zip_file_path) + + +if __name__ == '__main__': + main() diff --git a/maintenance/requirements.txt b/maintenance/requirements.txt new file mode 100644 index 00000000..cfe86dd1 --- /dev/null +++ b/maintenance/requirements.txt @@ -0,0 +1 @@ +psycopg2==2.8.3 \ No newline at end of file