From 0a86566821e99c821df1317c56f99647998d0d7d Mon Sep 17 00:00:00 2001 From: Tom Russell Date: Wed, 2 Oct 2019 15:03:54 +0100 Subject: [PATCH] Use Postgres COPY to extract data - COPY typically runs faster than going via Python - properly formatted JSON in edit history patches - assumes postgres and maintenance user both have access to /tmp --- .../extract_data/export_attributes.sql | 14 +++-- .../extract_data/export_edit_history.sql | 13 ++++- maintenance/extract_data/export_uprns.sql | 9 ++- maintenance/extract_data/extract_data.py | 58 +++++-------------- 4 files changed, 40 insertions(+), 54 deletions(-) diff --git a/maintenance/extract_data/export_attributes.sql b/maintenance/extract_data/export_attributes.sql index 82bb67ba..7174a0d2 100644 --- a/maintenance/extract_data/export_attributes.sql +++ b/maintenance/extract_data/export_attributes.sql @@ -1,4 +1,4 @@ -SELECT +COPY (SELECT building_id, ref_toid, ref_osm_id, @@ -16,6 +16,7 @@ SELECT date_upper, date_source, date_source_detail, + date_link, facade_year, facade_upper, facade_lower, @@ -34,6 +35,8 @@ SELECT planning_conservation_area_name, planning_in_list, planning_list_id, + planning_list_cat, + planning_list_grade, planning_heritage_at_risk_id, planning_world_list_id, planning_in_glher, @@ -44,8 +47,7 @@ SELECT planning_in_local_list, planning_local_list_url, planning_in_historic_area_assessment, - planning_historic_area_assessment_url, - planning_list_cat, - planning_list_grade, - date_link -FROM buildings \ No newline at end of file + planning_historic_area_assessment_url +FROM buildings) +TO '/tmp/building_attributes.csv' +WITH CSV HEADER diff --git a/maintenance/extract_data/export_edit_history.sql b/maintenance/extract_data/export_edit_history.sql index d142b3fc..e38e3bab 100644 --- a/maintenance/extract_data/export_edit_history.sql +++ b/maintenance/extract_data/export_edit_history.sql @@ -1,3 +1,12 @@ -SELECT log_id as revision_id, log_timestamp as revision_timestamp, building_id, forward_patch, reverse_patch, u.username as user +COPY(SELECT + log_id as revision_id, + date_trunc('second', log_timestamp) as revision_timestamp, + building_id, + forward_patch, + reverse_patch, + u.username as user FROM logs l -JOIN users u ON l.user_id = u.user_id \ No newline at end of file +JOIN users u + ON l.user_id = u.user_id) +TO '/tmp/edit_history.csv' +WITH CSV HEADER diff --git a/maintenance/extract_data/export_uprns.sql b/maintenance/extract_data/export_uprns.sql index cb6378ef..c9a59c26 100644 --- a/maintenance/extract_data/export_uprns.sql +++ b/maintenance/extract_data/export_uprns.sql @@ -1,3 +1,8 @@ -SELECT building_id, uprn, parent_uprn +COPY(SELECT + building_id, + uprn, + parent_uprn FROM building_properties -WHERE building_id IS NOT NULL \ No newline at end of file + WHERE building_id IS NOT NULL) +TO '/tmp/building_uprns.csv' +WITH CSV HEADER diff --git a/maintenance/extract_data/extract_data.py b/maintenance/extract_data/extract_data.py index 666abc6a..e90f0397 100644 --- a/maintenance/extract_data/extract_data.py +++ b/maintenance/extract_data/extract_data.py @@ -22,39 +22,6 @@ def get_connection(): ) -def fetch_with_server_side_cursor( - connection, - query, - on_row, - row_batch_size=10000 -): - with connection.cursor('server_side') as cur: - cur.itersize = row_batch_size - cur.execute(query) - - header_saved = False - - for row in cur: - if not header_saved: - columns = [c[0] for c in cur.description] - on_row(columns) - header_saved = True - on_row(row) - - -def db_to_csv(connection, query): - string_io = StringIO() - writer = csv.writer(string_io) - - fetch_with_server_side_cursor( - connection, - query, - lambda row: writer.writerow(row) - ) - - return string_io.getvalue() - - def get_extract_zip_file_path(current_time): base_dir = Path(os.environ['EXTRACTS_DIRECTORY']) file_name = f"data-extract-{current_time:%Y-%m-%d-%H_%M_%S}.zip" @@ -79,27 +46,30 @@ def read_sql(rel_path_from_script): return sql_path.read_text() -building_attr_query = read_sql('./export_attributes.sql') -building_uprn_query = read_sql('./export_uprns.sql') -edit_history_query = read_sql('./export_edit_history.sql') def make_data_extract(current_time, connection, zip_file_path): if zip_file_path.exists(): raise ZipFileExistsError('Archive file under specified name already exists') + # Execute data dump as Postgres COPY commands, write from server to /tmp + with connection.cursor() as cur: + cur.execute(read_sql('./export_attributes.sql')) + + with connection.cursor() as cur: + cur.execute(read_sql('./export_uprns.sql')) + + with connection.cursor() as cur: + cur.execute(read_sql('./export_edit_history.sql')) + zip_file_path.parent.mkdir(parents=True, exist_ok=True) try: with zipfile.ZipFile(zip_file_path, mode='w') as newzip: - newzip.writestr('building_attributes.csv', - db_to_csv(connection, building_attr_query)) - newzip.writestr('building_uprns.csv', - db_to_csv(connection, building_uprn_query)) - newzip.writestr('edit_history.csv', - db_to_csv(connection, edit_history_query)) - - # TODO: add README + newzip.write('README.txt') + newzip.write('/tmp/building_attributes.csv', arcname='building_attributes.csv') + newzip.write('/tmp/building_uprns.csv', arcname='building_uprns.csv') + newzip.write('/tmp/edit_history.csv', arcname='edit_history.csv') add_extract_record_to_database(connection, zip_file_path, current_time) except: