From 0a86566821e99c821df1317c56f99647998d0d7d Mon Sep 17 00:00:00 2001
From: Tom Russell <tomalrussell@gmail.com>
Date: Wed, 2 Oct 2019 15:03:54 +0100
Subject: [PATCH] Use Postgres COPY to extract data

- COPY typically runs faster than going via Python
- properly formatted JSON in edit history patches
- assumes postgres and maintenance user both have
  access to /tmp
---
 .../extract_data/export_attributes.sql        | 14 +++--
 .../extract_data/export_edit_history.sql      | 13 ++++-
 maintenance/extract_data/export_uprns.sql     |  9 ++-
 maintenance/extract_data/extract_data.py      | 58 +++++--------------
 4 files changed, 40 insertions(+), 54 deletions(-)

diff --git a/maintenance/extract_data/export_attributes.sql b/maintenance/extract_data/export_attributes.sql
index 82bb67ba..7174a0d2 100644
--- a/maintenance/extract_data/export_attributes.sql
+++ b/maintenance/extract_data/export_attributes.sql
@@ -1,4 +1,4 @@
-SELECT
+COPY (SELECT
     building_id,
     ref_toid,
     ref_osm_id,
@@ -16,6 +16,7 @@ SELECT
     date_upper,
     date_source,
     date_source_detail,
+    date_link,
     facade_year,
     facade_upper,
     facade_lower,
@@ -34,6 +35,8 @@ SELECT
     planning_conservation_area_name,
     planning_in_list,
     planning_list_id,
+    planning_list_cat,
+    planning_list_grade,
     planning_heritage_at_risk_id,
     planning_world_list_id,
     planning_in_glher,
@@ -44,8 +47,7 @@ SELECT
     planning_in_local_list,
     planning_local_list_url,
     planning_in_historic_area_assessment,
-    planning_historic_area_assessment_url,
-    planning_list_cat,
-    planning_list_grade,
-    date_link
-FROM buildings
\ No newline at end of file
+    planning_historic_area_assessment_url
+FROM buildings)
+TO '/tmp/building_attributes.csv'
+WITH CSV HEADER
diff --git a/maintenance/extract_data/export_edit_history.sql b/maintenance/extract_data/export_edit_history.sql
index d142b3fc..e38e3bab 100644
--- a/maintenance/extract_data/export_edit_history.sql
+++ b/maintenance/extract_data/export_edit_history.sql
@@ -1,3 +1,12 @@
-SELECT log_id as revision_id, log_timestamp as revision_timestamp, building_id, forward_patch, reverse_patch, u.username as user
+COPY(SELECT
+    log_id as revision_id,
+    date_trunc('second', log_timestamp) as revision_timestamp,
+    building_id,
+    forward_patch,
+    reverse_patch,
+    u.username as user
 FROM logs l
-JOIN users u ON l.user_id = u.user_id
\ No newline at end of file
+JOIN users u
+    ON l.user_id = u.user_id)
+TO '/tmp/edit_history.csv'
+WITH CSV HEADER
diff --git a/maintenance/extract_data/export_uprns.sql b/maintenance/extract_data/export_uprns.sql
index cb6378ef..c9a59c26 100644
--- a/maintenance/extract_data/export_uprns.sql
+++ b/maintenance/extract_data/export_uprns.sql
@@ -1,3 +1,8 @@
-SELECT building_id, uprn, parent_uprn
+COPY(SELECT
+    building_id,
+    uprn,
+    parent_uprn
 FROM building_properties
-WHERE building_id IS NOT NULL
\ No newline at end of file
+    WHERE building_id IS NOT NULL)
+TO '/tmp/building_uprns.csv'
+WITH CSV HEADER
diff --git a/maintenance/extract_data/extract_data.py b/maintenance/extract_data/extract_data.py
index 666abc6a..e90f0397 100644
--- a/maintenance/extract_data/extract_data.py
+++ b/maintenance/extract_data/extract_data.py
@@ -22,39 +22,6 @@ def get_connection():
     )
 
 
-def fetch_with_server_side_cursor(
-    connection,
-    query,
-    on_row,
-    row_batch_size=10000
-):
-    with connection.cursor('server_side') as cur:
-        cur.itersize = row_batch_size
-        cur.execute(query)
-
-        header_saved = False
-
-        for row in cur:
-            if not header_saved:
-                columns = [c[0] for c in cur.description]
-                on_row(columns)
-                header_saved = True
-            on_row(row)
-
-
-def db_to_csv(connection, query):
-    string_io = StringIO()
-    writer = csv.writer(string_io)
-
-    fetch_with_server_side_cursor(
-        connection,
-        query,
-        lambda row: writer.writerow(row)
-    )
-
-    return string_io.getvalue()
-
-
 def get_extract_zip_file_path(current_time):
     base_dir = Path(os.environ['EXTRACTS_DIRECTORY'])
     file_name = f"data-extract-{current_time:%Y-%m-%d-%H_%M_%S}.zip"
@@ -79,27 +46,30 @@ def read_sql(rel_path_from_script):
     return sql_path.read_text()
 
 
-building_attr_query = read_sql('./export_attributes.sql')
-building_uprn_query = read_sql('./export_uprns.sql')
-edit_history_query = read_sql('./export_edit_history.sql')
 
 
 def make_data_extract(current_time, connection, zip_file_path):
     if zip_file_path.exists():
         raise ZipFileExistsError('Archive file under specified name already exists')
 
+    # Execute data dump as Postgres COPY commands, write from server to /tmp
+    with connection.cursor() as cur:
+        cur.execute(read_sql('./export_attributes.sql'))
+
+    with connection.cursor() as cur:
+        cur.execute(read_sql('./export_uprns.sql'))
+
+    with connection.cursor() as cur:
+        cur.execute(read_sql('./export_edit_history.sql'))
+
     zip_file_path.parent.mkdir(parents=True, exist_ok=True)
 
     try:
         with zipfile.ZipFile(zip_file_path, mode='w') as newzip:
-            newzip.writestr('building_attributes.csv',
-                            db_to_csv(connection, building_attr_query))
-            newzip.writestr('building_uprns.csv',
-                            db_to_csv(connection, building_uprn_query))
-            newzip.writestr('edit_history.csv',
-                            db_to_csv(connection, edit_history_query))
-
-            # TODO: add README
+            newzip.write('README.txt')
+            newzip.write('/tmp/building_attributes.csv', arcname='building_attributes.csv')
+            newzip.write('/tmp/building_uprns.csv', arcname='building_uprns.csv')
+            newzip.write('/tmp/edit_history.csv', arcname='edit_history.csv')
 
         add_extract_record_to_database(connection, zip_file_path, current_time)
     except: