Add data extract maintenance script

This commit is contained in:
Maciej Ziarkowski 2019-08-29 13:17:09 +01:00
parent 0dee6ae167
commit be13cd94f5
5 changed files with 170 additions and 0 deletions

View File

@ -0,0 +1,51 @@
SELECT
building_id,
ref_toid,
ref_osm_id,
revision_id,
location_name,
location_number,
location_street,
location_line_two,
location_town,
location_postcode,
location_latitude,
location_longitude,
date_year,
date_lower,
date_upper,
date_source,
date_source_detail,
facade_year,
facade_upper,
facade_lower,
facade_source,
facade_source_detail,
size_storeys_attic,
size_storeys_core,
size_storeys_basement,
size_height_apex,
size_floor_area_ground,
size_floor_area_total,
size_width_frontage,
likes_total,
planning_portal_link,
planning_in_conservation_area,
planning_conservation_area_name,
planning_in_list,
planning_list_id,
planning_heritage_at_risk_id,
planning_world_list_id,
planning_in_glher,
planning_glher_url,
planning_in_apa,
planning_apa_name,
planning_apa_tier,
planning_in_local_list,
planning_local_list_url,
planning_in_historic_area_assessment,
planning_historic_area_assessment_url,
planning_list_cat,
planning_list_grade,
date_link
FROM buildings

View File

@ -0,0 +1,3 @@
SELECT log_id as revision_id, log_timestamp as revision_timestamp, building_id, forward_patch, reverse_patch, u.username as user
FROM logs l
JOIN users u ON l.user_id = u.user_id

View File

@ -0,0 +1,3 @@
SELECT building_id, uprn, parent_uprn
FROM building_properties
WHERE building_id IS NOT NULL

View File

@ -0,0 +1,112 @@
#!/usr/bin/env python3
import csv
import datetime
from io import StringIO
import os
from pathlib import Path
import zipfile
import psycopg2
def get_connection():
return psycopg2.connect(
host=os.environ['PGHOST'],
dbname=os.environ['PGDATABASE'],
user=os.environ['PGUSER'],
password=os.environ['PGPASSWORD']
)
def fetch_with_server_side_cursor(
connection,
query,
on_row,
row_batch_size=10000
):
with connection.cursor('server_side') as cur:
cur.itersize = row_batch_size
cur.execute(query)
header_saved = False
for row in cur:
if not header_saved:
columns = [c[0] for c in cur.description]
on_row(columns)
header_saved = True
on_row(row)
def db_to_csv(connection, query):
string_io = StringIO()
writer = csv.writer(string_io)
fetch_with_server_side_cursor(
connection,
query,
lambda row: writer.writerow(row)
)
return string_io.getvalue()
def get_extract_zip_file_path(current_time):
base_dir = Path(os.environ['EXTRACTS_DIRECTORY'])
file_name = f"data-extract-{current_time:%Y-%m-%d}.zip"
return base_dir / file_name
def add_extract_record_to_database(connection, zip_file_path, extracted_time):
with connection.cursor() as cur:
truncated_time = extracted_time.replace(second=0, microsecond=0)
cur.execute('''INSERT INTO
bulk_extracts (extracted_on, extract_path)
VALUES
(%s, %s)
''', (truncated_time, str(zip_file_path)))
connection.commit()
def read_sql(rel_path_from_script):
script_directory = Path(__file__).resolve().parent
sql_path = script_directory / rel_path_from_script
return sql_path.read_text()
building_attr_query = read_sql('./export_attributes.sql')
building_uprn_query = read_sql('./export_uprns.sql')
edit_history_query = read_sql('./export_edit_history.sql')
def make_data_extract(current_time, connection, zip_file_path):
zip_file_path.parent.mkdir(parents=True, exist_ok=True)
try:
with zipfile.ZipFile(zip_file_path, mode='w') as newzip:
newzip.writestr('building_attributes.csv',
db_to_csv(connection, building_attr_query))
newzip.writestr('building_uprns.csv',
db_to_csv(connection, building_uprn_query))
newzip.writestr('edit_history.csv',
db_to_csv(connection, edit_history_query))
# TODO: add README
add_extract_record_to_database(connection, zip_file_path, current_time)
except:
zip_file_path.unlink()
raise
def main():
current_time = datetime.datetime.utcnow()
conn = get_connection()
zip_file_path = get_extract_zip_file_path(current_time)
make_data_extract(current_time, conn, zip_file_path)
if __name__ == '__main__':
main()

View File

@ -0,0 +1 @@
psycopg2==2.8.3