Use Postgres COPY to extract data
- COPY typically runs faster than going via Python - properly formatted JSON in edit history patches - assumes postgres and maintenance user both have access to /tmp
This commit is contained in:
parent
01f82ea18e
commit
0a86566821
@ -1,4 +1,4 @@
|
|||||||
SELECT
|
COPY (SELECT
|
||||||
building_id,
|
building_id,
|
||||||
ref_toid,
|
ref_toid,
|
||||||
ref_osm_id,
|
ref_osm_id,
|
||||||
@ -16,6 +16,7 @@ SELECT
|
|||||||
date_upper,
|
date_upper,
|
||||||
date_source,
|
date_source,
|
||||||
date_source_detail,
|
date_source_detail,
|
||||||
|
date_link,
|
||||||
facade_year,
|
facade_year,
|
||||||
facade_upper,
|
facade_upper,
|
||||||
facade_lower,
|
facade_lower,
|
||||||
@ -34,6 +35,8 @@ SELECT
|
|||||||
planning_conservation_area_name,
|
planning_conservation_area_name,
|
||||||
planning_in_list,
|
planning_in_list,
|
||||||
planning_list_id,
|
planning_list_id,
|
||||||
|
planning_list_cat,
|
||||||
|
planning_list_grade,
|
||||||
planning_heritage_at_risk_id,
|
planning_heritage_at_risk_id,
|
||||||
planning_world_list_id,
|
planning_world_list_id,
|
||||||
planning_in_glher,
|
planning_in_glher,
|
||||||
@ -44,8 +47,7 @@ SELECT
|
|||||||
planning_in_local_list,
|
planning_in_local_list,
|
||||||
planning_local_list_url,
|
planning_local_list_url,
|
||||||
planning_in_historic_area_assessment,
|
planning_in_historic_area_assessment,
|
||||||
planning_historic_area_assessment_url,
|
planning_historic_area_assessment_url
|
||||||
planning_list_cat,
|
FROM buildings)
|
||||||
planning_list_grade,
|
TO '/tmp/building_attributes.csv'
|
||||||
date_link
|
WITH CSV HEADER
|
||||||
FROM buildings
|
|
||||||
|
@ -1,3 +1,12 @@
|
|||||||
SELECT log_id as revision_id, log_timestamp as revision_timestamp, building_id, forward_patch, reverse_patch, u.username as user
|
COPY(SELECT
|
||||||
|
log_id as revision_id,
|
||||||
|
date_trunc('second', log_timestamp) as revision_timestamp,
|
||||||
|
building_id,
|
||||||
|
forward_patch,
|
||||||
|
reverse_patch,
|
||||||
|
u.username as user
|
||||||
FROM logs l
|
FROM logs l
|
||||||
JOIN users u ON l.user_id = u.user_id
|
JOIN users u
|
||||||
|
ON l.user_id = u.user_id)
|
||||||
|
TO '/tmp/edit_history.csv'
|
||||||
|
WITH CSV HEADER
|
||||||
|
@ -1,3 +1,8 @@
|
|||||||
SELECT building_id, uprn, parent_uprn
|
COPY(SELECT
|
||||||
|
building_id,
|
||||||
|
uprn,
|
||||||
|
parent_uprn
|
||||||
FROM building_properties
|
FROM building_properties
|
||||||
WHERE building_id IS NOT NULL
|
WHERE building_id IS NOT NULL)
|
||||||
|
TO '/tmp/building_uprns.csv'
|
||||||
|
WITH CSV HEADER
|
||||||
|
@ -22,39 +22,6 @@ def get_connection():
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def fetch_with_server_side_cursor(
|
|
||||||
connection,
|
|
||||||
query,
|
|
||||||
on_row,
|
|
||||||
row_batch_size=10000
|
|
||||||
):
|
|
||||||
with connection.cursor('server_side') as cur:
|
|
||||||
cur.itersize = row_batch_size
|
|
||||||
cur.execute(query)
|
|
||||||
|
|
||||||
header_saved = False
|
|
||||||
|
|
||||||
for row in cur:
|
|
||||||
if not header_saved:
|
|
||||||
columns = [c[0] for c in cur.description]
|
|
||||||
on_row(columns)
|
|
||||||
header_saved = True
|
|
||||||
on_row(row)
|
|
||||||
|
|
||||||
|
|
||||||
def db_to_csv(connection, query):
|
|
||||||
string_io = StringIO()
|
|
||||||
writer = csv.writer(string_io)
|
|
||||||
|
|
||||||
fetch_with_server_side_cursor(
|
|
||||||
connection,
|
|
||||||
query,
|
|
||||||
lambda row: writer.writerow(row)
|
|
||||||
)
|
|
||||||
|
|
||||||
return string_io.getvalue()
|
|
||||||
|
|
||||||
|
|
||||||
def get_extract_zip_file_path(current_time):
|
def get_extract_zip_file_path(current_time):
|
||||||
base_dir = Path(os.environ['EXTRACTS_DIRECTORY'])
|
base_dir = Path(os.environ['EXTRACTS_DIRECTORY'])
|
||||||
file_name = f"data-extract-{current_time:%Y-%m-%d-%H_%M_%S}.zip"
|
file_name = f"data-extract-{current_time:%Y-%m-%d-%H_%M_%S}.zip"
|
||||||
@ -79,27 +46,30 @@ def read_sql(rel_path_from_script):
|
|||||||
return sql_path.read_text()
|
return sql_path.read_text()
|
||||||
|
|
||||||
|
|
||||||
building_attr_query = read_sql('./export_attributes.sql')
|
|
||||||
building_uprn_query = read_sql('./export_uprns.sql')
|
|
||||||
edit_history_query = read_sql('./export_edit_history.sql')
|
|
||||||
|
|
||||||
|
|
||||||
def make_data_extract(current_time, connection, zip_file_path):
|
def make_data_extract(current_time, connection, zip_file_path):
|
||||||
if zip_file_path.exists():
|
if zip_file_path.exists():
|
||||||
raise ZipFileExistsError('Archive file under specified name already exists')
|
raise ZipFileExistsError('Archive file under specified name already exists')
|
||||||
|
|
||||||
|
# Execute data dump as Postgres COPY commands, write from server to /tmp
|
||||||
|
with connection.cursor() as cur:
|
||||||
|
cur.execute(read_sql('./export_attributes.sql'))
|
||||||
|
|
||||||
|
with connection.cursor() as cur:
|
||||||
|
cur.execute(read_sql('./export_uprns.sql'))
|
||||||
|
|
||||||
|
with connection.cursor() as cur:
|
||||||
|
cur.execute(read_sql('./export_edit_history.sql'))
|
||||||
|
|
||||||
zip_file_path.parent.mkdir(parents=True, exist_ok=True)
|
zip_file_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with zipfile.ZipFile(zip_file_path, mode='w') as newzip:
|
with zipfile.ZipFile(zip_file_path, mode='w') as newzip:
|
||||||
newzip.writestr('building_attributes.csv',
|
newzip.write('README.txt')
|
||||||
db_to_csv(connection, building_attr_query))
|
newzip.write('/tmp/building_attributes.csv', arcname='building_attributes.csv')
|
||||||
newzip.writestr('building_uprns.csv',
|
newzip.write('/tmp/building_uprns.csv', arcname='building_uprns.csv')
|
||||||
db_to_csv(connection, building_uprn_query))
|
newzip.write('/tmp/edit_history.csv', arcname='edit_history.csv')
|
||||||
newzip.writestr('edit_history.csv',
|
|
||||||
db_to_csv(connection, edit_history_query))
|
|
||||||
|
|
||||||
# TODO: add README
|
|
||||||
|
|
||||||
add_extract_record_to_database(connection, zip_file_path, current_time)
|
add_extract_record_to_database(connection, zip_file_path, current_time)
|
||||||
except:
|
except:
|
||||||
|
Loading…
Reference in New Issue
Block a user