Rename etl scripts

This commit is contained in:
Tom Russell 2018-09-25 20:46:16 +01:00
parent ab32c36c98
commit bddd7e769f
11 changed files with 46 additions and 14 deletions

25
etl/README.md Normal file
View File

@ -0,0 +1,25 @@
# Data loading
The scripts in this directory are used to extract, transform and load (ETL) the core datasets
for Colouring London:
1. Building geometries, sourced from Ordnance Survey MasterMap (Topography Layer)
1. Unique Property Reference Numbers (UPRNs), sourced from Ordnance Survey AddressBase
## Prerequisites
Before running any of these scripts, you will need the OS data for your area of
interest. AddressBase and MasterMap are available directly from [Ordnance
Survey](https://www.ordnancesurvey.co.uk/).
To help test the Colouring London app, `get_test_polygons.py` will attempt to save a small
(1.5km²) extract from OpenStreetMap to a format suitable for loading to the database.
The scripts should be run in the following order:
1. extract_addressbase.sh
1. extract_mastermap.sh
1. filter_transform_mastermap_for_loading.sh
1. load_geometries.sh
1. create_building_records.sh
1. load_uprns.py

View File

@ -0,0 +1,9 @@
#!/usr/bin/env bash
#
# Create corresponding 'building' record with
# id: <building-guid>,
# doc: {},
# geom_id: <polygon-guid>
#
psql -c "INSERT INTO buildings ( geometry_id ) SELECT geometry_id from geometries;"

View File

@ -1,4 +1,10 @@
"""Download and load a small open dataset for testing """Download and load a small open dataset for testing
Run this to create a CSV of buildings geometries.
Then run:
- load_geometries.sh (loading geometries to the database)
- create_buildings.sh (creating empty building records for each geometry)
""" """
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import os import os
@ -21,8 +27,8 @@ fig, ax = osmnx.plot_buildings(gdf_proj, bgcolor='#333333', color='w', figsize=(
save=True, show=False, close=True, save=True, show=False, close=True,
filename='test_buildings_preview', dpi=600) filename='test_buildings_preview', dpi=600)
# save as geojson # save
test_data_file = os.path.join(os.path.dirname(__file__), 'test_buildings.geojson') test_data_file = os.path.join(os.path.dirname(__file__), 'test_buildings.csv')
gdf_to_save = gdf_proj.reset_index( gdf_to_save = gdf_proj.reset_index(
)[ )[
@ -32,5 +38,5 @@ gdf_to_save = gdf_proj.reset_index(
gdf_to_save.rename( gdf_to_save.rename(
columns={'index': 'fid'} columns={'index': 'fid'}
).to_file( ).to_file(
test_data_file, driver='GeoJSON' test_data_file, driver='CSV'
) )

View File

@ -30,11 +30,3 @@ psql -c "DELETE FROM geometries a USING (
) b ) b
WHERE a.source_id = b.source_id WHERE a.source_id = b.source_id
AND a.ctid <> b.ctid;" AND a.ctid <> b.ctid;"
#
# Create corresponding 'building' record with
# id: <building-guid>,
# doc: {},
# geom_id: <polygon-guid>
#
psql -c "INSERT INTO buildings ( geometry_id ) SELECT geometry_id from geometries;"

View File

@ -122,8 +122,8 @@ def read_config(config_path):
if __name__ == '__main__': if __name__ == '__main__':
if len(sys.argv) != 3: if len(sys.argv) != 2:
print("Usage: {} ./path/to/source/file.csv ./path/to/dbconfig.json".format( print("Usage: {} ./path/to/source/file.csv".format(
os.path.basename(__file__) os.path.basename(__file__)
)) ))
exit() exit()