diff --git a/etl/README.md b/etl/README.md new file mode 100644 index 00000000..fafd34d6 --- /dev/null +++ b/etl/README.md @@ -0,0 +1,25 @@ +# Data loading + +The scripts in this directory are used to extract, transform and load (ETL) the core datasets +for Colouring London: + +1. Building geometries, sourced from Ordnance Survey MasterMap (Topography Layer) +1. Unique Property Reference Numbers (UPRNs), sourced from Ordnance Survey AddressBase + +## Prerequisites + +Before running any of these scripts, you will need the OS data for your area of +interest. AddressBase and MasterMap are available directly from [Ordnance +Survey](https://www.ordnancesurvey.co.uk/). + +To help test the Colouring London app, `get_test_polygons.py` will attempt to save a small +(1.5kmĀ²) extract from OpenStreetMap to a format suitable for loading to the database. + +The scripts should be run in the following order: + +1. extract_addressbase.sh +1. extract_mastermap.sh +1. filter_transform_mastermap_for_loading.sh +1. load_geometries.sh +1. create_building_records.sh +1. load_uprns.py diff --git a/etl/create_building_records.sh b/etl/create_building_records.sh new file mode 100644 index 00000000..07d6985b --- /dev/null +++ b/etl/create_building_records.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +# +# Create corresponding 'building' record with +# id: , +# doc: {}, +# geom_id: +# +psql -c "INSERT INTO buildings ( geometry_id ) SELECT geometry_id from geometries;" diff --git a/etl/0_extract_addressbase.sh b/etl/extract_addressbase.sh similarity index 100% rename from etl/0_extract_addressbase.sh rename to etl/extract_addressbase.sh diff --git a/etl/0_extract_mastermap.sh b/etl/extract_mastermap.sh similarity index 100% rename from etl/0_extract_mastermap.sh rename to etl/extract_mastermap.sh diff --git a/etl/0_filter_for_loading.sh b/etl/filter_transform_mastermap_for_loading.sh similarity index 100% rename from etl/0_filter_for_loading.sh rename to etl/filter_transform_mastermap_for_loading.sh diff --git a/etl/0_test_polygons.py b/etl/get_test_polygons.py similarity index 77% rename from etl/0_test_polygons.py rename to etl/get_test_polygons.py index 53e8c1db..aa9a2b82 100644 --- a/etl/0_test_polygons.py +++ b/etl/get_test_polygons.py @@ -1,4 +1,10 @@ """Download and load a small open dataset for testing + +Run this to create a CSV of buildings geometries. + +Then run: +- load_geometries.sh (loading geometries to the database) +- create_buildings.sh (creating empty building records for each geometry) """ # -*- coding: utf-8 -*- import os @@ -21,8 +27,8 @@ fig, ax = osmnx.plot_buildings(gdf_proj, bgcolor='#333333', color='w', figsize=( save=True, show=False, close=True, filename='test_buildings_preview', dpi=600) -# save as geojson -test_data_file = os.path.join(os.path.dirname(__file__), 'test_buildings.geojson') +# save +test_data_file = os.path.join(os.path.dirname(__file__), 'test_buildings.csv') gdf_to_save = gdf_proj.reset_index( )[ @@ -32,5 +38,5 @@ gdf_to_save = gdf_proj.reset_index( gdf_to_save.rename( columns={'index': 'fid'} ).to_file( - test_data_file, driver='GeoJSON' + test_data_file, driver='CSV' ) diff --git a/etl/join-camden.json b/etl/join_building_data/join-camden.json similarity index 100% rename from etl/join-camden.json rename to etl/join_building_data/join-camden.json diff --git a/etl/3_join-data.py b/etl/join_building_data/join-data.py similarity index 100% rename from etl/3_join-data.py rename to etl/join_building_data/join-data.py diff --git a/etl/join-fitzrovia.json b/etl/join_building_data/join-fitzrovia.json similarity index 100% rename from etl/join-fitzrovia.json rename to etl/join_building_data/join-fitzrovia.json diff --git a/etl/1_load_geometries.sh b/etl/load_geometries.sh similarity index 78% rename from etl/1_load_geometries.sh rename to etl/load_geometries.sh index 0342e73b..82265f7c 100755 --- a/etl/1_load_geometries.sh +++ b/etl/load_geometries.sh @@ -30,11 +30,3 @@ psql -c "DELETE FROM geometries a USING ( ) b WHERE a.source_id = b.source_id AND a.ctid <> b.ctid;" - -# -# Create corresponding 'building' record with -# id: , -# doc: {}, -# geom_id: -# -psql -c "INSERT INTO buildings ( geometry_id ) SELECT geometry_id from geometries;" diff --git a/etl/2_load_buildings.py b/etl/load_uprns.py similarity index 95% rename from etl/2_load_buildings.py rename to etl/load_uprns.py index d5c6c375..2e47d792 100644 --- a/etl/2_load_buildings.py +++ b/etl/load_uprns.py @@ -73,7 +73,7 @@ def save_building(cur, uprn, geometry_id): geometry_id, ) ) - building = cur.fetchone() + building = cur.fetchone() if building is None: cur.execute( """INSERT INTO buildings @@ -122,8 +122,8 @@ def read_config(config_path): if __name__ == '__main__': - if len(sys.argv) != 3: - print("Usage: {} ./path/to/source/file.csv ./path/to/dbconfig.json".format( + if len(sys.argv) != 2: + print("Usage: {} ./path/to/source/file.csv".format( os.path.basename(__file__) )) exit()