diff --git a/etl/README.md b/etl/README.md index 49611071..5f879177 100644 --- a/etl/README.md +++ b/etl/README.md @@ -6,7 +6,6 @@ The scripts in this directory are used to extract, transform and load (ETL) the for Colouring London: Building geometries, sourced from Ordnance Survey (OS) MasterMap (Topography Layer) - To get the required datasets, you'll need to complete the following steps: @@ -17,10 +16,7 @@ To get the required datasets, you'll need to complete the following steps: ![](screenshot/MasterMap.png)
- - 4. You should be then able to check out your basket and download the files. Note: there may be multiple `.zip` files to download for MasterMap due to the size of the dataset. - 6. Unzip the MasterMap `.zip` files and move all the `.gz` files from each to a single folder in a convenient location. We will use this folder in later steps. ## Prerequisites @@ -60,14 +56,6 @@ cd ~/colouring-london/etl chmod +x *.sh ``` - - - - Extract the MasterMap data (this step could take a while). ```bash @@ -104,6 +92,14 @@ Index geometries. psql < ../migrations/002.index-geometries.up.sql ``` +TODO: Drop outside limit. + + + +```bash +./drop_outside_limit.sh ./path/to/boundary_file +```` + Create a building record per outline. @@ -114,29 +110,12 @@ Create a building record per outline. -Add UPRNs where they match. - - - -```bash -./load_uprns.sh ./addressbase_dir -```` - Run the remaining migrations in `../migrations` to create the rest of the database structure. ```bash ls ~/colouring-london/migrations/*.up.sql 2>/dev/null | while read -r migration; do psql < $migration; done; ``` -TODO: Drop outside limit. - - - -```bash -./drop_outside_limit.sh ./path/to/boundary_file -```` - - # [WIP] Updating the Colouring London database with new OS data TODO: this section should instruct how to update and existing db \ No newline at end of file diff --git a/etl/check_ab_mm_match.py b/etl/check_ab_mm_match.py deleted file mode 100644 index 98d82684..00000000 --- a/etl/check_ab_mm_match.py +++ /dev/null @@ -1,60 +0,0 @@ -"""Check if AddressBase TOIDs will match MasterMap -""" -import csv -import glob -import os -import sys - -from multiprocessing import Pool - -csv.field_size_limit(sys.maxsize) - -def main(ab_path, mm_path): - ab_paths = sorted(glob.glob(os.path.join(ab_path, "*.gml.csv.filtered.csv"))) - mm_paths = sorted(glob.glob(os.path.join(mm_path, "*.gml.csv"))) - - try: - assert len(ab_paths) == len(mm_paths) - except AssertionError: - print(ab_paths) - print(mm_paths) - - zipped_paths = zip(ab_paths, mm_paths) - - # parallel map over tiles - with Pool() as p: - p.starmap(check, zipped_paths) - -def check(ab_path, mm_path): - tile = str(os.path.basename(ab_path)).split(".")[0] - output_base = os.path.dirname(ab_path) - ab_toids = set() - mm_toids = set() - - with open(ab_path, 'r') as fh: - r = csv.DictReader(fh) - for line in r: - ab_toids.add(line['toid']) - - with open(mm_path, 'r') as fh: - r = csv.DictReader(fh) - for line in r: - mm_toids.add(line['fid']) - - missing = ab_toids - mm_toids - print(tile, "MasterMap:", len(mm_toids), "Addressbase:", len(ab_toids), "AB but not MM:", len(missing)) - - with open(os.path.join(output_base, 'missing_toids_{}.txt'.format(tile)), 'w') as fh: - for toid in missing: - fh.write("{}\n".format(toid)) - - with open(os.path.join(output_base, 'ab_toids_{}.txt'.format(tile)), 'w') as fh: - for toid in ab_toids: - fh.write("{}\n".format(toid)) - - -if __name__ == '__main__': - if len(sys.argv) != 3: - print("Usage: check_ab_mm_match.py ./path/to/addressbase/dir ./path/to/mastermap/dir") - exit(-1) - main(sys.argv[1], sys.argv[2]) diff --git a/etl/extract_addressbase.sh b/etl/extract_addressbase.sh deleted file mode 100755 index b6b94480..00000000 --- a/etl/extract_addressbase.sh +++ /dev/null @@ -1,63 +0,0 @@ -#!/usr/bin/env bash - -# -# Extract address points from OS Addressbase GML -# - as supplied in 5km tiles, zip/gz archives -# -: ${1?"Usage: $0 ./path/to/data/dir"} - -data_dir=$1 - -# -# Unzip to GML -# - -find $data_dir -type f -name '*.zip' -printf "%f\n" | \ -parallel \ -unzip -u $data_dir/{} -d $data_dir - -# -# Extract to CSV -# -# Relevant fields: -# WKT -# crossReference (list of TOID/other references) -# source (list of cross-reference sources: 7666MT refers to MasterMap Topo) -# uprn -# parentUPRN -# logicalStatus: 1 (one) is approved (otherwise historical, provisional) -# - -find $data_dir -type f -name '*.gml' -printf "%f\n" | \ -parallel \ -ogr2ogr -f CSV \ - -select crossReference,source,uprn,parentUPRN,logicalStatus \ - $data_dir/{}.csv $data_dir/{} BasicLandPropertyUnit \ - -lco GEOMETRY=AS_WKT - -# -# Filter -# -find $data_dir -type f -name '*.gml.csv' -printf "%f\n" | \ -parallel \ -colouringlondon/bin/python filter_addressbase_csv.py $data_dir/{} - - -# -# Transform to 3857 (web mercator) -# -find $data_dir -type f -name '*.filtered.csv' -printf "%f\n" | \ -parallel \ -ogr2ogr \ - -f CSV $data_dir/{}.3857.csv \ - -s_srs "EPSG:4326" \ - -t_srs "EPSG:3857" \ - $data_dir/{} \ - -lco GEOMETRY=AS_WKT - -# -# Update to EWKT (with SRID indicator for loading to Postgres) -# -find $data_dir -type f -name '*.3857.csv' -printf "%f\n" | \ -parallel \ -cat $data_dir/{} "|" sed "'s/^\"POINT/\"SRID=3857;POINT/'" "|" cut -f 1,3,4,5 -d "','" ">" $data_dir/{}.loadable diff --git a/etl/filter_addressbase_csv.py b/etl/filter_addressbase_csv.py deleted file mode 100755 index c6d273c8..00000000 --- a/etl/filter_addressbase_csv.py +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env python -"""Read ogr2ogr-converted CSV, filter to get OSMM TOID reference, only active addresses -""" -import csv -import json -import sys - - -def main(input_path): - output_path = "{}.filtered.csv".format(input_path) - fieldnames = ( - 'wkt', 'toid', 'uprn', 'parent_uprn' - ) - with open(input_path) as input_fh: - with open(output_path, 'w') as output_fh: - w = csv.DictWriter(output_fh, fieldnames=fieldnames) - w.writeheader() - r = csv.DictReader(input_fh) - for line in r: - if line['logicalStatus'] != "1": - continue - - refs = json.loads(line['crossReference']) - sources = json.loads(line['source']) - toid = "" - for ref, source in zip(refs, sources): - if source == "7666MT": - toid = ref - - w.writerow({ - 'uprn': line['uprn'], - 'parent_uprn': line['parentUPRN'], - 'toid': toid, - 'wkt': line['WKT'], - }) - - -if __name__ == '__main__': - if len(sys.argv) != 2: - print("Usage: filter_addressbase_csv.py ./path/to/data.csv") - exit(-1) - main(sys.argv[1]) diff --git a/etl/filter_transform_mastermap_for_loading.sh b/etl/filter_transform_mastermap_for_loading.sh index 45c62b2e..85c68b51 100755 --- a/etl/filter_transform_mastermap_for_loading.sh +++ b/etl/filter_transform_mastermap_for_loading.sh @@ -7,11 +7,6 @@ mastermap_dir=$1 -# -# Check which TOIDs are matched against UPRNs -# -# colouringlondon/bin/python check_ab_mm_match.py $addressbase_dir $mastermap_dir - # # Filter # - WHERE descriptiveGroup = '(1:Building)' diff --git a/etl/load_uprns.sh b/etl/load_uprns.sh deleted file mode 100755 index 6001f65c..00000000 --- a/etl/load_uprns.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env bash - -# -# Load UPRNS from CSV to Postgres -# - assume postgres connection details are set in the environment using PGUSER, PGHOST etc. -# -: ${1?"Usage: $0 ./path/to/addressbase/dir"} - -data_dir=$1 - -# -# Create 'building_properties' record with -# uprn: