remove addressbase from all steps and reorder readme

2022-03-18 11:07:15 +00:00 · 2022-03-18 11:07:15 +00:00 · 3653e30362
commit 3653e30362
parent f55ce63d84
8 changed files with 17 additions and 252 deletions
--- a/etl/README.md
+++ b/etl/README.md
@ -6,7 +6,6 @@ The scripts in this directory are used to extract, transform and load (ETL) the
 for Colouring London:

 Building geometries, sourced from Ordnance Survey (OS) MasterMap (Topography Layer)
-<!-- 1. Unique Property Reference Numbers (UPRNs), sourced from Ordnance Survey AddressBase -->

 To get the required datasets, you'll need to complete the following steps:

@ -17,10 +16,7 @@ To get the required datasets, you'll need to complete the following steps:
 ![](screenshot/MasterMap.png)
 <p></p>

-<!-- ![](screenshot/AddressBase.png) -->
-
 4. You should be then able to check out your basket and download the files. Note: there may be multiple `.zip` files to download for MasterMap due to the size of the dataset.
-<!-- 5. Unzip the AddressBase `.zip` in a convenient location. We will use the unzipped folder in later steps. Rename the folder as appropriate (make sure this folder doesn't contain the original `.zip` file). Note: this folder also contains `.zip` files, do not unzip at this stage as a script will do this later. -->
 6. Unzip the MasterMap `.zip` files and move all the `.gz` files from each to a single folder in a convenient location. We will use this folder in later steps.

 ## Prerequisites
@ -60,14 +56,6 @@ cd ~/colouring-london/etl
 chmod +x *.sh
 ```

-<!-- Extract the addressBase dataset.
-
-```bash
-./extract_addressbase.sh ./addressbase_dir
-``` -->
-
-<!-- ERROR 1: Couldn't fetch requested layer 'BasicLandPropertyUnit'! -->
-
 Extract the MasterMap data (this step could take a while).

 ```bash
@ -104,6 +92,14 @@ Index geometries.
 psql < ../migrations/002.index-geometries.up.sql
 ```

+TODO: Drop outside limit.
+
+<!-- But what is the boundary file? -->
+
+```bash
+./drop_outside_limit.sh ./path/to/boundary_file
+````
+
 Create a building record per outline.

 <!-- I had to edit the below file to set the psql vars before running -->
@ -114,29 +110,12 @@ Create a building record per outline.

 <!-- Insert 0.... -->

-Add UPRNs where they match.
-
-<!-- I had to edit the below file to set the psql vars before running -->
-
-```bash
-./load_uprns.sh ./addressbase_dir
-````
-
 Run the remaining migrations in `../migrations` to create the rest of the database structure.

 ```bash
 ls ~/colouring-london/migrations/*.up.sql 2>/dev/null | while read -r migration; do psql < $migration; done;
 ```

-TODO: Drop outside limit.
-
-<!-- But what is the bounddary file? -->
-
-```bash
-./drop_outside_limit.sh ./path/to/boundary_file
-````
-
-
 # [WIP] Updating the Colouring London database with new OS data

 TODO: this section should instruct how to update and existing db
--- a/etl/check_ab_mm_match.py
+++ b/etl/check_ab_mm_match.py
@ -1,60 +0,0 @@
-"""Check if AddressBase TOIDs will match MasterMap
-"""
-import csv
-import glob
-import os
-import sys
-
-from multiprocessing import Pool
-
-csv.field_size_limit(sys.maxsize)
-
-def main(ab_path, mm_path):
-    ab_paths = sorted(glob.glob(os.path.join(ab_path, "*.gml.csv.filtered.csv")))
-    mm_paths = sorted(glob.glob(os.path.join(mm_path, "*.gml.csv")))
-
-    try:
-        assert len(ab_paths) == len(mm_paths)
-    except AssertionError:
-        print(ab_paths)
-        print(mm_paths)
-
-    zipped_paths = zip(ab_paths, mm_paths)
-
-    # parallel map over tiles
-    with Pool() as p:
-        p.starmap(check, zipped_paths)
-
-def check(ab_path, mm_path):
-    tile = str(os.path.basename(ab_path)).split(".")[0]
-    output_base = os.path.dirname(ab_path)
-    ab_toids = set()
-    mm_toids = set()
-
-    with open(ab_path, 'r') as fh:
-        r = csv.DictReader(fh)
-        for line in r:
-            ab_toids.add(line['toid'])
-
-    with open(mm_path, 'r') as fh:
-        r = csv.DictReader(fh)
-        for line in r:
-            mm_toids.add(line['fid'])
-
-    missing = ab_toids - mm_toids
-    print(tile, "MasterMap:", len(mm_toids), "Addressbase:", len(ab_toids), "AB but not MM:", len(missing))
-
-    with open(os.path.join(output_base, 'missing_toids_{}.txt'.format(tile)), 'w') as fh:
-        for toid in missing:
-            fh.write("{}\n".format(toid))
-
-    with open(os.path.join(output_base, 'ab_toids_{}.txt'.format(tile)), 'w') as fh:
-        for toid in ab_toids:
-            fh.write("{}\n".format(toid))
-
-
-if __name__ == '__main__':
-    if len(sys.argv) != 3:
-        print("Usage: check_ab_mm_match.py ./path/to/addressbase/dir ./path/to/mastermap/dir")
-        exit(-1)
-    main(sys.argv[1], sys.argv[2])
--- a/etl/extract_addressbase.sh
+++ b/etl/extract_addressbase.sh
@ -1,63 +0,0 @@
-#!/usr/bin/env bash
-
-#
-# Extract address points from OS Addressbase GML
-# - as supplied in 5km tiles, zip/gz archives
-#
-: ${1?"Usage: $0 ./path/to/data/dir"}
-
-data_dir=$1
-
-#
-# Unzip to GML
-#
-
-find $data_dir -type f -name '*.zip' -printf "%f\n" | \
-parallel \
-unzip -u $data_dir/{} -d $data_dir
-
-#
-# Extract to CSV
-#
-# Relevant fields:
-# WKT
-# crossReference (list of TOID/other references)
-# source (list of cross-reference sources: 7666MT refers to MasterMap Topo)
-# uprn
-# parentUPRN
-# logicalStatus: 1 (one) is approved (otherwise historical, provisional)
-#
-
-find $data_dir -type f -name '*.gml' -printf "%f\n"  | \
-parallel \
-ogr2ogr -f CSV \
-    -select crossReference,source,uprn,parentUPRN,logicalStatus \
-    $data_dir/{}.csv $data_dir/{} BasicLandPropertyUnit \
-    -lco GEOMETRY=AS_WKT
-
-#
-# Filter
-#
-find $data_dir -type f -name '*.gml.csv' -printf "%f\n"  | \
-parallel \
-colouringlondon/bin/python filter_addressbase_csv.py $data_dir/{}
-
-
-#
-# Transform to 3857 (web mercator)
-#
-find $data_dir -type f -name '*.filtered.csv' -printf "%f\n" | \
-parallel \
-ogr2ogr \
-    -f CSV $data_dir/{}.3857.csv \
-    -s_srs "EPSG:4326" \
-    -t_srs "EPSG:3857" \
-    $data_dir/{} \
-    -lco GEOMETRY=AS_WKT
-
-#
-# Update to EWKT (with SRID indicator for loading to Postgres)
-#
-find $data_dir -type f -name '*.3857.csv' -printf "%f\n" | \
-parallel \
-cat $data_dir/{} "|" sed "'s/^\"POINT/\"SRID=3857;POINT/'" "|" cut -f 1,3,4,5 -d "','" ">" $data_dir/{}.loadable
--- a/etl/filter_addressbase_csv.py
+++ b/etl/filter_addressbase_csv.py
@ -1,42 +0,0 @@
-#!/usr/bin/env python
-"""Read ogr2ogr-converted CSV, filter to get OSMM TOID reference, only active addresses
-"""
-import csv
-import json
-import sys
-
-
-def main(input_path):
-    output_path = "{}.filtered.csv".format(input_path)
-    fieldnames = (
-        'wkt', 'toid', 'uprn', 'parent_uprn'
-    )
-    with open(input_path) as input_fh:
-        with open(output_path, 'w') as output_fh:
-            w = csv.DictWriter(output_fh, fieldnames=fieldnames)
-            w.writeheader()
-            r = csv.DictReader(input_fh)
-            for line in r:
-                if line['logicalStatus'] != "1":
-                    continue
-
-                refs = json.loads(line['crossReference'])
-                sources = json.loads(line['source'])
-                toid = ""
-                for ref, source in zip(refs, sources):
-                    if source == "7666MT":
-                        toid = ref
-
-                w.writerow({
-                    'uprn': line['uprn'],
-                    'parent_uprn': line['parentUPRN'],
-                    'toid': toid,
-                    'wkt': line['WKT'],
-                })
-
-
-if __name__ == '__main__':
-    if len(sys.argv) != 2:
-        print("Usage: filter_addressbase_csv.py ./path/to/data.csv")
-        exit(-1)
-    main(sys.argv[1])
--- a/etl/filter_transform_mastermap_for_loading.sh
+++ b/etl/filter_transform_mastermap_for_loading.sh
@ -7,11 +7,6 @@

 mastermap_dir=$1

-#
-# Check which TOIDs are matched against UPRNs
-#
-# colouringlondon/bin/python check_ab_mm_match.py $addressbase_dir $mastermap_dir
-
 #
 # Filter
 # - WHERE descriptiveGroup = '(1:Building)'
--- a/etl/load_uprns.sh
+++ b/etl/load_uprns.sh
@ -1,36 +0,0 @@
-#!/usr/bin/env bash
-
-#
-# Load UPRNS from CSV to Postgres
-# - assume postgres connection details are set in the environment using PGUSER, PGHOST etc.
-#
-: ${1?"Usage: $0 ./path/to/addressbase/dir"}
-
-data_dir=$1
-
-#
-# Create 'building_properties' record with
-#     uprn: <uprn>,
-#     parent_uprn: <parent_uprn>,
-#     toid: <toid>,
-#     uprn_geom: <point>
-#
-find $data_dir -type f -name '*.3857.csv.loadable' \
-printf "$data_dir/%f\n" | \
-parallel \
-cat {} '|' psql -c "\"COPY building_properties ( uprn_geom, toid, uprn, parent_uprn ) FROM stdin WITH CSV HEADER;\""
-
-#
-# Create references
-#
-
-# index essential for speeed here
-psql -c "CREATE INDEX IF NOT EXISTS building_toid_idx ON buildings ( ref_toid );"
-# link to buildings
-psql -c "UPDATE building_properties
-SET building_id = (
-    SELECT b.building_id
-    FROM buildings as b
-    WHERE
-    building_properties.toid = b.ref_toid
-);"
--- a/etl/run_all.sh
+++ b/etl/run_all.sh
@ -3,13 +3,11 @@
 #
 # Extract, transform and load building outlines and property records
 #
-: ${1?"Usage: $0 ./path/to/addressbase/dir ./path/to/mastermap/dir ./path/to/boundary"}
-: ${2?"Usage: $0 ./path/to/addressbase/dir ./path/to/mastermap/dir ./path/to/boundary"}
-: ${3?"Usage: $0 ./path/to/addressbase/dir ./path/to/mastermap/dir ./path/to/boundary"}
+: ${1?"Usage: $0 ./path/to/mastermap/dir ./path/to/boundary"}
+: ${2?"Usage: $0 ./path/to/mastermap/dir ./path/to/boundary"}

-addressbase_dir=$1
-mastermap_dir=$2
-boundary_file=$3
+mastermap_dir=$1
+boundary_file=$2
 script_dir=${0%/*}

 #
@ -17,10 +15,9 @@ script_dir=${0%/*}
 #

 # extract both datasets
-$script_dir/extract_addressbase.sh $addressbase_dir
 $script_dir/extract_mastermap.sh $mastermap_dir
 # filter mastermap ('building' polygons and any others referenced by addressbase)
-$script_dir/filter_transform_mastermap_for_loading.sh $addressbase_dir $mastermap_dir
+$script_dir/filter_transform_mastermap_for_loading.sh $mastermap_dir

 #
 # Load
@ -33,7 +30,5 @@ psql < $script_dir/../migrations/002.index-geometries.up.sql
 $script_dir/drop_outside_limit.sh $boundary_file
 # create a building record per outline
 $script_dir/create_building_records.sh
-# add UPRNs where they match
-$script_dir/load_uprns.sh $addressbase_dir
-# index building records
-psql < $script_dir/../migrations/003.index-buildings.up.sql
+# Run remaining migrations
+ls $script_dir/../migrations/*.up.sql 2>/dev/null | while read -r migration; do psql < $migration; done;
--- a/etl/run_clean.sh
+++ b/etl/run_clean.sh
@ -3,11 +3,8 @@
 #
 # Filter and transform for loading
 #
-: ${1?"Usage: $0 ./path/to/addressbase/dir ./path/to/mastermap/dir"}
-: ${2?"Usage: $0 ./path/to/addressbase/dir ./path/to/mastermap/dir"}
+: ${1?"Usage: $0 ./path/to/mastermap/dir"}

-addressbase_dir=$1
-mastermap_dir=$2
+mastermap_dir=$1

-rm -f $addressbase_dir/*.{csv,gml,txt,filtered,gfs}
 rm -f $mastermap_dir/*.{csv,gml,txt,filtered,gfs}