remove addressbase from all steps and reorder readme
This commit is contained in:
parent
f55ce63d84
commit
3653e30362
@ -6,7 +6,6 @@ The scripts in this directory are used to extract, transform and load (ETL) the
|
|||||||
for Colouring London:
|
for Colouring London:
|
||||||
|
|
||||||
Building geometries, sourced from Ordnance Survey (OS) MasterMap (Topography Layer)
|
Building geometries, sourced from Ordnance Survey (OS) MasterMap (Topography Layer)
|
||||||
<!-- 1. Unique Property Reference Numbers (UPRNs), sourced from Ordnance Survey AddressBase -->
|
|
||||||
|
|
||||||
To get the required datasets, you'll need to complete the following steps:
|
To get the required datasets, you'll need to complete the following steps:
|
||||||
|
|
||||||
@ -17,10 +16,7 @@ To get the required datasets, you'll need to complete the following steps:
|
|||||||
![](screenshot/MasterMap.png)
|
![](screenshot/MasterMap.png)
|
||||||
<p></p>
|
<p></p>
|
||||||
|
|
||||||
<!-- ![](screenshot/AddressBase.png) -->
|
|
||||||
|
|
||||||
4. You should be then able to check out your basket and download the files. Note: there may be multiple `.zip` files to download for MasterMap due to the size of the dataset.
|
4. You should be then able to check out your basket and download the files. Note: there may be multiple `.zip` files to download for MasterMap due to the size of the dataset.
|
||||||
<!-- 5. Unzip the AddressBase `.zip` in a convenient location. We will use the unzipped folder in later steps. Rename the folder as appropriate (make sure this folder doesn't contain the original `.zip` file). Note: this folder also contains `.zip` files, do not unzip at this stage as a script will do this later. -->
|
|
||||||
6. Unzip the MasterMap `.zip` files and move all the `.gz` files from each to a single folder in a convenient location. We will use this folder in later steps.
|
6. Unzip the MasterMap `.zip` files and move all the `.gz` files from each to a single folder in a convenient location. We will use this folder in later steps.
|
||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
@ -60,14 +56,6 @@ cd ~/colouring-london/etl
|
|||||||
chmod +x *.sh
|
chmod +x *.sh
|
||||||
```
|
```
|
||||||
|
|
||||||
<!-- Extract the addressBase dataset.
|
|
||||||
|
|
||||||
```bash
|
|
||||||
./extract_addressbase.sh ./addressbase_dir
|
|
||||||
``` -->
|
|
||||||
|
|
||||||
<!-- ERROR 1: Couldn't fetch requested layer 'BasicLandPropertyUnit'! -->
|
|
||||||
|
|
||||||
Extract the MasterMap data (this step could take a while).
|
Extract the MasterMap data (this step could take a while).
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
@ -104,6 +92,14 @@ Index geometries.
|
|||||||
psql < ../migrations/002.index-geometries.up.sql
|
psql < ../migrations/002.index-geometries.up.sql
|
||||||
```
|
```
|
||||||
|
|
||||||
|
TODO: Drop outside limit.
|
||||||
|
|
||||||
|
<!-- But what is the boundary file? -->
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./drop_outside_limit.sh ./path/to/boundary_file
|
||||||
|
````
|
||||||
|
|
||||||
Create a building record per outline.
|
Create a building record per outline.
|
||||||
|
|
||||||
<!-- I had to edit the below file to set the psql vars before running -->
|
<!-- I had to edit the below file to set the psql vars before running -->
|
||||||
@ -114,29 +110,12 @@ Create a building record per outline.
|
|||||||
|
|
||||||
<!-- Insert 0.... -->
|
<!-- Insert 0.... -->
|
||||||
|
|
||||||
Add UPRNs where they match.
|
|
||||||
|
|
||||||
<!-- I had to edit the below file to set the psql vars before running -->
|
|
||||||
|
|
||||||
```bash
|
|
||||||
./load_uprns.sh ./addressbase_dir
|
|
||||||
````
|
|
||||||
|
|
||||||
Run the remaining migrations in `../migrations` to create the rest of the database structure.
|
Run the remaining migrations in `../migrations` to create the rest of the database structure.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
ls ~/colouring-london/migrations/*.up.sql 2>/dev/null | while read -r migration; do psql < $migration; done;
|
ls ~/colouring-london/migrations/*.up.sql 2>/dev/null | while read -r migration; do psql < $migration; done;
|
||||||
```
|
```
|
||||||
|
|
||||||
TODO: Drop outside limit.
|
|
||||||
|
|
||||||
<!-- But what is the bounddary file? -->
|
|
||||||
|
|
||||||
```bash
|
|
||||||
./drop_outside_limit.sh ./path/to/boundary_file
|
|
||||||
````
|
|
||||||
|
|
||||||
|
|
||||||
# [WIP] Updating the Colouring London database with new OS data
|
# [WIP] Updating the Colouring London database with new OS data
|
||||||
|
|
||||||
TODO: this section should instruct how to update and existing db
|
TODO: this section should instruct how to update and existing db
|
@ -1,60 +0,0 @@
|
|||||||
"""Check if AddressBase TOIDs will match MasterMap
|
|
||||||
"""
|
|
||||||
import csv
|
|
||||||
import glob
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
|
|
||||||
from multiprocessing import Pool
|
|
||||||
|
|
||||||
csv.field_size_limit(sys.maxsize)
|
|
||||||
|
|
||||||
def main(ab_path, mm_path):
|
|
||||||
ab_paths = sorted(glob.glob(os.path.join(ab_path, "*.gml.csv.filtered.csv")))
|
|
||||||
mm_paths = sorted(glob.glob(os.path.join(mm_path, "*.gml.csv")))
|
|
||||||
|
|
||||||
try:
|
|
||||||
assert len(ab_paths) == len(mm_paths)
|
|
||||||
except AssertionError:
|
|
||||||
print(ab_paths)
|
|
||||||
print(mm_paths)
|
|
||||||
|
|
||||||
zipped_paths = zip(ab_paths, mm_paths)
|
|
||||||
|
|
||||||
# parallel map over tiles
|
|
||||||
with Pool() as p:
|
|
||||||
p.starmap(check, zipped_paths)
|
|
||||||
|
|
||||||
def check(ab_path, mm_path):
|
|
||||||
tile = str(os.path.basename(ab_path)).split(".")[0]
|
|
||||||
output_base = os.path.dirname(ab_path)
|
|
||||||
ab_toids = set()
|
|
||||||
mm_toids = set()
|
|
||||||
|
|
||||||
with open(ab_path, 'r') as fh:
|
|
||||||
r = csv.DictReader(fh)
|
|
||||||
for line in r:
|
|
||||||
ab_toids.add(line['toid'])
|
|
||||||
|
|
||||||
with open(mm_path, 'r') as fh:
|
|
||||||
r = csv.DictReader(fh)
|
|
||||||
for line in r:
|
|
||||||
mm_toids.add(line['fid'])
|
|
||||||
|
|
||||||
missing = ab_toids - mm_toids
|
|
||||||
print(tile, "MasterMap:", len(mm_toids), "Addressbase:", len(ab_toids), "AB but not MM:", len(missing))
|
|
||||||
|
|
||||||
with open(os.path.join(output_base, 'missing_toids_{}.txt'.format(tile)), 'w') as fh:
|
|
||||||
for toid in missing:
|
|
||||||
fh.write("{}\n".format(toid))
|
|
||||||
|
|
||||||
with open(os.path.join(output_base, 'ab_toids_{}.txt'.format(tile)), 'w') as fh:
|
|
||||||
for toid in ab_toids:
|
|
||||||
fh.write("{}\n".format(toid))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
if len(sys.argv) != 3:
|
|
||||||
print("Usage: check_ab_mm_match.py ./path/to/addressbase/dir ./path/to/mastermap/dir")
|
|
||||||
exit(-1)
|
|
||||||
main(sys.argv[1], sys.argv[2])
|
|
@ -1,63 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
|
|
||||||
#
|
|
||||||
# Extract address points from OS Addressbase GML
|
|
||||||
# - as supplied in 5km tiles, zip/gz archives
|
|
||||||
#
|
|
||||||
: ${1?"Usage: $0 ./path/to/data/dir"}
|
|
||||||
|
|
||||||
data_dir=$1
|
|
||||||
|
|
||||||
#
|
|
||||||
# Unzip to GML
|
|
||||||
#
|
|
||||||
|
|
||||||
find $data_dir -type f -name '*.zip' -printf "%f\n" | \
|
|
||||||
parallel \
|
|
||||||
unzip -u $data_dir/{} -d $data_dir
|
|
||||||
|
|
||||||
#
|
|
||||||
# Extract to CSV
|
|
||||||
#
|
|
||||||
# Relevant fields:
|
|
||||||
# WKT
|
|
||||||
# crossReference (list of TOID/other references)
|
|
||||||
# source (list of cross-reference sources: 7666MT refers to MasterMap Topo)
|
|
||||||
# uprn
|
|
||||||
# parentUPRN
|
|
||||||
# logicalStatus: 1 (one) is approved (otherwise historical, provisional)
|
|
||||||
#
|
|
||||||
|
|
||||||
find $data_dir -type f -name '*.gml' -printf "%f\n" | \
|
|
||||||
parallel \
|
|
||||||
ogr2ogr -f CSV \
|
|
||||||
-select crossReference,source,uprn,parentUPRN,logicalStatus \
|
|
||||||
$data_dir/{}.csv $data_dir/{} BasicLandPropertyUnit \
|
|
||||||
-lco GEOMETRY=AS_WKT
|
|
||||||
|
|
||||||
#
|
|
||||||
# Filter
|
|
||||||
#
|
|
||||||
find $data_dir -type f -name '*.gml.csv' -printf "%f\n" | \
|
|
||||||
parallel \
|
|
||||||
colouringlondon/bin/python filter_addressbase_csv.py $data_dir/{}
|
|
||||||
|
|
||||||
|
|
||||||
#
|
|
||||||
# Transform to 3857 (web mercator)
|
|
||||||
#
|
|
||||||
find $data_dir -type f -name '*.filtered.csv' -printf "%f\n" | \
|
|
||||||
parallel \
|
|
||||||
ogr2ogr \
|
|
||||||
-f CSV $data_dir/{}.3857.csv \
|
|
||||||
-s_srs "EPSG:4326" \
|
|
||||||
-t_srs "EPSG:3857" \
|
|
||||||
$data_dir/{} \
|
|
||||||
-lco GEOMETRY=AS_WKT
|
|
||||||
|
|
||||||
#
|
|
||||||
# Update to EWKT (with SRID indicator for loading to Postgres)
|
|
||||||
#
|
|
||||||
find $data_dir -type f -name '*.3857.csv' -printf "%f\n" | \
|
|
||||||
parallel \
|
|
||||||
cat $data_dir/{} "|" sed "'s/^\"POINT/\"SRID=3857;POINT/'" "|" cut -f 1,3,4,5 -d "','" ">" $data_dir/{}.loadable
|
|
@ -1,42 +0,0 @@
|
|||||||
#!/usr/bin/env python
|
|
||||||
"""Read ogr2ogr-converted CSV, filter to get OSMM TOID reference, only active addresses
|
|
||||||
"""
|
|
||||||
import csv
|
|
||||||
import json
|
|
||||||
import sys
|
|
||||||
|
|
||||||
|
|
||||||
def main(input_path):
|
|
||||||
output_path = "{}.filtered.csv".format(input_path)
|
|
||||||
fieldnames = (
|
|
||||||
'wkt', 'toid', 'uprn', 'parent_uprn'
|
|
||||||
)
|
|
||||||
with open(input_path) as input_fh:
|
|
||||||
with open(output_path, 'w') as output_fh:
|
|
||||||
w = csv.DictWriter(output_fh, fieldnames=fieldnames)
|
|
||||||
w.writeheader()
|
|
||||||
r = csv.DictReader(input_fh)
|
|
||||||
for line in r:
|
|
||||||
if line['logicalStatus'] != "1":
|
|
||||||
continue
|
|
||||||
|
|
||||||
refs = json.loads(line['crossReference'])
|
|
||||||
sources = json.loads(line['source'])
|
|
||||||
toid = ""
|
|
||||||
for ref, source in zip(refs, sources):
|
|
||||||
if source == "7666MT":
|
|
||||||
toid = ref
|
|
||||||
|
|
||||||
w.writerow({
|
|
||||||
'uprn': line['uprn'],
|
|
||||||
'parent_uprn': line['parentUPRN'],
|
|
||||||
'toid': toid,
|
|
||||||
'wkt': line['WKT'],
|
|
||||||
})
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
if len(sys.argv) != 2:
|
|
||||||
print("Usage: filter_addressbase_csv.py ./path/to/data.csv")
|
|
||||||
exit(-1)
|
|
||||||
main(sys.argv[1])
|
|
@ -7,11 +7,6 @@
|
|||||||
|
|
||||||
mastermap_dir=$1
|
mastermap_dir=$1
|
||||||
|
|
||||||
#
|
|
||||||
# Check which TOIDs are matched against UPRNs
|
|
||||||
#
|
|
||||||
# colouringlondon/bin/python check_ab_mm_match.py $addressbase_dir $mastermap_dir
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Filter
|
# Filter
|
||||||
# - WHERE descriptiveGroup = '(1:Building)'
|
# - WHERE descriptiveGroup = '(1:Building)'
|
||||||
|
@ -1,36 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
|
|
||||||
#
|
|
||||||
# Load UPRNS from CSV to Postgres
|
|
||||||
# - assume postgres connection details are set in the environment using PGUSER, PGHOST etc.
|
|
||||||
#
|
|
||||||
: ${1?"Usage: $0 ./path/to/addressbase/dir"}
|
|
||||||
|
|
||||||
data_dir=$1
|
|
||||||
|
|
||||||
#
|
|
||||||
# Create 'building_properties' record with
|
|
||||||
# uprn: <uprn>,
|
|
||||||
# parent_uprn: <parent_uprn>,
|
|
||||||
# toid: <toid>,
|
|
||||||
# uprn_geom: <point>
|
|
||||||
#
|
|
||||||
find $data_dir -type f -name '*.3857.csv.loadable' \
|
|
||||||
-printf "$data_dir/%f\n" | \
|
|
||||||
parallel \
|
|
||||||
cat {} '|' psql -c "\"COPY building_properties ( uprn_geom, toid, uprn, parent_uprn ) FROM stdin WITH CSV HEADER;\""
|
|
||||||
|
|
||||||
#
|
|
||||||
# Create references
|
|
||||||
#
|
|
||||||
|
|
||||||
# index essential for speeed here
|
|
||||||
psql -c "CREATE INDEX IF NOT EXISTS building_toid_idx ON buildings ( ref_toid );"
|
|
||||||
# link to buildings
|
|
||||||
psql -c "UPDATE building_properties
|
|
||||||
SET building_id = (
|
|
||||||
SELECT b.building_id
|
|
||||||
FROM buildings as b
|
|
||||||
WHERE
|
|
||||||
building_properties.toid = b.ref_toid
|
|
||||||
);"
|
|
@ -3,13 +3,11 @@
|
|||||||
#
|
#
|
||||||
# Extract, transform and load building outlines and property records
|
# Extract, transform and load building outlines and property records
|
||||||
#
|
#
|
||||||
: ${1?"Usage: $0 ./path/to/addressbase/dir ./path/to/mastermap/dir ./path/to/boundary"}
|
: ${1?"Usage: $0 ./path/to/mastermap/dir ./path/to/boundary"}
|
||||||
: ${2?"Usage: $0 ./path/to/addressbase/dir ./path/to/mastermap/dir ./path/to/boundary"}
|
: ${2?"Usage: $0 ./path/to/mastermap/dir ./path/to/boundary"}
|
||||||
: ${3?"Usage: $0 ./path/to/addressbase/dir ./path/to/mastermap/dir ./path/to/boundary"}
|
|
||||||
|
|
||||||
addressbase_dir=$1
|
mastermap_dir=$1
|
||||||
mastermap_dir=$2
|
boundary_file=$2
|
||||||
boundary_file=$3
|
|
||||||
script_dir=${0%/*}
|
script_dir=${0%/*}
|
||||||
|
|
||||||
#
|
#
|
||||||
@ -17,10 +15,9 @@ script_dir=${0%/*}
|
|||||||
#
|
#
|
||||||
|
|
||||||
# extract both datasets
|
# extract both datasets
|
||||||
$script_dir/extract_addressbase.sh $addressbase_dir
|
|
||||||
$script_dir/extract_mastermap.sh $mastermap_dir
|
$script_dir/extract_mastermap.sh $mastermap_dir
|
||||||
# filter mastermap ('building' polygons and any others referenced by addressbase)
|
# filter mastermap ('building' polygons and any others referenced by addressbase)
|
||||||
$script_dir/filter_transform_mastermap_for_loading.sh $addressbase_dir $mastermap_dir
|
$script_dir/filter_transform_mastermap_for_loading.sh $mastermap_dir
|
||||||
|
|
||||||
#
|
#
|
||||||
# Load
|
# Load
|
||||||
@ -33,7 +30,5 @@ psql < $script_dir/../migrations/002.index-geometries.up.sql
|
|||||||
$script_dir/drop_outside_limit.sh $boundary_file
|
$script_dir/drop_outside_limit.sh $boundary_file
|
||||||
# create a building record per outline
|
# create a building record per outline
|
||||||
$script_dir/create_building_records.sh
|
$script_dir/create_building_records.sh
|
||||||
# add UPRNs where they match
|
# Run remaining migrations
|
||||||
$script_dir/load_uprns.sh $addressbase_dir
|
ls $script_dir/../migrations/*.up.sql 2>/dev/null | while read -r migration; do psql < $migration; done;
|
||||||
# index building records
|
|
||||||
psql < $script_dir/../migrations/003.index-buildings.up.sql
|
|
||||||
|
@ -3,11 +3,8 @@
|
|||||||
#
|
#
|
||||||
# Filter and transform for loading
|
# Filter and transform for loading
|
||||||
#
|
#
|
||||||
: ${1?"Usage: $0 ./path/to/addressbase/dir ./path/to/mastermap/dir"}
|
: ${1?"Usage: $0 ./path/to/mastermap/dir"}
|
||||||
: ${2?"Usage: $0 ./path/to/addressbase/dir ./path/to/mastermap/dir"}
|
|
||||||
|
|
||||||
addressbase_dir=$1
|
mastermap_dir=$1
|
||||||
mastermap_dir=$2
|
|
||||||
|
|
||||||
rm -f $addressbase_dir/*.{csv,gml,txt,filtered,gfs}
|
|
||||||
rm -f $mastermap_dir/*.{csv,gml,txt,filtered,gfs}
|
rm -f $mastermap_dir/*.{csv,gml,txt,filtered,gfs}
|
||||||
|
Loading…
Reference in New Issue
Block a user