Extract and load addressbase/mastermap using id, gml, parallel

This commit is contained in:
Tom Russell 2018-09-25 19:20:41 +01:00
parent 7c3b9d222e
commit c6b3d3d5ca
7 changed files with 185 additions and 137 deletions

View File

@ -12,10 +12,12 @@ data_dir=$1
# Unzip to GML # Unzip to GML
# #
# find $data_dir -name '*.zip' -print0 | xargs -0 -P 4 -n 1 unzip find $data_dir -type f -name '*.zip' -printf "%f\n" | \
parallel \
unzip -u $data_dir/{} -d $data_dir
# #
# Extract (subset) to CSV # Extract to CSV
# #
# Relevant fields: # Relevant fields:
# WKT # WKT
@ -26,15 +28,17 @@ data_dir=$1
# logicalStatus: 1 (one) is approved (otherwise historical, provisional) # logicalStatus: 1 (one) is approved (otherwise historical, provisional)
# #
# find $data_dir -type f -name '*.gml' -printf "%f\n" | \ find $data_dir -type f -name '*.gml' -printf "%f\n" | \
# parallel \
# ogr2ogr -f CSV \
# -select crossReference,source,uprn,parentUPRN,logicalStatus \
# {}.csv {} BasicLandPropertyUnit \
# -lco GEOMETRY=AS_WKT
rm $data_dir/*.gml
find $data_dir -type f -name '*.gml.csv' -printf "$data_dir%f\n" | \
parallel \ parallel \
python filter_addressbase_csv.py {} ogr2ogr -f CSV \
-select crossReference,source,uprn,parentUPRN,logicalStatus \
$data_dir/{}.csv $data_dir/{} BasicLandPropertyUnit \
-lco GEOMETRY=AS_WKT
#
# Filter, grouping by TOID
#
find $data_dir -type f -name '*.gml.csv' -printf "%f\n" | \
parallel \
python filter_addressbase_csv.py $data_dir/{}

View File

@ -1,5 +1,13 @@
#!/usr/bin/env bash #!/usr/bin/env bash
#
# Extract MasterMap
#
: ${1?"Usage: $0 ./path/to/mastermap/dir"}
data_dir=$1
# #
# Extract buildings from *.gz to CSV # Extract buildings from *.gz to CSV
# #
@ -9,12 +17,14 @@
# Use `fid` as source ID, aka TOID. # Use `fid` as source ID, aka TOID.
# #
: ${1?"Usage: $0 ./path/to/data/dir"}
data_dir=$1
find $data_dir -type f -name '*.gz' -printf "%f\n" | \ find $data_dir -type f -name '*.gz' -printf "%f\n" | \
parallel \ parallel \
gunzip $data_dir/{} -k -S gml
rename 's/$/.gml/' $data_dir/*[^gzt]
find $data_dir -type f -name '*.gml' -printf "%f\n" | \
parallel \
ogr2ogr \ ogr2ogr \
-select fid,descriptiveGroup \ -select fid,descriptiveGroup \
-f CSV $data_dir/{}.csv \ -f CSV $data_dir/{}.csv \
@ -22,9 +32,4 @@ ogr2ogr \
TopographicArea \ TopographicArea \
-lco GEOMETRY=AS_WKT -lco GEOMETRY=AS_WKT
# then filter rm $data_dir/*.gfs
# -where "\"descriptiveGroup='(1:Building)'\"" \
# OR toid in addressbase_toids
# finally load
# -t_srs "EPSG:3857" \

40
etl/0_filter_for_loading.sh Executable file
View File

@ -0,0 +1,40 @@
#!/usr/bin/env bash
#
# Filter and transform for loading
#
: ${1?"Usage: $0 ./path/to/addressbase/dir ./path/to/mastermap/dir"}
: ${2?"Usage: $0 ./path/to/addressbase/dir ./path/to/mastermap/dir"}
addressbase_dir=$1
mastermap_dir=$2
#
# Check which TOIDs are matched against UPRNs
#
python check_ab_mm_match.py $addressbase_dir $mastermap_dir
#
# Filter
# - WHERE descriptiveGroup = '(1:Building)'
# - OR toid in addressbase_toids
#
python filter_mastermap.py $addressbase_dir $mastermap_dir
#
# Transform to 3857 (web mercator)
#
find $mastermap_dir -type f -name '*.filtered.csv' -printf "%f\n" | \
parallel \
ogr2ogr \
-f CSV $mastermap_dir/{}.3857.csv \
-select fid \
-s_srs "EPSG:27700" \
-t_srs "EPSG:3857" \
$mastermap_dir/{} \
-lco GEOMETRY=AS_WKT
#
# Update to EWKT (with SRID indicator for loading to Postgres)
#
sed -i 's/^"POLYGON/"SRID=3857;POLYGON/' $mastermap_dir/*.3857.csv

View File

@ -1,105 +0,0 @@
"""Load geometries from GeoJSON to Postgres
- create 'geometry' record with {
id: <polygon-guid>,
doc: {source_id: <toid>},
geom: <geom-wkb_hex>
}
- create corresponding 'building' record with {
id: <building-guid>,
doc: {},
geom_id: <polygon-guid>
}
"""
import glob
import json
import os
import sys
import fiona
import psycopg2
from shapely.geometry import shape
def main(source_dir, config_path):
"""Load config, read files and save features to the database
"""
conf = read_config(config_path)
dbconf = conf['database']
conn = psycopg2.connect(**dbconf)
source_files = glob.glob("{}/*.geojson".format(source_dir))
loaded = {}
for source_file in source_files:
with fiona.open(source_file, 'r') as source:
with conn.cursor() as cur:
for feature in source:
fid = feature['properties']['fid']
if fid not in loaded:
save_feature(cur, feature)
loaded[fid] = True
else:
print("Skipping", fid)
conn.commit()
conn.close()
def save_feature(cur, feature):
"""Save a feature with geometry and source id
"""
cur.execute(
"""INSERT INTO geometries
(
geometry_doc,
geometry_geom
)
VALUES
(
%s::jsonb,
ST_SetSRID(%s::geometry, %s)
)
RETURNING geometry_id
""", (
json.dumps({
'source_id': feature['properties']['fid']
}),
shape(feature['geometry']).wkb_hex,
3857
)
)
geom_id, = cur.fetchone()
cur.execute(
"""INSERT INTO buildings
(
building_doc,
geometry_id
)
VALUES
(
%s::jsonb,
%s
)
""", (
json.dumps({}),
geom_id
)
)
def read_config(config_path):
"""Read a JSON config file containing database connection details
"""
with open(config_path, 'r') as conf_fh:
conf = json.load(conf_fh)
return conf
if __name__ == '__main__':
if len(sys.argv) != 3:
print("Usage: {} ./path/to/source/dir ./path/to/dbconfig.json".format(
os.path.basename(__file__)
))
exit()
main(sys.argv[1], sys.argv[2])

40
etl/1_load_geometries.sh Executable file
View File

@ -0,0 +1,40 @@
#!/usr/bin/env bash
#
# Load geometries from GeoJSON to Postgres
# - assume postgres connection details are set in the environment using PGUSER, PGHOST etc.
#
: ${1?"Usage: $0 ./path/to/mastermap/dir"}
mastermap_dir=$1
#
# Create 'geometry' record with
# id: <polygon-guid>,
# source_id: <toid>,
# geom: <geom>
#
find $mastermap_dir -type f -name '*.3857.csv' \
-printf "COPY geometries ( geometry_geom, source_id ) FROM '$mastermap_dir/%f' WITH CSV HEADER;\n" | \
parallel \
psql -c {}
#
# Delete any duplicated geometries (by TOID)
#
psql -c "DELETE FROM geometries a USING (
SELECT MIN(ctid) as ctid, source_id
FROM geometries
GROUP BY source_id
HAVING COUNT(*) > 1
) b
WHERE a.source_id = b.source_id
AND a.ctid <> b.ctid;"
#
# Create corresponding 'building' record with
# id: <building-guid>,
# doc: {},
# geom_id: <polygon-guid>
#
psql -c "INSERT INTO buildings ( geometry_id ) SELECT geometry_id from geometries;"

View File

@ -11,17 +11,23 @@ csv.field_size_limit(sys.maxsize)
def main(ab_path, mm_path): def main(ab_path, mm_path):
ab_paths = sorted(glob.glob(os.path.join(ab_path, "*.gml.csv.filtered"))) ab_paths = sorted(glob.glob(os.path.join(ab_path, "*.gml.csv.filtered")))
mm_paths = sorted(glob.glob(os.path.join(mm_path, "*.gz.csv"))) mm_paths = sorted(glob.glob(os.path.join(mm_path, "*.gml.csv")))
try:
assert len(ab_paths) == len(mm_paths)
except AssertionError:
print(ab_paths)
print(mm_paths)
assert len(ab_paths) == len(mm_paths)
zipped_paths = zip(ab_paths, mm_paths) zipped_paths = zip(ab_paths, mm_paths)
# parallel map over tiles
with Pool(4) as p: with Pool(4) as p:
p.starmap(check, zipped_paths) p.starmap(check, zipped_paths)
def check(ab_path, mm_path): def check(ab_path, mm_path):
tile = str(os.path.basename(ab_path)).split(".")[0] tile = str(os.path.basename(ab_path)).split(".")[0]
print(tile) output_base = os.path.dirname(ab_path)
ab_toids = set() ab_toids = set()
mm_toids = set() mm_toids = set()
@ -35,22 +41,20 @@ def check(ab_path, mm_path):
for line in r: for line in r:
mm_toids.add(line['fid']) mm_toids.add(line['fid'])
print("MasterMap", len(mm_toids))
print("Addressbase", len(ab_toids))
missing = ab_toids - mm_toids missing = ab_toids - mm_toids
print("in AB but not MM", len(missing)) print(tile, "MasterMap:", len(mm_toids), "Addressbase:", len(ab_toids), "AB but not MM:", len(missing))
with open('missing_toids_{}.txt'.format(tile), 'w') as fh: with open(os.path.join(output_base, 'missing_toids_{}.txt'.format(tile)), 'w') as fh:
for toid in missing: for toid in missing:
fh.write("{}\n".format(toid)) fh.write("{}\n".format(toid))
with open('ab_toids_{}.txt'.format(tile), 'w') as fh: with open(os.path.join(output_base, 'ab_toids_{}.txt'.format(tile)), 'w') as fh:
for toid in ab_toids: for toid in ab_toids:
fh.write("{}\n".format(toid)) fh.write("{}\n".format(toid))
if __name__ == '__main__': if __name__ == '__main__':
if len(sys.argv) != 3: if len(sys.argv) != 3:
print("Usage: filter_addressbase_csv.py ./path/to/addressbase/dir ./path/to/mastermap/dir") print("Usage: check_ab_mm_match.py ./path/to/addressbase/dir ./path/to/mastermap/dir")
exit(-1) exit(-1)
main(sys.argv[1], sys.argv[2]) main(sys.argv[1], sys.argv[2])

60
etl/filter_mastermap.py Normal file
View File

@ -0,0 +1,60 @@
"""Filter MasterMap to buildings and addressbase-matches
- WHERE descriptiveGroup = '(1:Building)'
- OR toid in addressbase_toids
"""
import csv
import glob
import json
import os
import sys
from multiprocessing import Pool
csv.field_size_limit(sys.maxsize)
def main(ab_path, mm_path):
mm_paths = sorted(glob.glob(os.path.join(mm_path, "*.gml.csv")))
toid_paths = sorted(glob.glob(os.path.join(ab_path, "ab_toids_*.txt")))
try:
assert len(mm_paths) == len(toid_paths)
except AssertionError:
print(mm_paths)
print(toid_paths)
zipped_paths = zip(mm_paths, toid_paths)
# parallel map over tiles
with Pool(4) as p:
p.starmap(filter, zipped_paths)
def filter(mm_path, toid_path):
with open(toid_path, 'r') as fh:
r = csv.reader(fh)
toids = set(line[0] for line in r)
output_path = "{}.filtered.csv".format(str(mm_path).replace(".gml.csv", ""))
alt_output_path = "{}.filtered_not_building.csv".format(str(mm_path).replace(".gml.csv", ""))
output_fieldnames = ('WKT', 'fid', 'descriptiveGroup')
with open(mm_path, 'r') as fh:
r = csv.DictReader(fh)
with open(output_path, 'w') as output_fh:
w = csv.DictWriter(output_fh, fieldnames=output_fieldnames)
w.writeheader()
with open(alt_output_path, 'w') as alt_output_fh:
alt_w = csv.DictWriter(alt_output_fh, fieldnames=output_fieldnames)
alt_w.writeheader()
for line in r:
if 'Building' in line['descriptiveGroup']:
w.writerow(line)
elif line['fid'] in toids:
alt_w.writerow(line)
if __name__ == '__main__':
if len(sys.argv) != 3:
print("Usage: filter_mastermap.py ./path/to/addressbase/dir ./path/to/mastermap/dir")
exit(-1)
main(sys.argv[1], sys.argv[2])