From 3441bf88e2d74f2e04197abce172d126951f7d04 Mon Sep 17 00:00:00 2001 From: Tom Russell Date: Sat, 20 Oct 2018 18:37:02 +0100 Subject: [PATCH] Update load_data to use API --- etl/join_building_data/join-camden.json | 7 - etl/join_building_data/join-data.py | 162 --------------------- etl/join_building_data/join-fitzrovia.json | 6 - etl/join_building_data/load_data.py | 140 ++++++++++++++++++ 4 files changed, 140 insertions(+), 175 deletions(-) delete mode 100644 etl/join_building_data/join-camden.json delete mode 100644 etl/join_building_data/join-data.py delete mode 100644 etl/join_building_data/join-fitzrovia.json create mode 100644 etl/join_building_data/load_data.py diff --git a/etl/join_building_data/join-camden.json b/etl/join_building_data/join-camden.json deleted file mode 100644 index a761eb9b..00000000 --- a/etl/join_building_data/join-camden.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "crs": 27700, - "mapping": [ - ["Date_sou_1", "date_source", "lambda old_, new_: new_"], - ["Year_C", "date_year", "lambda old_, new_: int(new_)"] - ] -} \ No newline at end of file diff --git a/etl/join_building_data/join-data.py b/etl/join_building_data/join-data.py deleted file mode 100644 index 0e0ec6ec..00000000 --- a/etl/join_building_data/join-data.py +++ /dev/null @@ -1,162 +0,0 @@ -"""Join shapefile data to buildings - -- read through shapes -- locate building in current database (by centroid) -- update building with data -""" -import json -import os -import sys - -import fiona -import psycopg2 - -from shapely.geometry import shape - - -def main(source_file, config_path, transform_config_path): - """Load config, read files and save features to the database - """ - conf = read_config(config_path) - transform_config = read_config(transform_config_path) - data_mapping = [ - # from_fieldname, to_fieldname, mapping(old_val, source_val)->new_val - (from_, to_, eval(transform)) - for from_, to_, transform in transform_config['mapping'] - ] - dbconf = conf['database'] - conn = psycopg2.connect(**dbconf) - - with fiona.open(source_file, 'r') as source: - epsg_code = transform_config['crs'] - - with conn.cursor() as cur: - for feature in source: - geometry_id = find_geom(cur, feature, epsg_code) - if geometry_id is not None: - save_data( - cur, feature['properties'], data_mapping, geometry_id) - else: - print("Skipping", feature['properties']) - conn.commit() - conn.close() - - -def save_data(cur, props, data_conf, geometry_id): - """Save data to a building - """ - cur.execute( - """SELECT building_id, building_doc FROM buildings - WHERE - geometry_id = %s - """, ( - geometry_id, - ) - ) - building = cur.fetchone() - if building is None: - doc = update_from_props({}, props, data_conf) - cur.execute( - """INSERT INTO buildings - ( - building_doc, - geometry_id - ) - VALUES - ( - %s::jsonb, - %s - ) - """, ( - json.dumps(doc), - geometry_id - ) - ) - else: - building_id, old_doc = building - doc = update_from_props(old_doc, props, data_conf) - cur.execute( - """UPDATE buildings - SET - building_doc = %s::jsonb - WHERE - building_id = %s - """, ( - json.dumps(doc), - building_id - ) - ) - - -def find_geom(cur, feature, epsg_code): - """Find a building geometry - """ - # match on TOID - - # match on best intersection - wkb_hex = shape(feature['geometry']).wkb_hex - cur.execute( - """SELECT geometry_id, - ST_Area( - ST_Intersection( - ST_Transform( - ST_SetSRID(%s::geometry, %s), - 3857 - ), - geometry_geom - ) - ) as intersection_area - FROM geometries - WHERE - ST_Intersects( - ST_Transform( - ST_SetSRID(%s::geometry, %s), - 3857 - ), - geometry_geom - ) - ORDER BY intersection_area DESC - """, ( - wkb_hex, - epsg_code, - wkb_hex, - epsg_code - ) - ) - results = cur.fetchall() - if results: - # print(feature['properties']['fid'], "matched", len(results)) - return results[0] - else: - return results - - -def update_from_props(doc, props, mapping): - """Expect mapping to be a list of transforms - - from_fieldname (expect to find in source feature['properties']) - - to_fieldname (expect to create or find in existing doc) - - transform(old_val, new_val) function/lambda to do any processing - """ - for from_, to_, transform in mapping: - if to_ not in doc: - doc[to_] = None - doc[to_] = transform(doc[to_], props[from_]) - return doc - - -def read_config(config_path): - """Read a JSON config file containing database connection details - """ - with open(config_path, 'r') as fh: - conf = json.load(fh) - return conf - - -if __name__ == '__main__': - if len(sys.argv) != 4: - print( - "Usage: {} ./path/to/source/file.csv ./path/to/dbconfig.json ./path/to/mapping.json".format( - os.path.basename(__file__) - )) - exit() - main(sys.argv[1], sys.argv[2], sys.argv[3]) diff --git a/etl/join_building_data/join-fitzrovia.json b/etl/join_building_data/join-fitzrovia.json deleted file mode 100644 index d0646c80..00000000 --- a/etl/join_building_data/join-fitzrovia.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "crs": 27700, - "mapping": [ - ["Storeys", "size_storeys", "lambda old_, new_: int(new_) if new_ is not None else None"] - ] -} \ No newline at end of file diff --git a/etl/join_building_data/load_data.py b/etl/join_building_data/load_data.py new file mode 100644 index 00000000..28ac7eca --- /dev/null +++ b/etl/join_building_data/load_data.py @@ -0,0 +1,140 @@ +"""Join shapefile data to buildings + +This is effectively an example script using the HTTP API, tailored to particular collected +datasets for Camden (age data) and Fitzrovia (number of storeys). + +- read through shapes +- locate building by toid +- else locate building by representative point +- update building with data +""" +import json +import os +import sys +from functools import partial + +import fiona +import pyproj +import requests +from shapely.geometry import shape +from shapely.ops import transform + + +osgb_to_ll = partial( + pyproj.transform, + pyproj.Proj(init='epsg:27700'), + pyproj.Proj(init='epsg:4326') +) + + +def main(base_url, api_key, process, source_file): + """Read from file, update buildings + """ + with fiona.open(source_file, 'r') as source: + for feature in source: + props = feature['properties'] + + if process == "camden": + toid, data = process_camden(props) + else: + toid, data = process_fitzrovia(props) + + if data is None: + continue + + building_id = find_building(toid, feature['geometry'], base_url) + if not building_id: + print("no_match", toid, "-") + continue + + save_data(building_id, data, api_key, base_url) + + +def process_camden(props): + toid = osgb_toid(props['TOID']) + data = { + 'date_year': props['Year_C'], + 'date_source_detail': props['Date_sou_1'] + } + return toid, data + + +def process_fitzrovia(props): + toid = osgb_toid(props['TOID']) + storeys = props['Storeys'] + + if storeys is None: + return toid, None + + if props['Basement'] == 'Yes': + data = { + 'size_storeys_core': int(storeys) - 1, + 'size_storeys_basement': 1 + } + else: + data = { + 'size_storeys_core': int(storeys), + 'size_storeys_basement': 0 + } + return toid, data + + +def osgb_toid(toid): + if toid is None: + toid = "" + return "osgb" + toid.lstrip("0") + + +def save_data(building_id, data, api_key, base_url): + """Save data to a building + """ + r = requests.post( + "{}/building/{}.json?api_key={}".format(base_url, building_id, api_key), + json=data + ) + + +def find_building(toid, geom, base_url): + """Find building_id by TOID or location + """ + r = requests.get(base_url + "/buildings/reference", params={ + 'key': 'toid', + 'id': toid + }) + buildings = r.json() + if buildings and len(buildings) == 1: + bid = buildings[0]['building_id'] + print("match_by_toid", toid, bid) + return bid + + # try location + poly = shape(geom) + point_osgb = poly.centroid + if not poly.contains(point_osgb): + point_osgb = poly.representative_point() + + point_ll = transform(osgb_to_ll, point_osgb) + r = requests.get(base_url + "/buildings/locate", params={ + 'lng': point_ll.x, + 'lat': point_ll.y + }) + buildings = r.json() + if buildings and len(buildings) == 1: + bid = buildings[0]['building_id'] + print("match_by_location", toid, bid) + return bid + + return None + + +if __name__ == '__main__': + try: + url, api_key, process, filename = sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4] + except IndexError: + print( + "Usage: {} ./path/to/camden.shp".format( + os.path.basename(__file__) + )) + exit() + + main(url, api_key, process, filename)