Merge pull request #1224 from colouring-cities/fix/etl-test-polygons-1223

Update get_test_polygons.py for osmnx 1.6.0
This commit is contained in:
Mike Simpson 2023-08-15 15:14:21 +01:00 committed by GitHub
commit fd11836bc8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 447 additions and 369 deletions

View File

@ -5,21 +5,20 @@ jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
with:
python-version: '3.7'
- name:
Install dependencies
run: |
sudo apt-get install libgeos-dev
python -m pip install --upgrade pip
python -m pip install pytest
python -m pip install flake8
python -m pip install -r etl/requirements.txt
- name: Run Flake8
run: |
ls etl/*py | grep -v 'join_building_data' | xargs flake8 --exclude etl/__init__.py
- name: Run tests
run: |
python -m pytest
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
with:
python-version: "3.11"
- name: Install dependencies
run: |
sudo apt-get install libgeos-dev
python -m pip install --upgrade pip
python -m pip install pytest
python -m pip install flake8
python -m pip install -r etl/requirements.txt
- name: Run Flake8
run: |
flake8 etl --ignore=E501
- name: Run tests
run: |
python -m pytest

View File

@ -1 +1,3 @@
from .filter_mastermap import filter_mastermap
from .filter_mastermap import filter_mastermap
__all__ = ["filter_mastermap"]

View File

@ -20,24 +20,24 @@ def main(mastermap_path):
def filter_mastermap(mm_path):
output_path = str(mm_path).replace(".gml.csv", "")
output_path = "{}.filtered.csv".format(output_path)
output_fieldnames = ('WKT', 'fid', 'descriptiveGroup')
output_fieldnames = ("WKT", "fid", "descriptiveGroup")
# Open the input csv with all polygons, buildings and others
with open(mm_path, 'r') as fh:
with open(mm_path, "r") as fh:
r = csv.DictReader(fh)
# Open a new output csv that will contain just buildings
with open(output_path, 'w') as output_fh:
with open(output_path, "w") as output_fh:
w = csv.DictWriter(output_fh, fieldnames=output_fieldnames)
w.writeheader()
for line in r:
try:
if 'Building' in line['descriptiveGroup']:
if "Building" in line["descriptiveGroup"]:
w.writerow(line)
# when descriptiveGroup is missing, ignore this Polygon
except TypeError:
pass
if __name__ == '__main__':
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: filter_mastermap.py ./path/to/mastermap/dir")
exit(-1)

View File

@ -21,43 +21,49 @@ size = 256
# load buildings from about 1.5km² around UCL
point = (51.524498, -0.133874)
dist = 612
gdf = osmnx.footprints_from_point(point=point, dist=dist)
tags = {"building": True}
gdf = osmnx.features_from_point(point, tags, dist=dist)
# preview image
gdf_proj = osmnx.projection.project_gdf(gdf, to_crs={'init': 'epsg:3857'})
gdf_proj = gdf_proj[gdf_proj.geometry.apply(lambda g: g.geom_type != 'MultiPolygon')] # noqa
gdf_proj = osmnx.projection.project_gdf(gdf, to_crs={"init": "epsg:3857"})
gdf_proj = gdf_proj[gdf_proj.geometry.type == "Polygon"]
fig, ax = osmnx.plot_footprints(gdf_proj, bgcolor='#333333',
color='w', figsize=(4, 4),
save=True, show=False, close=True,
filename='test_buildings_preview', dpi=600)
fig, ax = osmnx.plot_footprints(
gdf_proj,
bgcolor="#333333",
color="w",
figsize=(4, 4),
save=True,
show=False,
close=True,
filepath="test_buildings_preview.png",
dpi=600,
)
# save
test_dir = os.path.dirname(__file__)
test_data_geojson = str(os.path.join(test_dir, 'test_buildings.geojson'))
test_data_geojson = str(os.path.join(test_dir, "test_buildings.geojson"))
subprocess.run(["rm", test_data_geojson])
gdf_to_save = gdf_proj.reset_index()[["osmid", "geometry"]]
gdf_to_save = gdf_proj.reset_index(
)[
['index', 'geometry']
]
gdf_to_save.rename(
columns={'index': 'fid'}
).to_file(
test_data_geojson, driver='GeoJSON'
gdf_to_save.rename(columns={"osmid": "fid"}).to_file(
test_data_geojson, driver="GeoJSON"
)
# convert to CSV
test_data_csv = str(os.path.join(test_dir, 'test_buildings.3857.csv'))
test_data_csv = str(os.path.join(test_dir, "test_buildings.3857.csv"))
subprocess.run(["rm", test_data_csv])
subprocess.run(
["ogr2ogr", "-f", "CSV", test_data_csv,
test_data_geojson, "-lco", "GEOMETRY=AS_WKT"]
[
"ogr2ogr",
"-f",
"CSV",
test_data_csv,
test_data_geojson,
"-lco",
"GEOMETRY=AS_WKT",
]
)
# add SRID for ease of loading to PostgreSQL
subprocess.run(
["sed", "-i", "s/^\"POLYGON/\"SRID=3857;POLYGON/",
test_data_csv]
)
subprocess.run(["sed", "-i", 's/^"POLYGON/"SRID=3857;POLYGON/', test_data_csv])

View File

@ -17,7 +17,6 @@ Then with this script:
"""
import json
import csv
import os
import subprocess
@ -28,50 +27,49 @@ from tqdm import tqdm
def main(base_url, api_key, source_file):
"""Read from file, update buildings
"""
with open(source_file, 'r') as source_fh:
"""Read from file, update buildings"""
with open(source_file, "r") as source_fh:
source = csv.DictReader(source_fh)
for feature in tqdm(source, total=line_count(source_file)):
building_id, data = process_ca(feature)
if building_id and building_id != 'building_id':
if building_id and building_id != "building_id":
save_data(building_id, data, api_key, base_url)
def line_count(fname):
"""Count lines - relies on 'wc'
"""
p = subprocess.run(['wc', '-l', fname], stdout=subprocess.PIPE)
"""Count lines - relies on 'wc'"""
p = subprocess.run(["wc", "-l", fname], stdout=subprocess.PIPE)
if p.returncode != 0:
raise IOError(err)
raise IOError(p.returncode)
return int(p.stdout.strip().split()[0])
def process_ca(props):
building_id = props['building_id']
building_id = props["building_id"]
data = {
'planning_in_conservation_area': True,
'planning_conservation_area_name': props['conservation_area_name']
"planning_in_conservation_area": True,
"planning_conservation_area_name": props["conservation_area_name"],
}
return building_id, data
def save_data(building_id, data, api_key, base_url):
"""Save data to a building
"""
r = requests.post(
"""Save data to a building"""
requests.post(
"{}/buildings/{}.json?api_key={}".format(base_url, building_id, api_key),
json=data
json=data,
)
if __name__ == '__main__':
if __name__ == "__main__":
try:
url, api_key, filename = sys.argv[1], sys.argv[2], sys.argv[3]
except IndexError:
print(
"Usage: {} <URL> <api_key> ./path/to/conservation_areas.csv".format(
os.path.basename(__file__)
))
os.path.basename(__file__)
)
)
exit()
main(url, api_key, filename)

View File

@ -44,8 +44,6 @@ TODO extend to allow latitude,longitude or easting,northing columns and lookup b
"""
import csv
import json
import os
import sys
import argparse
import requests
@ -53,9 +51,8 @@ from retrying import retry
def main(base_url, api_key, source_file, json_columns, no_overwrite=False, debug=False):
"""Read from file, update buildings
"""
with open(source_file, 'r') as source:
"""Read from file, update buildings"""
with open(source_file, "r") as source:
reader = csv.DictReader(source)
for line in reader:
building_id = find_building(line, base_url)
@ -64,78 +61,86 @@ def main(base_url, api_key, source_file, json_columns, no_overwrite=False, debug
if building_id is None:
continue
if 'sust_dec' in line and line['sust_dec'] == '':
del line['sust_dec']
if "sust_dec" in line and line["sust_dec"] == "":
del line["sust_dec"]
if no_overwrite:
try:
if check_data_present(building_id, line.keys(), base_url):
print(f'Building {building_id}: Not updating to avoid overwriting existing data')
print(
f"Building {building_id}: Not updating to avoid overwriting existing data"
)
continue
except ApiRequestError as e:
print(f'Error checking existing data for building {building_id}: status {e.code}, data: {e.data}')
print(
f"Error checking existing data for building {building_id}: status {e.code}, data: {e.data}"
)
raise
response_code, response_data = update_building(building_id, line, api_key, base_url)
response_code, response_data = update_building(
building_id, line, api_key, base_url
)
if response_code != 200:
print('ERROR', building_id, response_code, response_data)
print("ERROR", building_id, response_code, response_data)
elif debug:
print('DEBUG', building_id, response_code, response_data)
print("DEBUG", building_id, response_code, response_data)
class ApiRequestError(Exception):
def __init__(self, code, data, message=''):
def __init__(self, code, data, message=""):
self.code = code
self.data = data
super().__init__(message)
def check_data_present(building_id, fields, base_url):
response_code, current_state = get_building(building_id, base_url)
if response_code != 200:
raise ApiRequestError(response_code, current_state)
else:
id_fields = set(['building_id', 'toid', 'uprn'])
id_fields = set(["building_id", "toid", "uprn"])
field_names_without_ids = [k for k in fields if k not in id_fields]
return any([current_state.get(k, None) != None for k in field_names_without_ids])
return any(
[current_state.get(k, None) is not None for k in field_names_without_ids]
)
@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000)
def get_building(building_id, base_url):
"""Get data for a building
"""
"""Get data for a building"""
r = requests.get(f"{base_url}/api/buildings/{building_id}.json")
return r.status_code, r.json()
@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000)
def update_building(building_id, data, api_key, base_url):
"""Save data to a building
"""
"""Save data to a building"""
r = requests.post(
"{}/api/buildings/{}.json".format(base_url, building_id),
params={'api_key': api_key},
json=data
params={"api_key": api_key},
json=data,
)
return r.status_code, r.json()
def find_building(data, base_url):
if 'building_id' in data:
building_id = data['building_id']
if "building_id" in data:
building_id = data["building_id"]
if building_id is not None:
print("match_by_building_id", building_id)
return building_id
if 'toid' in data:
building_id = find_by_reference(base_url, 'toid', data['toid'])
if "toid" in data:
building_id = find_by_reference(base_url, "toid", data["toid"])
if building_id is not None:
print("match_by_toid", data['toid'], building_id)
print("match_by_toid", data["toid"], building_id)
return building_id
if 'uprn' in data:
building_id = find_by_reference(base_url, 'uprn', data['uprn'])
if "uprn" in data:
building_id = find_by_reference(base_url, "uprn", data["uprn"])
if building_id is not None:
print("match_by_uprn", data['uprn'], building_id)
print("match_by_uprn", data["uprn"], building_id)
return building_id
print("no_match", data)
@ -144,21 +149,21 @@ def find_building(data, base_url):
@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000)
def find_by_reference(base_url, ref_key, ref_id):
"""Find building_id by TOID or UPRN
"""
r = requests.get("{}/api/buildings/reference".format(base_url), params={
'key': ref_key,
'id': ref_id
})
"""Find building_id by TOID or UPRN"""
r = requests.get(
"{}/api/buildings/reference".format(base_url),
params={"key": ref_key, "id": ref_id},
)
buildings = r.json()
if buildings and 'error' not in buildings and len(buildings) == 1:
building_id = buildings[0]['building_id']
if buildings and "error" not in buildings and len(buildings) == 1:
building_id = buildings[0]["building_id"]
else:
building_id = None
return building_id
def parse_json_columns(row, json_columns):
for col in json_columns:
row[col] = json.loads(row[col])
@ -167,28 +172,41 @@ def parse_json_columns(row, json_columns):
def list_str(values):
return values.split(',')
return values.split(",")
if __name__ == '__main__':
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('url', help='URL for the app')
parser.add_argument('api_key', help='API key for the user')
parser.add_argument('path', help='Path to data CSV file')
parser.add_argument('json_columns',
nargs='?',
parser.add_argument("url", help="URL for the app")
parser.add_argument("api_key", help="API key for the user")
parser.add_argument("path", help="Path to data CSV file")
parser.add_argument(
"json_columns",
nargs="?",
type=list_str,
default=[],
help='A comma-separated list of columns which should be parsed as JSON')
help="A comma-separated list of columns which should be parsed as JSON",
)
parser.add_argument('--no-overwrite', '-n',
action='store_true',
dest='no_overwrite',
help='Don\'t overwrite building data if any of the fields supplied is already set')
parser.add_argument(
"--no-overwrite",
"-n",
action="store_true",
dest="no_overwrite",
help="Don't overwrite building data if any of the fields supplied is already set",
)
parser.add_argument('--debug', '-d',
action='store_true',
help='Print debug messages')
parser.add_argument(
"--debug", "-d", action="store_true", help="Print debug messages"
)
args = parser.parse_args()
main(args.url, args.api_key, args.path, args.json_columns, args.no_overwrite, args.debug)
main(
args.url,
args.api_key,
args.path,
args.json_columns,
args.no_overwrite,
args.debug,
)

View File

@ -23,18 +23,18 @@ The process:
TODO extend to allow latitude,longitude or easting,northing columns and lookup by location.
"""
import csv
import json
import os
import sys
import requests
session = requests.Session()
session.verify = False
def main(base_url, api_key, source_file):
"""Read from file, update buildings
"""
with open(source_file, 'r') as source:
"""Read from file, update buildings"""
with open(source_file, "r") as source:
reader = csv.DictReader(source)
for line in reader:
building_id = find_building(line, base_url)
@ -42,40 +42,41 @@ def main(base_url, api_key, source_file):
if building_id is None:
continue
response_code, response_data = update_building(building_id, line, api_key, base_url)
response_code, response_data = update_building(
building_id, line, api_key, base_url
)
if response_code != 200:
print('ERROR', building_id, response_code, response_data)
print("ERROR", building_id, response_code, response_data)
def update_building(building_id, data, api_key, base_url):
"""Save data to a building
"""
"""Save data to a building"""
r = requests.post(
"{}/api/buildings/{}.json".format(base_url, building_id),
params={'api_key': api_key},
params={"api_key": api_key},
json=data,
verify=False
verify=False,
)
print(r)
return r.status_code, r.json()
def find_building(data, base_url):
if 'building_id' in data:
building_id = data['building_id']
if "building_id" in data:
building_id = data["building_id"]
if building_id is not None:
print("match_by_building_id", building_id)
return building_id
if 'toid' in data:
building_id = find_by_reference(base_url, 'toid', data['toid'])
if "toid" in data:
building_id = find_by_reference(base_url, "toid", data["toid"])
if building_id is not None:
print("match_by_toid", data['toid'], building_id)
print("match_by_toid", data["toid"], building_id)
return building_id
if 'uprn' in data:
building_id = find_by_reference(base_url, 'uprn', data['uprn'])
if "uprn" in data:
building_id = find_by_reference(base_url, "uprn", data["uprn"])
if building_id is not None:
print("match_by_uprn", data['uprn'], building_id)
print("match_by_uprn", data["uprn"], building_id)
return building_id
print("no_match", data)
@ -83,32 +84,34 @@ def find_building(data, base_url):
def find_by_reference(base_url, ref_key, ref_id):
"""Find building_id by TOID or UPRN
"""
r = requests.get("{}/api/buildings/reference".format(base_url), params={
'key': ref_key,
'id': ref_id,
},
verify=False
"""Find building_id by TOID or UPRN"""
r = requests.get(
"{}/api/buildings/reference".format(base_url),
params={
"key": ref_key,
"id": ref_id,
},
verify=False,
)
buildings = r.json()
if buildings and 'error' not in buildings and len(buildings) == 1:
building_id = buildings[0]['building_id']
if buildings and "error" not in buildings and len(buildings) == 1:
building_id = buildings[0]["building_id"]
else:
building_id = None
return building_id
if __name__ == '__main__':
if __name__ == "__main__":
try:
url, api_key, filename = sys.argv[1], sys.argv[2], sys.argv[3]
except IndexError:
print(
"Usage: {} <URL> <api_key> ./path/to/data.csv".format(
os.path.basename(__file__)
))
os.path.basename(__file__)
)
)
exit()
main(url, api_key, filename)

View File

@ -8,7 +8,6 @@ datasets for Camden (age data) and Fitzrovia (number of storeys).
- else locate building by representative point
- update building with data
"""
import json
import os
import sys
from functools import partial
@ -21,18 +20,15 @@ from shapely.ops import transform
osgb_to_ll = partial(
pyproj.transform,
pyproj.Proj(init='epsg:27700'),
pyproj.Proj(init='epsg:4326')
pyproj.transform, pyproj.Proj(init="epsg:27700"), pyproj.Proj(init="epsg:4326")
)
def main(base_url, api_key, process, source_file):
"""Read from file, update buildings
"""
with fiona.open(source_file, 'r') as source:
"""Read from file, update buildings"""
with fiona.open(source_file, "r") as source:
for feature in source:
props = feature['properties']
props = feature["properties"]
if process == "camden":
toid, data = process_camden(props)
@ -42,7 +38,7 @@ def main(base_url, api_key, process, source_file):
if data is None:
continue
building_id = find_building(toid, feature['geometry'], base_url)
building_id = find_building(toid, feature["geometry"], base_url)
if not building_id:
print("no_match", toid, "-")
continue
@ -51,31 +47,22 @@ def main(base_url, api_key, process, source_file):
def process_camden(props):
toid = osgb_toid(props['TOID'])
data = {
'date_year': props['Year_C'],
'date_source_detail': props['Date_sou_1']
}
toid = osgb_toid(props["TOID"])
data = {"date_year": props["Year_C"], "date_source_detail": props["Date_sou_1"]}
return toid, data
def process_fitzrovia(props):
toid = osgb_toid(props['TOID'])
storeys = props['Storeys']
toid = osgb_toid(props["TOID"])
storeys = props["Storeys"]
if storeys is None:
return toid, None
if props['Basement'] == 'Yes':
data = {
'size_storeys_core': int(storeys) - 1,
'size_storeys_basement': 1
}
if props["Basement"] == "Yes":
data = {"size_storeys_core": int(storeys) - 1, "size_storeys_basement": 1}
else:
data = {
'size_storeys_core': int(storeys),
'size_storeys_basement': 0
}
data = {"size_storeys_core": int(storeys), "size_storeys_basement": 0}
return toid, data
@ -86,24 +73,21 @@ def osgb_toid(toid):
def save_data(building_id, data, api_key, base_url):
"""Save data to a building
"""
r = requests.post(
"""Save data to a building"""
requests.post(
"{}/buildings/{}.json?api_key={}".format(base_url, building_id, api_key),
json=data
json=data,
)
def find_building(toid, geom, base_url):
"""Find building_id by TOID or location
"""
r = requests.get(base_url + "/buildings/reference", params={
'key': 'toid',
'id': toid
})
"""Find building_id by TOID or location"""
r = requests.get(
base_url + "/buildings/reference", params={"key": "toid", "id": toid}
)
buildings = r.json()
if buildings and len(buildings) == 1:
bid = buildings[0]['building_id']
bid = buildings[0]["building_id"]
print("match_by_toid", toid, bid)
return bid
@ -114,27 +98,32 @@ def find_building(toid, geom, base_url):
point_osgb = poly.representative_point()
point_ll = transform(osgb_to_ll, point_osgb)
r = requests.get(base_url + "/buildings/locate", params={
'lng': point_ll.x,
'lat': point_ll.y
})
r = requests.get(
base_url + "/buildings/locate", params={"lng": point_ll.x, "lat": point_ll.y}
)
buildings = r.json()
if buildings and len(buildings) == 1:
bid = buildings[0]['building_id']
bid = buildings[0]["building_id"]
print("match_by_location", toid, bid)
return bid
return None
if __name__ == '__main__':
if __name__ == "__main__":
try:
url, api_key, process, filename = sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4]
url, api_key, process, filename = (
sys.argv[1],
sys.argv[2],
sys.argv[3],
sys.argv[4],
)
except IndexError:
print(
"Usage: {} <URL> <api_key> <camden|fitzrovia> ./path/to/camden.shp".format(
os.path.basename(__file__)
))
os.path.basename(__file__)
)
)
exit()
main(url, api_key, process, filename)

View File

@ -8,7 +8,6 @@ datasets for Camden (age data) and Fitzrovia (number of storeys).
- else locate building by representative point
- update building with data
"""
import json
import os
import sys
from functools import partial
@ -21,18 +20,15 @@ from shapely.ops import transform
osgb_to_ll = partial(
pyproj.transform,
pyproj.Proj(init='epsg:27700'),
pyproj.Proj(init='epsg:4326')
pyproj.transform, pyproj.Proj(init="epsg:27700"), pyproj.Proj(init="epsg:4326")
)
def main(base_url, api_key, process, source_file):
"""Read from file, update buildings
"""
with fiona.open(source_file, 'r') as source:
"""Read from file, update buildings"""
with fiona.open(source_file, "r") as source:
for feature in source:
props = feature['properties']
props = feature["properties"]
if process == "camden":
toid, data = process_camden(props)
@ -42,7 +38,7 @@ def main(base_url, api_key, process, source_file):
if data is None:
continue
building_id = find_building(toid, feature['geometry'], base_url)
building_id = find_building(toid, feature["geometry"], base_url)
if not building_id:
print("no_match", toid, "-")
continue
@ -51,31 +47,22 @@ def main(base_url, api_key, process, source_file):
def process_camden(props):
toid = osgb_toid(props['TOID'])
data = {
'date_year': props['Year_C'],
'date_source_detail': props['Date_sou_1']
}
toid = osgb_toid(props["TOID"])
data = {"date_year": props["Year_C"], "date_source_detail": props["Date_sou_1"]}
return toid, data
def process_fitzrovia(props):
toid = osgb_toid(props['TOID'])
storeys = props['Storeys']
toid = osgb_toid(props["TOID"])
storeys = props["Storeys"]
if storeys is None:
return toid, None
if props['Basement'] == 'Yes':
data = {
'size_storeys_core': int(storeys) - 1,
'size_storeys_basement': 1
}
if props["Basement"] == "Yes":
data = {"size_storeys_core": int(storeys) - 1, "size_storeys_basement": 1}
else:
data = {
'size_storeys_core': int(storeys),
'size_storeys_basement': 0
}
data = {"size_storeys_core": int(storeys), "size_storeys_basement": 0}
return toid, data
@ -86,24 +73,21 @@ def osgb_toid(toid):
def save_data(building_id, data, api_key, base_url):
"""Save data to a building
"""
r = requests.post(
"""Save data to a building"""
requests.post(
"{}/buildings/{}.json?api_key={}".format(base_url, building_id, api_key),
json=data
json=data,
)
def find_building(toid, geom, base_url):
"""Find building_id by TOID or location
"""
r = requests.get(base_url + "/buildings/reference", params={
'key': 'toid',
'id': toid
})
"""Find building_id by TOID or location"""
r = requests.get(
base_url + "/buildings/reference", params={"key": "toid", "id": toid}
)
buildings = r.json()
if buildings and len(buildings) == 1:
bid = buildings[0]['building_id']
bid = buildings[0]["building_id"]
print("match_by_toid", toid, bid)
return bid
@ -114,27 +98,32 @@ def find_building(toid, geom, base_url):
point_osgb = poly.representative_point()
point_ll = transform(osgb_to_ll, point_osgb)
r = requests.get(base_url + "/buildings/locate", params={
'lng': point_ll.x,
'lat': point_ll.y
})
r = requests.get(
base_url + "/buildings/locate", params={"lng": point_ll.x, "lat": point_ll.y}
)
buildings = r.json()
if buildings and len(buildings) == 1:
bid = buildings[0]['building_id']
bid = buildings[0]["building_id"]
print("match_by_location", toid, bid)
return bid
return None
if __name__ == '__main__':
if __name__ == "__main__":
try:
url, api_key, process, filename = sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4]
url, api_key, process, filename = (
sys.argv[1],
sys.argv[2],
sys.argv[3],
sys.argv[4],
)
except IndexError:
print(
"Usage: {} <URL> <api_key> <camden|fitzrovia> ./path/to/camden.shp".format(
os.path.basename(__file__)
))
os.path.basename(__file__)
)
)
exit()
main(url, api_key, process, filename)

View File

@ -1,9 +1,14 @@
def planning_data_entry_to_address(element):
site_name = element["_source"].get("site_name")
site_number = element["_source"].get("site_number")
street_name = element["_source"].get("street_name") # seems often misused - say "31 COPTHALL ROAD EAST" site_name getting Ickenham street_name
street_name = element["_source"].get("street_name")
# seems often misused - say "31 COPTHALL ROAD EAST" site_name
# getting Ickenham street_name
secondary_street_name = element["_source"].get("secondary_street_name")
return generate_address(site_name, site_number, street_name, secondary_street_name)['result']
return generate_address(site_name, site_number, street_name, secondary_street_name)[
"result"
]
def generate_address(site_name, site_number, street_name, secondary_street_name):
"""
@ -11,13 +16,13 @@ def generate_address(site_name, site_number, street_name, secondary_street_name)
sadly it does not always works well and relies on many heursitics as data quality is limited
"""
if site_name != None:
if site_name is not None:
site_name = site_name.strip()
if site_number != None:
if site_number is not None:
site_number = site_number.strip()
if street_name != None:
if street_name is not None:
street_name = street_name.strip()
if secondary_street_name != None:
if secondary_street_name is not None:
secondary_street_name = secondary_street_name.strip()
if site_name == "":
@ -29,68 +34,80 @@ def generate_address(site_name, site_number, street_name, secondary_street_name)
if secondary_street_name == "":
secondary_street_name = None
data = {
'site_name': site_name,
'site_number': site_number,
'street_name': street_name,
'secondary_street_name': secondary_street_name,
}
"site_name": site_name,
"site_number": site_number,
"street_name": street_name,
"secondary_street_name": secondary_street_name,
}
if site_name == site_number == street_name == secondary_street_name == None:
return {'result': None, 'data': data}
if site_name == site_number == street_name == secondary_street_name is None:
return {"result": None, "data": data}
if secondary_street_name != None:
if street_name == None:
print('"secondary_street_name != None, street_name == None"')
show_data(site_name, site_number, street_name, secondary_street_name, "???????")
if secondary_street_name is not None:
if street_name is None:
print('"secondary_street_name is not None, street_name is None"')
show_data(
site_name, site_number, street_name, secondary_street_name, "???????"
)
else:
street_name += " - with secondary road name: " + secondary_street_name
if site_number != None and street_name != None:
if site_number is not None and street_name is not None:
address = site_number + " " + street_name
if site_name != None:
print('"site_name != None and site_number != None and street_name != None"')
show_data(site_name, site_number, street_name, secondary_street_name, address)
if site_name is not None:
print(
'"site_name is not None and site_number is not None and street_name is not None"'
)
show_data(
site_name, site_number, street_name, secondary_street_name, address
)
return {'result': address, 'data': data}
return {"result": address, "data": data}
if site_name != None:
if street_name != None:
if site_name is not None:
if street_name is not None:
try:
if site_number == None and int(site_name):
return {'result': site_name + " " + street_name, 'data': data}
if site_number is None and int(site_name):
return {"result": site_name + " " + street_name, "data": data}
except ValueError:
pass
if street_name in site_name:
site_name_without_street_name = site_name.replace(street_name, "").strip()
site_name_without_street_name = site_name.replace(
street_name, ""
).strip()
try:
house_number = int(site_name_without_street_name)
_ = int(site_name_without_street_name)
# so it appears to be case like
# site_name: 5 Warwick Road
# street_name: Warwick Road
# no other info provided
# in such case just returning site_name will work fine...
return {'result': site_name, 'data': data}
return {"result": site_name, "data": data}
except ValueError:
pass
print('"site_name != None and street_name != None"')
show_data(site_name, site_number, street_name, secondary_street_name, site_name)
if site_number != None:
print('"site_name != None and site_number != None"')
show_data(site_name, site_number, street_name, secondary_street_name, site_name)
return {'result': site_name, 'data': data}
print('"site_name is not None and street_name is not None"')
show_data(
site_name, site_number, street_name, secondary_street_name, site_name
)
if site_number is not None:
print('"site_name is not None and site_number is not None"')
show_data(
site_name, site_number, street_name, secondary_street_name, site_name
)
return {"result": site_name, "data": data}
else:
if street_name != None:
if site_number != None:
return {'result': site_number + " " + street_name, 'data': data}
if street_name != None and site_number == None:
print('"street_name != None or site_number == None"')
if street_name is not None:
if site_number is not None:
return {"result": site_number + " " + street_name, "data": data}
if street_name is not None and site_number is None:
print('"street_name is not None or site_number is None"')
show_data(site_name, site_number, street_name, secondary_street_name, None)
return {'result': None, 'data': data}
if street_name == None and site_number != None:
print('"street_name == None or site_number != None"')
return {"result": None, "data": data}
if street_name is None and site_number is not None:
print('"street_name is None or site_number is not None"')
show_data(site_name, site_number, street_name, secondary_street_name, None)
return {'result': None, 'data': data}
return {'result': None, 'data': data}
return {"result": None, "data": data}
return {"result": None, "data": data}
def show_data(site_name, site_number, street_name, secondary_street_name, address):
@ -100,4 +117,4 @@ def show_data(site_name, site_number, street_name, secondary_street_name, addres
print("secondary_street_name:", secondary_street_name)
print("address generated based on this data:", address)
print()
print()
print()

View File

@ -5,6 +5,7 @@ import requests
import psycopg2
import address_data
def main():
connection = get_connection()
cursor = connection.cursor()
@ -16,10 +17,12 @@ def main():
while True:
data = query(search_after).json()
load_data_into_database(cursor, data)
for entry in data['hits']['hits']:
for entry in data["hits"]["hits"]:
downloaded += 1
last_sort = entry['sort']
print("downloaded", downloaded, "last_sort", last_sort, "previous", search_after)
last_sort = entry["sort"]
print(
"downloaded", downloaded, "last_sort", last_sort, "previous", search_after
)
if search_after == last_sort:
break
search_after = last_sort
@ -31,24 +34,30 @@ def load_data_into_database(cursor, data):
print(json.dumps(data, indent=4))
print("timed_out field missing in provided data")
else:
if data['timed_out']:
if data["timed_out"]:
raise Exception("query getting livestream data has failed")
for entry in data['hits']['hits']:
for entry in data["hits"]["hits"]:
try:
description = None
if entry['_source']['description'] != None:
description = entry['_source']['description'].strip()
application_id = entry['_source']['lpa_app_no']
application_id_with_borough_identifier = entry['_source']['id']
decision_date = parse_date_string_into_date_object(entry['_source']['decision_date'])
last_synced_date = parse_date_string_into_date_object(entry['_source']['last_synced'])
uprn = entry['_source']['uprn']
status_before_aliasing = entry['_source']['status']
if entry["_source"]["description"] is not None:
description = entry["_source"]["description"].strip()
application_id = entry["_source"]["lpa_app_no"]
application_id_with_borough_identifier = entry["_source"]["id"]
decision_date = parse_date_string_into_date_object(
entry["_source"]["decision_date"]
)
last_synced_date = parse_date_string_into_date_object(
entry["_source"]["last_synced"]
)
uprn = entry["_source"]["uprn"]
status_before_aliasing = entry["_source"]["status"]
status_info = process_status(status_before_aliasing, decision_date)
status = status_info["status"]
status_explanation_note = status_info["status_explanation_note"]
planning_url = obtain_entry_link(entry['_source']['url_planning_app'], application_id)
if uprn == None:
planning_url = obtain_entry_link(
entry["_source"]["url_planning_app"], application_id
)
if uprn is None:
continue
try:
uprn = int(uprn)
@ -61,7 +70,9 @@ def load_data_into_database(cursor, data):
"last_synced_date": last_synced_date,
"application_id": application_id,
"application_url": planning_url,
"registered_with_local_authority_date": parse_date_string_into_date_object(entry['_source']['valid_date']),
"registered_with_local_authority_date": parse_date_string_into_date_object(
entry["_source"]["valid_date"]
),
"uprn": uprn,
"status": status,
"status_before_aliasing": status_before_aliasing,
@ -70,13 +81,16 @@ def load_data_into_database(cursor, data):
"data_source_link": "https://www.london.gov.uk/programmes-strategies/planning/digital-planning/planning-london-datahub",
"address": address_data.planning_data_entry_to_address(entry),
}
if entry["address"] != None:
if entry["address"] is not None:
maximum_address_length = 300
if len(entry["address"]) > maximum_address_length:
print("address is too long, shortening", entry["address"])
entry["address"] = entry["address"][0:maximum_address_length]
if date_in_future(entry["registered_with_local_authority_date"]):
print("registered_with_local_authority_date is treated as invalid:", entry["registered_with_local_authority_date"])
print(
"registered_with_local_authority_date is treated as invalid:",
entry["registered_with_local_authority_date"],
)
# Brent-87_0946 has "valid_date": "23/04/9187"
entry["registered_with_local_authority_date"] = None
@ -85,13 +99,17 @@ def load_data_into_database(cursor, data):
entry["decision_date"] = None
if date_in_future(entry["last_synced_date"]):
print("last_synced_date is treated as invalid:", entry["last_synced_date"])
print(
"last_synced_date is treated as invalid:", entry["last_synced_date"]
)
entry["last_synced_date"] = None
if "Hackney" in application_id_with_borough_identifier:
if entry["application_url"] != None:
if entry["application_url"] is not None:
if "https://" not in entry["application_url"]:
entry["application_url"] = "https://developmentandhousing.hackney.gov.uk" + entry["application_url"]
entry[
"application_url"
] = f"https://developmentandhousing.hackney.gov.uk{entry['application_url']}"
insert_entry(cursor, entry)
except TypeError as e:
print()
@ -104,40 +122,40 @@ def load_data_into_database(cursor, data):
def date_in_future(date):
if date == None:
if date is None:
return False
return date > datetime.datetime.now()
def query(search_after):
headers = {
'X-API-AllowRequest': os.environ['PLANNNING_DATA_API_ALLOW_REQUEST_CODE'],
"X-API-AllowRequest": os.environ["PLANNNING_DATA_API_ALLOW_REQUEST_CODE"],
# Already added when you pass json= but not when you pass data=
# 'Content-Type': 'application/json',
}
json_data = {
'size': 10000,
'sort': [
"size": 10000,
"sort": [
{
'last_updated': {
'order': 'desc',
'unmapped_type': 'boolean',
"last_updated": {
"order": "desc",
"unmapped_type": "boolean",
},
},
],
'stored_fields': [
'*',
"stored_fields": [
"*",
],
'_source': {
'excludes': [],
"_source": {
"excludes": [],
},
'query': {
'bool': {
'must': [
"query": {
"bool": {
"must": [
{
'range': {
'valid_date': {
'gte': '01/01/1021',
"range": {
"valid_date": {
"gte": "01/01/1021",
},
},
},
@ -147,18 +165,22 @@ def query(search_after):
}
if search_after != []:
json_data['search_after'] = search_after
json_data["search_after"] = search_after
print(json_data)
return requests.post('https://planningdata.london.gov.uk/api-guest/applications/_search', headers=headers, json=json_data)
return requests.post(
"https://planningdata.london.gov.uk/api-guest/applications/_search",
headers=headers,
json=json_data,
)
def get_connection():
return psycopg2.connect(
host=os.environ['PGHOST'],
dbname=os.environ['PGDATABASE'],
user=os.environ['PGUSER'],
password=os.environ['PGPASSWORD']
host=os.environ["PGHOST"],
dbname=os.environ["PGDATABASE"],
user=os.environ["PGUSER"],
password=os.environ["PGPASSWORD"],
)
@ -170,28 +192,31 @@ def insert_entry(cursor, e):
try:
now = datetime.datetime.now()
application_url = None
if e["application_url"] != None:
if e["application_url"] is not None:
application_url = e["application_url"]
cursor.execute('''INSERT INTO
cursor.execute(
"""INSERT INTO
planning_data (planning_application_id, planning_application_link, description, registered_with_local_authority_date, days_since_registration_cached, decision_date, days_since_decision_date_cached, last_synced_date, status, status_before_aliasing, status_explanation_note, data_source, data_source_link, address, uprn)
VALUES
(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
''', (
e["application_id"],
application_url, e["description"],
date_object_into_date_string(e["registered_with_local_authority_date"]),
days_since(e["registered_with_local_authority_date"], now),
date_object_into_date_string(e["decision_date"]),
days_since(e["decision_date"], now),
date_object_into_date_string(e["last_synced_date"]),
e["status"],
e["status_before_aliasing"],
e["status_explanation_note"],
e["data_source"],
e["data_source_link"],
e["address"],
e["uprn"],
)
""",
(
e["application_id"],
application_url,
e["description"],
date_object_into_date_string(e["registered_with_local_authority_date"]),
days_since(e["registered_with_local_authority_date"], now),
date_object_into_date_string(e["decision_date"]),
days_since(e["decision_date"], now),
date_object_into_date_string(e["last_synced_date"]),
e["status"],
e["status_before_aliasing"],
e["status_explanation_note"],
e["data_source"],
e["data_source_link"],
e["address"],
e["uprn"],
),
)
except psycopg2.errors.Error as error:
show_dictionary(e)
@ -204,30 +229,32 @@ def show_dictionary(data):
def days_since(date, now):
if(date == None):
if date is None:
return None
return (now - date).days
def date_object_into_date_string(date):
if(date == None):
if date is None:
return None
return datetime.datetime.strftime(date, "%Y-%m-%d")
def parse_date_string_into_date_object(incoming):
if incoming == None:
if incoming is None:
return None
date = None
try:
date = datetime.datetime.strptime(incoming, "%d/%m/%Y") # '21/07/2022'
except ValueError:
date = datetime.datetime.strptime(incoming, "%Y-%m-%dT%H:%M:%S.%fZ") # '2022-08-08T20:07:22.238Z'
date = datetime.datetime.strptime(
incoming, "%Y-%m-%dT%H:%M:%S.%fZ"
) # '2022-08-08T20:07:22.238Z'
return date
def obtain_entry_link(provided_link, application_id):
if provided_link != None:
if provided_link is not None:
if "Ealing" in application_id:
if ";" == provided_link[-1]:
return provided_link[:-1]
@ -237,7 +264,7 @@ def obtain_entry_link(provided_link, application_id):
# Planning application ID: Hackney-2021_2491
# https://developmentandhousing.hackney.gov.uk/planning/index.html?fa=getApplication&reference=2021/2491
ref_for_link = application_id.replace("Hackney-", "").replace("_", "/")
return "https://developmentandhousing.hackney.gov.uk/planning/index.html?fa=getApplication&reference=" + ref_for_link
return f"https://developmentandhousing.hackney.gov.uk/planning/index.html?fa=getApplication&reference={ref_for_link}"
if "Lambeth" in application_id:
# sadly, specific links seems impossible
return "https://planning.lambeth.gov.uk/online-applications/refineSearch.do?action=refine"
@ -282,9 +309,16 @@ def obtain_entry_link(provided_link, application_id):
def process_status(status, decision_date):
status_length_limit = 50 # see migrations/034.planning_livestream_data.up.sql
if status in ["Application Under Consideration", "Application Received"]:
if decision_date == None:
if decision_date is None:
status = "Submitted"
if status in ["Refused", "Refusal", "Refusal (P)", "Application Invalid", "Insufficient Fee", "Dismissed"]:
if status in [
"Refused",
"Refusal",
"Refusal (P)",
"Application Invalid",
"Insufficient Fee",
"Dismissed",
]:
status = "Rejected"
if status == "Appeal Received":
status = "Appeal In Progress"
@ -296,16 +330,39 @@ def process_status(status, decision_date):
status = "Withdrawn"
if len(status) > status_length_limit:
print("Status was too long and was skipped:", status)
return {"status": "Processing failed", "status_explanation_note": "status was unusally long and it was imposible to save it"}
if (status in ["Submitted", "Approved", "Rejected", "Appeal In Progress", "Withdrawn", "Unknown"]):
return {
"status": "Processing failed",
"status_explanation_note": "status was unusally long and it was imposible to save it",
}
if status in [
"Submitted",
"Approved",
"Rejected",
"Appeal In Progress",
"Withdrawn",
"Unknown",
]:
return {"status": status, "status_explanation_note": None}
if status in ["No Objection to Proposal (OBS only)", "Objection Raised to Proposal (OBS only)"]:
return {"status": "Approved", "status_explanation_note": "preapproved application, local authority is unable to reject it"}
if status in [
"No Objection to Proposal (OBS only)",
"Objection Raised to Proposal (OBS only)",
]:
return {
"status": "Approved",
"status_explanation_note": "preapproved application, local authority is unable to reject it",
}
print("Unexpected status " + status)
if status not in ["Not Required", "SECS", "Comment Issued", "ALL DECISIONS ISSUED", "Closed", "Declined to Determine"]:
if status not in [
"Not Required",
"SECS",
"Comment Issued",
"ALL DECISIONS ISSUED",
"Closed",
"Declined to Determine",
]:
print("New unexpected status " + status)
return {"status": status, "status_explanation_note": None}
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@ -1,3 +1,3 @@
# Python packages for planning data import
psycopg2==2.8.6
psycopg2-binary==2.9.7
requests==2.31.0

View File

@ -1,7 +1,7 @@
# Python packages for etl
fiona==1.7.13
osmnx==0.13
psycopg2==2.7.5
shapely==1.7
fiona
osmnx==1.6.0
psycopg2-binary==2.9.7
retrying==1.3.3
requests==2.31.0
shapely

View File

@ -1,2 +1,2 @@
psycopg2==2.8.3
requests==2.31.0
psycopg2-binary==2.9.7
requests==2.31.0