2019-09-30 05:39:43 -04:00
|
|
|
"""Join csv data to buildings
|
|
|
|
|
2020-06-16 08:27:08 -04:00
|
|
|
Run `python load_csv.py -h` for full usage instructions and a list of all options.
|
|
|
|
|
2019-10-02 04:43:12 -04:00
|
|
|
Example usage (replace URL with test/staging/localhost as necessary, API key with real key for
|
|
|
|
the appropriate site):
|
|
|
|
|
|
|
|
python load_csv.py \
|
|
|
|
https://colouring.london \
|
|
|
|
a0a00000-0a00-0aaa-a0a0-0000aaaa0000 \
|
|
|
|
data.csv
|
|
|
|
|
2019-12-10 12:17:53 -05:00
|
|
|
The optional last argument specifies which columns should be parsed as JSON values.
|
|
|
|
This is required for example for columns of array type to be processed by the API correctly.
|
|
|
|
Otherwise, those values would be treated as a string and not an array.
|
|
|
|
|
|
|
|
An example usage with the json_columns argument (other values in the example are placeholders):
|
|
|
|
python load_csv.py \
|
|
|
|
https://colouring.london \
|
|
|
|
a0a00000-0a00-0aaa-a0a0-0000aaaa0000 \
|
|
|
|
data.csv \
|
|
|
|
current_landuse_group,date_url
|
|
|
|
|
2019-09-30 05:39:43 -04:00
|
|
|
This script uses the HTTP API, and can process CSV files which identify buildings by id, TOID,
|
|
|
|
UPRN.
|
|
|
|
|
|
|
|
The process:
|
|
|
|
- assume first line of the CSV is a header, where column names are either
|
|
|
|
- building identifiers - one of:
|
|
|
|
- building_id
|
|
|
|
- toid
|
|
|
|
- uprn
|
|
|
|
- building data field names
|
|
|
|
- read through lines of CSV:
|
|
|
|
- use building id if provided
|
|
|
|
- else lookup by toid
|
|
|
|
- else lookup by uprn
|
|
|
|
- else locate building by representative point
|
2019-12-10 12:17:53 -05:00
|
|
|
- (optional) parse JSON column values
|
2019-09-30 05:39:43 -04:00
|
|
|
- update building
|
|
|
|
|
|
|
|
TODO extend to allow latitude,longitude or easting,northing columns and lookup by location.
|
|
|
|
|
|
|
|
"""
|
|
|
|
import csv
|
|
|
|
import json
|
2020-06-16 08:27:08 -04:00
|
|
|
import argparse
|
2019-09-30 05:39:43 -04:00
|
|
|
|
|
|
|
import requests
|
2020-06-16 08:24:43 -04:00
|
|
|
from retrying import retry
|
2019-09-30 05:39:43 -04:00
|
|
|
|
|
|
|
|
2020-06-16 11:16:46 -04:00
|
|
|
def main(base_url, api_key, source_file, json_columns, no_overwrite=False, debug=False):
|
2023-08-15 09:10:19 -04:00
|
|
|
"""Read from file, update buildings"""
|
|
|
|
with open(source_file, "r") as source:
|
2019-10-02 04:43:12 -04:00
|
|
|
reader = csv.DictReader(source)
|
2019-09-30 05:39:43 -04:00
|
|
|
for line in reader:
|
|
|
|
building_id = find_building(line, base_url)
|
2019-12-10 12:17:53 -05:00
|
|
|
line = parse_json_columns(line, json_columns)
|
2019-09-30 05:39:43 -04:00
|
|
|
|
|
|
|
if building_id is None:
|
|
|
|
continue
|
|
|
|
|
2023-08-15 09:10:19 -04:00
|
|
|
if "sust_dec" in line and line["sust_dec"] == "":
|
|
|
|
del line["sust_dec"]
|
2019-11-21 08:17:17 -05:00
|
|
|
|
2020-06-16 11:16:46 -04:00
|
|
|
if no_overwrite:
|
|
|
|
try:
|
|
|
|
if check_data_present(building_id, line.keys(), base_url):
|
2023-08-15 09:10:19 -04:00
|
|
|
print(
|
|
|
|
f"Building {building_id}: Not updating to avoid overwriting existing data"
|
|
|
|
)
|
2020-06-16 11:16:46 -04:00
|
|
|
continue
|
|
|
|
except ApiRequestError as e:
|
2023-08-15 09:10:19 -04:00
|
|
|
print(
|
|
|
|
f"Error checking existing data for building {building_id}: status {e.code}, data: {e.data}"
|
|
|
|
)
|
2020-06-16 11:16:46 -04:00
|
|
|
raise
|
|
|
|
|
2023-08-15 09:10:19 -04:00
|
|
|
response_code, response_data = update_building(
|
|
|
|
building_id, line, api_key, base_url
|
|
|
|
)
|
2019-10-02 04:43:12 -04:00
|
|
|
if response_code != 200:
|
2023-08-15 09:10:19 -04:00
|
|
|
print("ERROR", building_id, response_code, response_data)
|
2020-06-16 11:16:46 -04:00
|
|
|
elif debug:
|
2023-08-15 09:10:19 -04:00
|
|
|
print("DEBUG", building_id, response_code, response_data)
|
|
|
|
|
2019-09-30 05:39:43 -04:00
|
|
|
|
2020-06-16 11:16:46 -04:00
|
|
|
class ApiRequestError(Exception):
|
2023-08-15 09:10:19 -04:00
|
|
|
def __init__(self, code, data, message=""):
|
2020-06-16 11:16:46 -04:00
|
|
|
self.code = code
|
|
|
|
self.data = data
|
|
|
|
super().__init__(message)
|
|
|
|
|
2023-08-15 09:10:19 -04:00
|
|
|
|
2020-06-16 11:16:46 -04:00
|
|
|
def check_data_present(building_id, fields, base_url):
|
|
|
|
response_code, current_state = get_building(building_id, base_url)
|
|
|
|
if response_code != 200:
|
|
|
|
raise ApiRequestError(response_code, current_state)
|
|
|
|
else:
|
2023-08-15 09:10:19 -04:00
|
|
|
id_fields = set(["building_id", "toid", "uprn"])
|
2020-06-16 11:16:46 -04:00
|
|
|
field_names_without_ids = [k for k in fields if k not in id_fields]
|
|
|
|
|
2023-08-15 09:10:19 -04:00
|
|
|
return any(
|
|
|
|
[current_state.get(k, None) is not None for k in field_names_without_ids]
|
|
|
|
)
|
2020-06-16 11:16:46 -04:00
|
|
|
|
|
|
|
|
|
|
|
@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000)
|
|
|
|
def get_building(building_id, base_url):
|
2023-08-15 09:10:19 -04:00
|
|
|
"""Get data for a building"""
|
2020-06-16 11:16:46 -04:00
|
|
|
r = requests.get(f"{base_url}/api/buildings/{building_id}.json")
|
|
|
|
return r.status_code, r.json()
|
|
|
|
|
2019-09-30 05:39:43 -04:00
|
|
|
|
2020-06-16 08:24:43 -04:00
|
|
|
@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000)
|
2019-09-30 05:39:43 -04:00
|
|
|
def update_building(building_id, data, api_key, base_url):
|
2023-08-15 09:10:19 -04:00
|
|
|
"""Save data to a building"""
|
2019-09-30 05:39:43 -04:00
|
|
|
r = requests.post(
|
2019-10-02 04:43:12 -04:00
|
|
|
"{}/api/buildings/{}.json".format(base_url, building_id),
|
2023-08-15 09:10:19 -04:00
|
|
|
params={"api_key": api_key},
|
|
|
|
json=data,
|
2019-09-30 05:39:43 -04:00
|
|
|
)
|
2019-10-02 04:43:12 -04:00
|
|
|
return r.status_code, r.json()
|
2019-09-30 05:39:43 -04:00
|
|
|
|
|
|
|
|
|
|
|
def find_building(data, base_url):
|
2023-08-15 09:10:19 -04:00
|
|
|
if "building_id" in data:
|
|
|
|
building_id = data["building_id"]
|
2019-12-10 12:16:26 -05:00
|
|
|
if building_id is not None:
|
|
|
|
print("match_by_building_id", building_id)
|
|
|
|
return building_id
|
2019-11-21 08:17:17 -05:00
|
|
|
|
2023-08-15 09:10:19 -04:00
|
|
|
if "toid" in data:
|
|
|
|
building_id = find_by_reference(base_url, "toid", data["toid"])
|
2019-10-02 04:43:12 -04:00
|
|
|
if building_id is not None:
|
2023-08-15 09:10:19 -04:00
|
|
|
print("match_by_toid", data["toid"], building_id)
|
2019-10-02 04:43:12 -04:00
|
|
|
return building_id
|
2019-09-30 05:39:43 -04:00
|
|
|
|
2023-08-15 09:10:19 -04:00
|
|
|
if "uprn" in data:
|
|
|
|
building_id = find_by_reference(base_url, "uprn", data["uprn"])
|
2019-10-02 04:43:12 -04:00
|
|
|
if building_id is not None:
|
2023-08-15 09:10:19 -04:00
|
|
|
print("match_by_uprn", data["uprn"], building_id)
|
2019-10-02 04:43:12 -04:00
|
|
|
return building_id
|
2019-09-30 05:39:43 -04:00
|
|
|
|
|
|
|
print("no_match", data)
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
2020-06-16 08:24:43 -04:00
|
|
|
@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000)
|
2019-09-30 05:39:43 -04:00
|
|
|
def find_by_reference(base_url, ref_key, ref_id):
|
2023-08-15 09:10:19 -04:00
|
|
|
"""Find building_id by TOID or UPRN"""
|
|
|
|
r = requests.get(
|
|
|
|
"{}/api/buildings/reference".format(base_url),
|
|
|
|
params={"key": ref_key, "id": ref_id},
|
|
|
|
)
|
2019-09-30 05:39:43 -04:00
|
|
|
buildings = r.json()
|
|
|
|
|
2023-08-15 09:10:19 -04:00
|
|
|
if buildings and "error" not in buildings and len(buildings) == 1:
|
|
|
|
building_id = buildings[0]["building_id"]
|
2019-09-30 05:39:43 -04:00
|
|
|
else:
|
|
|
|
building_id = None
|
|
|
|
|
|
|
|
return building_id
|
|
|
|
|
2023-08-15 09:10:19 -04:00
|
|
|
|
2019-12-10 12:17:53 -05:00
|
|
|
def parse_json_columns(row, json_columns):
|
|
|
|
for col in json_columns:
|
|
|
|
row[col] = json.loads(row[col])
|
|
|
|
|
|
|
|
return row
|
2019-09-30 05:39:43 -04:00
|
|
|
|
2020-06-16 08:27:08 -04:00
|
|
|
|
|
|
|
def list_str(values):
|
2023-08-15 09:10:19 -04:00
|
|
|
return values.split(",")
|
|
|
|
|
2020-06-16 08:27:08 -04:00
|
|
|
|
2023-08-15 09:10:19 -04:00
|
|
|
if __name__ == "__main__":
|
2020-06-16 08:27:08 -04:00
|
|
|
parser = argparse.ArgumentParser()
|
2023-08-15 09:10:19 -04:00
|
|
|
parser.add_argument("url", help="URL for the app")
|
|
|
|
parser.add_argument("api_key", help="API key for the user")
|
|
|
|
parser.add_argument("path", help="Path to data CSV file")
|
|
|
|
parser.add_argument(
|
|
|
|
"json_columns",
|
|
|
|
nargs="?",
|
2020-06-16 08:27:08 -04:00
|
|
|
type=list_str,
|
|
|
|
default=[],
|
2023-08-15 09:10:19 -04:00
|
|
|
help="A comma-separated list of columns which should be parsed as JSON",
|
|
|
|
)
|
2020-06-16 08:27:08 -04:00
|
|
|
|
2023-08-15 09:10:19 -04:00
|
|
|
parser.add_argument(
|
|
|
|
"--no-overwrite",
|
|
|
|
"-n",
|
|
|
|
action="store_true",
|
|
|
|
dest="no_overwrite",
|
|
|
|
help="Don't overwrite building data if any of the fields supplied is already set",
|
|
|
|
)
|
2020-06-16 11:16:46 -04:00
|
|
|
|
2023-08-15 09:10:19 -04:00
|
|
|
parser.add_argument(
|
|
|
|
"--debug", "-d", action="store_true", help="Print debug messages"
|
|
|
|
)
|
2020-06-16 08:27:08 -04:00
|
|
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
2023-08-15 09:10:19 -04:00
|
|
|
main(
|
|
|
|
args.url,
|
|
|
|
args.api_key,
|
|
|
|
args.path,
|
|
|
|
args.json_columns,
|
|
|
|
args.no_overwrite,
|
|
|
|
args.debug,
|
|
|
|
)
|