From 82a50d77d6102113a0feaaec4dac6ec357bbffad Mon Sep 17 00:00:00 2001 From: Maciej Ziarkowski Date: Tue, 10 Dec 2019 17:17:53 +0000 Subject: [PATCH] Allow specifying JSON columns for CSV bulk import --- etl/join_building_data/load_csv.py | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/etl/join_building_data/load_csv.py b/etl/join_building_data/load_csv.py index 3195dc47..d7927c34 100644 --- a/etl/join_building_data/load_csv.py +++ b/etl/join_building_data/load_csv.py @@ -8,6 +8,17 @@ the appropriate site): a0a00000-0a00-0aaa-a0a0-0000aaaa0000 \ data.csv +The optional last argument specifies which columns should be parsed as JSON values. +This is required for example for columns of array type to be processed by the API correctly. +Otherwise, those values would be treated as a string and not an array. + +An example usage with the json_columns argument (other values in the example are placeholders): + python load_csv.py \ + https://colouring.london \ + a0a00000-0a00-0aaa-a0a0-0000aaaa0000 \ + data.csv \ + current_landuse_group,date_url + This script uses the HTTP API, and can process CSV files which identify buildings by id, TOID, UPRN. @@ -23,6 +34,7 @@ The process: - else lookup by toid - else lookup by uprn - else locate building by representative point + - (optional) parse JSON column values - update building TODO extend to allow latitude,longitude or easting,northing columns and lookup by location. @@ -36,13 +48,14 @@ import sys import requests -def main(base_url, api_key, source_file): +def main(base_url, api_key, source_file, json_columns): """Read from file, update buildings """ with open(source_file, 'r') as source: reader = csv.DictReader(source) for line in reader: building_id = find_building(line, base_url) + line = parse_json_columns(line, json_columns) if building_id is None: continue @@ -101,15 +114,22 @@ def find_by_reference(base_url, ref_key, ref_id): return building_id +def parse_json_columns(row, json_columns): + for col in json_columns: + row[col] = json.loads(row[col]) + + return row if __name__ == '__main__': try: url, api_key, filename = sys.argv[1], sys.argv[2], sys.argv[3] except IndexError: print( - "Usage: {} ./path/to/data.csv".format( + "Usage: {} ./path/to/data.csv []".format( os.path.basename(__file__) )) exit() - main(url, api_key, filename) + json_columns = sys.argv[4].split(',') if len(sys.argv) > 4 else [] + + main(url, api_key, filename, json_columns)