colouring-montreal/etl/join_building_data/load_csv.py

"""Join csv data to buildings

Run `python load_csv.py -h` for full usage instructions and a list of all options.

Example usage (replace URL with test/staging/localhost as necessary, API key with real key for
the appropriate site):

    python load_csv.py \
        https://colouring.london \
        a0a00000-0a00-0aaa-a0a0-0000aaaa0000 \
        data.csv

The optional last argument specifies which columns should be parsed as JSON values.
This is required for example for columns of array type to be processed by the API correctly.
Otherwise, those values would be treated as a string and not an array.

An example usage with the json_columns argument (other values in the example are placeholders):
    python load_csv.py \
        https://colouring.london \
        a0a00000-0a00-0aaa-a0a0-0000aaaa0000 \
        data.csv \
        current_landuse_group,date_url

This script uses the HTTP API, and can process CSV files which identify buildings by id, TOID,
UPRN.

The process:
    - assume first line of the CSV is a header, where column names are either
        - building identifiers - one of:
            - building_id
            - toid
            - uprn
        - building data field names
    - read through lines of CSV:
        - use building id if provided
            - else lookup by toid
            - else lookup by uprn
            - else locate building by representative point
        - (optional) parse JSON column values
        - update building

TODO extend to allow latitude,longitude or easting,northing columns and lookup by location.

"""
import csv
import json
import os
import sys
import argparse

import requests
from retrying import retry


def main(base_url, api_key, source_file, json_columns):
    """Read from file, update buildings
    """
    with open(source_file, 'r') as source:
        reader = csv.DictReader(source)
        for line in reader:
            building_id = find_building(line, base_url)
            line = parse_json_columns(line, json_columns)

            if building_id is None:
                continue

            if 'sust_dec' in line and line['sust_dec'] == '':
                del line['sust_dec']

            response_code, response_data = update_building(building_id, line, api_key, base_url)
            if response_code != 200:
                print('ERROR', building_id, response_code, response_data)
            else:
                print('DEBUG', building_id, response_code, response_data)


@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000)
def update_building(building_id, data, api_key, base_url):
    """Save data to a building
    """
    r = requests.post(
        "{}/api/buildings/{}.json".format(base_url, building_id),
        params={'api_key': api_key},
        json=data
    )
    return r.status_code, r.json()


def find_building(data, base_url):
    if 'building_id' in data:
        building_id = data['building_id']
        if building_id is not None:
            print("match_by_building_id", building_id)
            return building_id

    if 'toid' in data:
        building_id = find_by_reference(base_url, 'toid', data['toid'])
        if building_id is not None:
            print("match_by_toid", data['toid'], building_id)
            return building_id

    if 'uprn' in data:
        building_id =  find_by_reference(base_url, 'uprn', data['uprn'])
        if building_id is not None:
            print("match_by_uprn", data['uprn'], building_id)
            return building_id

    print("no_match", data)
    return None


@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000)
def find_by_reference(base_url, ref_key, ref_id):
    """Find building_id by TOID or UPRN
    """
    r = requests.get("{}/api/buildings/reference".format(base_url), params={
        'key': ref_key,
        'id': ref_id
    })
    buildings = r.json()

    if buildings and 'error' not in buildings and len(buildings) == 1:
        building_id = buildings[0]['building_id']
    else:
        building_id = None

    return building_id

def parse_json_columns(row, json_columns):
    for col in json_columns:
        row[col] = json.loads(row[col])

    return row


def list_str(values):
    return values.split(',')

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('url', help='URL for the app')
    parser.add_argument('api_key', help='API key for the user')
    parser.add_argument('path', help='Path to data CSV file')
    parser.add_argument('json_columns',
        nargs='?',
        type=list_str,
        default=[],
        help='A comma-separated list of columns which should be parsed as JSON')


    args = parser.parse_args()

    main(args.url, args.api_key, args.path, args.json_columns)
Add generic CSV upload script (rename load_data to load_shapefile) 2019-09-30 05:39:43 -04:00			`"""Join csv data to buildings`

Move to argparse for command line options 2020-06-16 08:27:08 -04:00			Run `python load_csv.py -h` for full usage instructions and a list of all options.

Fix load_csv script - tested against localhost 2019-10-02 04:43:12 -04:00			`Example usage (replace URL with test/staging/localhost as necessary, API key with real key for`
			`the appropriate site):`

			`python load_csv.py \`
			`https://colouring.london \`
			`a0a00000-0a00-0aaa-a0a0-0000aaaa0000 \`
			`data.csv`

Allow specifying JSON columns for CSV bulk import 2019-12-10 12:17:53 -05:00			`The optional last argument specifies which columns should be parsed as JSON values.`
			`This is required for example for columns of array type to be processed by the API correctly.`
			`Otherwise, those values would be treated as a string and not an array.`

			`An example usage with the json_columns argument (other values in the example are placeholders):`
			`python load_csv.py \`
			`https://colouring.london \`
			`a0a00000-0a00-0aaa-a0a0-0000aaaa0000 \`
			`data.csv \`
			`current_landuse_group,date_url`

Add generic CSV upload script (rename load_data to load_shapefile) 2019-09-30 05:39:43 -04:00			`This script uses the HTTP API, and can process CSV files which identify buildings by id, TOID,`
			`UPRN.`

			`The process:`
			`- assume first line of the CSV is a header, where column names are either`
			`- building identifiers - one of:`
			`- building_id`
			`- toid`
			`- uprn`
			`- building data field names`
			`- read through lines of CSV:`
			`- use building id if provided`
			`- else lookup by toid`
			`- else lookup by uprn`
			`- else locate building by representative point`
Allow specifying JSON columns for CSV bulk import 2019-12-10 12:17:53 -05:00			`- (optional) parse JSON column values`
Add generic CSV upload script (rename load_data to load_shapefile) 2019-09-30 05:39:43 -04:00			`- update building`

			`TODO extend to allow latitude,longitude or easting,northing columns and lookup by location.`

			`"""`
			`import csv`
			`import json`
			`import os`
			`import sys`
Move to argparse for command line options 2020-06-16 08:27:08 -04:00			`import argparse`
Add generic CSV upload script (rename load_data to load_shapefile) 2019-09-30 05:39:43 -04:00
			`import requests`
Add retrying logic 2020-06-16 08:24:43 -04:00			`from retrying import retry`
Add generic CSV upload script (rename load_data to load_shapefile) 2019-09-30 05:39:43 -04:00

Allow specifying JSON columns for CSV bulk import 2019-12-10 12:17:53 -05:00			`def main(base_url, api_key, source_file, json_columns):`
Add generic CSV upload script (rename load_data to load_shapefile) 2019-09-30 05:39:43 -04:00			`"""Read from file, update buildings`
			`"""`
			`with open(source_file, 'r') as source:`
Fix load_csv script - tested against localhost 2019-10-02 04:43:12 -04:00			`reader = csv.DictReader(source)`
Add generic CSV upload script (rename load_data to load_shapefile) 2019-09-30 05:39:43 -04:00			`for line in reader:`
			`building_id = find_building(line, base_url)`
Allow specifying JSON columns for CSV bulk import 2019-12-10 12:17:53 -05:00			`line = parse_json_columns(line, json_columns)`
Add generic CSV upload script (rename load_data to load_shapefile) 2019-09-30 05:39:43 -04:00
			`if building_id is None:`
			`continue`

Update load_csv python script - use building_id if present in CSV - print DEBUG even if response from API is 200 - handle empty-string sust_dec (which fails because empty-string isn't a valid enum value as defined in 011.sustainability.up.sql) by deleting from data if present 2019-11-21 08:17:17 -05:00			`if 'sust_dec' in line and line['sust_dec'] == '':`
			`del line['sust_dec']`

Fix load_csv script - tested against localhost 2019-10-02 04:43:12 -04:00			`response_code, response_data = update_building(building_id, line, api_key, base_url)`
			`if response_code != 200:`
			`print('ERROR', building_id, response_code, response_data)`
Update load_csv python script - use building_id if present in CSV - print DEBUG even if response from API is 200 - handle empty-string sust_dec (which fails because empty-string isn't a valid enum value as defined in 011.sustainability.up.sql) by deleting from data if present 2019-11-21 08:17:17 -05:00			`else:`
			`print('DEBUG', building_id, response_code, response_data)`
Add generic CSV upload script (rename load_data to load_shapefile) 2019-09-30 05:39:43 -04:00

Add retrying logic 2020-06-16 08:24:43 -04:00			`@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000)`
Add generic CSV upload script (rename load_data to load_shapefile) 2019-09-30 05:39:43 -04:00			`def update_building(building_id, data, api_key, base_url):`
			`"""Save data to a building`
			`"""`
			`r = requests.post(`
Fix load_csv script - tested against localhost 2019-10-02 04:43:12 -04:00			`"{}/api/buildings/{}.json".format(base_url, building_id),`
			`params={'api_key': api_key},`
Add generic CSV upload script (rename load_data to load_shapefile) 2019-09-30 05:39:43 -04:00			`json=data`
			`)`
Fix load_csv script - tested against localhost 2019-10-02 04:43:12 -04:00			`return r.status_code, r.json()`
Add generic CSV upload script (rename load_data to load_shapefile) 2019-09-30 05:39:43 -04:00

			`def find_building(data, base_url):`
Update load_csv python script - use building_id if present in CSV - print DEBUG even if response from API is 200 - handle empty-string sust_dec (which fails because empty-string isn't a valid enum value as defined in 011.sustainability.up.sql) by deleting from data if present 2019-11-21 08:17:17 -05:00			`if 'building_id' in data:`
Accept CSV with building_id for API data import 2019-12-10 12:16:26 -05:00			`building_id = data['building_id']`
			`if building_id is not None:`
			`print("match_by_building_id", building_id)`
			`return building_id`
Update load_csv python script - use building_id if present in CSV - print DEBUG even if response from API is 200 - handle empty-string sust_dec (which fails because empty-string isn't a valid enum value as defined in 011.sustainability.up.sql) by deleting from data if present 2019-11-21 08:17:17 -05:00
Add generic CSV upload script (rename load_data to load_shapefile) 2019-09-30 05:39:43 -04:00			`if 'toid' in data:`
			`building_id = find_by_reference(base_url, 'toid', data['toid'])`
Fix load_csv script - tested against localhost 2019-10-02 04:43:12 -04:00			`if building_id is not None:`
			`print("match_by_toid", data['toid'], building_id)`
			`return building_id`
Add generic CSV upload script (rename load_data to load_shapefile) 2019-09-30 05:39:43 -04:00
			`if 'uprn' in data:`
			`building_id = find_by_reference(base_url, 'uprn', data['uprn'])`
Fix load_csv script - tested against localhost 2019-10-02 04:43:12 -04:00			`if building_id is not None:`
			`print("match_by_uprn", data['uprn'], building_id)`
			`return building_id`
Add generic CSV upload script (rename load_data to load_shapefile) 2019-09-30 05:39:43 -04:00
			`print("no_match", data)`
			`return None`


Add retrying logic 2020-06-16 08:24:43 -04:00			`@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000)`
Add generic CSV upload script (rename load_data to load_shapefile) 2019-09-30 05:39:43 -04:00			`def find_by_reference(base_url, ref_key, ref_id):`
			`"""Find building_id by TOID or UPRN`
			`"""`
Fix load_csv script - tested against localhost 2019-10-02 04:43:12 -04:00			`r = requests.get("{}/api/buildings/reference".format(base_url), params={`
Add generic CSV upload script (rename load_data to load_shapefile) 2019-09-30 05:39:43 -04:00			`'key': ref_key,`
			`'id': ref_id`
			`})`
			`buildings = r.json()`

Fix load_csv script - tested against localhost 2019-10-02 04:43:12 -04:00			`if buildings and 'error' not in buildings and len(buildings) == 1:`
Add generic CSV upload script (rename load_data to load_shapefile) 2019-09-30 05:39:43 -04:00			`building_id = buildings[0]['building_id']`
			`else:`
			`building_id = None`

			`return building_id`

Allow specifying JSON columns for CSV bulk import 2019-12-10 12:17:53 -05:00			`def parse_json_columns(row, json_columns):`
			`for col in json_columns:`
			`row[col] = json.loads(row[col])`

			`return row`
Add generic CSV upload script (rename load_data to load_shapefile) 2019-09-30 05:39:43 -04:00
Move to argparse for command line options 2020-06-16 08:27:08 -04:00
			`def list_str(values):`
			`return values.split(',')`

Add generic CSV upload script (rename load_data to load_shapefile) 2019-09-30 05:39:43 -04:00			`if __name__ == '__main__':`
Move to argparse for command line options 2020-06-16 08:27:08 -04:00			`parser = argparse.ArgumentParser()`
			`parser.add_argument('url', help='URL for the app')`
			`parser.add_argument('api_key', help='API key for the user')`
			`parser.add_argument('path', help='Path to data CSV file')`
			`parser.add_argument('json_columns',`
			`nargs='?',`
			`type=list_str,`
			`default=[],`
			`help='A comma-separated list of columns which should be parsed as JSON')`


			`args = parser.parse_args()`

			`main(args.url, args.api_key, args.path, args.json_columns)`