diff --git a/etl/extract_addressbase.py b/etl/extract_addressbase.py index 63ce53eb..5c8c603c 100644 --- a/etl/extract_addressbase.py +++ b/etl/extract_addressbase.py @@ -8,3 +8,47 @@ Relevant CSV columns:: 18 - Latitude 19 - Longitude """ +import csv +import glob +import io +import os +import sys +from zipfile import ZipFile + +def main(source_dir, output_file): + with open(output_file, 'w', encoding='utf8', newline='') as fh: + w = csv.writer(fh) + w.writerow(('UPRN', 'easting', 'northing', 'lat', 'lng')) + for address in read_addresses(source_dir): + w.writerow(address) + + +def read_addresses(source_dir): + zips = glob.glob(os.path.join(source_dir, '*.zip')) + n = len(zips) + for i, zipname in enumerate(zips): + with ZipFile(zipname) as zipfile: + names = zipfile.namelist() + csvname = names[0] + print("Processing {} ({} of {})".format(csvname, i+1, n)) + + with zipfile.open(csvname) as csvfile: + fh = io.TextIOWrapper(csvfile) + r = csv.reader(fh) + for line in r: + uprn = line[0] + toid = line[1] + easting = line[16] + northing = line[17] + lat = line[18] + lng = float(line[19]) + yield uprn, easting, northing, lat, lng + + +if __name__ == '__main__': + if len(sys.argv) != 3: + print("Usage: {} ./path/to/source/dir ./path/to/output/file".format( + os.path.basename(__file__) + )) + exit() + main(sys.argv[1], sys.argv[2])