From 0e35a7cca25b2276581935b1e16e19384e4e432e Mon Sep 17 00:00:00 2001 From: Ed Chalstrey Date: Thu, 17 Mar 2022 15:43:23 +0000 Subject: [PATCH] mastermap filtering without using addressbase --- etl/filter_mastermap.py | 46 ++++++++++--------- etl/filter_transform_mastermap_for_loading.sh | 8 ++-- 2 files changed, 27 insertions(+), 27 deletions(-) diff --git a/etl/filter_mastermap.py b/etl/filter_mastermap.py index 1713d262..e291c7ad 100644 --- a/etl/filter_mastermap.py +++ b/etl/filter_mastermap.py @@ -1,7 +1,6 @@ -"""Filter MasterMap to buildings and addressbase-matches +"""Filter MasterMap to buildings - WHERE descriptiveGroup includes 'Building' -- OR toid in addressbase_toids """ import csv import glob @@ -13,25 +12,28 @@ from multiprocessing import Pool csv.field_size_limit(sys.maxsize) -def main(ab_path, mm_path): - mm_paths = sorted(glob.glob(os.path.join(mm_path, "*.gml.csv"))) - toid_paths = sorted(glob.glob(os.path.join(ab_path, "ab_toids_*.txt"))) +def main(mastermap_path): + mm_paths = sorted(glob.glob(os.path.join(mastermap_path, "*.gml.csv"))) + # toid_paths = sorted(glob.glob(os.path.join(ab_path, "ab_toids_*.txt"))) - try: - assert len(mm_paths) == len(toid_paths) - except AssertionError: - print(mm_paths) - print(toid_paths) - zipped_paths = zip(mm_paths, toid_paths) + # try: + # assert len(mm_paths) == len(toid_paths) + # except AssertionError: + # print(mm_paths) + # print(toid_paths) + # zipped_paths = zip(mm_paths, toid_paths) # parallel map over tiles - with Pool() as p: - p.starmap(filter, zipped_paths) + # with Pool() as p: + # p.starmap(filter, zipped_paths) + + for mm_path in mm_paths: + filter(mm_path) -def filter(mm_path, toid_path): - with open(toid_path, 'r') as fh: - r = csv.reader(fh) - toids = set(line[0] for line in r) +def filter(mm_path): + # with open(toid_path, 'r') as fh: + # r = csv.reader(fh) + # toids = set(line[0] for line in r) output_path = "{}.filtered.csv".format(str(mm_path).replace(".gml.csv", "")) alt_output_path = "{}.filtered_not_building.csv".format(str(mm_path).replace(".gml.csv", "")) @@ -48,13 +50,13 @@ def filter(mm_path, toid_path): if 'Building' in line['descriptiveGroup']: w.writerow(line) - elif line['fid'] in toids: - alt_w.writerow(line) + # elif line['fid'] in toids: + # alt_w.writerow(line) if __name__ == '__main__': - if len(sys.argv) != 3: - print("Usage: filter_mastermap.py ./path/to/addressbase/dir ./path/to/mastermap/dir") + if len(sys.argv) != 2: + print("Usage: filter_mastermap.py ./path/to/mastermap/dir") exit(-1) - main(sys.argv[1], sys.argv[2]) + main(sys.argv[1]) diff --git a/etl/filter_transform_mastermap_for_loading.sh b/etl/filter_transform_mastermap_for_loading.sh index 042a3441..45c62b2e 100755 --- a/etl/filter_transform_mastermap_for_loading.sh +++ b/etl/filter_transform_mastermap_for_loading.sh @@ -3,23 +3,21 @@ # # Filter and transform for loading # -: ${1?"Usage: $0 ./path/to/addressbase/dir ./path/to/mastermap/dir"} -# : ${2?"Usage: $0 ./path/to/addressbase/dir ./path/to/mastermap/dir"} +: ${1?"Usage: $0 ./path/to/mastermap/dir"} -# addressbase_dir=$1 mastermap_dir=$1 # # Check which TOIDs are matched against UPRNs # -colouringlondon/bin/python check_ab_mm_match.py $addressbase_dir $mastermap_dir +# colouringlondon/bin/python check_ab_mm_match.py $addressbase_dir $mastermap_dir # # Filter # - WHERE descriptiveGroup = '(1:Building)' # - OR toid in addressbase_toids # -colouringlondon/bin/python filter_mastermap.py $addressbase_dir $mastermap_dir +colouringlondon/bin/python filter_mastermap.py $mastermap_dir # # Transform to 3857 (web mercator)