61 lines
1.7 KiB
Python
61 lines
1.7 KiB
Python
"""Check if AddressBase TOIDs will match MasterMap
|
|
"""
|
|
import csv
|
|
import glob
|
|
import os
|
|
import sys
|
|
|
|
from multiprocessing import Pool
|
|
|
|
csv.field_size_limit(sys.maxsize)
|
|
|
|
def main(ab_path, mm_path):
|
|
ab_paths = sorted(glob.glob(os.path.join(ab_path, "*.gml.csv.filtered.csv")))
|
|
mm_paths = sorted(glob.glob(os.path.join(mm_path, "*.gml.csv")))
|
|
|
|
try:
|
|
assert len(ab_paths) == len(mm_paths)
|
|
except AssertionError:
|
|
print(ab_paths)
|
|
print(mm_paths)
|
|
|
|
zipped_paths = zip(ab_paths, mm_paths)
|
|
|
|
# parallel map over tiles
|
|
with Pool() as p:
|
|
p.starmap(check, zipped_paths)
|
|
|
|
def check(ab_path, mm_path):
|
|
tile = str(os.path.basename(ab_path)).split(".")[0]
|
|
output_base = os.path.dirname(ab_path)
|
|
ab_toids = set()
|
|
mm_toids = set()
|
|
|
|
with open(ab_path, 'r') as fh:
|
|
r = csv.DictReader(fh)
|
|
for line in r:
|
|
ab_toids.add(line['toid'])
|
|
|
|
with open(mm_path, 'r') as fh:
|
|
r = csv.DictReader(fh)
|
|
for line in r:
|
|
mm_toids.add(line['fid'])
|
|
|
|
missing = ab_toids - mm_toids
|
|
print(tile, "MasterMap:", len(mm_toids), "Addressbase:", len(ab_toids), "AB but not MM:", len(missing))
|
|
|
|
with open(os.path.join(output_base, 'missing_toids_{}.txt'.format(tile)), 'w') as fh:
|
|
for toid in missing:
|
|
fh.write("{}\n".format(toid))
|
|
|
|
with open(os.path.join(output_base, 'ab_toids_{}.txt'.format(tile)), 'w') as fh:
|
|
for toid in ab_toids:
|
|
fh.write("{}\n".format(toid))
|
|
|
|
|
|
if __name__ == '__main__':
|
|
if len(sys.argv) != 3:
|
|
print("Usage: check_ab_mm_match.py ./path/to/addressbase/dir ./path/to/mastermap/dir")
|
|
exit(-1)
|
|
main(sys.argv[1], sys.argv[2])
|