diff --git a/functions/__Init__.py b/functions/__Init__.py index 1a55335..14a466f 100644 --- a/functions/__Init__.py +++ b/functions/__Init__.py @@ -1,12 +1,12 @@ -from .population import process_buffer_population, process_travels, push_to_db_coords,write_to_csv -from .network import process_network, push_network, network_write +from .population import process_buffer_population, process_travels +from .network import process_nodes, process_links, process_links_attr from .metro import process_metro, push_metro, metro_write from .bus import process_bus, push_bus, bus_write -from .helpers import buffer_creator +from .helpers import buffer_creator,push_to_db_coords,write_to_csv __all__ = [ 'process_buffer_population', 'process_travels','push_to_db_coords', 'write_to_csv', - 'process_network', 'push_network', 'network_write', + 'process_nodes', 'process_links', 'process_links_attr' 'process_metro', 'push_metro', 'metro_write', 'process_bus', 'push_bus', 'bus_write', 'buffer_creator' diff --git a/functions/network.py b/functions/network.py index 655eb7a..6666093 100644 --- a/functions/network.py +++ b/functions/network.py @@ -3,30 +3,32 @@ import pandas,pyproj, re from shapely.geometry import Point def process_nodes(data): + transformer = pyproj.Transformer.from_crs('EPSG:2950', 'EPSG:4326', always_xy=True) ELEMENT_LIST = [] - elements = BeautifulSoup(data,'lxml-xml') - for element in elements.find_all("node"): - ELEMENT_LIST.append(dict(element.attrs)) + for node in data: + attr = dict(node.attrs) + lat, lon = transformer.transform(attr['x'], attr['y']) + attr['coordinates'] = Point(lon,lat) + ELEMENT_LIST.append(attr) return pandas.DataFrame(ELEMENT_LIST) def process_links(data): ELEMENT_LIST = [] - elements = BeautifulSoup(data,'lxml-xml') - for element in elements.find_all("link"): + for element in data: ELEMENT_LIST.append(dict(element.attrs)) return pandas.DataFrame(ELEMENT_LIST) def process_links_attr(data): ELEMENT_LIST = [] - elements = BeautifulSoup(data,'lxml-xml') - for element in elements.find_all("link"): - ELEMENT_DICT = {} - if element.find_all("attribute"): - for attr in element.find_all("attribute"): - ELEMENT_DICT.update({attr["name"]: attr.get_text()}) - else: + for element in data: + if not element.find("attributes"): continue - ELEMENT_DICT["id"]=element.getattr("id") + ELEMENT_DICT = {"id": element.get("id")} + for attr in element.find("attributes").find_all("attribute"): + attr_name = attr.get("name") + attr_value = attr.get_text() + attr_name = attr_name.replace(":", "_") + ELEMENT_DICT[attr_name] = attr_value + ELEMENT_LIST.append(ELEMENT_DICT) - return pandas.DataFrame(ELEMENT_LIST) \ No newline at end of file diff --git a/main.py b/main.py index 1de72b6..1f240f7 100644 --- a/main.py +++ b/main.py @@ -1,4 +1,5 @@ -import os, typer +import os,math,typer +from bs4 import BeautifulSoup from dotenv import load_dotenv from rich import print from typing_extensions import Annotated @@ -14,6 +15,8 @@ from functions import buffer_creator, process_buffer_population,process_travels, from functions import process_nodes,process_links,process_links_attr from styles import print_help +import xml.etree.ElementTree as ET + called= "population" app = typer.Typer(rich_markup_mode="rich") load_dotenv() @@ -94,7 +97,10 @@ def population( TaskProgressColumn(), console=console ) as progress: - task = progress.add_task("[cyan]Processing chunks...", total=max_chunk) + if max_chunk > 2: + task = progress.add_task("[cyan]Processing chunks...", total=max_chunk) + else: + task = progress.add_task("[cyan]Processing chunks...", total=max_chunk, visible=False) current_chunk = 0 processed_line = 0 @@ -166,47 +172,91 @@ def network( error_printer("File empty") raise typer.Exit() else: - success_printer(f"{count + 1} lines read") + success_printer(f"{count + 1} lines found") f.close() BUFFER = [] DEVIDER_COUNT = 0 - with open(file,'r',encoding='utf-8') as f: - for line in f: - if line.strip() == os.getenv("DIVIDER"): - DEVIDER_COUNT = DEVIDER_COUNT + 1 - if DEVIDER_COUNT == 2 and "nodes" in common_tables: - dataframe = process_nodes(BUFFER) - if cleandata: - dataframe = dataframe.dropna() - if push: - push_to_db_coords("nodes", dataframe, mode) - else: - write_to_csv("nodes", dataframe,file) - BUFFER = [] - if DEVIDER_COUNT == 3: - if "links" in common_tables: - dataframe = process_links(BUFFER) - if cleandata: - dataframe = dataframe.dropna() - if push: - push_to_db_coords("links", dataframe, mode) - else: - write_to_csv("links", dataframe, file) - if "links_attr" in common_tables: - dataframe = process_links_attr(BUFFER) - if cleandata: - dataframe = dataframe.dropna() - if push: - push_to_db_coords("links_attr", dataframe, mode) - else: - write_to_csv("links_attr", dataframe, file) - BUFFER = [] - if DEVIDER_COUNT < 1: - continue - if DEVIDER_COUNT > 2: - continue - BUFFER.append(line) - console.print("[green]Processing complete![/green]") + with Progress( + SpinnerColumn(), + TextColumn("[progress.description]{task.description}"), + BarColumn(), + TaskProgressColumn(), + console=console + ) as progress: + with open(file,'r',encoding='utf-8') as f: + for line in f: + if line.strip() == os.getenv("DIVIDER"): + DEVIDER_COUNT = DEVIDER_COUNT + 1 + if DEVIDER_COUNT == 1 or DEVIDER_COUNT > 3: continue + if DEVIDER_COUNT == 2: + if "node" not in common_tables: + BUFFER.clear() + continue + else: + try: + element = BeautifulSoup((' ').join(BUFFER), 'lxml-xml') + total_nodes = element.find_all("node") + except: + error_printer("node process failed") + total_chunks = math.ceil(len(total_nodes)/int(os.getenv("CHUNK_SIZE"))) + if total_chunks > 2: + node_task = progress.add_task("[cyan]Processing [bold]nodes[/bold] chunks...", total=total_chunks) + else: + node_task = progress.add_task("[cyan]Processing [bold]nodes[/bold] chunks...", total=total_chunks, visible=False) + success_printer(f"Chunk count: {total_chunks}") + currernt_chunks= 0 + while currernt_chunks < total_chunks: + size = int(os.getenv("CHUNK_SIZE")) + new_Chunk = total_nodes[currernt_chunks*size:min(len(total_nodes),(currernt_chunks+1)*size)] + dataframe = process_nodes(new_Chunk) + if cleandata: + dataframe = dataframe.dropna() + if push: + push_to_db_coords("nodes", dataframe, mode) + else: + write_to_csv("nodes", dataframe,file) + progress.update(node_task, advance=1) + currernt_chunks =+ 1 + BUFFER.clear() + if DEVIDER_COUNT == 3: + try: + element = BeautifulSoup((' ').join(BUFFER), 'lxml-xml') + total_links = element.find_all("link") + except: + error_printer("node process failed") + total_chunks = math.ceil(len(total_links)/int(os.getenv("CHUNK_SIZE"))) + success_printer(f"Chunk count: {total_chunks}") + if total_chunks > 2: + link_task = progress.add_task("[cyan]Processing [bold]links[/bold] chunks...", total=total_chunks) + else: + link_task = progress.add_task("[cyan]Processing [bold]links[/bold] chunks...", total=total_chunks, visible=False) + currernt_chunks= 0 + while currernt_chunks < total_chunks: + size = int(os.getenv("CHUNK_SIZE")) + new_Chunk = total_links[currernt_chunks*size:min(len(total_links),(currernt_chunks+1)*size)] + if "links" in common_tables: + dataframe = process_links(new_Chunk) + if cleandata: + dataframe = dataframe.dropna() + if push: + push_to_db_coords("links", dataframe, mode) + else: + write_to_csv("links", dataframe, file) + if "links_attr" in common_tables: + dataframe = process_links_attr(new_Chunk) + if cleandata: + dataframe = dataframe.dropna() + if push: + push_to_db_coords("links_attr", dataframe, mode) + else: + write_to_csv("links_attr", dataframe, file) + progress.update(link_task, advance=1) + currernt_chunks +=1 + BUFFER.clear() + continue + continue + BUFFER.append(line.strip()) + console.print("[green]Processing complete![/green]") @app.command() def metro(