network functions added pt.1

2024-11-14 17:40:28 -05:00 · 2024-09-11 10:56:17 -04:00 · 2024-09-11 10:56:17 -04:00 · e8dda40c9f
commit e8dda40c9f
parent 3541f9f10e
5 changed files with 132 additions and 48 deletions
--- a/functions/Init.py
+++ b/functions/Init.py
@ -1,11 +1,11 @@
-from .population import process_buffer_population, process_travels, push_population,write_population
+from .population import process_buffer_population, process_travels, push_to_db_coords,write_to_csv
 from .network import process_network, push_network, network_write
 from .metro import process_metro, push_metro, metro_write
 from .bus import process_bus, push_bus, bus_write
 from .helpers import buffer_creator

 __all__ = [
-    'process_buffer_population', 'process_travels','push_population', 'write_population', 
+    'process_buffer_population', 'process_travels','push_to_db_coords', 'write_to_csv', 
    'process_network', 'push_network', 'network_write',
    'process_metro', 'push_metro', 'metro_write',
    'process_bus', 'push_bus', 'bus_write',
--- a/functions/helpers.py
+++ b/functions/helpers.py
@ -1,3 +1,7 @@
+import geopandas, os, datetime
+from sqlalchemy import create_engine
+from geoalchemy2 import Geometry, WKTElement
+
 def buffer_creator(file,divider,start_line, chunk_size):
  buffer = []
  line_number = start_line
@ -13,4 +17,25 @@ def buffer_creator(file,divider,start_line, chunk_size):
        continue
      buffer.append(line.strip())
  return current_line,(' ').join(buffer)
-  
+  
+def push_to_db_coords(name,data,mode):
+    GDF = geopandas.GeoDataFrame(data, crs='EPSG:4326')
+    GDF['geom'] = GDF['coordinates'].apply(lambda x: WKTElement(x.wkt, srid=os.getenv("SRID")))
+    engine = create_engine(f'postgresql://{os.getenv("USER")}:{os.getenv("PASS")}@{os.getenv("HOST_NAME")}/{os.getenv("DATA_BASE")}', echo=False)
+    GDF.to_sql(
+        name=name,
+        con=engine,
+        if_exists=mode,
+        chunksize=os.getenv("CHUNK_SIZE"),
+        dtype={'geom': Geometry('Point', srid=os.getenv("SRID"))},
+        index=False
+    )
+
+def write_to_csv(name,data, file):
+    directory = file.parent
+    id = datetime.datetime.now().strftime("%Y%m%d")
+    csv = directory / (file.stem + f"-{name}-{id}.csv")
+    if csv.exists():
+         data.to_csv(csv, mode='a',index=False)
+    else:
+        data.to_csv(csv,index=False)
--- a/functions/network.py
+++ b/functions/network.py
@ -1,9 +1,32 @@
+from bs4 import BeautifulSoup
+import pandas,pyproj, re
+from shapely.geometry import Point

-def process_network(data, cleandata):
-    print(data, cleandata)
+def process_nodes(data):
+    ELEMENT_LIST = []
+    elements = BeautifulSoup(data,'lxml-xml')
+    for element in elements.find_all("node"):
+        ELEMENT_LIST.append(dict(element.attrs))
+    return pandas.DataFrame(ELEMENT_LIST)

-def push_network(data,mode):
-    print(data,mode)
+def process_links(data):
+    ELEMENT_LIST = []
+    elements = BeautifulSoup(data,'lxml-xml')
+    for element in elements.find_all("link"):
+        ELEMENT_LIST.append(dict(element.attrs))
+    return pandas.DataFrame(ELEMENT_LIST)

-def network_write(data):
-    print(data)
+def process_links_attr(data):
+    ELEMENT_LIST = []
+    elements = BeautifulSoup(data,'lxml-xml')
+    for element in elements.find_all("link"):
+        ELEMENT_DICT = {}
+        if element.find_all("attribute"):
+            for attr in element.find_all("attribute"):
+                ELEMENT_DICT.update({attr["name"]: attr.get_text()})
+        else:
+            continue
+        ELEMENT_DICT["id"]=element.getattr("id")
+        ELEMENT_LIST.append(ELEMENT_DICT)
+
+    return pandas.DataFrame(ELEMENT_LIST)
--- a/functions/population.py
+++ b/functions/population.py
@ -1,8 +1,6 @@
 from bs4 import BeautifulSoup
-import pandas,geopandas, pyproj, re, os, datetime
+import pandas,pyproj, re
 from shapely.geometry import Point
-from sqlalchemy import create_engine
-from geoalchemy2 import Geometry, WKTElement

 def camel_to_snake(name):
    return re.sub(r'(?<!^)(?=[A-Z])', '_', name).lower()
@ -52,24 +50,3 @@ def process_travels(data):
    return pandas.DataFrame(activities_list)


-def push_population(name,data,mode):
-    GDF = geopandas.GeoDataFrame(data, crs='EPSG:4326')
-    GDF['geom'] = GDF['coordinates'].apply(lambda x: WKTElement(x.wkt, srid=os.getenv("SRID")))
-    engine = create_engine(f'postgresql://{os.getenv("USER")}:{os.getenv("PASS")}@{os.getenv("HOST_NAME")}/{os.getenv("DATA_BASE")}', echo=False)
-    GDF.to_sql(
-        name=name,
-        con=engine,
-        if_exists=mode,
-        chunksize=os.getenv("CHUNK_SIZE"),
-        dtype={'geom': Geometry('Point', srid=os.getenv("SRID"))},
-        index=False
-    )
-
-def write_population(name,data, file):
-    directory = file.parent
-    id = datetime.datetime.now().strftime("%Y%m%d")
-    csv = directory / (file.stem + f"-{name}-{id}.csv")
-    if csv.exists():
-         data.to_csv(csv, mode='a',index=False)
-    else:
-        data.to_csv(csv,index=False)
--- a/main.py
+++ b/main.py
@ -10,7 +10,8 @@ from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskPr
 import time

 from classes import City, DBMode, RTMode
-from functions import buffer_creator, process_buffer_population,process_travels, push_population, write_population
+from functions import buffer_creator, process_buffer_population,process_travels, push_to_db_coords, write_to_csv
+from functions import process_nodes,process_links,process_links_attr
 from styles import print_help

 called= "population"
@ -112,16 +113,16 @@ def population(
                if cleandata:
                    dataframe = dataframe.dropna()
                if push:
-                    push_population("agents",dataframe, mode)
+                    push_to_db_coords("agents",dataframe, mode)
                else:
-                    write_population("agents",dataframe, file)
+                    write_to_csv("agents",dataframe, file)

            if "travels" in common_tables:
                dataframe_travels = process_travels(buffer)
                if push:
-                    push_population("travels",dataframe_travels, mode)
+                    push_to_db_coords("travels",dataframe_travels, mode)
                else:
-                    write_population("travels",dataframe_travels, file)
+                    write_to_csv("travels",dataframe_travels, file)
        
            if log:
                f = open(log_file, "a")
@ -135,19 +136,77 @@ def population(

@app.command()
 def network(
-    file: Annotated[Path, typer.Argument(help="Relative path to the file.", show_default=False)], 
-    cleandata: bool = typer.Option(False, "--cleandata", "-cd", help="Clean the data if this flag is used."),
-    push: bool = typer.Option(False, "--push", "-p", help="Push the data into Database.\nIf you want the output to be saved in [green bold].csv[/green bold] format, do not mention this flag."),
-    mode: Optional[DBMode] = typer.Option(None, help="Specify either 'amend' or 'drop' when pushing data", show_default=False),
+    file: Annotated[Path, typer.Argument(help="Provide the relative path to the [yellow bold underline]XML file[/yellow bold underline].", show_default=False)], 
+    tables: list[str] = typer.Argument(..., help="Tables to include: [underline bold]nodes[/underline bold], [underline bold]links[/underline bold], [underline bold]links_attr[/underline bold]. Use [underline bold]all[/underline bold] for everything.",show_default=False),
+    cleandata: bool = typer.Option(False, "--cleandata", "-cd", help="Drop the rows that have missing values."),
+    push: bool = typer.Option(False, "--push", "-p", help="Save the output directly to the database When mentioned. Otherwise, Saves as a [green bold]CSV file[/green bold] in the input directory]"),
+    mode: Optional[DBMode] = typer.Option(None, help="Specify either [underline]'append'[/underline] or [underline]'drop'[/underline] when pushing data", show_default=False),
    ):
-    if not file.exists():
-        error_parser("File did does not exist!")
+    console = Console()
+    all_tables = ["nodes","links","links_attr"]
+    common_tables = [item for item in tables if item in ["all"] + all_tables]
+    if len(common_tables) == 0:
+        error_printer("Incorrect table input")
+        raise typer.Exit()
+    elif "all" in common_tables:
+        common_tables = all_tables
+    info_printer(f"Tables to inlude: {common_tables}")
+    if not file.exists():
+        error_printer("File not found")
+        raise typer.Exit()
+    try:
+        f = open(file, 'r', encoding='utf-8')
+        success_printer("File Opened")
+    except:
+        error_printer("Unable to read file")
+        raise typer.Exit()
+
+    count = sum(1 for _ in f)
+    if count == 0:
+        error_printer("File empty")
        raise typer.Exit()
-    data = process_network(file,cleandata)
-    if push:
-        push_network(data, mode)
    else:
-        network_write(data)
+        success_printer(f"{count + 1} lines read")
+    f.close()
+    BUFFER = []
+    DEVIDER_COUNT = 0
+    with open(file,'r',encoding='utf-8') as f:
+        for line in f:
+            if line.strip() == os.getenv("DIVIDER"):
+                DEVIDER_COUNT = DEVIDER_COUNT + 1
+            if DEVIDER_COUNT == 2 and "nodes" in common_tables:
+                dataframe = process_nodes(BUFFER)
+                if cleandata:
+                    dataframe = dataframe.dropna()
+                if push:
+                    push_to_db_coords("nodes", dataframe, mode)
+                else:
+                    write_to_csv("nodes", dataframe,file)
+                BUFFER = []
+            if DEVIDER_COUNT == 3:
+                if "links" in common_tables:
+                    dataframe = process_links(BUFFER)
+                    if cleandata:
+                        dataframe = dataframe.dropna()
+                    if push:
+                        push_to_db_coords("links", dataframe, mode)
+                    else:
+                        write_to_csv("links", dataframe, file)
+                if "links_attr" in common_tables:
+                    dataframe = process_links_attr(BUFFER)
+                    if cleandata:
+                        dataframe = dataframe.dropna()
+                    if push:
+                        push_to_db_coords("links_attr", dataframe, mode)
+                    else:
+                        write_to_csv("links_attr", dataframe, file)
+                BUFFER = []
+            if DEVIDER_COUNT < 1:
+                continue
+            if DEVIDER_COUNT > 2:
+                continue
+            BUFFER.append(line)
+    console.print("[green]Processing complete![/green]")

@app.command()
 def metro(