population added + help improved

This commit is contained in:
Kian 2024-09-10 17:40:50 -04:00
parent 87b0db002a
commit 52be6c060e
7 changed files with 214 additions and 41 deletions

classes.py Normal file
View File

@ -0,0 +1,12 @@
from enum import Enum
class City(str, Enum):
mtl = "mtl"
class DBMode(str, Enum):
drop = "drop"
append = "append"
class RTMode(str, Enum):
online = "online"
offline = "offline"

View File

@ -1,11 +1,13 @@
from .population import process_population, push_population,population_write from .population import process_buffer_population, push_population,write_population
from .network import process_network, push_network, network_write from .network import process_network, push_network, network_write
from .metro import process_metro, push_metro, metro_write from .metro import process_metro, push_metro, metro_write
from .bus import process_bus, push_bus, bus_write from .bus import process_bus, push_bus, bus_write
from .helpers import buffer_creator
__all__ = [ __all__ = [
'process_population', 'push_population', 'population_write', 'process_buffer_population', 'push_population', 'write_population',
'process_network', 'push_network', 'network_write', 'process_network', 'push_network', 'network_write',
'process_metro', 'push_metro', 'metro_write', 'process_metro', 'push_metro', 'metro_write',
'process_bus', 'push_bus', 'bus_write' 'process_bus', 'push_bus', 'bus_write',
] ]

functions/helpers.py Normal file
View File

@ -0,0 +1,16 @@
def buffer_creator(file,divider,start_line, chunk_size):
buffer = []
line_number = start_line
current_line = 0
divider_count = 0
with open(file,'r',encoding='utf-8') as f:
for line in f:
current_line += 1
if (current_line <= line_number): continue
if (line.strip()== divider):
divider_count = divider_count + 1
if divider_count == chunk_size: break
return current_line,(' ').join(buffer)

View File

@ -1,9 +1,46 @@
from bs4 import BeautifulSoup
import pandas,geopandas, pyproj, re, os, datetime
from shapely.geometry import Point
from sqlalchemy import create_engine
from geoalchemy2 import Geometry, WKTElement
def process_population(data, cleandata): def camel_to_snake(name):
print(data, cleandata) return re.sub(r'(?<!^)(?=[A-Z])', '_', name).lower()
def process_buffer_population(data):
transformer = pyproj.Transformer.from_crs('EPSG:2950', 'EPSG:4326', always_xy=True)
elements = BeautifulSoup(data,'html.parser')
for person in elements.find_all('person'):
person_obj = {}
person_obj['id'] = person['id']
activity = person.find('plan').find('activity')
lat, lon = transformer.transform(activity['x'], activity['y'])
person_obj['coordinates'] = Point(lon,lat)
person_obj['time'] = activity['end_time']
for attr in person.find_all('attribute'):
person_obj[camel_to_snake(attr['name'])] = attr.get_text()
return pandas.DataFrame(PERSON_LIST)
def push_population(data,mode): def push_population(data,mode):
print(data,mode) GDF = geopandas.GeoDataFrame(data, crs='EPSG:4326')
GDF['geom'] = GDF['coordinates'].apply(lambda x: WKTElement(x.wkt, srid=os.getenv("SRID")))
engine = create_engine(f'postgresql://{os.getenv("USER")}:{os.getenv("PASS")}@{os.getenv("HOST_NAME")}/{os.getenv("DATA_BASE")}', echo=False)
dtype={'geom': Geometry('Point', srid=os.getenv("SRID"))},
def population_write(data): def write_population(data, file):
print(data) directory = file.parent
id = datetime.datetime.now().strftime("%Y%m%d")
csv = directory / (file.stem + id +".csv")
if csv.exists():
data.to_csv(csv, mode='a',index=False)

View File

@ -1,55 +1,118 @@
import typer, geopandas, requests, shapely import os, typer
from bs4 import BeautifulSoup from dotenv import load_dotenv
from zipfile import ZipFile from rich import print
from typing_extensions import Annotated from typing_extensions import Annotated
from typing import Optional from typing import Optional
from pathlib import Path from pathlib import Path
from enum import Enum from typing import Tuple
from functions import process_population, push_population, population_write from classes import City, DBMode, RTMode
from functions import process_network, push_network, network_write from functions import buffer_creator, process_buffer_population, push_population, write_population
from functions import process_metro, push_metro, metro_write from styles import print_help
from functions import process_bus, push_bus, bus_write
app = typer.Typer() called= "population"
app = typer.Typer(rich_markup_mode="rich")
class City(str, Enum): def error_printer(text):
mtl = "mtl" print(f'[bold red]ERROR:[/bold red] [bold]{text}[/bold]')
def success_printer(text):
print(f'[bold green]SUCCESS:[/bold green] [bold]{text}[/bold]')
def info_printer(text):
print(f'[bold blue]INFO:[/bold blue] [bold]{text}[/bold]')
def notice_printer(text):
print(f'[bold yellow]NOTICE:[/bold yellow] [bold]{text}[/bold]')
class DBMode(str, Enum): @app.command(print_help())
drop = "drop"
amend = "amend"
class RTMode(str, Enum):
online = "online"
offline = "offline"
def population( def population(
file: Annotated[Path, typer.Argument(help="Relative path to the file.", show_default=False)], file: Annotated[Path, typer.Argument(help="Provide the relative path to the [yellow bold underline]XML file[/yellow bold underline].", show_default=False)],
cleandata: bool = typer.Option(False, "--cleandata", "-cd", help="Clean the data if this flag is used."), tables: list[str] = typer.Argument(..., help="Tables to include: [underline bold]agents[/underline bold], [underline bold]travels[/underline bold]. Use [underline bold]all[/underline bold] for everything.",show_default=False),
push: bool = typer.Option(False, "--push", "-p", help="Push the data into Database."), range: Tuple[int, int] = typer.Option(None, "--range", "-r", help="Specify the start and end of the chunk range to be processed.", show_default=False),
mode: Optional[DBMode] = typer.Option(None, help="Specify either 'amend' or 'drop' when pushing data", show_default=False), log: bool = typer.Option(False, "--log", "-l", help="Creates a Log file in the same directory to track the progress. Useful for large files that might be intrupted."),
cleandata: bool = typer.Option(False, "--cleandata", "-cd", help="Drop the rows that have missing values."),
push: bool = typer.Option(False, "--push", "-p", help="Save the output directly to the database When mentioned. Otherwise, Saves as a [green bold]CSV file[/green bold] in the input directory]"),
mode: Optional[DBMode] = typer.Option(None, help="Specify either [underline]'append'[/underline] or [underline]'drop'[/underline] when pushing data", show_default=False),
): ):
if not file.exists(): all_tables = ["agents","travels"]
print("File did does not exist!") common_tables = [item for item in tables if item in ["all"] + all_tables]
if len(common_tables) == 0:
error_printer("Incorrect table input")
raise typer.Exit()
elif "all" in common_tables:
common_tables = all_tables
info_printer(f"Tables to inlude: {common_tables}")
if not file.exists():
error_printer("File not found")
raise typer.Exit()
f = open(file, 'r', encoding='utf-8')
success_printer("File Opened")
error_printer("Unable to read file")
raise typer.Exit()
count = sum(1 for _ in f)
if count == 0:
error_printer("File empty")
raise typer.Exit() raise typer.Exit()
data = process_population(file,cleandata)
if push:
push_population(data, mode)
else: else:
population_write(data) success_printer(f"{count + 1} lines read")
max_chunk = 0
with open(file,'r',encoding='utf-8') as f:
for line in f:
if line.strip() == os.getenv("DIVIDER"):
max_chunk = max_chunk + 1
if max_chunk > 0:
success_printer(f"{max_chunk} Chunks found")
elif max_chunk == 0:
error_printer("Unable to find Chunks")
raise typer.Exit()
if not range:
range = [0,max_chunk-2]
info_printer(f"Chunk Range: {range}")
directory = file.parent
log_file = directory / (file.stem + ".log")
if not log:
notice_printer("Log file not created")
if log_file.exists():
notice_printer(f"Log file {log_file} already exists")
info_printer(f"Log file {log_file} created")
current_chunk = 0
processed_line = 0
if log:
with open(log_file,'r',encoding='utf-8') as l:
log_list = l.read().splitlines()
while current_chunk < max_chunk:
if log and current_chunk in log_list: continue
processed_line, buffer = buffer_creator(file, os.getenv("DIVIDER"), processed_line, int(os.getenv("CHUNK_SIZE")))
dataframe = process_buffer_population(buffer)
if cleandata:
dataframe = dataframe.dropna()
if push:
push_population(dataframe, mode)
if log:
f = open(log_file, "a")
current_chunk += 1
@app.command() @app.command()
def network( def network(
file: Annotated[Path, typer.Argument(help="Relative path to the file.", show_default=False)], file: Annotated[Path, typer.Argument(help="Relative path to the file.", show_default=False)],
cleandata: bool = typer.Option(False, "--cleandata", "-cd", help="Clean the data if this flag is used."), cleandata: bool = typer.Option(False, "--cleandata", "-cd", help="Clean the data if this flag is used."),
push: bool = typer.Option(False, "--push", "-p", help="Push the data into Database."), push: bool = typer.Option(False, "--push", "-p", help="Push the data into Database.\nIf you want the output to be saved in [green bold].csv[/green bold] format, do not mention this flag."),
mode: Optional[DBMode] = typer.Option(None, help="Specify either 'amend' or 'drop' when pushing data", show_default=False), mode: Optional[DBMode] = typer.Option(None, help="Specify either 'amend' or 'drop' when pushing data", show_default=False),
): ):
if not file.exists(): if not file.exists():
print("File did does not exist!") error_parser("File did does not exist!")
raise typer.Exit() raise typer.Exit()
data = process_network(file,cleandata) data = process_network(file,cleandata)
if push: if push:

styles/__Init__.py Normal file
View File

@ -0,0 +1,5 @@
from .help import print_help
__all__ = [

styles/help.py Normal file
View File

@ -0,0 +1,38 @@
import sys
from rich.table import Table
from rich.console import Console
from rich.console import Group
from rich.padding import Padding
from rich.panel import Panel
def population_help():
line1 = "This CLI tool processes [yellow bold]MATSim population XML files[/yellow bold] and prepares the data for storage in either [green bold].csv[/green bold] file or in [blue bold]PostgreSQL[/blue bold] database with [blue bold]PostGIS[/blue bold] integration."
line2 = "It extracts key data such as coordinates, converting them into a format ready for geospatial.\nUse the available [underline bold]options[/underline bold] to modify the behavior, such as [underline bold]cleaning[/underline bold] the data with missing values, or choosing to either [underline bold]replace[/underline bold] or [underline bold]append[/underline bold] data in the target table."
line3 = "The resulting table structure includes columns such as the following:"
line4 = "[red bold]NOTE:[/red bold] Ensure PostgreSQL connection details are provided via a [underline bold bright_cyan].env[/underline bold bright_cyan] file.\n[red bold]NOTE:[/red bold] By default if a [underline].log[/underline] exist with the same name in the same directory of file, It will use that to prcoess the file"
table = Table("id","lon","lat","geom","time","age","sex","person_id","economic_sector","household_id","household_income")
lines = f"{line1} \n{line2} \n{line3}"
panel = Padding(Panel(Padding(Group(lines,Padding(table, (1,0)),line4), (1,1)), title="About",title_align="left"), (1,0,0,0))
return panel
def network_help():
test = Padding("Hello network", (1,1))
return Panel(test, title="About",title_align="left")
def metro_help():
test = Padding("Hello network", (1,1))
return Panel(test, title="About",title_align="left")
def bus_help():
test = Padding("Hello network", (1,1))
return Panel(test, title="About",title_align="left")
def print_help():
console = Console()
if "--help" in sys.argv or "-h" in sys.argv:
if (sys.argv[1] == "population"):
elif (sys.argv[1] == "network"):