2024-09-12 18:44:32 -04:00
import os , math , typer
from bs4 import BeautifulSoup
2024-09-10 17:40:50 -04:00
from dotenv import load_dotenv
from rich import print
2024-09-07 18:13:36 -04:00
from typing_extensions import Annotated
from typing import Optional
from pathlib import Path
2024-09-10 17:40:50 -04:00
from typing import Tuple
2024-09-10 20:01:51 -04:00
from rich . console import Console
from rich . progress import Progress , SpinnerColumn , TextColumn , BarColumn , TaskProgressColumn
import time
2024-09-07 18:13:36 -04:00
2024-09-10 17:40:50 -04:00
from classes import City , DBMode , RTMode
2024-09-15 17:04:21 -04:00
from functions import file_validator , buffer_creator , process_buffer_population , process_travels , push_to_db_coords , write_to_csv , push_to_db_linestring
2024-09-11 10:56:17 -04:00
from functions import process_nodes , process_links , process_links_attr
2024-09-15 17:04:21 -04:00
from functions import error_printer , success_printer , info_printer , notice_printer
from functions import metro_processing , bus_processing
2024-09-10 17:40:50 -04:00
from styles import print_help
2024-09-07 18:13:36 -04:00
2024-09-10 17:40:50 -04:00
called = " population "
app = typer . Typer ( rich_markup_mode = " rich " )
load_dotenv ( )
2024-09-07 18:13:36 -04:00
2024-09-10 17:40:50 -04:00
@app.command ( print_help ( ) )
2024-09-07 18:13:36 -04:00
def population (
2024-09-10 17:40:50 -04:00
file : Annotated [ Path , typer . Argument ( help = " Provide the relative path to the [yellow bold underline]XML file[/yellow bold underline]. " , show_default = False ) ] ,
tables : list [ str ] = typer . Argument ( . . . , help = " Tables to include: [underline bold]agents[/underline bold], [underline bold]travels[/underline bold]. Use [underline bold]all[/underline bold] for everything. " , show_default = False ) ,
range : Tuple [ int , int ] = typer . Option ( None , " --range " , " -r " , help = " Specify the start and end of the chunk range to be processed. " , show_default = False ) ,
log : bool = typer . Option ( False , " --log " , " -l " , help = " Creates a Log file in the same directory to track the progress. Useful for large files that might be intrupted. " ) ,
cleandata : bool = typer . Option ( False , " --cleandata " , " -cd " , help = " Drop the rows that have missing values. " ) ,
push : bool = typer . Option ( False , " --push " , " -p " , help = " Save the output directly to the database When mentioned. Otherwise, Saves as a [green bold]CSV file[/green bold] in the input directory] " ) ,
mode : Optional [ DBMode ] = typer . Option ( None , help = " Specify either [underline] ' append ' [/underline] or [underline] ' drop ' [/underline] when pushing data " , show_default = False ) ,
2024-09-07 18:13:36 -04:00
) :
2024-09-10 20:01:51 -04:00
console = Console ( )
2024-09-10 17:40:50 -04:00
all_tables = [ " agents " , " travels " ]
common_tables = [ item for item in tables if item in [ " all " ] + all_tables ]
if len ( common_tables ) == 0 :
error_printer ( " Incorrect table input " )
raise typer . Exit ( )
elif " all " in common_tables :
common_tables = all_tables
info_printer ( f " Tables to inlude: { common_tables } " )
2024-09-15 17:04:21 -04:00
file_validator ( file )
2024-09-10 17:40:50 -04:00
with open ( file , ' r ' , encoding = ' utf-8 ' ) as f :
for line in f :
if line . strip ( ) == os . getenv ( " DIVIDER " ) :
max_chunk = max_chunk + 1
if max_chunk > 0 :
success_printer ( f " { max_chunk } Chunks found " )
elif max_chunk == 0 :
error_printer ( " Unable to find Chunks " )
raise typer . Exit ( )
if not range :
range = [ 0 , max_chunk - 2 ]
info_printer ( f " Chunk Range: { range } " )
directory = file . parent
log_file = directory / ( file . stem + " .log " )
if not log :
notice_printer ( " Log file not created " )
else :
if log_file . exists ( ) :
notice_printer ( f " Log file { log_file } already exists " )
else :
log_file . touch ( )
info_printer ( f " Log file { log_file } created " )
2024-09-10 20:01:51 -04:00
with Progress (
SpinnerColumn ( ) ,
TextColumn ( " [progress.description] {task.description} " ) ,
BarColumn ( ) ,
TaskProgressColumn ( ) ,
console = console
) as progress :
2024-09-12 18:44:32 -04:00
if max_chunk > 2 :
task = progress . add_task ( " [cyan]Processing chunks... " , total = max_chunk )
else :
task = progress . add_task ( " [cyan]Processing chunks... " , total = max_chunk , visible = False )
2024-09-10 17:40:50 -04:00
2024-09-10 20:01:51 -04:00
current_chunk = 0
processed_line = 0
if log :
with open ( log_file , ' r ' , encoding = ' utf-8 ' ) as l :
log_list = l . read ( ) . splitlines ( )
while current_chunk < max_chunk :
if current_chunk < range [ 0 ] or current_chunk > range [ 1 ] :
processed_line , buffer = buffer_creator ( file , os . getenv ( " DIVIDER " ) , processed_line , int ( os . getenv ( " CHUNK_SIZE " ) ) )
current_chunk + = 1
continue
if log and current_chunk in log_list : continue
2024-09-10 18:28:23 -04:00
processed_line , buffer = buffer_creator ( file , os . getenv ( " DIVIDER " ) , processed_line , int ( os . getenv ( " CHUNK_SIZE " ) ) )
2024-09-10 20:01:51 -04:00
if " agents " in common_tables :
dataframe = process_buffer_population ( buffer )
if cleandata :
dataframe = dataframe . dropna ( )
if push :
2024-09-11 10:56:17 -04:00
push_to_db_coords ( " agents " , dataframe , mode )
2024-09-10 20:01:51 -04:00
else :
2024-09-11 10:56:17 -04:00
write_to_csv ( " agents " , dataframe , file )
2024-09-10 18:28:23 -04:00
2024-09-10 20:01:51 -04:00
if " travels " in common_tables :
dataframe_travels = process_travels ( buffer )
if push :
2024-09-11 10:56:17 -04:00
push_to_db_coords ( " travels " , dataframe_travels , mode )
2024-09-10 20:01:51 -04:00
else :
2024-09-11 10:56:17 -04:00
write_to_csv ( " travels " , dataframe_travels , file )
2024-09-10 18:28:23 -04:00
2024-09-10 20:01:51 -04:00
if log :
f = open ( log_file , " a " )
f . write ( f " \n { current_chunk } " )
f . close ( )
current_chunk + = 1
time . sleep ( 2 )
progress . update ( task , advance = 1 )
progress . update ( task , visible = False )
console . print ( " [green]Processing complete![/green] " )
2024-09-07 18:13:36 -04:00
@app.command ( )
def network (
2024-09-11 10:56:17 -04:00
file : Annotated [ Path , typer . Argument ( help = " Provide the relative path to the [yellow bold underline]XML file[/yellow bold underline]. " , show_default = False ) ] ,
tables : list [ str ] = typer . Argument ( . . . , help = " Tables to include: [underline bold]nodes[/underline bold], [underline bold]links[/underline bold], [underline bold]links_attr[/underline bold]. Use [underline bold]all[/underline bold] for everything. " , show_default = False ) ,
cleandata : bool = typer . Option ( False , " --cleandata " , " -cd " , help = " Drop the rows that have missing values. " ) ,
push : bool = typer . Option ( False , " --push " , " -p " , help = " Save the output directly to the database When mentioned. Otherwise, Saves as a [green bold]CSV file[/green bold] in the input directory] " ) ,
mode : Optional [ DBMode ] = typer . Option ( None , help = " Specify either [underline] ' append ' [/underline] or [underline] ' drop ' [/underline] when pushing data " , show_default = False ) ,
2024-09-07 18:13:36 -04:00
) :
2024-09-11 10:56:17 -04:00
console = Console ( )
all_tables = [ " nodes " , " links " , " links_attr " ]
common_tables = [ item for item in tables if item in [ " all " ] + all_tables ]
if len ( common_tables ) == 0 :
error_printer ( " Incorrect table input " )
raise typer . Exit ( )
elif " all " in common_tables :
common_tables = all_tables
info_printer ( f " Tables to inlude: { common_tables } " )
2024-09-15 17:04:21 -04:00
file_validator ( file )
2024-09-11 10:56:17 -04:00
BUFFER = [ ]
DEVIDER_COUNT = 0
2024-09-12 18:44:32 -04:00
with Progress (
SpinnerColumn ( ) ,
TextColumn ( " [progress.description] {task.description} " ) ,
BarColumn ( ) ,
TaskProgressColumn ( ) ,
console = console
) as progress :
with open ( file , ' r ' , encoding = ' utf-8 ' ) as f :
for line in f :
if line . strip ( ) == os . getenv ( " DIVIDER " ) :
DEVIDER_COUNT = DEVIDER_COUNT + 1
if DEVIDER_COUNT == 1 or DEVIDER_COUNT > 3 : continue
if DEVIDER_COUNT == 2 :
if " node " not in common_tables :
BUFFER . clear ( )
continue
else :
try :
element = BeautifulSoup ( ( ' ' ) . join ( BUFFER ) , ' lxml-xml ' )
total_nodes = element . find_all ( " node " )
except :
error_printer ( " node process failed " )
total_chunks = math . ceil ( len ( total_nodes ) / int ( os . getenv ( " CHUNK_SIZE " ) ) )
if total_chunks > 2 :
node_task = progress . add_task ( " [cyan]Processing [bold]nodes[/bold] chunks... " , total = total_chunks )
else :
node_task = progress . add_task ( " [cyan]Processing [bold]nodes[/bold] chunks... " , total = total_chunks , visible = False )
success_printer ( f " Chunk count: { total_chunks } " )
currernt_chunks = 0
while currernt_chunks < total_chunks :
size = int ( os . getenv ( " CHUNK_SIZE " ) )
new_Chunk = total_nodes [ currernt_chunks * size : min ( len ( total_nodes ) , ( currernt_chunks + 1 ) * size ) ]
dataframe = process_nodes ( new_Chunk )
if cleandata :
dataframe = dataframe . dropna ( )
if push :
push_to_db_coords ( " nodes " , dataframe , mode )
else :
write_to_csv ( " nodes " , dataframe , file )
progress . update ( node_task , advance = 1 )
currernt_chunks = + 1
BUFFER . clear ( )
if DEVIDER_COUNT == 3 :
try :
element = BeautifulSoup ( ( ' ' ) . join ( BUFFER ) , ' lxml-xml ' )
total_links = element . find_all ( " link " )
except :
error_printer ( " node process failed " )
total_chunks = math . ceil ( len ( total_links ) / int ( os . getenv ( " CHUNK_SIZE " ) ) )
success_printer ( f " Chunk count: { total_chunks } " )
if total_chunks > 2 :
link_task = progress . add_task ( " [cyan]Processing [bold]links[/bold] chunks... " , total = total_chunks )
else :
link_task = progress . add_task ( " [cyan]Processing [bold]links[/bold] chunks... " , total = total_chunks , visible = False )
currernt_chunks = 0
while currernt_chunks < total_chunks :
size = int ( os . getenv ( " CHUNK_SIZE " ) )
new_Chunk = total_links [ currernt_chunks * size : min ( len ( total_links ) , ( currernt_chunks + 1 ) * size ) ]
if " links " in common_tables :
dataframe = process_links ( new_Chunk )
if cleandata :
dataframe = dataframe . dropna ( )
if push :
push_to_db_coords ( " links " , dataframe , mode )
else :
write_to_csv ( " links " , dataframe , file )
if " links_attr " in common_tables :
dataframe = process_links_attr ( new_Chunk )
if cleandata :
dataframe = dataframe . dropna ( )
if push :
push_to_db_coords ( " links_attr " , dataframe , mode )
else :
write_to_csv ( " links_attr " , dataframe , file )
progress . update ( link_task , advance = 1 )
currernt_chunks + = 1
BUFFER . clear ( )
continue
continue
BUFFER . append ( line . strip ( ) )
console . print ( " [green]Processing complete![/green] " )
2024-09-07 18:13:36 -04:00
@app.command ( )
def metro (
city : Annotated [ City , typer . Argument ( . . . , help = " Choose a city " , show_default = False ) ] ,
2024-09-15 17:04:21 -04:00
files : list [ Path ] = typer . Option ( None , " --files " , " -f " , help = " Provide the relative path to [yellow bold underline]shape files[/yellow bold underline]. " , show_default = False ) ,
cleandata : bool = typer . Option ( False , " --cleandata " , " -cd " , help = " Drop the rows that have missing values. " ) ,
push : bool = typer . Option ( False , " --push " , " -p " , help = " Save the output directly to the database When mentioned. Otherwise, saves as a [green bold]CSV file[/green bold] in the input directory " ) ,
pushmode : Optional [ DBMode ] = typer . Option ( None , help = " Specify either [underline] ' append ' [/underline] or [underline] ' drop ' [/underline] when pushing data " , show_default = False ) ,
) :
for file in files :
if not file . exists ( ) :
error_printer ( f " Shapefile { file } does not exist. " )
raise typer . Exit ( )
if file . suffix != ' .shp ' :
error_printer ( f " File { file } is not a .shp file. " )
raise typer . Exit ( )
success_printer ( " Shapefiles validated successfully. " )
metro_stations_df , metro_lines_df = metro_processing ( city , files )
if not metro_stations_df or not metro_lines_df :
error_printer ( " dataframes were processed successfully " )
raise typer . Exit ( )
if cleandata :
if metro_stations_df : metro_stations_df = metro_stations_df . dropna ( )
if metro_lines_df : metro_lines_df = metro_lines_df . dropna ( )
if push :
if metro_stations_df : push_to_db_coords ( " metro-stations " , metro_stations_df , pushmode )
if metro_lines_df : push_to_db_linestring ( " metro-lines " , metro_lines_df , pushmode )
else :
if metro_stations_df : write_to_csv ( " metro-stations " , metro_stations_df , file )
if metro_lines_df : write_to_csv ( " metro-lines " , metro_lines_df , file )
success_printer ( " Processing complete. " )
2024-09-07 18:13:36 -04:00
@app.command ( )
def bus (
city : Annotated [ City , typer . Argument ( . . . , help = " Choose a city " , show_default = False ) ] ,
2024-09-15 17:04:21 -04:00
files : list [ Path ] = typer . Option ( None , " --files " , " -f " , help = " Provide the relative path to [yellow bold underline]shape files[/yellow bold underline]. " , show_default = False ) ,
cleandata : bool = typer . Option ( False , " --cleandata " , " -cd " , help = " Drop the rows that have missing values. " ) ,
push : bool = typer . Option ( False , " --push " , " -p " , help = " Save the output directly to the database when mentioned. Otherwise, saves as a [green bold]CSV file[/green bold] in the input directory " ) ,
pushmode : Optional [ DBMode ] = typer . Option ( None , help = " Specify either [underline] ' append ' [/underline] or [underline] ' drop ' [/underline] when pushing data " , show_default = False ) ,
2024-09-07 18:13:36 -04:00
) :
2024-09-15 17:04:21 -04:00
for file in files :
if not file . exists ( ) :
error_printer ( f " Shapefile { file } does not exist. " )
raise typer . Exit ( )
if file . suffix != ' .shp ' :
error_printer ( f " File { file } is not a .shp file. " )
raise typer . Exit ( )
success_printer ( " Shapefiles validated successfully. " )
bus_stations_df , bus_lines_df = bus_processing ( city , files )
if not bus_stations_df and not bus_lines_df :
error_printer ( " No dataframes were processed successfully. " )
raise typer . Exit ( )
if cleandata :
if bus_stations_df is not None :
bus_stations_df = bus_stations_df . dropna ( )
if bus_lines_df is not None :
bus_lines_df = bus_lines_df . dropna ( )
if push :
if bus_stations_df is not None :
push_to_db_coords ( " bus-stations " , bus_stations_df , pushmode )
if bus_lines_df is not None :
push_to_db_linestring ( " bus-lines " , bus_lines_df , pushmode )
else :
if bus_stations_df is not None :
write_to_csv ( " bus-stations " , bus_stations_df , file )
if bus_lines_df is not None :
write_to_csv ( " bus-lines " , bus_lines_df , file )
success_printer ( " Processing complete. " )
2024-09-07 18:13:36 -04:00
if __name__ == " __main__ " :
app ( )