Hello. This program was suggested by ChatGPT so I try to solve type errors here.
I am importing data from csv .txt like this:
route_id,agency_id,route_short_name,route_long_name,route_desc,route_type,route_url,route_color,route_text_color,route_sort_order
"-CISR-877516-2","JDFA-45192081-1","877516","Fryštát-Nové Město-Hranice-Mizerov-Ráj-Nové Město-Fryštát","","704","","","",""
"-CISR-877516-1","JDFA-45192081-1","877516","Fryštát-Nové Město-Hranice-Mizerov-Ráj-Nové Město-Fryštát","","704","","","",""
route_id is VARCHAR and I need to get rid of "-CISR-" on all rows in the source column. I cannot grasp how it works. It first suggested to replace this
routes_data['route_id'].apply(lambda x: x.replace('-CISR-', ''))
with this
route_id = routes_data['route_id'].replace('-CISR-', '')
last_route_id = route_id
because I want to save the last_route_id and pass it to log function... write_last_row...
import pandas as pd
import pymysql
import io
import zipfile
import csv
def write_last_row(filename, last_row_number, last_sid, last_route_id, error_message=None):
with open(filename, 'w') as file:
if error_message:
file.write(f"Error: {error_message}\n")
else:
file.write(f"Last row number: {last_row_number}\n")
file.write(f"Last sid inserted: {last_sid}\n")
file.write(f"Last route id: {last_route_id}\n")
logfile = "error_log.log"
mydb = pymysql.connect(
host="localhost",
user="admin",
password="",
database="",
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor
)
mycursor = mydb.cursor()
mycursor.execute("""
CREATE TABLE IF NOT EXISTS routes (
route_id VARCHAR(255),
route_short_name MEDIUMINT,
route_long_name VARCHAR(255),
PRIMARY KEY (route_id)
)
""")
mydb.commit() # Commit po vytvoření tabulky
zip_path = "/mnt/ramdisk/JDF_merged_GTFS.zip"
chunksize = 10
last_row_number = 0
last_sid = 0 # sid = stop_name id (relation to `stops_names` table)
# sid is important to get stop_name from `stops_names`
last_route_id = 0
with zipfile.ZipFile(zip_path, 'r') as zip_file:
with zip_file.open('routes.txt', 'r') as routes_file:
for routes_data in csv.reader(io.TextIOWrapper(routes_file, 'utf-8')):
last_row_number += chunksize
try:
# route_id = routes_data['route_id'].replace('-CISR-', '')
# last_route_id = route_id
# problem here ... I need to replace the values in column:
routes_data['route_id'].apply(lambda x: x.replace('-CISR-', ''))
# get sid and stop_name from table stop_names for the stop using LEFT JOIN
routes_data['sid'] = 0 # default 0 if stop not found
routes_data['stop_name'] = ""
for index, row in routes_data.iterrows():
stop_name_query = "SELECT stop_names.sid, stop_names.stop_name FROM routes LEFT JOIN stop_names ON routes.sid = stop_names.sid WHERE routes.route_id = %s"
mycursor.execute(stop_name_query, (row['route_id'],))
result = mycursor.fetchone()
if result:
sid = result['sid']
stop_name = result['stop_name']
last_sid = sid
else:
error_message = f"sid unique identifier {row['route_id']} not found"
write_last_row(logfile, last_row_number, last_sid, last_route_id, error_message)
print(error_message)
continue
# to change the trip long description
needle = stop_name.split(',')[0] # Needle is city name
if needle in row['route_long_name']:
routes_data.at[index, 'route_long_name'] = row['route_long_name'].replace(needle, f"#{sid}:") # @WRONG - LLM didn't change the needle to stop_name!
else:
routes_data.at[index, 'route_long_name'] = "" # Stop name not found in stop description, remove it
sql = "INSERT INTO routes (route_id, route_short_name, route_long_name) VALUES (%s, %s, %s)"
val = (row['route_id'], row['route_short_name'], row['route_long_name'])
mycursor.execute(sql, val)
# COMMIT
mydb.commit()
except Exception as e:
print(f"Chyba při zpracování dat: {e}")
finally:
# log
write_last_row(logfile, last_row_number, last_sid, last_route_id)
mycursor.execute("SHOW TABLE STATUS LIKE 'routes'")
table_status = mycursor.fetchone()
print("Velikost tabulky routes na disku:", table_status['Data_length'] / 100000000, "stovek MB")
mycursor.close()
mydb.close()
First what type is this:
routes_data['route_id']
is it a bunch of lines?
Second. How to perform the replace(..,'') of the data here?
This was a test for 10 lines, but I plan to increase the chunks number to 1000. Because I am exporting this to mysql db, and I cannot make queries every one record. There is some jobs between the lines like LEFT JOIN on table with stop_names to retrieve arrays of sid,stop_name
PS:
This particular csv can have from 670 000 to more than million records.