This notebook extracts data from Kaggle and preprocesses it to contain only the relevant data, and get rid of missings.
Furthermore, simple overviews of the raw and filtered data are shown.
import pandas as pd
import numpy as np
from helper_functions import get_full_data, summarize_dfExtracting Data from Kaggle¶
# Download data into cache
data_path = get_full_data()# Create pandas dataframes of the datasets
airlines_df = pd.read_csv(data_path+"/airlines.csv")
airports_df = pd.read_csv(data_path+"/airports.csv")
flights_full_df = pd.read_csv(data_path+"/flights.csv", low_memory=False)Overviews and Summaries of the Different Raw Datasets¶
# Overview of the airlines
airlines_df.head()Loading...
# Overview of the airports
airports_df.head()Loading...
# Overview of the raw dataset containing the flights
flights_full_df.head()Loading...
# Summary of the raw dataset containing all the flights
summary_flights = summarize_df(flights_full_df,
"figures/df_summary_all_flights.png")
summary_flightsLoading...
Filtering the Raw Flights Dataset¶
filtered_flights_df = flights_full_df.copy()
# Filtering for flights with airports in California as origin airport
airports_CA = airports_df.loc[airports_df["STATE"] == "CA", "IATA_CODE"].tolist()
filtered_flights_df = filtered_flights_df[
# (filtered_flights_df["DESTINATION_AIRPORT"].isin(airports_CA)) |
(filtered_flights_df["ORIGIN_AIRPORT"].isin(airports_CA))
]# Assuming that nan values in AIRLINE_DELAY, AIR_SYSTEM_DELAY, LATE_AIRCRAFT_DELAY,
# SECURITY_DELAY and WEATHER_DELAY means that the flight was not delayed in those
# areas, setting the values of the variables equal to 0
cols_fill_nan = ["AIRLINE_DELAY",
"AIR_SYSTEM_DELAY",
"LATE_AIRCRAFT_DELAY",
"SECURITY_DELAY",
"WEATHER_DELAY"]
filtered_flights_df[cols_fill_nan] = filtered_flights_df[cols_fill_nan].fillna(0)# Dropping specific columns
cols_remove = ["CANCELLATION_REASON", # Because of too many missings
"YEAR", # Because all values are 2015
"DIVERTED" # Because all remaining values are 0
]
filtered_flights_df = filtered_flights_df.drop(cols_remove, axis=1)# As some of the flights are cancelled, some missings in the data are only missings
# because the plane did not take off, and should thus not be removed
# Removing remaining rows with true missings (they only make up < 0.5% of the total remaining data)
cols_remove_rows_false = ["AIR_TIME",
"ARRIVAL_DELAY",
"ARRIVAL_TIME",
"DEPARTURE_DELAY",
"DEPARTURE_TIME",
"ELAPSED_TIME",
"TAXI_IN",
"TAXI_OUT",
"WHEELS_OFF",
"WHEELS_ON"]
filtered_flights_df = filtered_flights_df[
(filtered_flights_df["CANCELLED"] == 1) |
(~filtered_flights_df[cols_remove_rows_false].isna().any(axis=1))
]
# Removing remaining rows with missings, but where data should be present no matter if
# the plane took off or not
cols_remove_rows_true = ["TAIL_NUMBER"]
filtered_flights_df = filtered_flights_df.dropna(subset=cols_remove_rows_true)
# Removing rows where the flight is cancelled, but the row still shows data for the trip
filtered_flights_df = filtered_flights_df[
~((filtered_flights_df["CANCELLED"] == 1) &
(filtered_flights_df[cols_remove_rows_false].notna().any(axis=1)))
]Overview of the Filtered Flights Dataset¶
# Overview of the filtered dataset containing the flights
summary_filtered_flights = summarize_df(filtered_flights_df,
"figures/df_summary_filtered_flights.png")
summary_filtered_flightsLoading...
Saving the Datasets¶
# Saving the datasets to CSV files in the project folder
airlines_df.to_csv("data/airlines.csv", index=False)
airports_df.to_csv("data/airports.csv", index=False)
filtered_flights_df.to_csv("data/filtered_flights.csv", index=False)