Demo entry 6348145

hi

   

Submitted by anonymous on Feb 19, 2017 at 06:11
Language: Python 3. Code size: 3.4 kB.

import pandas as pd
from math import exp
from sklearn import linear_model
import os

# Method for predicting cancellation
def predict_cancellation(DepartureTime, SchedElapsedTime, Distance, is_UA, is_DL, is_AA,
                            is_winter, is_fall, is_summer, is_spring):

    # Uses coefficients from model (see Part 1)
    logistic_regression = -0.41396187 + 5.02636090e-05*DepartureTime + 1.59693780e-02*SchedElapsedTime \
                            - 2.37240015e-03*Distance - 3.06144997e-02*is_UA - 6.25634661e-01*is_DL \
                            + 2.42287287e-01*is_AA + 4.47389644e-01*is_winter - 1.01297294e+00*is_fall \
                            - -4.39301994e-02*is_summer + 1.95551622e-01*is_spring

    # Logistic regression came in log() form, converted to probability (0, 1)
    probability_cancelled = exp(logistic_regression)/(1 + exp(logistic_regression))
    print("Probability flight is cancelled: ", probability_cancelled)


# PART 1 -- BUILDING A PREDICTIVE MODEL
# Importing dataset
os.chdir("/Users/Fiona/Desktop")

# Using data with 50-50 cancelled/not cancelled split to adjust for rare cancelled events (17%)
data = pd.read_csv("Dataset_1.csv")

# Creating binary indicators for airlines
data['is_UA'] = (data.UniqueCarrier == "UA").astype(int)
data['is_DL'] = (data.UniqueCarrier == "DL").astype(int)
data['is_AA'] = (data.UniqueCarrier == "AA").astype(int)

# Creating binary indicators for seasonality
data['is_winter'] = ((data.Month == 12) | (data.Month == 1) | (data.Month == 2)).astype(int)
data['is_fall'] = ((data.Month == 9) | (data.Month == 10) | (data.Month == 11)).astype(int)
data['is_summer'] = ((data.Month == 8) | (data.Month == 7) | (data.Month == 6)).astype(int)
data['is_spring'] = ((data.Month == 5) | (data.Month == 4) | (data.Month == 3)).astype(int)

# Preparing independent variables for prediction
predictors = pd.DataFrame([data.DepartureTime, data.SchedElapsedTime, data.Distance,
                            data.is_UA, data.is_DL, data.is_AA, data.is_winter, data.is_fall,
                            data.is_summer, data.is_spring]).T

# Building logistic model
log_model = linear_model.LogisticRegression()
log_model.fit(X = predictors, y = data.Cancelled)
y_intercept = log_model.intercept_
coefficients = log_model.coef_
accuracy = log_model.score(X = predictors, y = data.Cancelled)

# Making predictions on existing data points
# Incorporating to existing data set and exporting to CSV
test_preds = log_model.predict(X = predictors)
pred_cancelled = pd.DataFrame({"pred_cancelled":test_preds})
new_data = pd.concat([data, pred_cancelled], axis=1)
new_data.to_csv("new_data.csv", index=False)

# PART 2 -- PREDICT WHETHER FLIGHT WILL BE CANCELLED
predict_cancellation(2200, 60, 400, 0, 0, 1, 1, 0, 0, 0)
predict_cancellation(1725, 118, 533, 0, 1, 0, 0, 0, 1, 0)

# PART 3 -- MODEL OUTPUT
print("Predictors: ", list(predictors))
print("Coefficients: ", coefficients)
print("Y-Intercept: ", y_intercept)
print("Prediction accuracy: ", accuracy)
# 22% of the time, the model predicts that flights will be cancelled vs.
# 17% in the data

# PART 4 -- AIRLINE RECOMMENDATION
# I would recommend airline DL to travel with. Traveling with DL had a negative
# correlation with flight cancellation. Its coefficient was -.6256, compared
# with -.0306 for airline UA and +.2423 for airline AA (the most unreliable
# airline).

This snippet took 0.01 seconds to highlight.

Back to the Entry List or Home.

Delete this entry (admin only).