Demo entry 3522290

Uber problem

   

Submitted by anonymous on Jan 08, 2016 at 20:33
Language: Python. Code size: 9.0 kB.

"""
Title: Uber Take-home Problem: Forecasting
Author: Mayukh Samanta
"""

import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas import DataFrame, Series
from sklearn.decomposition import PCA
from scipy import signal
from sklearn.svm import NuSVR
from sklearn.metrics import mean_absolute_error

def topNAutoCorrelations(timeSeries, n):
    """
    Return 'n' lags which have the highest autocorrelations. Note 
    that this is mostly diagnostic to get a sense of the relevant lags.
    """
    # De-trending time series
    timeSeries = Series(signal.detrend(timeSeries))
    autoCorrelations = np.array([timeSeries.autocorr(
                                lag = i) for i in range(1, len(timeSeries))])
    topIndexes = np.argsort(-autoCorrelations)[:n]
    topAutoCorrelations = autoCorrelations[topIndexes]
    return Series(topAutoCorrelations, index=topIndexes+1)

def createShiftedData(ts, lags, startIndex, endIndex):
    """
    Create lagged data set; use as input a vector of lag indexes and the day 
    ID.
    """
    X_shifted = pd.concat([ts.shift(i) for i in lags], axis=1).iloc[
                startIndex:endIndex]
    X_shifted.columns = lags
    X_shifted['Weekend indicator'] = [int(ts.index[i].weekday() >= 5) for i in 
                                        range(startIndex, endIndex)]
    return X_shifted
    
def timeSeriesCV(k, X, y, pcaList):
    """
    Perform a rolling-window version of k-fold cross validation so that future
    information is not used to make past predictions, per Hyndman (2015).
    """
    # Data further is split into: 75% train, 25% validation (60:20 of overall 
    # data set, with 80% of total data used for training + validation). This is 
    # just a heuristic that seems reasonable.
    trainingValidationSplit = 0.75     
    lengthCvSet = int(trainingValidationSplit * len(X))/k     
    # Rolling cross-validation: train on 1, validate on 2; train on 1+2, 
    # validate on 3,...
    error = np.zeros(len(pcaList))
    print '\nPerforming', k, 'fold CV'
    
    for i in range(k):
        print '\nEntering fold: ', i + 1
        X_train_fold = X[:(i + 1) * lengthCvSet]
        y_train_fold = y[:(i + 1) * lengthCvSet]
        X_val_fold = X[(i + 1) * lengthCvSet: (i + 2) * lengthCvSet]
        y_val_fold = y[(i + 1) * lengthCvSet: (i + 2) * lengthCvSet]
        print 'Length of training fold: ', len(y_train_fold)
        print 'Length of validation fold: ', len(y_val_fold)
        
        # In our case we simply consider models with a varying number of PCs. 
        # Each model is transformed by a PCA rotation, fit on the original 
        # training set. Default parameters are used for initial training.        
        modelList = [NuSVR().fit(pca.transform(
                    X_train_fold), y_train_fold) for pca in pcaList]
        
        # Find error for each model for fold 'i'           
        for j in range(len(modelList)):
            y_pred = modelList[j].predict(pcaList[j].transform(X_val_fold))
            error[j] += mean_absolute_error(y_val_fold, y_pred)
            print 'Error for model', j + 1, ' :', mean_absolute_error(
            y_val_fold, y_pred)
            
    error = error / (k - 1)        
    leastErrorIndex = error.argmin()
    print 'The best model is model: ', leastErrorIndex + 1    
    bestModel = modelList[leastErrorIndex]
    bestPCA = pcaList[leastErrorIndex]
    # Re-train best model on entire training set
    bestModel.fit(bestPCA.transform(X), y)
    return [bestModel, bestPCA]             
        
# Read json file of user logins
with open('logins.json') as data_file:    
    loginDataDict = json.load(data_file)
loginDF = DataFrame(loginDataDict)
timeStampSeries = pd.to_datetime(loginDF['login_time'])

# Each login is interpreted as a count of '1'
userCountsDF = DataFrame(np.ones(len(loginDF.index)))

# This contains the user login counts binned into 15 minute intervals
userCountsDF = userCountsDF.set_index(timeStampSeries).resample(
                                    '15Min', how='sum')
userCountsDF.columns = ['User Count']

# Assumption: missing values are forward filled (~4% in this data set)
userCountsDF = userCountsDF.ffill()
userCountsDF['Day ID'] = userCountsDF.index.weekday

plt.style.use('ggplot')

# The first objective is to figure out 'lags' that are good predictors of demand
# We don't want to discard too much data either, so there is a trade-off between
# how useful a large lag is vs how much data we can afford to discard.

# Compute autocorrelations of the de-trended time series at a) 15 min intervals
# b) 1 hr intervals, c) daily intervals to see how lags at different 
# granularities are important. Since we have only 4 months of data, I assume 
# that I do not want to discard more than ~2 weeks worth of data. 

ts15Min = userCountsDF['User Count']
ts1h = ts15Min.resample('1h', how='sum')
ts1d = ts15Min.resample('1d', how='sum')
ts1w = ts15Min.resample('1w', how='sum')

# Assumption: we're only interested in the top 25% of lags (from 0 to end of 
# the time series).
n = 0.25 
top15Min = topNAutoCorrelations(ts15Min, int(n * len(ts15Min)))
top1h = topNAutoCorrelations(ts1h, int(n * len(ts1h)))
top1d = topNAutoCorrelations(ts1d, int(n * len(ts1d)))
top1w = topNAutoCorrelations(ts1w, int(n * len(ts1w)))

# Construct a data set for some ML model                                                              
maxLag = 336 * 4 # Maximum lag we consider = 2 weeks prior to prediction time
trainingFraction = 0.8 # Fraction of data we want to use for training 
# (including validation)

# Lags were chosen approximately from autocorrelation plots
lagList = [[1, 2, 3, 4, 670, 671, 672, 673, 1342, 1343, 1344, 1345, 1346], 
           [2, 3, 4, 670, 671, 672, 673, 1342, 1343, 1344, 1345, 1346], 
            [3, 4, 670, 671, 672, 673, 1342, 1343, 1344, 1345, 1346],
            [4, 670, 671, 672, 673, 1342, 1343, 1344, 1345, 1346]]
          
# Naive prediction: we simply consider the lag that the autocorrelation
# numbers indicate is the best for the training set.

predictionWindowList = ['15 Min Ahead', '30 Min Ahead', '45 Min Ahead', 
                    '1 hr ahead']
# Keeps a track of error and means for various 'x'-min ahead models
errorList = []
meanList = []

for i in range(len(lagList)):
    lags = lagList[i]
    predictionWindow = predictionWindowList[i]
    print '\nPrediction: ', predictionWindow
    print 'Lags = ', lags
    startIndex = int(max(lags)) # Start index of 'order' max(lags)
    endIndex = int(trainingFraction * len(ts15Min)) # End index of training set
    X_train = createShiftedData(ts15Min, lags, startIndex, endIndex) 
    y_train = ts15Min[startIndex:endIndex]
    X_test = createShiftedData(ts15Min, lags, endIndex, len(ts15Min))
    y_test = ts15Min[endIndex:]
    
    # Run dimensionality reduction on training sets to determine no. of PCs 
    n_PCA = np.arange(1, 5) # Consider 1 to 4 principal components
    pcaList = []
    for n in n_PCA:
        pca = PCA(n, whiten=True)
        pca.fit(X_train)
        pcaList.append(pca)
    
    # Cross-validation parameters, and cross-validation itself for different 
    # number of principal components (from 1 to 4)
    k = 3
    clf_CV, pca = timeSeriesCV(k, X_train, y_train, pcaList)      
        
    # Both training and test sets are transformed using the same rotation.
    X_train_PCA = pca.transform(X_train)
    X_test_PCA = pca.transform(X_test)
    
    C_value = 20.0
    nu_value = 0.5
    print 'C = ', C_value, ' Nu = ', nu_value
    clf_CV.fit(X_train_PCA, y_train)
    
    # Test error
    y_pred = clf_CV.predict(X_test_PCA)
    mae = mean_absolute_error(y_test, y_pred)
    print 'MAE = ', mae
    print 'Mean of prediction, test: ', y_pred.mean(), y_test.mean()
    errorList.append(mae)
    meanList.append(y_pred.mean())
          
    # Predict on test set
    plt.figure(1)
    plt.plot(clf_CV.predict(X_test_PCA)[:400])
    plt.plot(y_test.values[:400])
    plt.title('Prediction on test set')             
    plt.show()

# From autocorrelation plots, the best predictors for the 45 min and 1 hr case
# are 2 weeks prior to the prediction point
naiveLagList = [1, 2, 1344, 1344]  
modelErrors = Series(errorList, index=predictionWindowList, name='MAE_SVR')
modelMean = Series(meanList, index=predictionWindowList, name='Mean_SVR')
naiveErrors = Series([mean_absolute_error(y_test, ts15Min[endIndex - i: -i]) 
                    for i in naiveLagList], index=predictionWindowList, name=
                    'MAE_naive')
naiveMean = Series([np.mean(ts15Min[endIndex - i: -i]) for i in naiveLagList],
                    index=predictionWindowList, name='Mean_naive')
errorDF = pd.concat([modelErrors, naiveErrors], axis=1)
print errorDF

This snippet took 0.01 seconds to highlight.

Back to the Entry List or Home.

Delete this entry (admin only).