Demo entry 6348305

test3

   

Submitted by anonymous on Feb 20, 2017 at 00:18
Language: Python. Code size: 7.5 kB.

import numpy as np  
import pandas as pd  
  
# set working path  
path = 'C:/MyDocuments/JobHunting/Interview/DataXu/DataChallenge/Data_Classification/data_classification_small/'  
  
# ##########################################################################  
  
#                    Data preprocessing   
  
# ##########################################################################  
  
### Load data  
header = list(pd.read_csv(path + 'header.csv'))  
pos = pd.read_csv(path + '0CH9UhFrWY_pos.csv', sep='\t', names = header)  # dataframe  
neg = pd.read_csv(path + '0CH9UhFrWY_neg.csv', sep='\t', names = header)  # dataframe  
  
# a glance of data  
pos.describe()  
pos.describe(include = ['O'])  
  
neg.describe()  
neg.describe(include = ['O'])  
  
# look at the gender  
pos[pos['user_gender'] == 1].shape  
pos[pos['user_gender'] == 0].shape  
neg[neg['user_gender'] == 1].shape  
neg[neg['user_gender'] == 0].shape  
  
# remove the gender  
pos = pos.drop('user_gender', axis = 1)  
neg = neg.drop('user_gender', axis = 1)  
  
# look at the top_cat  
unique_cat = list(pos.top_category.unique())    # some NAN  
  
# remove all the NA  
pos, neg = pos.dropna(), neg.dropna()  
  
# check and remove duplicates  
pos.shape[0] == pos.drop_duplicates().shape[0]  
neg.shape[0] == neg.drop_duplicates().shape[0]  
  
pos, neg = pos.drop_duplicates(), neg.drop_duplicates()  
  
# combine pos and neg with label  
pos['label'] = 1  
neg['label'] = 0  
Alldata = pd.concat([pos, neg], axis = 0)  
  
# ##########################################################################  
  
#                    Feature Engineering   
  
# ##########################################################################  
  
# ============= create dummy variables for creative_id and exchange ========  
  
dummies_CUID = pd.get_dummies(Alldata['creative_uid']).rename(columns=lambda x: 'CUID_' + str(x))  
dummies_EXCH = pd.get_dummies(Alldata['exchange']).rename(columns=lambda x: 'EXCH_' + str(x))  
  
# ============= create dummy variables for top_category ====================  
dummies_TOPC = pd.get_dummies(Alldata['top_category']).rename(columns=lambda x: 'TOPC_' + str(x))  
  
# ============= create dummy variables for exchange_publisher ==============  
data_EXCHP = list(Alldata['exchange_publisher'])  
from collections import Counter  
EXCHP_cnt = Counter(data_EXCHP)  
  
# only EXCHP > 200 we have a name, otherwise encoded as 'other_EXCHP'  
EXCHP_2 = ['other_EXCHP'] * Alldata.shape[0]  
for i in range(Alldata.shape[0]):  
    if EXCHP_cnt[Alldata.iloc[i]['exchange_publisher']] > 300:  
        EXCHP_2[i] = Alldata.iloc[i]['exchange_publisher']  
  
EXCHP_2_cnt = Counter(EXCHP_2)  
  
Alldata['EXCHP_2'] = EXCHP_2  
dummies_EXCHP = pd.get_dummies(Alldata['EXCHP_2']).rename(columns=lambda x: 'EXCHP_' + str(x))  
  
# ============== create dummy variables for sitename ======================  
data_SITE = list(Alldata['sitename'])  
SITE_cnt = Counter(data_SITE)  
  
# only SITE > 200 we have a name, otherwise encoded as 'other_SITE'  
SITE_2 = ['other_SITE'] * Alldata.shape[0]  
for i in range(Alldata.shape[0]):  
    if SITE_cnt[Alldata.iloc[i]['sitename']] > 300:  
        SITE_2[i] = Alldata.iloc[i]['sitename']  
  
SITE_2_cnt = Counter(SITE_2)  
  
Alldata['SITE_2'] = SITE_2  
dummies_SITE = pd.get_dummies(Alldata['SITE_2']).rename(columns=lambda x: 'SITE_' + str(x))  
  
# =============== create dummy variables for catgorylist ==================  
data_CLIST = list(Alldata['categories_list'])  
CLIST_cnt = Counter(data_CLIST)  
  
# only catgorylist > 300 we have a name, otherwise encoded as 'other_CLIST'  
CLIST_2 = ['other_CLIST'] * Alldata.shape[0]  
for i in range(Alldata.shape[0]):  
    if CLIST_cnt[Alldata.iloc[i]['categories_list']] > 300:  
        CLIST_2[i] = Alldata.iloc[i]['categories_list']  
  
CLIST_2_cnt = Counter(CLIST_2)  
  
Alldata['CLIST_2'] = CLIST_2  
dummies_CLIST = pd.get_dummies(Alldata['CLIST_2']).rename(columns=lambda x: 'CLIST_' + str(x))  
  
# ================ create dummy variables for user_agent_string ===========  
data_UAS = list(Alldata['user_agent_string'])  
data_UAS_2 = [x[:50] for x in data_UAS]  
UAS_cnt = Counter(data_UAS)  
  
# only user_agent_string > 50 we have a name, otherwise encoded as 'other_UAS'  
UAS_2 = ['other_UAS'] * Alldata.shape[0]  
for i in range(Alldata.shape[0]):  
    if UAS_cnt[Alldata.iloc[i]['user_agent_string']] > 300:  
        UAS_2[i] = Alldata.iloc[i]['user_agent_string']  
  
UAS_2_cnt = Counter(UAS_2)  
  
Alldata['UAS_2'] = UAS_2  
dummies_UAS = pd.get_dummies(Alldata['UAS_2']).rename(columns=lambda x: 'UAS_' + str(x))  
  
# ================= combine all the numeric and dummy vaiables ============  
  
X = Alldata[['user_local_hour', 'abovebelow_fold']]  
X = pd.concat([X, dummies_CUID, dummies_EXCH, dummies_EXCHP, dummies_SITE, dummies_TOPC, dummies_CLIST, dummies_UAS], axis = 1)  
featurenames = list(X)  
y = Alldata['label']  
  
X, y = np.asarray(X), np.asarray(y) # X is the feature matrix and y is the label  
  
# ##########################################################################  
  
#                Model training and testing  
  
# ##########################################################################   
# split the data  
from sklearn.cross_validation import train_test_split  
  
Xtr, Xts, ytr, yts = train_test_split(X, y, test_size=0.2, random_state=42)  
  
ntr, p = Xtr.shape  
nts, p = Xts.shape  
  
# ===========  fit the model with gradient boosting ========================  
from sklearn.ensemble import GradientBoostingClassifier  
  
# fit the data with color 0 using the first k0 features  
GBC = GradientBoostingClassifier(n_estimators=500, learning_rate=0.1, max_depth=2)  
GBC.fit(Xtr, ytr)  
  
# make prediction  
pred_GB = GBC.predict(Xts)  
pred_GB_prob = GBC.predict_proba(Xts)  
  
# calculate accuracy and confusion matrix  
from sklearn.metrics import confusion_matrix  
acc_GB = np.sum(pred_GB == yts)/float(len(yts))  
cfm_GB = confusion_matrix(yts, pred_GB)  
  
# calculate and plot the ROC and AUC  
from sklearn.metrics import roc_curve, auc  
fpr, tpr, thresholds = roc_curve(yts, pred_GB_prob[:,1])  
AUC = auc(fpr, tpr)  
  
import matplotlib.pyplot as plt  
plt.figure()  
lw = 2  
plt.plot(fpr, tpr, color='darkorange',  
         lw=lw, label='ROC curve (area = %0.2f)' % AUC)  
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')  
plt.xlim([0.0, 1.0])  
plt.ylim([0.0, 1.05])  
plt.xlabel('False Positive Rate')  
plt.ylabel('True Positive Rate')  
plt.title('ROC plot')  
plt.legend(loc="lower right")  
#plt.show()   
  
filename = 'ROC.png'  
plt.savefig(path + filename,  bbox_inches="tight", dpi = 200)  
  
# plot the feature importance  
importances = GBC.feature_importances_  
indices_imp = np.argsort(importances)[::-1]  
  
# plot feature importances:  
import matplotlib.pyplot as plt  
plt.figure()  
plt.title("Feature importances")  
plt.bar(range(15), importances[indices_imp[:15]], width = 0.8,  
       color="r",  align="center")  
plt.xticks(range(15), indices_imp[:15])  
plt.xlim([-1, 15])  
plt.xlabel('feature index')  
#plt.show()  
  
filename = 'FI.png'  
plt.savefig(path + filename,  bbox_inches="tight", dpi = 200)  

This snippet took 0.01 seconds to highlight.

Back to the Entry List or Home.

Delete this entry (admin only).