Demo entry 6736296

PY

   

Submitted by cjm on Apr 26, 2018 at 13:44
Language: Python 3. Code size: 6.5 kB.

import warnings
warnings.filterwarnings("ignore")
# coding: utf-8

# In[2]:

import pandas as pd
import numpy as np


#Load Data
def loadData(csv_file):
    data_read = pd.read_csv(csv_file)
    return data_read

#对Train进行预处理:
TrainData=loadData('C:\Users\StrongYuzi\Desktop\PY\\train.csv')
#print(TrainData.describe())

'''
import matplotlib.pyplot as plt
#Use figures to show the relationships between the features
fig = plt.figure()
fig.set(alpha = 0.2) #设定图表颜色alpha参数

plt.subplot2grid((2,3),(0,0)) #分为多个小图
TrainData.Survived.value_counts().plot(kind='bar') #柱状图
plt.title(u"Survived: 1:The Survived") #标题
plt.ylabel(u"The Number of People")

plt.subplot2grid((2,3),(0,1))
TrainData.Pclass.value_counts().plot(kind='bar')
plt.title(u"PCLASS: 1:The first class") #标题
plt.ylabel(u"The Number of different people")

plt.show()
'''

'''
如果缺值的样本占总数比例极高,我们可能就直接舍弃了,作为特征加入的话,可能反倒带入noise,影响最后的结果了
如果缺值的样本适中,而该属性非连续值特征属性(比如说类目属性),那就把NaN作为一个新类别,加到类别特征中
如果缺值的样本适中,而该属性为连续值特征属性,有时候我们会考虑给定一个step(比如这里的age,我们可以考虑每隔2/3岁为一个步长),然后把它离散化,之后把NaN作为一个type加到属性类目中。
有些情况下,缺失的值个数并不是特别多,那我们也可以试着根据已有的值,拟合一下数据,补充上。
'''

#使用RandomForestClassifier填补缺失的年龄属性
from sklearn.ensemble import RandomForestRegressor
def set_missing_ages(df):
    # 把已有的数值型特征取出来丢进Random Forest Regressor中
    age_df = df[['Age','Fare','Parch','SibSp','Pclass']]

    # 乘客分成已知年龄和未知年龄两部分
    age_df_known = age_df[age_df.Age.notnull()].as_matrix()
    age_df_unknown = age_df[age_df.Age.isnull()].as_matrix()

    #根据已有乘客年龄信息建立模型 预测无年龄信息乘客年龄
    age_df_known_label = age_df_known[:,0]
    age_df_known_feature = age_df_known[:,1:]
    clf_rfr = RandomForestRegressor(n_estimators=200,n_jobs=-1)
    clf_rfr.fit(age_df_known_feature,age_df_known_label)

    PredictAge = clf_rfr.predict(age_df_unknown[:,1:])

    #用预测的模型填补缺失数据
    df.loc[(df.Age.isnull()),'Age'] = PredictAge
    return df, clf_rfr

NewTrainData , clf_rfr = set_missing_ages(TrainData)


# In[3]:

'''
#Cabin太多NAN,直接去掉--.drop(['Feature1','Feature2'],axis=1)去掉此标签的几列
NewTrainData , clf_rfr = set_missing_ages(TrainData)
NewTrainData_feature = NewTrainData.drop(['PassengerId','Survived','Name','Cabin'],axis=1)
NewTrainData_label = NewTrainData['Survived']
'''
#尝试Cabin的有无对结果的影响:
#补充Cabin缺失数据:
def set_missing_cabin(df):
    df.loc[df.Cabin.isnull(),'Cabin'] = 0
    df.loc[df.Cabin.notnull(),'Cabin'] = 1
    return  df

NewTrainData = set_missing_cabin(NewTrainData)
NewTrainData_feature = NewTrainData.drop(['PassengerId','Survived','Name'],axis=1)

NewTrainData_label = NewTrainData['Survived']


# In[4]:

#用pandas的get_dummies来将属性展开
Embarked_dummies = pd.get_dummies(NewTrainData_feature['Embarked'],prefix='Embarked')
Sex_dummies = pd.get_dummies(NewTrainData_feature['Sex'],prefix='Sex')
Pclass_dummies = pd.get_dummies(NewTrainData_feature['Pclass'],prefix='Pclass')
Cabin_dummies = pd.get_dummies(NewTrainData_feature['Cabin'],prefix='Cabin')

#拼接展开的属性 去掉原属性
NewTrainData_feature = pd.concat([NewTrainData_feature, Embarked_dummies, Sex_dummies, Pclass_dummies, Cabin_dummies], axis=1)
NewTrainData_feature = NewTrainData_feature.drop(['Pclass','Cabin','Sex','Ticket','Embarked'], axis=1)


# In[10]:

#对Age和Fare两个属性进行归一化
from sklearn import preprocessing
scaler = preprocessing.StandardScaler()

age_scale = scaler.fit(NewTrainData_feature['Age'].reshape(-1,1))
NewTrainData_feature['Age_scale'] = scaler.fit_transform(NewTrainData_feature['Age'].reshape(-1,1),age_scale)

fare_scale = scaler.fit(NewTrainData_feature['Fare'].reshape(-1,1))
NewTrainData_feature['Fare_scale'] = scaler.fit_transform(NewTrainData_feature['Fare'].reshape(-1,1),fare_scale)

NewTrainData_feature = NewTrainData_feature.drop(['Age','Fare'],axis=1)


# In[12]:

#对Test进行预处理:
TestData=loadData('C:\Users\StrongYuzi\Desktop\PY\\test.csv')
#Test里面有Fare不全的 先用0替代
TestData.loc[(TestData.Fare.isnull()),'Fare'] = 0


#把Test里面年龄补上
TestData_tmp = TestData[['Age','Fare','Parch','SibSp','Pclass']]
age_TestData_unknown = TestData_tmp[TestData.Age.isnull()].as_matrix()
age_TestData_unknown_feature = age_TestData_unknown[:,1:]
PredictTestAge = clf_rfr.predict(age_TestData_unknown_feature)
TestData.loc[(TestData.Age.isnull()),'Age'] = PredictTestAge


#去掉Test中的Cabin
NewTestData_feature = TestData.drop(['PassengerId','Name'],axis=1)
NewTestData_feature = set_missing_cabin(NewTestData_feature)


#属性展开
Embarked_dummies_test = pd.get_dummies(NewTestData_feature['Embarked'],prefix='Embarked')
Sex_dummies_test = pd.get_dummies(NewTestData_feature['Sex'],prefix='Sex')
Pclass_dummies_test = pd.get_dummies(NewTestData_feature['Pclass'],prefix='Pclass')
Cabin_dummies_test = pd.get_dummies(NewTestData_feature['Cabin'],prefix='Cabin')

#拼接展开的属性 去掉原属性
NewTestData_feature = pd.concat([NewTestData_feature, Embarked_dummies_test, Sex_dummies_test, Pclass_dummies_test, Cabin_dummies_test], axis=1)
NewTestData_feature = NewTestData_feature.drop(['Pclass','Cabin','Sex','Ticket','Embarked'], axis=1)


#对Test数据归一化
NewTestData_feature['Age_scale'] = scaler.fit_transform(NewTestData_feature['Age'].reshape(-1,1),age_scale)

NewTestData_feature['Fare_scale'] = scaler.fit_transform(NewTestData_feature['Fare'].reshape(-1,1),fare_scale)

NewTestData_feature = NewTestData_feature.drop(['Age','Fare'],axis=1)
#NewTestData_feature = NewTestData_feature.drop(['Fare'],axis=1)

#取出要的属性值
NewTrainData_feature = NewTrainData_feature.values
NewTrainData_label = NewTrainData_label.values
NewTestData_feature = NewTestData_feature.values


# In[14]:

#SVM预测结果
from sklearn import svm
clf_svm = svm.SVC(C=40,gamma=0.05)
clf_svm.fit(NewTrainData_feature,NewTrainData_label)


# In[15]:

#GradientBoostingClassifier预测结果
from sklearn.ensemble import GradientBoostingClassifier
clf_gbdt = GradientBoostingClassifier(learning_rate=0.1,n_estimators=100,max_features='sqrt',random_state=0)
clf_gbdt.fit(NewTrainData_feature,NewTrainData_label)


# In[16]:
'''
#预测的结果
predictions = clf.predict(NewTestData_feature)
'''

# In[31]:

acc_svm = 0
acc_gbdt = 0
for i in range(len(NewTrainData_feature)):
    acc_svm += NewTrainData_label[i] - clf_svm.predict(NewTrainData_feature)[i]
    acc_gbdt += NewTrainData_label[i] - clf_gbdt.predict(NewTrainData_feature)[i]
acc_svm = 1 - acc_svm / len(NewTrainData_feature)
acc_gbdt = 1 - acc_gbdt / len(NewTrainData_feature)


# In[32]:

print(acc_svm)
print(acc_gbdt)


# In[ ]:

This snippet took 0.01 seconds to highlight.

Back to the Entry List or Home.

Delete this entry (admin only).