I am using KNN to realize the prediction of Titanic on kaggle. I want to know how to solve this coding problem
-- coding: utf-8 --
import numpy as np
import pandas as pd
import seaborn as sns
import sys
import importlib
importlib.reload(sys)
test=pd.read_csv(“test.csv”,encoding =‘utf_8’)
train=pd.read_csv(‘train.csv’,encoding =‘utf_8’)
train[‘Age’]=train[‘Age’].map(lambda x: ‘child’ if x<12 else ‘youth’ if x<30 else ‘adlut’ if x<60 else ‘old’ if x<75 else ‘tooold’ if x>=75 else ‘null’)
train[‘SibSp’]=train[‘SibSp’].map(lambda x: ‘small’ if x<1 else ‘middle’ if x<3 else ‘large’)
train[‘Parch’]=train[‘Parch’].map(lambda x: ‘small’ if x<1 else ‘middle’ if x<4 else ‘large’)#根据小提琴图,分成三个部分
train[‘Fare’]=train[‘Fare’].map(lambda x:np.log(x+1))#lambda这种是匿名函数,+1是由于定义域所导致的
train[‘Fare’]=train[‘Fare’].map(lambda x: ‘poor’ if x<2.5 else ‘rich’)
train[‘Cabin’]=train[‘Cabin’].map(lambda x:‘yes’ if type(x)==str else ‘no’)
train.dropna(axis=0,inplace=True)
labels= train[‘Survived’]
features= train.drop([‘Survived’,‘PassengerId’,‘Name’,‘Ticket’],axis=1)
features = pd.get_dummies(features)
encoded = list(features.columns)
print ("{} total features after one-hot encoding.".format(len(encoded)))
#对’Age’,‘SibSp’,'Parch’特征进行分段分类
test[‘Age’]=test[‘Age’].map(lambda x: ‘child’ if x<12 else ‘youth’ if x<30 else ‘adlut’ if x<60 else ‘old’ if x<75 else ‘tooold’ if x>=75 else ‘null’)
test[‘SibSp’]=test[‘SibSp’].map(lambda x: ‘small’ if x<1 else ‘middle’ if x<3 else ‘large’)
test[‘Parch’]=test[‘Parch’].map(lambda x: ‘small’ if x<1 else ‘middle’ if x<4 else ‘large’)
#均值补齐’Fare’特征值并作对数转换和分类
test.Fare.fillna(test[‘Fare’].mean(), inplace=True)
test[‘Fare’]=test[‘Fare’].map(lambda x:np.log(x+1))
test[‘Fare’]=test[‘Fare’].map(lambda x: ‘poor’ if x<2.5 else ‘rich’)
#按’Cabin’是否缺损分类
test[‘Cabin’]=test[‘Cabin’].map(lambda x:‘yes’ if type(x)==str else ‘no’)
#删除不需要的特征并进行独热编码
#Id=test[‘PassengerId’],好像是多余的
test=test.drop([‘PassengerId’,‘Name’,‘Ticket’],axis=1)
test=pd.get_dummies(test)
encoded = list(test.columns)
print ("{} total features after one-hot encoding.".format(len(encoded)))
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score,roc_auc_score
from time import time
from sklearn.neighbors import KNeighborsClassifier
def fit_model(alg,parameters):
X=features
y=labels #由于数据较少,使用全部数据进行网格搜索
#?这里是可以优化的地方,网格搜索
scorer=make_scorer(roc_auc_score) #使用roc_auc_score作为评分标准
grid = GridSearchCV(alg,parameters,scoring=scorer,cv=5) #使用网格搜索,输入参数,实现自动调参
start=time() #计时
grid=grid.fit(X,y) #模型训练
end=time()
t=round(end-start,3)
print (grid.best_params_) #输出最佳参数
print (‘searching time for {} is {} s’.format(alg.class.name,t)) #输出搜索时间
return grid #返回训练好的模型
alg5=KNeighborsClassifier(n_jobs=-1)
parameters5 = {‘n_neighbors’:range(2,10),‘leaf_size’:range(10,80,20) }
clf5=fit_model(alg5,parameters5)