R语言检验交叉验证

#R语言：交叉验证选择最优模型
#考虑下面的数据建模问题：

x=seq(0,1,by=0.01)
y=sin(2pix)+rnorm(length(x),0,0.1)
data1=data.frame(x,y)
install.packages(“ggplot2”)
library(ggplot2)
ggplot(data1,aes(x,y))+geom_point()
poly1=lm(y~poly(x,degree = 1),data=data1)
summary(poly1)
#RMSE-均方误差
RMSE=function(t,p){
return(sqrt(mean((t-p)^2)))
}
RMSE(data1$y,predict(poly1))
#模型的均方误差是计算出来了，这个值到底是好还是差呢？
#接下来我们写个循环，看看不同的项得到的均方误差。
Performance=data.frame()

for (d in 1:10) {

polyfit=lm(y~poly(x,degree = d),data=data1)

mean.rmse=RMSE(data1 $y,mean(data1$ y))

model.rmse=RMSE(data1$y,predict(polyfit))

Performance=rbind(Performance,

                data.frame(Degree=d,
                           
                           model.rmse,
                           
                           Rsqr=1-model.rmse/mean.rmse))

}
Performance
ggplot(Performance,aes(Degree,model.rmse))+geom_line()+geom_point()#似乎是Degree取值越大，rmse会减小。
#是不是degree越大就越好呢？不是，过拟合现象值得关注，拟合的太好了，把随机扰动的特征都拟合了
#交叉验证法
#所谓交叉验证方法即把数据集分为两部分，Training data 和 testing data.用Training data建模，用testing data来验证模型的泛化能力。从而避免过拟合
##1.把数据分为trainingdata and testingdata
set.seed(1)
index=nrow(data1)
index1=sample(index,round(0.5*index))
trainingdata=data1[index1,] #训练集
testingdata=data1[-index1,] #剩余的是测试集
##做一个循环得到traindata 和testdata 的rmse.
Performance1=data.frame()

for(d in 1:15)

{

polyfit=lm(y~poly(x,degree = d),data=trainingdata)

Performance1=rbind(Performance1,data.frame(Degree=d,

                                       Data='Train',
                                       
                                       rmse=RMSE(trainingdata$y,predict(polyfit))))

Performance1=rbind(Performance1,data.frame(Degree=d,

                                       Data='Test',
                                       
                                       rmse=RMSE(testingdata$y,predict(polyfit,newdata = testingdata))))

}

Performance1
ggplot(Performance1,aes(Degree,rmse,linetype=Data))+geom_point()+geom_line()
#图中可以清楚看到。当degree=12的时候，模型在testdata上均方误差已经开始猛增。说明traindata得到的模型是没有泛化能力的。也就是模型拟合了噪声，出现过拟合。
#结论：当Degree=3时，模型最佳。
Performance1

#K折交叉验证
training <-iris
#抽样方法
#ind<-sample(2,nrow(training),replace=TRUE,prob=c(0.7,0.3)) #对数据分成两部分，70%训练数据，30%检测数据nrow(training)行数
#traindata<- training [ind1,] #训练集
#testdata<- training [ind2,] #测试集
#10-fold cross-validation
#就是十折交叉验证，用来测试精度，是常用的精度测试方法。
#将数据集分成十分，轮流将其中9份做训练1份做测试，10次的结果的均值作为对算法精度的估计，
#一般还需要进行多次10倍交叉验证求均值，例如10次10倍交叉验证，更精确一点。
#使用切分函数-K折交叉验证
install.packages(“caret”)
library(caret)
library(lattice)
folds<-createFolds(y=training $KaTeX parse error: Expected 'EOF', got '#' at position 15: Species,k=10) #̲根据training的labe\dots$ Species[which(predict(rf)==training $Species)])/length(training$ Species))
}
mean(re) #取k折交叉验证结果的均值作为评判模型准确率的结果