R을 이용한 데이터마이닝 수업중에 Kaggle bike sharing demand 예측을 하는 과제를 진행중인데요.
이게 중회귀분석을 이용한건 알겠는데 정확히 어떤건지 해석이 잘 안되서요
내용을 요약하자면 뭐라고 해야되나요 ㅠㅠ
setwd("C:/Users/정보화29/Desktop/r data")
install.packages("MASS")
library("MASS")
BIKE<-read.csv("train.csv")
ACTUAL<-c()
CASUAL<-c()
REGISTERED<-c()
ACTUAL<-cbind(ACTUAL,BIKE[,"count"])
CASUAL<-cbind(CASUAL,BIKE[,"casual"])
REGISTERED<-cbind(REGISTERED,BIKE[,"registered"])
CASUAL<-log(CASUAL)
REGISTERED<-log(REGISTERED)
ACTUAL<-log(ACTUAL)
SUB<-read.csv("sampleSubmission.csv")
TEST<-read.csv("test.csv")
TIMES<-paste(TEST[,"datetime"])
BIKE<-cbind(BIKE,"hour"=format(as.POSIXct(BIKE[,"datetime"], format="%Y-%m-%d %H:%M"), format="%H"),
"day"=format(as.POSIXct(BIKE[,"datetime"], format="%Y-%m-%d %H:%M"), format="%d"),
"month"=format(as.POSIXct(BIKE[,"datetime"], format="%Y-%m-%d %H:%M"), format="%m"),
"year"=format(as.POSIXct(BIKE[,"datetime"], format="%Y-%m-%d %H:%M"), format="%Y")
)
BIKE[,c("hour","day","month","year")]<-as.matrix(BIKE[,c("hour","day","month","year")])
for(i in 1:nrow(BIKE)){
for(j in c("hour","day","month","year")){
if(is.na(BIKE[i,j])){
BIKE[i,j]<-mean(as.numeric(BIKE[i-1,j]),as.numeric(BIKE[i+1,j]))}
}}
TEST<-cbind(TEST,"hour"=format(as.POSIXct(TEST[,"datetime"], format="%Y-%m-%d %H:%M"), format="%H"),
"day"=format(as.POSIXct(TEST[,"datetime"], format="%Y-%m-%d %H:%M"), format="%d"),
"month"=format(as.POSIXct(TEST[,"datetime"], format="%Y-%m-%d %H:%M"), format="%m"),
"year"=format(as.POSIXct(TEST[,"datetime"], format="%Y-%m-%d %H:%M"), format="%Y")
)
TEST[,c("hour","day","month","year")]<-as.matrix(TEST[,c("hour","day","month","year")])
START<-ISOdate(min(BIKE[,"year"]),min(BIKE[,"month"]),min(BIKE[,"day"]),min(BIKE[,"hour"]))
BIKETIME<-c()
TESTTIME<-c()
for(i in 1:nrow(BIKE)){
BIKETIME<-rbind(BIKETIME,difftime(ISOdate(BIKE[i,"year"],BIKE[i,"month"],BIKE[i,"day"],BIKE[i,"hour"]),START,units="hours"))}
for(i in 1:nrow(TEST)){
TESTTIME<-rbind(TESTTIME, difftime(ISOdate(TEST[i,"year"],TEST[i,"month"],TEST[i,"day"],TEST[i,"hour"]),START,units="hours"))}
dayof<-function(X){
month<-as.matrix(as.numeric(X[,"month"]))
month[month==1]<-13
month[month==2]<-14
day<-as.matrix(as.numeric(X[,"day"]))
year<-cbind(as.numeric(X[,"year"]),as.numeric(X[,"month"]))
for(i in 1:nrow(year)){
if((year[i,2]<=2)==TRUE){year[i,1]<-year[i,1]-1}
}
year<-as.matrix(year[,1])
y<-year %% 2000
c<-year - y
c<-c/100
DAY<-(day+floor(13*(month+1)/5)+ y+ floor(y/4)+ floor(c/4)-c) %% 7
return(DAY)}
BIKE<-cbind(BIKE,"dayof"=dayof(BIKE))
TEST<-cbind(TEST,"dayof"=dayof(TEST))
TEST<-TEST[,colnames(TEST)!="datetime"]
TEST<-TEST[,colnames(TEST)!="day"]
TEST<-TEST[,colnames(TEST)!="year"]
BIKE<-BIKE[,colnames(TEST)]
A<-cbind(BIKE[,"hour"],BIKE[,"dayof"])
B<-cbind(TEST[,"hour"],TEST[,"dayof"])
A2<-c()
B2<-c()
for(i in unique(B[,1])){
for(j in unique(B[,2])){
TEMPA<-matrix(data=0,ncol=1,nrow=nrow(BIKE))
TEMPB<-matrix(data=0,ncol=1,nrow=nrow(TEST))
TEMPA[A[,1]==i & A[,2]==j,1]<-1
TEMPB[B[,1]==i & B[,2]==j,1]<-1
A2<-cbind(A2,TEMPA)
B2<-cbind(B2,TEMPB)
}}
#TEST<-TEST[,colnames(TEST)!=c("month","hour","dayof")]
#BIKE<-BIKE[,colnames(BIKE)!=c("month","hour","dayof")]
BIKE<-cbind(BIKE,BIKE[,"temp"]^2,BIKE[,"atemp"]^2,BIKE[,"humidity"]^2,BIKE[,"windspeed"]^2,(BIKE[,"temp"])^3,(BIKE[,"atemp"])^3,(BIKE[,"humidity"])^3,(BIKE[,"windspeed"])^3)
TEST<-cbind(TEST,TEST[,"temp"]^2,TEST[,"atemp"]^2,TEST[,"humidity"]^2,TEST[,"windspeed"]^2,(TEST[,"temp"])^3,(TEST[,"atemp"])^3,(TEST[,"humidity"])^3,(TEST[,"windspeed"])^3)
Categorical<-c("holiday","workingday","weather","month","hour","dayof")
for(i in Categorical){
BT<-BIKE[i]
TT<-TEST[i]
BT2<-c()
TT2<-c()
for(j in unique(BT[,i])){
TempMat<-matrix(data=0,nrow=nrow(BT),ncol=1)
TempMat[BT==j]<-1
BT2<-cbind(BT2,TempMat)
TempMat2<-matrix(data=0,nrow=nrow(TT),ncol=1)
TempMat2[TT==j]<-1
TT2<-cbind(TT2,TempMat2)}
BIKE<-cbind(BIKE[,colnames(BIKE)!=i],BT2)
TEST<-cbind(TEST[,colnames(TEST)!=i],TT2)}
BIKE<-cbind(BIKE,A2)
TEST<-cbind(TEST,B2)
TEST<-cbind(TEST,sin(2*pi*TESTTIME/(365*24)),cos(2*pi*TESTTIME/(365*24)),sin(2*pi*TESTTIME/(24)),cos(2*pi*TESTTIME/(24)),TESTTIME)
BIKE<-cbind(BIKE,sin(2*pi*BIKETIME/(365*24)),cos(2*pi*BIKETIME/(365*24)),sin(2*pi*BIKETIME/(24)),cos(2*pi*BIKETIME/(24)),BIKETIME)
BIKE<-cbind(BIKE,matrix(data=1,nrow=nrow(BIKE),ncol=1))
TEST<-cbind(TEST,matrix(data=1,nrow=nrow(TEST),ncol=1))
BIKE<-data.matrix(BIKE)
TEST<-data.matrix(TEST)
BikeInv<-ginv(as.matrix(BIKE))
ACT<-BikeInv %*% ACTUAL
Predictions<-cbind(TIMES,exp(TEST %*% ACT))
Predictions[Predictions[,2]<0,2]<-0
colnames(Predictions)<-c("datetime","count")
write.table(Predictions,file="Sub.csv",row.names=FALSE,quote=FALSE,sep=",",col.names=TRUE)