Rev. | 7c9f09ae96b5624e827e40177bc05e5f87f8de96 |
---|---|
Tamanho | 3,409 bytes |
Hora | 2015-04-10 00:30:14 |
Autor | Lorenzo Isella |
Mensagem de Log | A code using caret+GBM. |
rm(list=ls())
library(caret)
library(doMC)
library(gbm)
topK <- function(x,k){
tbl <- tabulate(x)
names(tbl) <- levels(x)
x <- as.character(x)
levelsToKeep <- names(tail(sort(tbl),k))
x[!(x %in% levelsToKeep)] <- 'other'
factor(x)
}
keep_levels <- function(x,level_list){
## tbl <- tabulate(x)
## names(tbl) <- levels(x)
x <- as.character(x)
levelsToKeep <- level_list
x[!(x %in% levelsToKeep)] <- 'other'
factor(x)
}
keep_levels2 <- function(x,level_list, rare_level){
## tbl <- tabulate(x)
## names(tbl) <- levels(x)
x <- as.character(x)
levelsToKeep <- level_list
x[!(x %in% levelsToKeep)] <- rare_level
factor(x)
}
synchronize_factor <- function(few_fac, many_fac){
few_fac <- droplevels(few_fac) #make sure there are no unused levels!
few_lev <- levels(few_fac)
freq_few <- tabulate(few_fac)
sel_min <- which(freq_few==min(freq_few))[1]
rare_level <- few_lev[sel_min]
new_many_fac <- keep_levels2(many_fac,few_lev,rare_level)
return(new_many_fac)
}
########################################################
########################################################
########################################################
########################################################
########################################################
########################################################
ncores <- 2
x <- readRDS("train-test.RDS")
## x <- readRDS("train-test-one-hot.RDS")
## x <- as.data.frame(x)
y <- readRDS("train-revenue.RDS")
trind = 1:length(y) ## rows from the training dataset in x
teind = (length(y)+1):nrow(x) ## rows from the test dataset in x
## x <- subset(x, select=-c(City))
train <- x[trind, ]
train$revenue <- y
#now we synchronize the factor to avoid the problem that some of them do not appear in the test set
train$City <- droplevels(train$City)
train$City.Group <- droplevels(train$City.Group)
train$Type <- droplevels(train$Type)
x$City <- synchronize_factor(train$City, x$City)
x$City.Group <- synchronize_factor(train$City.Group, x$City.Group)
x$Type <- synchronize_factor(train$Type, x$Type)
set.seed(1234)
## shuffle train!
shuffle <- sample(length(y))
train <- train[shuffle,]
saveRDS(train, "train-fixed.RDS")
gbmGrid <- expand.grid(interaction.depth = seq(1,9),
n.trees = seq(1,151, by=10),
shrinkage = seq(0.01,0.1, by=0.01))
fitControl <- trainControl(## 10-fold CV
method = "repeatedcv",
## method = "cv",
number = 5,
## repeated ten times
repeats = 10 )
## fitControl2 <- trainControl(
## method = "boot632",
## number = 100)
registerDoMC(cores = ncores)
gbm_model <- train(revenue ~ ., data = train,
method = "gbm",
trControl = fitControl,
verbose = FALSE,
## Now specify the exact models
## to evaludate:
tuneGrid = gbmGrid)
saveRDS(gbm_model, "restaurant-gbm-caret.RDS")
print("OK with the training part")
# Make prediction
pred = predict(gbm_model,x[teind,])
pred = data.frame(0:(length(pred)-1),pred)
names(pred) = c('Id', "Prediction")
write.csv(pred,file='submission-restaurant-caret2.csv', quote=FALSE,row.names=FALSE)
print("So far so good")