OSDN > Developer > larry77 > Chamber > myprojects-hg-reborn > File Details

myprojects-hg-reborn
Fork

(Original repository, No fork origin)

File Info

Rev.	7c9f09ae96b5624e827e40177bc05e5f87f8de96
Tamanho	3,409 bytes
Hora	2015-04-10 00:30:14
Autor	Lorenzo Isella
Mensagem de Log	A code using caret+GBM.

Content

Export as raw format

rm(list=ls())

library(caret)

library(doMC)
library(gbm)

topK <- function(x,k){
    tbl <- tabulate(x)
    names(tbl) <- levels(x)
    x <- as.character(x)
    levelsToKeep <- names(tail(sort(tbl),k))
    x[!(x %in% levelsToKeep)] <- 'other'
    factor(x)
  }



keep_levels <- function(x,level_list){
    ## tbl <- tabulate(x)
    ## names(tbl) <- levels(x)
    x <- as.character(x)
    levelsToKeep <- level_list
    x[!(x %in% levelsToKeep)] <- 'other'
    factor(x)
  }


keep_levels2 <- function(x,level_list, rare_level){
    ## tbl <- tabulate(x)
    ## names(tbl) <- levels(x)
    x <- as.character(x)
    levelsToKeep <- level_list
    x[!(x %in% levelsToKeep)] <- rare_level
    factor(x)
  }

synchronize_factor <- function(few_fac, many_fac){

    few_fac <- droplevels(few_fac) #make sure there are no unused levels!
    few_lev <- levels(few_fac)
    freq_few <- tabulate(few_fac)
    sel_min <- which(freq_few==min(freq_few))[1] 
    rare_level <- few_lev[sel_min]
    new_many_fac <- keep_levels2(many_fac,few_lev,rare_level)
    return(new_many_fac)
}




########################################################
########################################################
########################################################
########################################################
########################################################
########################################################


ncores <- 2


x <- readRDS("train-test.RDS")

## x <- readRDS("train-test-one-hot.RDS")

## x <- as.data.frame(x)

y <- readRDS("train-revenue.RDS")

trind = 1:length(y) ## rows from the training dataset in x
teind = (length(y)+1):nrow(x) ## rows from the test dataset in x

## x <- subset(x, select=-c(City))



train <- x[trind, ]

train$revenue <- y

#now we synchronize the factor to avoid the problem that some of them do not appear in the test set

train$City <- droplevels(train$City)
train$City.Group <- droplevels(train$City.Group)
train$Type <- droplevels(train$Type)

x$City <- synchronize_factor(train$City, x$City)
x$City.Group <- synchronize_factor(train$City.Group, x$City.Group)
x$Type <- synchronize_factor(train$Type, x$Type)


set.seed(1234)

## shuffle train!

shuffle <- sample(length(y))

train <- train[shuffle,]

saveRDS(train, "train-fixed.RDS")

gbmGrid <-  expand.grid(interaction.depth = seq(1,9),
                        n.trees = seq(1,151, by=10),
                        shrinkage = seq(0.01,0.1, by=0.01))

fitControl <- trainControl(## 10-fold CV
    method = "repeatedcv",
    ## method = "cv",
                           number = 5,
                           ## repeated ten times
                            repeats = 10 )


## fitControl2 <- trainControl(
##     method = "boot632",
##         number = 100)


registerDoMC(cores = ncores)



gbm_model <- train(revenue ~ ., data = train,
                 method = "gbm",
                 trControl = fitControl,
                 verbose = FALSE,
                 ## Now specify the exact models 
                 ## to evaludate:
                 tuneGrid = gbmGrid)


saveRDS(gbm_model, "restaurant-gbm-caret.RDS")

print("OK with the training part")

# Make prediction
pred = predict(gbm_model,x[teind,])

pred = data.frame(0:(length(pred)-1),pred)
names(pred) = c('Id', "Prediction")

write.csv(pred,file='submission-restaurant-caret2.csv', quote=FALSE,row.names=FALSE)




print("So far so good")

myprojects-hg-reborn Fork

Tags

Frequently used words (click to add to your profile)

File Info

Content

myprojects-hg-reborn
Fork