I fitted a logistic regression model in 10-fold cv. I can use the pROC package to get the AUC but it seems the AUC is not for the 10-fold CV because the cvAUC library gave a different AUC. I suspect the AUC from pROC is for one fold. Please how can extract the joint AUC for the 10-fold using the pROC library?
data(iris)
data <- iris[which(iris$Species=="setosa" | iris$Species=="versicolor"),]
data$ID <- seq.int(nrow(data))
table(data$Species)
data$Species <-as.factor(data$Species)
confusion_matrices <- list()
accuracy <- c()
for (i in c(1:10)) {
set.seed(3456)
folds <- caret::createFolds(data$Species, k = 10)
test <- data[data$ID %in% folds[[i]], ]
train <- data[data$ID %in% unlist(folds[-i]), ]
model1 <- glm(as.factor(Species)~ ., family = binomial, data = train)
summary(model1)
pred <- predict(model1, newdata = test, type = "response")
predR <- as.factor( pred >= 0.5)
df <- data.frame(cbind(test$Species, predR))
df_list <- lapply(df, as.factor)
confusion_matrices[[i]] <- caret::confusionMatrix(df_list[[2]], df_list[[1]])
accuracy[[i]] <- confusion_matrices[[i]]$overall["Accuracy"]
}
library(pander)
library(dplyr)
names(accuracy) <- c("Fold 1",....,"Fold 10")
accuracy %>%
pander::pandoc.table()
mean(accuracy)
Related
I am running the following code, found in this appendix of a paper https://wiley.figshare.com/articles/Supplement_1_R_code_used_to_format_the_data_and_compute_the_SSAS_/3528698/1 to calculate the Sexual Segregation and Aggregation Statistic in R - but keep getting the following error - presumably there is an issue with calling a function from C, but I cannot resolve it.
# Main function, computes both the SSAS (Sexual Segregation and
# Aggregation Statistic) and the 95% limits of SSAS
# under the assumption of random association of both sexes
SSAS <- function(x, conf.int = 0.95, B = 10000)
{
x <- as.matrix(x)
nr <- nrow(x)
nc <- ncol(x)
sr <- rowSums(x)
sc <- colSums(x)
n <- sum(x)
E <- outer(sr, sc, "*")/n
dimnames(E) <- dimnames(x)
tmp <- .C("chisqsim", as.integer(nr), as.integer(nc),as.integer(sr),
as.integer(sc), as.integer(n), as.integer(B), as.double(E), integer(nr * nc),
double(n + 1), integer(nc), results = double(B), PACKAGE = "stats")
obs <- sum(sort((x - E)^2/E, decreasing = TRUE))/n
sim <- tmp$results/n
p0 <- (1 - conf.int)/2
return(c(obs, quantile(sim, p0), quantile(sim, 1 -p0)))
}
# This function formats data to be run with the SSAS function
splitmfd <- function(mfd) {
loca1 <- function(x) {
x <- t(x[, 1:2])
dimnames(x) <- list(c("mal", "fem"), as.character(1:ncol(x)))
x
}
l0 <- split(mfd, mfd$mon)
lapply(l0, loca1)
}
# Example 1: Isard
rup <- read.table("http://pbil.univ-lyon1.fr/R/donnees/mfdrupicapra.txt",
h = T)
# Load data from the web
plot1 <- function(w, titre = "") {
plot(1:12, w[, 1], ylim = range(w), ax = F, pch = 19,
type = "n", ylab = "IK", xlab = "")
title(main = titre)
box()
axis(1, 1:12, c("Jan", "Feb", "Mar", "Apr", "May",
"Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"))
axis(2, pretty(range(w)), tck = 1)
polyx <- c(1:12, 12:1)
polyy <- c(w[, 3], rev(w[, 2]))
polygon(polyx, polyy, col = grey(0.9))
points(w[, 1], pch = 19, type = "b")
}
# Function to plot data and 95\ levels of significance
l1 <- splitmfd(rup)
# Format data to be used with SSAS function
w <- matrix(unlist(lapply(l1, SSAS)), ncol = 3, byrow = T)
# "w" is a matrix having 3 columns and 12 rows. In the first columns are
# the SSAS estimates for each month, and the lower and upper limits in columns 2
# and 3 respectively.
plot1(w, "Isard")
# Plot figure 3a
# Example 2: Red deer
cer <- read.table("http://pbil.univ-lyon1.fr/R/donnees/mfdcervus.txt",
h = T)
l1 <- splitmfd(cer)
w <- matrix(unlist(lapply(l1, SSAS)), ncol = 3, byrow = T)
plot1(w, "Red deer")
# Plot figure 3c
# Example 3: Roe deer
cap <- read.table("http://pbil.univ-lyon1.fr/R/donnees/mfdcapreolus.txt",
h = T)
l1 <- splitmfd(cap)
w <- matrix(unlist(lapply(l1, SSAS)), ncol = 3, byrow = T)
plot1(w, "Roe deer")
# Plot figure 3e
Here is my error:
w <- matrix(unlist(lapply(l1, SSAS)), ncol = 3, byrow = T)
Error in .C("chisqsim", as.integer(nr), as.integer(nc), as.integer(sr), :
"chisqsim" not available for .C() for package "stats"
This is a prime-example why you should not call internal C functions in R user code. R internals can (and do) change. Here, these changes are relevant: "more use of .Call" by Prof. Ripley.
Thus, you can change the function to this:
SSAS <- function(x, conf.int = 0.95, B = 10000)
{
x <- as.matrix(x)
nr <- nrow(x)
nc <- ncol(x)
sr <- rowSums(x)
sc <- colSums(x)
n <- sum(x)
E <- outer(sr, sc, "*")/n
dimnames(E) <- dimnames(x)
tmp <- .Call(stats:::C_chisq_sim, sr, sc, B, E)
obs <- sum(sort((x - E)^2/E, decreasing = TRUE))/n
sim <- tmp/n
p0 <- (1 - conf.int)/2
return(c(obs, quantile(sim, p0), quantile(sim, 1 -p0)))
}
Then, the code runs, but I have not checked for correctness. So, no warranties (as usual on Stack Overflow).
This question, probably has a simple solution but I cannot think of how to do it...
So I have a script as follows:
# ------------------ MODEL SETUP ----------------------------------------# simulation length
t_max <- 50
# arena
arena_x <- 100
arena_y <- 100
# plant parameters
a <- 0.1
b <- 0.1
g <- 1
# list of plant locations and initial sizes
nplants <-dim(plantLocsX)[1]*dim(plantLocsX)[2]
iterations<-5
totalBiomass<-matrix(0,nrow=iterations,ncol=1)
# starting loop
sep <- 10
# Original matrix
plantLocsX <- matrix(rep(seq(0,arena_x,sep), arena_y/sep),
nrow=1+arena_x/sep,
ncol=1+arena_y/sep)
plantLocsY <- t(plantLocsX)
plantSizes <- matrix(1,nrow=nplants,ncol=1)
# Plot the plants
radius <- sqrt( plantSizes/ pi )
symbols(plantLocsX, plantLocsY, radius, xlim = c(0,100), ylim=c(0,100), inches=0.05, fg = "green",
xlab = "x domain (m)", ylab = "y domain (m)", main = "Random Plant Locations", col.main = 51)
# Calculate distances between EACH POSSIBLE PAIR of plants
distances <- matrix(0,nrow=nplants,ncol=nplants)
for (i in 1:nplants){
for (j in 1:nplants){
distances[i,j] <- sqrt( (plantLocsX[i]-plantLocsX[j])^2 + (plantLocsY[i]-plantLocsY[j])^2 )
}
}
# ------------------ MODEL RUNNING ---------------------------------------
I need to alter the arrays containing plant locations and plant sizes so that at each time step, entries are removed and added (simulating mortality/reproduction, respectively). The "distances" must be updated with plant locations and sizes after each iteration...I can only think of complex ways to do this: destructing and constructing new matrices at each time step to fit the new number of elements but there must be functions to make this simpler....any advice?
Many thanks!!
I have been trying to run logistic regression looping all columns in raw file (predictors) with binary outcome (outcome.csv file).
raw <- data.frame(matrix(nrow=500, ncol=10))
out <- read.csv(file="outcome.csv", header=T)
models <- list()
res <- list()
a <- colnames(raw)
for(i in 1:length(raw){
models[[i]] <- summary(glm(out$blue ~ raw[,i] + out$sex, data=raw , family= binomial ) )
res[[i]] <- paste("logistic", a[[i]], ".txt", sep="")
write.table(models, res, row.names=FALSE, quote=FALSE, sep="\t")
}
it kept saying this
Error in model.frame.default(formula = out$blue ~ raw[, i] + out$MALE, :
variable lengths differ (found for 'raw[, i]')
Any suggestions related for loop or apply would be very much appreciated.
Thanks.
I use a netCDF file which stores one variable and has following dimensions: lon, lat, time.
Generally speaking I wish to compare it against different data that I have already in R stored as dataframe - first two columns are coordinates in WGS84, and next are values for specific time.
So I wrote following code.
# since # ncFile$dim$time$units say: [1] "days since 1900-1-1"
daysFromDate <- function(data1, data2="1900-01-01")
{
round(as.numeric(difftime(data1,data2,units = "days")))
}
#study area:
lon <- c(40.25, 48)
lat <- c(16, 24.25)
myTime <- c(daysFromDate("2008-01-16"), daysFromDate("2011-12-31"))
varName <- "spei"
require(ncdf4)
require(RCurl)
x <- getBinaryURL("http://digital.csic.es/bitstream/10261/104742/3/SPEI_01.nc")
ncFile <- nc_open(x)
LonIdx <- which( ncFile$dim$lon$vals >= lon[1] | ncFile$dim$lon$vals <= lon[2])
LatIdx <- which( ncFile$dim$lat$vals >= lat[1] & ncFile$dim$lat$vals <= lat[2])
TimeIdx <- which( ncFile$dim$time$vals >= myTime[1] & ncFile$dim$time$vals <= myTime[2])
MyVariable <- ncvar_get( ncFile, varName)[ LonIdx, LatIdx, TimeIdx]
I thought that data frame will be returned so that I will be able to easily manipulate data (in example - check correlation or create a plot).
Unfortunately 3-dimensional list has been returned instead.
How can I reformat this to data frame with following columns X-Y-Time1-Time2-...
So, example data will looks as follows
X Y 2014-01-01 2014-01-02 2014-01-02
50 17 0.5 0.4 0.3
where 0.5, 0.4 and 0.3 are example variable values
Or maybe there is different solution?
Ok, try following code, but it assumes that ranges are dense filled. And I changed lon test from or to and
require(ncdf4)
nc <- nc_open("SPEI_01.nc")
print(nc)
lon <- ncvar_get(nc, "lon")
lat <- ncvar_get(nc, "lat")
time <- ncvar_get(nc, "time")
lonIdx <- which( lon >= 40.25 & lon <= 48.00)
latIdx <- which( lat >= 16.00 & lat <= 24.25)
myTime <- c(daysFromDate("2008-01-16"), daysFromDate("2011-12-31"))
timeIdx <- which(time >= myTime[1] & time <= myTime[2])
data <- ncvar_get(nc, "spei")[lonIdx, latIdx, timeIdx]
indices <- expand.grid(lon[lonIdx], lat[latIdx], time[timeIdx])
print(length(indices))
class(indices)
summary(indices)
str(indices)
df <- data.frame(cbind(indices, as.vector(data)))
summary(df)
str(df)
UPDATE
ok, looks like I got the idea what do you want, but have do direct solution. What I've got so far is this: split data frame using either split() function or data.table package. After splitting by X&Y, you'll get lists of small data frames where X&Y are a constant for a given frame. Probably is it possible to transpose and recombine them back, but I have no idea how. It might be a good idea to continue to work with data as columns, Lists are nested, could be flattened, and here is link for splitting in R: http://www.uni-kiel.de/psychologie/rexrepos/posts/dfSplitMerge.html
Code, as continued from previous example
require(data.table)
colnames(df) <- c("X","Y","Time","spei")
df$Time <- as.Date(df$Time, origin="1900-01-01")
dt <- as.data.table(df)
summary(dt)
# Taken from https://github.com/Rdatatable/data.table/issues/1389
# x data.table
# f use `by` argument instead - unlike data.frame
# drop logical default FALSE will include `by` columns in resulting data.tables - unlike data.frame
# by character column names on which split into lists
# flatten logical default FALSE will result in recursive nested list having data.table as leafs
# ... ignored
split.data.table <- function(x, f, drop = FALSE, by, flatten = FALSE, ...){
if(missing(by) && !missing(f)) by = f
stopifnot(!missing(by), is.character(by), is.logical(drop), is.logical(flatten), !".ll" %in% names(x), by %in% names(x), !"nm" %in% by)
if(!flatten){
.by = by[1L]
tmp = x[, list(.ll=list(.SD)), by = .by, .SDcols = if(drop) setdiff(names(x), .by) else names(x)]
setattr(ll <- tmp$.ll, "names", tmp[[.by]])
if(length(by) > 1L) return(lapply(ll, split.data.table, drop = drop, by = by[-1L])) else return(ll)
} else {
tmp = x[, list(.ll=list(.SD)), by=by, .SDcols = if(drop) setdiff(names(x), by) else names(x)]
setattr(ll <- tmp$.ll, 'names', tmp[, .(nm = paste(.SD, collapse = ".")), by = by, .SDcols = by]$nm)
return(ll)
}
}
# here is data.table split
q <- split.data.table(dt, by = c("X","Y"), drop=FALSE)
str(q)
# here is data frame split
qq <- split(df, list(df$X, df$Y))
str(qq)
My problem is that I want to compute some specific area inside of graph of time series.
This is the graph :
an this is the code how I generate my time series :
x1<-rnorm(250,0.4,0.9)
x <- as.matrix(x1)
t <- ts(x[,1], start=c(1,1), frequency=30)
plot(t,xlim=c(2,4),main="Daily closing price of Walterenergie",ylab="Adjusted close Returns",xlab="Times",col="blue")
abline(0,0)
So my question is how to compute the area in green and then how to compute the area in red separately.
If you need the cothe for the graph :
###### Graph of the performance of my portfolio
num.points <- 250
x.vals <- 1:num.points
values <- x1
plot(x.vals, values, t="l", pch=20, main="Daily closing price", xlab="Times", ylab="Adjusted close Returns", las=1)
abline(h=0, col="darkgray", lwd=2)
crossings <- values[-length(values)] * values[-1]
crossings <- which(crossings < 0)
# You can draw the points to check (uncomment following line)
# points(x.vals[crossings], values[crossings], col="red", pch="X")
intersections <- NULL
for (cr in crossings)
{
new.int <- cr + abs(values[cr])/(abs(values[cr])+abs(values[cr+1]))
intersections <- c(intersections, new.int)
}
# Again, let's check the intersections
#points(intersections, rep(0, length(intersections)), pch=20, col="red", cex=0.7)
last.intersection <- 0
for (i in intersections)
{
ids <- which(x.vals<=i & x.vals>last.intersection)
poly.x <- c(last.intersection, x.vals[ids], i)
poly.y <- c(0, values[ids], 0)
if (max(poly.y) > 0)
{
col="green"
}
else
{
col="red"
}
polygon(x=poly.x, y=poly.y, col=col)
last.intersection <- i
}