R codes to extract ROC from Logistic regression model in 10 CV - logistic-regression

I fitted a logistic regression model in 10-fold cv. I can use the pROC package to get the AUC but it seems the AUC is not for the 10-fold CV because the cvAUC library gave a different AUC. I suspect the AUC from pROC is for one fold. Please how can extract the joint AUC for the 10-fold using the pROC library?
data(iris)
data <- iris[which(iris$Species=="setosa" | iris$Species=="versicolor"),]
data$ID <- seq.int(nrow(data))
table(data$Species)
data$Species <-as.factor(data$Species)
confusion_matrices <- list()
accuracy <- c()
for (i in c(1:10)) {
set.seed(3456)
folds <- caret::createFolds(data$Species, k = 10)
test <- data[data$ID %in% folds[[i]], ]
train <- data[data$ID %in% unlist(folds[-i]), ]
model1 <- glm(as.factor(Species)~ ., family = binomial, data = train)
summary(model1)
pred <- predict(model1, newdata = test, type = "response")
predR <- as.factor( pred >= 0.5)
df <- data.frame(cbind(test$Species, predR))
df_list <- lapply(df, as.factor)
confusion_matrices[[i]] <- caret::confusionMatrix(df_list[[2]], df_list[[1]])
accuracy[[i]] <- confusion_matrices[[i]]$overall["Accuracy"]
}
library(pander)
library(dplyr)
names(accuracy) <- c("Fold 1",....,"Fold 10")
accuracy %>%
pander::pandoc.table()
mean(accuracy)

Related

SSAS (Sexual Segregation and Aggregation Statistic) in R - calling C

I am running the following code, found in this appendix of a paper https://wiley.figshare.com/articles/Supplement_1_R_code_used_to_format_the_data_and_compute_the_SSAS_/3528698/1 to calculate the Sexual Segregation and Aggregation Statistic in R - but keep getting the following error - presumably there is an issue with calling a function from C, but I cannot resolve it.
# Main function, computes both the SSAS (Sexual Segregation and
# Aggregation Statistic) and the 95% limits of SSAS
# under the assumption of random association of both sexes
SSAS <- function(x, conf.int = 0.95, B = 10000)
{
x <- as.matrix(x)
nr <- nrow(x)
nc <- ncol(x)
sr <- rowSums(x)
sc <- colSums(x)
n <- sum(x)
E <- outer(sr, sc, "*")/n
dimnames(E) <- dimnames(x)
tmp <- .C("chisqsim", as.integer(nr), as.integer(nc),as.integer(sr),
as.integer(sc), as.integer(n), as.integer(B), as.double(E), integer(nr * nc),
double(n + 1), integer(nc), results = double(B), PACKAGE = "stats")
obs <- sum(sort((x - E)^2/E, decreasing = TRUE))/n
sim <- tmp$results/n
p0 <- (1 - conf.int)/2
return(c(obs, quantile(sim, p0), quantile(sim, 1 -p0)))
}
# This function formats data to be run with the SSAS function
splitmfd <- function(mfd) {
loca1 <- function(x) {
x <- t(x[, 1:2])
dimnames(x) <- list(c("mal", "fem"), as.character(1:ncol(x)))
x
}
l0 <- split(mfd, mfd$mon)
lapply(l0, loca1)
}
# Example 1: Isard
rup <- read.table("http://pbil.univ-lyon1.fr/R/donnees/mfdrupicapra.txt",
h = T)
# Load data from the web
plot1 <- function(w, titre = "") {
plot(1:12, w[, 1], ylim = range(w), ax = F, pch = 19,
type = "n", ylab = "IK", xlab = "")
title(main = titre)
box()
axis(1, 1:12, c("Jan", "Feb", "Mar", "Apr", "May",
"Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"))
axis(2, pretty(range(w)), tck = 1)
polyx <- c(1:12, 12:1)
polyy <- c(w[, 3], rev(w[, 2]))
polygon(polyx, polyy, col = grey(0.9))
points(w[, 1], pch = 19, type = "b")
}
# Function to plot data and 95\ levels of significance
l1 <- splitmfd(rup)
# Format data to be used with SSAS function
w <- matrix(unlist(lapply(l1, SSAS)), ncol = 3, byrow = T)
# "w" is a matrix having 3 columns and 12 rows. In the first columns are
# the SSAS estimates for each month, and the lower and upper limits in columns 2
# and 3 respectively.
plot1(w, "Isard")
# Plot figure 3a
# Example 2: Red deer
cer <- read.table("http://pbil.univ-lyon1.fr/R/donnees/mfdcervus.txt",
h = T)
l1 <- splitmfd(cer)
w <- matrix(unlist(lapply(l1, SSAS)), ncol = 3, byrow = T)
plot1(w, "Red deer")
# Plot figure 3c
# Example 3: Roe deer
cap <- read.table("http://pbil.univ-lyon1.fr/R/donnees/mfdcapreolus.txt",
h = T)
l1 <- splitmfd(cap)
w <- matrix(unlist(lapply(l1, SSAS)), ncol = 3, byrow = T)
plot1(w, "Roe deer")
# Plot figure 3e
Here is my error:
w <- matrix(unlist(lapply(l1, SSAS)), ncol = 3, byrow = T)
Error in .C("chisqsim", as.integer(nr), as.integer(nc), as.integer(sr), :
"chisqsim" not available for .C() for package "stats"
This is a prime-example why you should not call internal C functions in R user code. R internals can (and do) change. Here, these changes are relevant: "more use of .Call" by Prof. Ripley.
Thus, you can change the function to this:
SSAS <- function(x, conf.int = 0.95, B = 10000)
{
x <- as.matrix(x)
nr <- nrow(x)
nc <- ncol(x)
sr <- rowSums(x)
sc <- colSums(x)
n <- sum(x)
E <- outer(sr, sc, "*")/n
dimnames(E) <- dimnames(x)
tmp <- .Call(stats:::C_chisq_sim, sr, sc, B, E)
obs <- sum(sort((x - E)^2/E, decreasing = TRUE))/n
sim <- tmp/n
p0 <- (1 - conf.int)/2
return(c(obs, quantile(sim, p0), quantile(sim, 1 -p0)))
}
Then, the code runs, but I have not checked for correctness. So, no warranties (as usual on Stack Overflow).

Altering arrays to add/remove entries at each time-step in R

This question, probably has a simple solution but I cannot think of how to do it...
So I have a script as follows:
# ------------------ MODEL SETUP ----------------------------------------# simulation length
t_max <- 50
# arena
arena_x <- 100
arena_y <- 100
# plant parameters
a <- 0.1
b <- 0.1
g <- 1
# list of plant locations and initial sizes
nplants <-dim(plantLocsX)[1]*dim(plantLocsX)[2]
iterations<-5
totalBiomass<-matrix(0,nrow=iterations,ncol=1)
# starting loop
sep <- 10
# Original matrix
plantLocsX <- matrix(rep(seq(0,arena_x,sep), arena_y/sep),
nrow=1+arena_x/sep,
ncol=1+arena_y/sep)
plantLocsY <- t(plantLocsX)
plantSizes <- matrix(1,nrow=nplants,ncol=1)
# Plot the plants
radius <- sqrt( plantSizes/ pi )
symbols(plantLocsX, plantLocsY, radius, xlim = c(0,100), ylim=c(0,100), inches=0.05, fg = "green",
xlab = "x domain (m)", ylab = "y domain (m)", main = "Random Plant Locations", col.main = 51)
# Calculate distances between EACH POSSIBLE PAIR of plants
distances <- matrix(0,nrow=nplants,ncol=nplants)
for (i in 1:nplants){
for (j in 1:nplants){
distances[i,j] <- sqrt( (plantLocsX[i]-plantLocsX[j])^2 + (plantLocsY[i]-plantLocsY[j])^2 )
}
}
# ------------------ MODEL RUNNING ---------------------------------------
I need to alter the arrays containing plant locations and plant sizes so that at each time step, entries are removed and added (simulating mortality/reproduction, respectively). The "distances" must be updated with plant locations and sizes after each iteration...I can only think of complex ways to do this: destructing and constructing new matrices at each time step to fit the new number of elements but there must be functions to make this simpler....any advice?
Many thanks!!

looping logistic regression in R

I have been trying to run logistic regression looping all columns in raw file (predictors) with binary outcome (outcome.csv file).
raw <- data.frame(matrix(nrow=500, ncol=10))
out <- read.csv(file="outcome.csv", header=T)
models <- list()
res <- list()
a <- colnames(raw)
for(i in 1:length(raw){
models[[i]] <- summary(glm(out$blue ~ raw[,i] + out$sex, data=raw , family= binomial ) )
res[[i]] <- paste("logistic", a[[i]], ".txt", sep="")
write.table(models, res, row.names=FALSE, quote=FALSE, sep="\t")
}
it kept saying this
Error in model.frame.default(formula = out$blue ~ raw[, i] + out$MALE, :
variable lengths differ (found for 'raw[, i]')
Any suggestions related for loop or apply would be very much appreciated.
Thanks.

Read multidimensional NetCDF as data frame in R

I use a netCDF file which stores one variable and has following dimensions: lon, lat, time.
Generally speaking I wish to compare it against different data that I have already in R stored as dataframe - first two columns are coordinates in WGS84, and next are values for specific time.
So I wrote following code.
# since # ncFile$dim$time$units say: [1] "days since 1900-1-1"
daysFromDate <- function(data1, data2="1900-01-01")
{
round(as.numeric(difftime(data1,data2,units = "days")))
}
#study area:
lon <- c(40.25, 48)
lat <- c(16, 24.25)
myTime <- c(daysFromDate("2008-01-16"), daysFromDate("2011-12-31"))
varName <- "spei"
require(ncdf4)
require(RCurl)
x <- getBinaryURL("http://digital.csic.es/bitstream/10261/104742/3/SPEI_01.nc")
ncFile <- nc_open(x)
LonIdx <- which( ncFile$dim$lon$vals >= lon[1] | ncFile$dim$lon$vals <= lon[2])
LatIdx <- which( ncFile$dim$lat$vals >= lat[1] & ncFile$dim$lat$vals <= lat[2])
TimeIdx <- which( ncFile$dim$time$vals >= myTime[1] & ncFile$dim$time$vals <= myTime[2])
MyVariable <- ncvar_get( ncFile, varName)[ LonIdx, LatIdx, TimeIdx]
I thought that data frame will be returned so that I will be able to easily manipulate data (in example - check correlation or create a plot).
Unfortunately 3-dimensional list has been returned instead.
How can I reformat this to data frame with following columns X-Y-Time1-Time2-...
So, example data will looks as follows
X Y 2014-01-01 2014-01-02 2014-01-02
50 17 0.5 0.4 0.3
where 0.5, 0.4 and 0.3 are example variable values
Or maybe there is different solution?
Ok, try following code, but it assumes that ranges are dense filled. And I changed lon test from or to and
require(ncdf4)
nc <- nc_open("SPEI_01.nc")
print(nc)
lon <- ncvar_get(nc, "lon")
lat <- ncvar_get(nc, "lat")
time <- ncvar_get(nc, "time")
lonIdx <- which( lon >= 40.25 & lon <= 48.00)
latIdx <- which( lat >= 16.00 & lat <= 24.25)
myTime <- c(daysFromDate("2008-01-16"), daysFromDate("2011-12-31"))
timeIdx <- which(time >= myTime[1] & time <= myTime[2])
data <- ncvar_get(nc, "spei")[lonIdx, latIdx, timeIdx]
indices <- expand.grid(lon[lonIdx], lat[latIdx], time[timeIdx])
print(length(indices))
class(indices)
summary(indices)
str(indices)
df <- data.frame(cbind(indices, as.vector(data)))
summary(df)
str(df)
UPDATE
ok, looks like I got the idea what do you want, but have do direct solution. What I've got so far is this: split data frame using either split() function or data.table package. After splitting by X&Y, you'll get lists of small data frames where X&Y are a constant for a given frame. Probably is it possible to transpose and recombine them back, but I have no idea how. It might be a good idea to continue to work with data as columns, Lists are nested, could be flattened, and here is link for splitting in R: http://www.uni-kiel.de/psychologie/rexrepos/posts/dfSplitMerge.html
Code, as continued from previous example
require(data.table)
colnames(df) <- c("X","Y","Time","spei")
df$Time <- as.Date(df$Time, origin="1900-01-01")
dt <- as.data.table(df)
summary(dt)
# Taken from https://github.com/Rdatatable/data.table/issues/1389
# x data.table
# f use `by` argument instead - unlike data.frame
# drop logical default FALSE will include `by` columns in resulting data.tables - unlike data.frame
# by character column names on which split into lists
# flatten logical default FALSE will result in recursive nested list having data.table as leafs
# ... ignored
split.data.table <- function(x, f, drop = FALSE, by, flatten = FALSE, ...){
if(missing(by) && !missing(f)) by = f
stopifnot(!missing(by), is.character(by), is.logical(drop), is.logical(flatten), !".ll" %in% names(x), by %in% names(x), !"nm" %in% by)
if(!flatten){
.by = by[1L]
tmp = x[, list(.ll=list(.SD)), by = .by, .SDcols = if(drop) setdiff(names(x), .by) else names(x)]
setattr(ll <- tmp$.ll, "names", tmp[[.by]])
if(length(by) > 1L) return(lapply(ll, split.data.table, drop = drop, by = by[-1L])) else return(ll)
} else {
tmp = x[, list(.ll=list(.SD)), by=by, .SDcols = if(drop) setdiff(names(x), by) else names(x)]
setattr(ll <- tmp$.ll, 'names', tmp[, .(nm = paste(.SD, collapse = ".")), by = by, .SDcols = by]$nm)
return(ll)
}
}
# here is data.table split
q <- split.data.table(dt, by = c("X","Y"), drop=FALSE)
str(q)
# here is data frame split
qq <- split(df, list(df$X, df$Y))
str(qq)

Compute the area on graph of time series

My problem is that I want to compute some specific area inside of graph of time series.
This is the graph :
an this is the code how I generate my time series :
x1<-rnorm(250,0.4,0.9)
x <- as.matrix(x1)
t <- ts(x[,1], start=c(1,1), frequency=30)
plot(t,xlim=c(2,4),main="Daily closing price of Walterenergie",ylab="Adjusted close Returns",xlab="Times",col="blue")
abline(0,0)
So my question is how to compute the area in green and then how to compute the area in red separately.
If you need the cothe for the graph :
###### Graph of the performance of my portfolio
num.points <- 250
x.vals <- 1:num.points
values <- x1
plot(x.vals, values, t="l", pch=20, main="Daily closing price", xlab="Times", ylab="Adjusted close Returns", las=1)
abline(h=0, col="darkgray", lwd=2)
crossings <- values[-length(values)] * values[-1]
crossings <- which(crossings < 0)
# You can draw the points to check (uncomment following line)
# points(x.vals[crossings], values[crossings], col="red", pch="X")
intersections <- NULL
for (cr in crossings)
{
new.int <- cr + abs(values[cr])/(abs(values[cr])+abs(values[cr+1]))
intersections <- c(intersections, new.int)
}
# Again, let's check the intersections
#points(intersections, rep(0, length(intersections)), pch=20, col="red", cex=0.7)
last.intersection <- 0
for (i in intersections)
{
ids <- which(x.vals<=i & x.vals>last.intersection)
poly.x <- c(last.intersection, x.vals[ids], i)
poly.y <- c(0, values[ids], 0)
if (max(poly.y) > 0)
{
col="green"
}
else
{
col="red"
}
polygon(x=poly.x, y=poly.y, col=col)
last.intersection <- i
}

Resources