Read multidimensional NetCDF as data frame in R - arrays

I use a netCDF file which stores one variable and has following dimensions: lon, lat, time.
Generally speaking I wish to compare it against different data that I have already in R stored as dataframe - first two columns are coordinates in WGS84, and next are values for specific time.
So I wrote following code.
# since # ncFile$dim$time$units say: [1] "days since 1900-1-1"
daysFromDate <- function(data1, data2="1900-01-01")
{
round(as.numeric(difftime(data1,data2,units = "days")))
}
#study area:
lon <- c(40.25, 48)
lat <- c(16, 24.25)
myTime <- c(daysFromDate("2008-01-16"), daysFromDate("2011-12-31"))
varName <- "spei"
require(ncdf4)
require(RCurl)
x <- getBinaryURL("http://digital.csic.es/bitstream/10261/104742/3/SPEI_01.nc")
ncFile <- nc_open(x)
LonIdx <- which( ncFile$dim$lon$vals >= lon[1] | ncFile$dim$lon$vals <= lon[2])
LatIdx <- which( ncFile$dim$lat$vals >= lat[1] & ncFile$dim$lat$vals <= lat[2])
TimeIdx <- which( ncFile$dim$time$vals >= myTime[1] & ncFile$dim$time$vals <= myTime[2])
MyVariable <- ncvar_get( ncFile, varName)[ LonIdx, LatIdx, TimeIdx]
I thought that data frame will be returned so that I will be able to easily manipulate data (in example - check correlation or create a plot).
Unfortunately 3-dimensional list has been returned instead.
How can I reformat this to data frame with following columns X-Y-Time1-Time2-...
So, example data will looks as follows
X Y 2014-01-01 2014-01-02 2014-01-02
50 17 0.5 0.4 0.3
where 0.5, 0.4 and 0.3 are example variable values
Or maybe there is different solution?

Ok, try following code, but it assumes that ranges are dense filled. And I changed lon test from or to and
require(ncdf4)
nc <- nc_open("SPEI_01.nc")
print(nc)
lon <- ncvar_get(nc, "lon")
lat <- ncvar_get(nc, "lat")
time <- ncvar_get(nc, "time")
lonIdx <- which( lon >= 40.25 & lon <= 48.00)
latIdx <- which( lat >= 16.00 & lat <= 24.25)
myTime <- c(daysFromDate("2008-01-16"), daysFromDate("2011-12-31"))
timeIdx <- which(time >= myTime[1] & time <= myTime[2])
data <- ncvar_get(nc, "spei")[lonIdx, latIdx, timeIdx]
indices <- expand.grid(lon[lonIdx], lat[latIdx], time[timeIdx])
print(length(indices))
class(indices)
summary(indices)
str(indices)
df <- data.frame(cbind(indices, as.vector(data)))
summary(df)
str(df)
UPDATE
ok, looks like I got the idea what do you want, but have do direct solution. What I've got so far is this: split data frame using either split() function or data.table package. After splitting by X&Y, you'll get lists of small data frames where X&Y are a constant for a given frame. Probably is it possible to transpose and recombine them back, but I have no idea how. It might be a good idea to continue to work with data as columns, Lists are nested, could be flattened, and here is link for splitting in R: http://www.uni-kiel.de/psychologie/rexrepos/posts/dfSplitMerge.html
Code, as continued from previous example
require(data.table)
colnames(df) <- c("X","Y","Time","spei")
df$Time <- as.Date(df$Time, origin="1900-01-01")
dt <- as.data.table(df)
summary(dt)
# Taken from https://github.com/Rdatatable/data.table/issues/1389
# x data.table
# f use `by` argument instead - unlike data.frame
# drop logical default FALSE will include `by` columns in resulting data.tables - unlike data.frame
# by character column names on which split into lists
# flatten logical default FALSE will result in recursive nested list having data.table as leafs
# ... ignored
split.data.table <- function(x, f, drop = FALSE, by, flatten = FALSE, ...){
if(missing(by) && !missing(f)) by = f
stopifnot(!missing(by), is.character(by), is.logical(drop), is.logical(flatten), !".ll" %in% names(x), by %in% names(x), !"nm" %in% by)
if(!flatten){
.by = by[1L]
tmp = x[, list(.ll=list(.SD)), by = .by, .SDcols = if(drop) setdiff(names(x), .by) else names(x)]
setattr(ll <- tmp$.ll, "names", tmp[[.by]])
if(length(by) > 1L) return(lapply(ll, split.data.table, drop = drop, by = by[-1L])) else return(ll)
} else {
tmp = x[, list(.ll=list(.SD)), by=by, .SDcols = if(drop) setdiff(names(x), by) else names(x)]
setattr(ll <- tmp$.ll, 'names', tmp[, .(nm = paste(.SD, collapse = ".")), by = by, .SDcols = by]$nm)
return(ll)
}
}
# here is data.table split
q <- split.data.table(dt, by = c("X","Y"), drop=FALSE)
str(q)
# here is data frame split
qq <- split(df, list(df$X, df$Y))
str(qq)

Related

R: Adding columns from one data frame to another, non-matching number of rows

I have a .txt file with millions of rows of data - DateTime (1-min intervals) and Precipitation.
I have a .csv file with thousands of rows of data - DateTime (daily intevals), MaxTemp, MinTemp, WindSpd, WindDir.
I import the .txt file as a data frame and do a few transformations. I then move this into a new data frame.
I import the .csv file as a data frame do a few transformations. I then want to add the columns from this data frame into the new data frame (total of 7 columns). However, R throws an error: "Error in data.frame(..., check.names = FALSE) : arguments imply differing number of rows: 10382384, 32868, 1"
I know the number of rows is different, however, this is the format I need for the next step in processing. This could be easily done in Excel were it not for the crazy amount of rows.
Simulated code is below, which produces the same error:
a <- as.character(c(1,2,3,4,5,6,7,8,9,10))
b <- c(paste("Date", a))
c <- c(rnorm(10, mean = 5, sd = 2.1))
Frame1 <- data.frame(b,c)
d <- as.character(c(1,2,3))
e <- c(paste("Date", d))
f <- c(rnorm(3, mean = 1, sd = 0.7))
g <- c(rnorm(3, mean = 3, sd = 2))
h <- c(rnorm(3, mean = 8, sd = 1))
Frame2 <- data.frame(e,f,g,h)
NewFrame <- cbind(Frame1)
NewFrame <- cbind(NewFrame, Frame2)
I have tried a *_join but it throws error: "Error: by must be supplied when x and y have no common variables.
i use by = character()` to perform a cross-join." which to me reads like it wants to match things up, which I don't need. I really just need to plop these two datasets side-by-side for the next processing step. Help?
The data frames MUST have an equal number of rows. To compensate then, I just added a bunch of rows to the smaller dataset so that it contains the same amount of rows as the larger dataset (in my case, it will always be the .csv file) and filled it with "NA" values. The following application I use for downstream processing knows how to handle the "NA" values so this works well for me.
I've run the solution with a representative dataset and I am able to cbind the two data frames together.
Sample code with the simulated dataset:
#create data frame 1
a <- as.character(c(1:10))
b <- c(paste("Date", a))
c <- c(rnorm(10, mean = 5, sd = 2.1))
Frame1 <- data.frame(b,c)
#create date frame 2
d <- as.character(c(1,2,3))
e <- c(paste("Date", d))
f <- c(rnorm(3, mean = 1, sd = 0.7))
g <- c(rnorm(3, mean = 3, sd = 2))
h <- c(rnorm(3, mean = 8, sd = 1))
Frame2 <- data.frame(e,f,g,h)
#find the maximum number of rows
maxlen <- max(nrow(Frame1), nrow(Frame2))
#finds the minimum number of rows
rowrow <- min(nrow(Frame1), nrow(Frame2))
#adds enough rows to the smaller dataset to equal the number of rows
#in the larger dataset. Populates the rows with "NA" values
Frame2[rowrow+(maxlen-rowrow),] <- NA
#creates the new data frame from the two frames
NewFrame <- cbind(NewFrame, Frame2)

Is there any function that calculate correlation between a set of matrices included in an array in R?

I have a list that includes 20 matrices. I want to calculate Pearson's correlation betweeen all matrices. but I can not find any possible code or functions? Could you please give some tips for doing so.
something like:
a=matrix(1:8100, ncol = 90)
b=matrix(8100:16199, ncol = 90)
c=matrix(sample(16200:24299),ncol = 90)
z=list(a,b,c)
I find this:
https://rdrr.io/cran/lineup/man/corbetw2mat.html and try it:
library(lineup)
corbetw2mat(z[a], z[b], what = "all")
I've got the following error:
Error in corbetw2mat(z[a], z[b], what = "all") :
(list) object cannot be coerced to type 'double'
I want a list like this for the result:
a & b
correlations
a & c
correlations
b & c
correlations
Thanks
I will create a smaller data set to illustrate the solution below.
To get pairwise combinations the best option is to compute a matrix of combinations with combn and then loop through it, in this case a lapply loop.
set.seed(1234) # Make the results reproducible
a <- matrix(1:9, ncol = 3)
b <- matrix(rnorm(9), ncol = 3)
c <- matrix(sample(1:9), ncol = 3)
sample_list <- list(a, b, c)
cmb <- combn(3, 2)
res <- lapply(seq.int(ncol(cmb)), function(i) {
cor(sample_list[[ cmb[1, i] ]], sample_list[[ cmb[2, i] ]])
})
The results are in the list res.
Note that sample is a base r function, so I changed the name to sample_list.

Altering arrays to add/remove entries at each time-step in R

This question, probably has a simple solution but I cannot think of how to do it...
So I have a script as follows:
# ------------------ MODEL SETUP ----------------------------------------# simulation length
t_max <- 50
# arena
arena_x <- 100
arena_y <- 100
# plant parameters
a <- 0.1
b <- 0.1
g <- 1
# list of plant locations and initial sizes
nplants <-dim(plantLocsX)[1]*dim(plantLocsX)[2]
iterations<-5
totalBiomass<-matrix(0,nrow=iterations,ncol=1)
# starting loop
sep <- 10
# Original matrix
plantLocsX <- matrix(rep(seq(0,arena_x,sep), arena_y/sep),
nrow=1+arena_x/sep,
ncol=1+arena_y/sep)
plantLocsY <- t(plantLocsX)
plantSizes <- matrix(1,nrow=nplants,ncol=1)
# Plot the plants
radius <- sqrt( plantSizes/ pi )
symbols(plantLocsX, plantLocsY, radius, xlim = c(0,100), ylim=c(0,100), inches=0.05, fg = "green",
xlab = "x domain (m)", ylab = "y domain (m)", main = "Random Plant Locations", col.main = 51)
# Calculate distances between EACH POSSIBLE PAIR of plants
distances <- matrix(0,nrow=nplants,ncol=nplants)
for (i in 1:nplants){
for (j in 1:nplants){
distances[i,j] <- sqrt( (plantLocsX[i]-plantLocsX[j])^2 + (plantLocsY[i]-plantLocsY[j])^2 )
}
}
# ------------------ MODEL RUNNING ---------------------------------------
I need to alter the arrays containing plant locations and plant sizes so that at each time step, entries are removed and added (simulating mortality/reproduction, respectively). The "distances" must be updated with plant locations and sizes after each iteration...I can only think of complex ways to do this: destructing and constructing new matrices at each time step to fit the new number of elements but there must be functions to make this simpler....any advice?
Many thanks!!

How do you dynamically create difference- or delta- columns in a data.frame?

My dataframe has column names of outstanding balance from Balance, Balance1, Balance2,...,Balance36.
I want to add a column for the delta between each month, i.e. Delta2 = Balance2 - Balance1
How can I simplify by method below.
dataset$delta1 = apply(dataset[, c("Balance1","Balance")], 1, function(x){x[2]-x[1]})
dataset$delta2 = apply(dataset[, c("Balance2","Balance1")], 1, function(x){x[2]-x[1]})
...
dataset$delta35 = apply(dataset[, c("Balance35","Balance34")], 1, function(x){x[2]-x[1]})
dataset$delta36 = apply(dataset[, c("Balance36","Balance35")], 1, function(x){x[2]-x[1]})
It boils down to a one-liner. First, name your dataset something short, df is the usual name. Then, use direct subtraction; there's zero need to call apply() to subtract one column from another:
df$delta1 <- df[,"Balance1"] - df[,"Balance"]
df$delta2 <- df[,"Balance2"] - df[,"Balance1"]
...
df$delta35 <- df[,"Balance35"] - df[,"Balance34")]
df$delta36 <- df[,"Balance36"] - df[,"Balance35")]
But since the whole computation has a regular structure, we're really only talking about generating a Nx36 array of differences, so use numeric column indices. Say your "Balance*" column indices are (50:85) and your delta_cols are 100:135, or whatever. Then the indices for LHS of your "Balance*" subtraction are balance_lhs <- (50:84) and RHS indices are (51:85), or just ((50:84)+1) (remember that most operators like addition vectorize in R)
So your Nx36 array can be generated by just the one-liner:
df[,delta_cols] <- df[,(balance_lhs+1)] - df[,balance_lhs]
And you can compute delta_cols <- which(colnames(df) == c("delta1",...,"delta36") programmatically, to avoid magic-number column indices in your code.
Use lapply to calculate delta for all 36 comparisons in one line.
# Sample data (37 columns, labelled Balance, Balance1, ...)
set.seed(2017);
df <- as.data.frame(matrix(runif(37 * 100), ncol = 37));
colnames(df) <- paste("Balance", c("", seq(1:36)), sep = "");
# List of difference vectors (36 distance vectors, labelled delta1, ...)
lst <- lapply(2:ncol(df), function(i) df[, i] - df[, i - 1]);
names(lst) <- paste("delta", seq(1:36), sep = "");
# Combine with original dataframe
df <- cbind.data.frame(
df,
as.data.frame(lst));

in R, apply function to multiple inputs to return array of POSIXlt

I have a function to create a timestamp from multiple string inputs:
# Create timestamp from date, time, and timezone strings
str_to_dt <- function(date_str, time_str, tz_str){
as.POSIXlt(x = paste(date_str, time_str), format = "%m/%d/%Y %H:%M",
tz = tz_str)
}
I have arrays of input values:
dates <- c("01/23/1945", "11/11/1911")
times <- c("12:12", "13:13")
tzones <- c("Pacific/Guam", "America/New_York")
Since as.POSIXlt takes only a constant tz, not a vector, I have to loop through these. The following for loop produces the output I would like, but I would prefer to understand how to do it in plyr.
ts <- as.POSIXlt(rep(NA,length(dates)))
for(i in 1:length(dates)){
ts[i] <- str_to_dt(date_str = dates[i],
time_str = times[i],
tz_str = tzones[i])
}
ts
[1] "1945-01-23 11:11:00 GST" "1911-11-11 13:13:00 EST"
When I use mapply I get a list of the component vectors of the datetime, which is not what I want.
mapply(FUN=str_to_dt, date_str=dates, time_str=times, tz_str=tzones) # decomposed
When I use mlply I can get a LIST of the correct 2 POSIXlt results:
library(plyr)
mlply(.data=data.frame(date_str = dates, time_str = times,
tz_str = tzones, stringsAsFactors=FALSE),
.fun=str_to_dt)
But I am at a loss getting them back into an array, since unlist totally decomposes the POSIXlt. The analogous function maply is giving me an error:
maply(.data=data.frame(date_str = dates, time_str = times, tz_str = tzones,
stringsAsFactors=FALSE),
.fun=str_to_dt) # Error: Results must have one or more dimensions.
How do I fix the error so that maply works, or is there a better way?
c() will convert the list to a typed vector, but only if you call it indirectly.
> d <- as.POSIXlt(Sys.Date())
> dlist <- list(d,d)
> do.call(c, dlist)
[1] "2014-09-17 17:00:00 PDT" "2014-09-17 17:00:00 PDT"
> str(do.call(c, dlist))
POSIXlt[1:2], format: "2014-09-17 17:00:00" "2014-09-17 17:00:00"

Resources