How to loop through an array in pairs? - arrays

I have a 1000x1 vector (1000 rows and 1 column). I want to get elements in pairs (row 1 and row 2, row 3 and row 4, row 5 and row 6, etc.)
Here's what I have so far
for (j in 1: ncol(total_loci)){
for (i in 1: sample_size){
# a pair
genotype[i]<- paste(total_loci[i, j], total_loci[i+1,j], sep="")
}
}
Genotype should thus be a 500x1 vector (500 rows and 1 column) containing the genotype. Assume that my for-loops are correct. I think my I needs to skip every other index -- so my i should start at 1 then 3, 5, 7, 9, etc. The variable total_loci is of class data frame.

You should try to use vectorized solutions where possible. They're usually more memory efficient and faster than loops.
In this case, you can use seq to generate an index vector for every other element. Then you can use that index vector to subset the original vector in pairs.
# sample data
x <- replicate(5, sample(LETTERS, 1000, replace=TRUE), simplify=FALSE)
x <- as.data.frame(x, stringsAsFactors=FALSE)
names(x) <- paste("V",1:NCOL(x), sep="")
# function to concatenate every other observation as a pair
f <- function(x) {
s <- seq(2, length(x), 2)
paste(x[s-1], x[s], sep="")
}
# run algorithm for each column
y <- as.data.frame(lapply(x, f), stringsAsFactors=FALSE)

Here is a general approach for processing an array in consecutive chunks of n elements. You can set n = 2 to process it by pairs.
First, here is a function that splits a vector n-by-n, returning a list of n elements:
n.ny.n <- function(x, n) split(x, 1+(seq_along(x)-1) %% n)
n.by.n(x = 1:24, n = 2)
# $`1`
# [1] 1 3 5 7 9 11 13 15 17 19 21 23
#
# $`2`
# [1] 2 4 6 8 10 12 14 16 18 20 22 24
Then you can run any function on the slices using mapply, and via do.call:
do.call(mapply, c(FUN = paste, n.by.n(x = 1:24, n = 2), sep = "_"))
# [1] "1_2" "3_4" "5_6" "7_8" "9_10" "11_12" "13_14" "15_16"
# [9] "17_18" "19_20" "21_22" "23_24"
do.call(mapply, c(FUN = paste, n.by.n(x = 1:24, n = 6), sep = "_"))
# [1] "1_2_3_4_5_6" "7_8_9_10_11_12" "13_14_15_16_17_18"
# [4] "19_20_21_22_23_24"

Here is a way to do it without any apply family calls or loops:
# Generate some sample data.
total_loci<-data.frame(genotype=sample(LETTERS,500,replace=TRUE))
# Paste
paste0(total_loci[c(TRUE,TRUE,FALSE,FALSE),],
total_loci[c(FALSE,FALSE,TRUE,TRUE),])

Related

Fastest way to fill a 3D array from another 3D arrray in R

I am using the code below to fill a 3D array from another 3D array. I have used the sapply function to apply the code lines at each individual (3rd dimension) as in Efficient way to fill a 3D array.
Here is my code.
ind <- 1000
individuals <- as.character(seq(1, ind, by = 1))
maxCol <- 7
col <- 4
line <- 0
a <- 0
b <- 0
c <- 0
col_array <- c("year","time", "ID", "age", as.vector(outer(c(paste(seq(0, 1, by = 1), "year", sep="_"), paste(seq(2, maxCol, by = 1), "years", sep="_")), c("S_F", "I_F", "R_F"), paste, sep="_")))
array1 <- array(sample(1:100, length(col_array), replace = T), dim=c(2, length(col_array), ind), dimnames=list(NULL, col_array, individuals)) ## 3rd dimension = individual ID
## print(array1)
col_array <- c("year","time", "ID", "age", as.vector(outer(c(paste(seq(0, 1, by = 1), "year", sep="_"), paste(seq(2, maxCol, by = 1), "years", sep="_")), c("S_M", "I_M", "R_M"), paste, sep="_")))
array2 <- array(NA, dim=c(2, length(col_array), ind), dimnames=list(NULL, col_array, individuals)) ## 3rd dimension = individual ID
## print(array2)
tic("array2")
array2 <- sapply(individuals, function(i){
## Fill the first columns
array2[line + 1, c("year", "time", "ID", "age"), i] <- c(a, b, i, c)
## Define column indexes for individuals S
col_start_S_F <- which(colnames(array1[,,i])=="0_year_S_F")
col_end_S_F <- which(colnames(array1[,,i])==paste(maxCol,"years_S_F", sep="_"))
col_start_S_M <- which(colnames(array2[,,i])=="0_year_S_M")
col_end_S_M <- which(colnames(array2[,,i])==paste(maxCol,"years_S_M", sep="_"))
## Fill the columns for individuals S
p_S_M <- sapply(0:maxCol, function(x){pnorm(x, 4, 1)})
array2[line + 1, col_start_S_M:col_end_S_M, i] <- round(as.numeric(as.vector(array1[line + 1, col_start_S_F:col_end_S_F, i]))*p_S_M)
## Define column indexes for individuals I
col_start_I_F <- which(colnames(array1[,,i])=="0_year_I_F")
col_end_I_F <- which(colnames(array1[,,i])==paste(maxCol,"years_I_F", sep="_"))
col_start_I_M <- which(colnames(array2[,,i])=="0_year_I_M")
col_end_I_M <- which(colnames(array2[,,i])==paste(maxCol,"years_I_M", sep="_"))
## Fill the columns for individuals I
p_I_M <- sapply(0:maxCol, function(x){pnorm(x, 2, 1)})
array2[line + 1, col_start_I_M:col_end_I_M, i] <- round(as.numeric(as.vector(array1[line + 1, col_start_I_F:col_end_I_F, i]))*p_I_M)
## Define column indexes for individuals R
col_start_R_M <- which(colnames(array2[,,i])=="0_year_R_M")
col_end_R_M <- which(colnames(array2[,,i])==paste(maxCol,"years_R_M", sep="_"))
## Fill the columns for individuals R
array2[line + 1, col_start_R_M:col_end_R_M, i] <- as.numeric(as.vector(array2[line + 1, col_start_S_M:col_end_S_M, i])) +
as.numeric(as.vector(array2[line + 1, col_start_I_M:col_end_I_M, i]))
return(array2[,,i])
## print(array2[,,i])
}, simplify = "array")
## print(array2)
toc()
Is there a way to increase the performance/speed of my code (i.e., < 1 sec)? There are 500000 observations for the 3rd dimension. Any suggestions?
TL;DR: Here's a tidyverse solution that transforms the sample array into a dataframe and applies the requested changes. EDIT: I've added steps 1+2 to transform the original post's sample data into the format I used in step 3. The actual calculation in Step 3 is very fast (<0.1 sec), but the bottleneck is step 2, which takes 10 seconds for 500k rows.
Step 0: Create sample data for 500k individuals
ind <- 500000
individuals <- as.character(seq(1, ind, by = 1))
maxCol <- 7
col <- 4
line <- 0
a <- 0
b <- 0
c <- 0
col_array <- c("year","time", "ID", "age", as.vector(outer(c(paste(seq(0, 1, by = 1), "year", sep="_"), paste(seq(2, maxCol, by = 1), "years", sep="_")), c("S_F", "I_F", "R_F"), paste, sep="_")))
array1 <- array(sample(1:100, length(col_array), replace = T), dim=c(2, length(col_array), ind), dimnames=list(NULL, col_array, individuals)) ## 3rd dimension = individual ID
dim(array1)
# [1] 2 28 500000 # Two rows x 28 measures x 500k individuals
Step 1: Subset array and convert to data frame.
library(tidyverse)
# OP only uses first line of array1. If other rows needed, replace with "array1 %>%"
# and adjust renaming below to account for different Var1.
array1_dt <- array1[1,,] %>%
as.data.frame.table(stringsAsFactors = FALSE)
Step 2: Break out the stats into different columns, with one row for each individual-year. This is the slowest step (especially the spread line), and takes 0.05 sec for 1000 individuals but 10 seconds for 500k. I expect a data.table solution could make it much faster, if needed.
array1_dt_reshape <- array1_dt %>%
rename(stat = Var1, ID = Var2) %>%
filter(!stat %in% c("year", "time", "ID", "age")) %>%
mutate(year = stat %>% str_sub(end = 1),
col = stat %>% str_sub(start = -3)) %>%
select(-stat) %>%
spread(col, Freq) %>%
arrange(ID)
Step 3: Apply requested transformation. This function calculates the distribution with two sets of parameters, and uses these to scale the input table's columns. It takes 0.03 sec for 500k of individuals.
array_transform <- function(input_data = array1_dt_reshape,
max_yr = 7, S_M_mean = 4, I_M_mean = 2) {
tictoc::tic()
# First calculate the distribution function values to apply to all individuals,
# depending on year.
p_S_M_vals <- sapply(0:max_yr, function(x){pnorm(x, S_M_mean, 1)})
p_I_M_vals <- sapply(0:max_yr, function(x){pnorm(x, I_M_mean, 1)})
# For each year, scale S_M + I_M by the respective distribution functions.
# This solution relies on the fact that each ID has 8 rows every time,
# so we can recycle the 8 values in the distribution functions.
output <- input_data %>%
# group_by(ID) %>% <-- Not needed
mutate(S_M = S_F * p_S_M_vals,
I_M = I_F * p_I_M_vals,
R_M = S_M + I_M) # %>% ungroup <-- Not needed
tictoc::toc()
return(output)
}
array1_output <- array_transform(array1_dt_reshape)
Results
head(array1_output)
ID year I_F R_F S_F S_M I_M R_M
1 1 0 16 76 23 7.284386e-04 0.3640021 0.3647305
2 1 1 46 96 80 1.079918e-01 7.2981417 7.4061335
3 1 2 27 57 76 1.729010e+00 13.5000000 15.2290100
4 1 3 42 64 96 1.523090e+01 35.3364793 50.5673837
5 1 4 74 44 57 2.850000e+01 72.3164902 100.8164902
6 1 5 89 90 64 5.384606e+01 88.8798591 142.7259228
7 1 6 23 16 44 4.299899e+01 22.9992716 65.9982658
8 1 7 80 46 90 8.987851e+01 79.9999771 169.8784862
9 2 0 16 76 23 7.284386e-04 0.3640021 0.3647305
10 2 1 46 96 80 1.079918e-01 7.2981417 7.406133

Randomly split array using split() and sample() in R

i have created an array consisting of N values, say
a <- array(dim = c(x, y)).
I want to partition the array into K groups, where array values are randomly assigned to one of K groups. For example if K = 2 and N = 10, one subarray could have 7 values while the other would have 3. The values in all subarrays must sum to N (e.g., 7 + 3 = 10).
I know using split() and sample() is probably the easiest route. I have tried
split(a, sample(a, 2))
but this does not work as it should.
Any ideas?
a <- array(c(1,2,3,4,5,6))
k <- 3
split(sample(a),1:k)
# [1] 2 3
#
# $`2`
# [1] 4 5
#
# $`3`
# [1] 6 1

How to find Consecutive Numbers Among multiple Arrays?

I right away give an example,
now suppose I have 3 arrays a,b,c such as
a = c(3,5)
b = c(6,1,8,7)
c = c(4,2,9)
I must be able to extract consecutive triplets among them i,e.,
c(1,2,3),c(4,5,6)
But this was just an example, I would be having a larger data set with even more than 10 arrays, hence must be able to find the consecutive series of length ten.
So could anyone provide an algorithm, to generally find the consecutive series of length 'n' among 'n' arrays.
I am actually doing this stuff in R, so its preferable if you give your code in R. Yet algorithm from any language is more than welcomed.
Reorganize the data first into a list containing value and array number.
Sort the list; you'd have smth like:
1-2
2-3
3-1 (i.e. " there' s a three in array 1" )
4-3
5-1
6-2
7-2
8-2
9-3
Then loop the list, check if there are actually n consecutive numbers, then check if these had different array numbers
Here's one approach. This assumes there are no breaks in the sequence of observations in the number of groups. Here the data.
N <- 3
a <- c(3,5)
b <- c(6,1,8,7)
c <- c(4,2,9)
Then i combine them together and order by the observations
dd <- lattice::make.groups(a,b,c)
dd <- dd[order(dd$data),]
Now I look for rows in this table where all three groups are represented
idx <- apply(embed(as.numeric(dd$which),N), 1, function(x) {
length(unique(x))==N
})
Then we can see the triplets with
lapply(which(idx), function(i) {
dd[i:(i+N-1),]
})
# [[1]]
# data which
# b2 1 b
# c2 2 c
# a1 3 a
#
# [[2]]
# data which
# c1 4 c
# a2 5 a
# b1 6 b
Here is a brute force method with expand.grid and three vectors as in the example
# get all combinations
df <- expand.grid(a,b,c)
Using combn to calculate difference for each pairwise combination.
# get all parwise differences
myDiffs <- combn(names(df), 2, FUN=function(x) abs(x[1]-x[2]))
# subset data using `rowSums` and `which`
df[which(rowSums(myDiffs == 1) == ncol(myDiffs)-1), ]
df[which(rowSums(myDiffs == 1) == ncol(myDiffs)-1), ]
Var1 Var2 Var3
2 5 6 4
11 3 1 2
I have hacked together a little recursive function that will find all the consecutive triplets amongst as many vectors as you pass it (need to pass at least three). It is probably a little crude, but seems to work.
The function uses the ellipsis, ..., for passing arguments. Hence it will take however many arguments (i.e. numeric vectors) you provide and put them in the list items. Then the smallest value amongst each passed vector is located, along with its index.
Then the indeces of the vectors corresponding to the smallest triplet are created and iterated through using a for() loop, where the output values are passed to the output vector out. The input vectors in items are pruned and passed again into the function in a recursive fashion.
Only, when all vectors are NA, i.e. there are no more values in the vectors, the function returns the final result.
library(magrittr)
# define function to find the triplets
tripl <- function(...){
items <- list(...)
# find the smallest number in each passed vector, along with its index
# output is a matrix of n-by-2, where n is the number of passed arguments
triplet.id <- lapply(items, function(x){
if(is.na(x) %>% prod) id <- c(NA, NA)
else id <- c(which(x == min(x)), x[which(x == min(x))])
}) %>% unlist %>% matrix(., ncol=2, byrow=T)
# find the smallest triplet from the passed vectors
index <- order(triplet.id[,2])[1:3]
# create empty vector for output
out <- vector()
# go through the smallest triplet's indices
for(i in index){
# .. append the coresponding item from the input vector to the out vector
# .. and remove the value from the input vector
if(length(items[[i]]) == 1) {
out <- append(out, items[[i]])
# .. if the input vector has no value left fill with NA
items[[i]] <- NA
}
else {
out <- append(out, items[[i]][triplet.id[i,1]])
items[[i]] <- items[[i]][-triplet.id[i,1]]
}
}
# recurse until all vectors are empty (NA)
if(!prod(unlist(is.na(items)))) out <- append(list(out),
do.call("tripl", c(items), quote = F))
else(out <- list(out))
# return result
return(out)
}
The function can be called by passing the input vectors as arguments.
# input vectors
a = c(3,5)
b = c(6,1,8,7)
c = c(4,2,9)
# find all the triplets using our function
y <- tripl(a,b,c)
The result is a list, which contains all the neccesary information, albeit unordered.
print(y)
# [[1]]
# [1] 1 2 3
#
# [[2]]
# [1] 4 5 6
#
# [[3]]
# [1] 7 9 NA
#
# [[4]]
# [1] 8 NA NA
Ordering everything can be done using sapply():
# put everything in order
sapply(y, function(x){x[order(x)]}) %>% t
# [,1] [,2] [,3]
# [1,] 1 2 3
# [2,] 4 5 6
# [3,] 7 9 NA
# [4,] 8 NA NA
The thing is, that it will use only one value per vector to find triplets.
It will therefore not find the consecutive triplet c(6,7,8) among e.g. c(6,7,11), c(8,9,13) and c(10,12,14).
In this instance it would return c(6,8,10) (see below).
a<-c(6,7,11)
b<-c(8,9,13)
c<-c(10,12,14)
y <- tripl(a,b,c)
sapply(y, function(x){x[order(x)]}) %>% t
# [,1] [,2] [,3]
# [1,] 6 8 10
# [2,] 7 9 12
# [3,] 11 13 14

List of n first Neighbors from a 3d Array R

Lets say we have a 3d array:
my.array <- array(1:27, dim=c(3,3,3))
I would like to create a list of the n first neighbors.
Example: Lets get my.array[2,2,2]=14, so the first neighbors of 14 is:
list[14] = [1 to 27] - 14
I also would like to do the same for second, third, n closest neighbors using R, C or Matlab.
Thanks
Based on the comments, I assume you are defining "first nearest neighbor" as all cells with a euclidean distance of 1 or less (excluding self), "second nearest neighbors" as those with 2 or less, etc. Your assertion in a comment in #evan058's answer that "for (1,1,1) the first level neighbors is 2,4,5,10,11,13", I'm actually interpreting this to include the immediate diagonals (with a distance of 1.414) but not further diagonals (in your example, 14 would be a further diagonal with a distance of 1.732).
This function accepts either a pre-defined array (ary) or the dimensions to make one (dims).
nearestNeighbors(dims = c(3,3,3), elem = c(1,1,1), dist = 1)
# dim1 dim2 dim3
# [1,] 2 1 1
# [2,] 1 2 1
# [3,] 1 1 2
nearestNeighbors(dims = c(3,3,3), elem = c(1,1,1), dist = 1,
return_indices = FALSE)
# [1] 2 4 10
nearestNeighbors(dims = c(3,3,3), elem = c(1,1,1), dist = 2,
return_indices = FALSE)
# [1] 2 3 4 5 7 10 11 13 14 19
nearestNeighbors(ary = array(27:1, dim = c(3,3,3)), elem = c(1,1,1), dist = 2)
# dim1 dim2 dim3
# [1,] 2 1 1
# [2,] 3 1 1
# [3,] 1 2 1
# [4,] 2 2 1
# [5,] 1 3 1
# [6,] 1 1 2
# [7,] 2 1 2
# [8,] 1 2 2
# [9,] 2 2 2
# [10,] 1 1 3
nearestNeighbors(ary = array(27:1, dim = c(3,3,3)), elem = c(1,1,1), dist = 2,
return_indices = FALSE)
# [1] 26 25 24 23 21 18 17 15 14 9
The function:
#' Find nearest neighbors.
#'
#' #param ary array
#' #param elem integer vector indicating the indices on array from
#' which all nearest neighbors will be found; must be the same
#' length as \code{dims} (or \code{dim(ary)}). Only one of
#' \code{ary} and \code{dim} needs to be provided.
#' #param dist numeric, the max distance from \code{elem}, not
#' including the 'self' point.
#' #param dims integer vector indicating the dimensions of the array.
#' Only one of \code{ary} and \code{dim} needs to be provided.
#' #param return_indices logical, whether to return a matrix of
#' indices (as many columns as dimensions) or the values from
#' \code{ary} of the nearest neighbors
#' #return either matrix of indices (one column per dimension) if
#' \code{return_indices == TRUE}, or the appropriate values in
#' \code{ary} otherwise.
nearestNeighbors <- function(ary, elem, dist, dims, return_indices = TRUE) {
if (missing(dims)) dims <- dim(ary)
tmpary <- array(1:prod(dims), dim = dims)
if (missing(ary)) ary <- tmpary
if (length(elem) != length(dims))
stop("'elem'' needs to have the same dimensions as 'ary'")
# work on a subset of the whole matrix
usedims <- mapply(function(el, d) {
seq(max(1, el - dist), min(d, el + dist))
}, elem, dims, SIMPLIFY=FALSE)
df <- as.matrix(do.call('expand.grid', usedims))
# now, df is only as big as we need to possibly satisfy `dist`
ndist <- sqrt(apply(df, 1, function(x) sum((x - elem)^2)))
ret <- df[which(ndist > 0 & ndist <= dist),,drop = FALSE]
if (return_indices) {
return(ret)
} else {
return(ary[ret])
}
}
Edit: changed the code for a "slight" speed improvement: using a 256x256x256 array and a distance of 2 previously took ~90 seconds on my machine. Now it takes less than 1 second. Even a distance of 5 (same array) takes less than a second. Not fully tested, please verify it is correct.
Edit: Removed the extra { on the fifty line of the function.
I think something along these lines will do the trick:
nClosest <- function(pts, pt, n)
{
# Get the target value
val <- pts[pt[1], pt[2], pt[3]]
# Turn the matrix into a DF
ptsDF <- adply(pts, 1:3)
# Create Dist column for distance to val
ptsDF$Dist <- abs(ptsDF$V1 - val)
# Order by the distance to val
ptsDF <- ptsDF[with(ptsDF, order(Dist)),]
# Split into groups:
sp <- split(ptsDF, ptsDF$Dist)
# Get max index
topInd = min(n+1, length(sp))
# Agg the split dfs into a single df
rbind.fill(sp[2:topInd])
}
Output:
> nClosest(my.array, c(1,2,2), 3)
X1 X2 X3 V1 Dist
1 3 1 2 12 1
2 2 2 2 14 1
3 2 1 2 11 2
4 3 2 2 15 2
5 1 1 2 10 3
6 1 3 2 16 3

How can I scale an array to another length saving it's approximate values in R

I have two arrays with different lengths
value <- c(1,1,1,4,4,4,1,1,1)
time <- c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15)
How can I resize the value array to make it same length as the time array, saving it's approximate values ?
approx() function tells that lengths are differ.
I want to get value array to be like
value <- c(1,1,1,1,1,4,4,4,4,4,4,1,1,1,1)
time <- c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15)
so lengths are equal
UPD
Okay, the main goal is to calculate correlation of v1 from v2, where
v1 inside of data.frame v1,t1 , and v2 inside of data.frame v2,t2.
the v1,t1 and v2,t2 data frames have different lengths, but we know that t1 and t2 is for equal time period so we can overlay them.
for t1 we have 1,3,5,7,9 and for t2 we have 1,2,3,4,5,6,7,8,9,10.
The problem is that two data frames are recorded separately but simultaneusly so I need to scale one of them to overlay another data.frame. And then I can calculate correlation of how v1 affects on v2.
That why I need to scale v1 to t2 length.
I'm sorry guys, I dont know how to write the goal correctly in english.
You may use the xout argument in approx
"xout: an optional set of numeric values specifying where interpolation is to take place.".
# create some fake data, which I _think_ may resemble the data you described in edit.
set.seed(123)
# "for t1 we have 1,3,5,7,9"
df1 <- data.frame(time = c(1, 3, 5, 7, 9), value = sample(1:10, 5))
df1
# "for t2 we have 1,2,3,4,5,6,7,8,9,10", the 'full time series'.
df2 <- data.frame(time = 1:10, value = sample(1:10))
# interpolate using approx and the xout argument
# The time values for 'full time series', df2$time, is used as `xout`.
# default values of arguments (e.g. linear interpolation, no extrapolation)
interpol1 <- with(df1, approx(x = time, y = value, xout = df2$time))
# some arguments you may wish to check
# extrapolation rules
interpol2 <- with(df1, approx(x = time, y = value, xout = df2$time,
rule = 2))
# interpolation method ('last observation carried forward")
interpol3 <- with(df1, approx(x = time, y = value, xout = df2$time,
rule = 2, method = "constant"))
df1
# time value
# 1 1 3
# 2 3 8
# 3 5 4
# 4 7 7
# 5 9 6
interpol1
# $x
# [1] 1 2 3 4 5 6 7 8 9 10
#
# $y
# [1] 3.0 5.5 8.0 6.0 4.0 5.5 7.0 6.5 6.0 NA
interpol3
# $x
# [1] 1 2 3 4 5 6 7 8 9 10
#
# $y
# [1] 3 3 8 8 4 4 7 7 6 6
# correlation between a vector of inter-(extra-)polated values
# and the 'full' time series
cor.test(interpol3$y, df2$value)
This little function tries to pad the values in the shorter vector out as evenly as possible and is generalisable. Haven't thought too much about edge cases, and I am sure there are many that break it. Plus it seems like it could be simplified, but is this what you are looking to do...
pad <- function(x,y){
fill <- length(y) - length(x)
run <- rle(x)
add <- fill %/% length(run$lengths)
pad <- diff( c( 0 , as.integer( seq( add , fill , length.out = length(run$lengths) ) ) ) )
rep(run$values , times = run$lengths+pad)
}
pad(value,time)
[1] 1 1 1 1 1 4 4 4 4 4 1 1 1 1 1
Or e.g.
value <- 1:2
time <- 1:10
pad(value,time)
[1] 1 1 1 1 1 2 2 2 2 2

Resources