How to replace reshape2::melt for an array with tidyr? - arrays

I would like to convert a matrix/array (with dimnames) into a data frame. This can be done very easily using reshape2::melt but seems harder with tidyr, and in fact not really possible in the case of an array. Am I missing something? (In particular since reshape2 describes itself as being retired; see https://github.com/hadley/reshape).
For example, given the following matrix
MyScores <- matrix(runif(2*3), nrow = 2, ncol = 3,
dimnames = list(Month = month.name[1:2], Class = LETTERS[1:3]))
we can turn it into a data frame as follows
reshape2::melt(MyScores, value.name = 'Score') # perfect
or, using tidyr as follows:
as_tibble(MyScores, rownames = 'Month') %>%
gather(Class, Score, -Month)
In this case reshape2 and tidyr seem similar (although reshape2 is shorter if you are looking for a long-format data frame).
However for arrays, it seems harder. Given
EverybodyScores <- array(runif(2*3*5), dim = c(2,3,5),
dimnames = list(Month = month.name[1:2], Class = LETTERS[1:3], StudentID = 1:5))
we can turn it into a data frame as follows:
reshape2::melt(EverybodyScores, value.name = 'Score') # perfect
but using tidyr it's not clear how to do it:
as_tibble(EverybodyScores, rownames = 'Month') # looses month information and need to distange Class and StudentID
Is this a situation where the right solution is to stick to using reshape2?

One way I just found by playing around is to coerce via tbl_cube. I have never really used the class but it seems to do the trick in this instance.
EverybodyScores <- array(
runif(2 * 3 * 5),
dim = c(2, 3, 5),
dimnames = list(Month = month.name[1:2], Class = LETTERS[1:3], StudentID = 1:5)
)
library(tidyverse)
library(cubelyr)
EverybodyScores %>%
as.tbl_cube(met_name = "Score") %>%
as_tibble
#> # A tibble: 30 x 4
#> Month Class StudentID Score
#> <chr> <chr> <int> <dbl>
#> 1 January A 1 0.366
#> 2 February A 1 0.254
#> 3 January B 1 0.441
#> 4 February B 1 0.562
#> 5 January C 1 0.313
#> 6 February C 1 0.192
#> 7 January A 2 0.799
#> 8 February A 2 0.277
#> 9 January B 2 0.631
#> 10 February B 2 0.101
#> # ... with 20 more rows
Created on 2018-08-15 by the reprex package (v0.2.0).

Making a tibble drops the row names, but instead of going straight into a tibble, you can make the array into a base R data.frame, then use tidyr::rownames_to_column to make a column for months. Notice that converting to a data frame creates columns with names like A.1, sticking the class and ID together; you can separate these again with tidyr::separate. Calling as_tibble is optional, just for if you care about it being a tibble in the end, and also can come at any point in the workflow once you've made a column from the row names.
library(tidyverse)
EverybodyScores <- array(runif(2*3*5), dim = c(2,3,5),
dimnames = list(Month = month.name[1:2], Class = LETTERS[1:3], StudentID = 1:5))
EverybodyScores %>%
as.data.frame() %>%
rownames_to_column("Month") %>%
gather(key = class_id, value = value, -Month) %>%
separate(class_id, into = c("Class", "StudentID"), sep = "\\.") %>%
as_tibble()
#> # A tibble: 30 x 4
#> Month Class StudentID value
#> <chr> <chr> <chr> <dbl>
#> 1 January A 1 0.576
#> 2 February A 1 0.229
#> 3 January B 1 0.930
#> 4 February B 1 0.547
#> 5 January C 1 0.761
#> 6 February C 1 0.468
#> 7 January A 2 0.631
#> 8 February A 2 0.893
#> 9 January B 2 0.638
#> 10 February B 2 0.735
#> # ... with 20 more rows
Created on 2018-08-15 by the reprex package (v0.2.0).

Here is the new tidyr way to do the same:
library(tidyr)
EverybodyScores <- array(
runif(2 * 3 * 5),
dim = c(2, 3, 5),
dimnames = list(Month = month.name[1:2], Class = LETTERS[1:3], StudentID = 1:5)
)
as_tibble(EverybodyScores, rownames = "Month") %>%
pivot_longer(
cols = matches("^A|^B|^C"),
names_sep = "\\.",
names_to = c("Class", "StudentID")
)
#> # A tibble: 30 x 4
#> Month Class StudentID value
#> <chr> <chr> <chr> <dbl>
#> 1 January A 1 0.0325
#> 2 January B 1 0.959
#> 3 January C 1 0.593
#> 4 January A 2 0.0702
#> 5 January B 2 0.882
#> 6 January C 2 0.918
#> 7 January A 3 0.459
#> 8 January B 3 0.849
#> 9 January C 3 0.901
#> 10 January A 4 0.328
#> # … with 20 more rows
Created on 2021-02-23 by the reprex package (v1.0.0)

Related

loop Tukey post hoc letters extraction

I need to apply a tukey post hoc test to a dataset with 80 columns/variables based on 3 groups/treatments. Is there any way to get a table with all variables in which common characters identify levels or groups that are not significantly different (based on p-values) in an automated way with a loop function?
Does this work for you?
# packages and function conflicts
library(conflicted)
library(emmeans)
library(multcomp)
library(multcompView)
library(tidyverse)
conflict_prefer("select", winner = "dplyr")
#> [conflicted] Will prefer dplyr::select over any other package
# Create example data
dat <- PlantGrowth %>%
transmute(
group = group,
y1 = weight,
y2 = weight * runif(30, 0.8, 1.2),
y3 = weight * runif(30, 0.8, 1.2)
) %>%
as_tibble()
dat
#> # A tibble: 30 x 4
#> group y1 y2 y3
#> <fct> <dbl> <dbl> <dbl>
#> 1 ctrl 4.17 4.22 3.53
#> 2 ctrl 5.58 6.19 6.46
#> 3 ctrl 5.18 5.95 5.66
#> 4 ctrl 6.11 5.36 7.19
#> 5 ctrl 4.5 4.41 5.11
#> 6 ctrl 4.61 3.93 4.89
#> 7 ctrl 5.17 4.36 4.67
#> 8 ctrl 4.53 4.53 4.72
#> 9 ctrl 5.33 4.64 5.86
#> 10 ctrl 5.14 5.34 4.89
#> # ... with 20 more rows
# Loop setup
var_names <- names(dat)[-1]
loop_out <- list()
# Loop
for (var_i in var_names) {
dat_i <- dat %>%
rename(y_i = !!var_i) %>%
select(group, y_i)
mod_i <- lm(y_i ~ group, data = dat_i)
emm_i <- emmeans(mod_i, "group") %>%
cld(Letters = letters)
loop_out[[var_i]] <- emm_i %>%
as_tibble() %>%
select(group, emmean, .group) %>%
rename_with(.cols = -group,
.fn = ~ paste(., var_i, sep = "_"))
}
# Join loop results
loop_out %>% reduce(full_join, by='group')
#> # A tibble: 3 x 7
#> group emmean_y1 .group_y1 emmean_y2 .group_y2 emmean_y3 .group_y3
#> <fct> <dbl> <chr> <dbl> <chr> <dbl> <chr>
#> 1 trt1 4.66 " a " 4.46 " a " 4.64 " a"
#> 2 ctrl 5.03 " ab" 4.89 " ab" 5.30 " a"
#> 3 trt2 5.53 " b" 5.74 " b" 5.39 " a"
Created on 2022-08-08 by the reprex package (v2.0.1)
Check out my summary on the compact letter display for more background.

Converting a list of data into table format in R [duplicate]

I'm having trouble rearranging the following data frame:
set.seed(45)
dat1 <- data.frame(
name = rep(c("firstName", "secondName"), each=4),
numbers = rep(1:4, 2),
value = rnorm(8)
)
dat1
name numbers value
1 firstName 1 0.3407997
2 firstName 2 -0.7033403
3 firstName 3 -0.3795377
4 firstName 4 -0.7460474
5 secondName 1 -0.8981073
6 secondName 2 -0.3347941
7 secondName 3 -0.5013782
8 secondName 4 -0.1745357
I want to reshape it so that each unique "name" variable is a rowname, with the "values" as observations along that row and the "numbers" as colnames. Sort of like this:
name 1 2 3 4
1 firstName 0.3407997 -0.7033403 -0.3795377 -0.7460474
5 secondName -0.8981073 -0.3347941 -0.5013782 -0.1745357
I've looked at melt and cast and a few other things, but none seem to do the job.
Using reshape function:
reshape(dat1, idvar = "name", timevar = "numbers", direction = "wide")
The new (in 2014) tidyr package also does this simply, with gather()/spread() being the terms for melt/cast.
Edit: Now, in 2019, tidyr v 1.0 has launched and set spread and gather on a deprecation path, preferring instead pivot_wider and pivot_longer, which you can find described in this answer. Read on if you want a brief glimpse into the brief life of spread/gather.
library(tidyr)
spread(dat1, key = numbers, value = value)
From github,
tidyr is a reframing of reshape2 designed to accompany the tidy data framework, and to work hand-in-hand with magrittr and dplyr to build a solid pipeline for data analysis.
Just as reshape2 did less than reshape, tidyr does less than reshape2. It's designed specifically for tidying data, not the general reshaping that reshape2 does, or the general aggregation that reshape did. In particular, built-in methods only work for data frames, and tidyr provides no margins or aggregation.
You can do this with the reshape() function, or with the melt() / cast() functions in the reshape package. For the second option, example code is
library(reshape)
cast(dat1, name ~ numbers)
Or using reshape2
library(reshape2)
dcast(dat1, name ~ numbers)
Another option if performance is a concern is to use data.table's extension of reshape2's melt & dcast functions
(Reference: Efficient reshaping using data.tables)
library(data.table)
setDT(dat1)
dcast(dat1, name ~ numbers, value.var = "value")
# name 1 2 3 4
# 1: firstName 0.1836433 -0.8356286 1.5952808 0.3295078
# 2: secondName -0.8204684 0.4874291 0.7383247 0.5757814
And, as of data.table v1.9.6 we can cast on multiple columns
## add an extra column
dat1[, value2 := value * 2]
## cast multiple value columns
dcast(dat1, name ~ numbers, value.var = c("value", "value2"))
# name value_1 value_2 value_3 value_4 value2_1 value2_2 value2_3 value2_4
# 1: firstName 0.1836433 -0.8356286 1.5952808 0.3295078 0.3672866 -1.6712572 3.190562 0.6590155
# 2: secondName -0.8204684 0.4874291 0.7383247 0.5757814 -1.6409368 0.9748581 1.476649 1.1515627
With tidyr, there is pivot_wider() and pivot_longer() which are generalized to do reshaping from long -> wide or wide -> long, respectively. Using the OP's data:
single column long -> wide
library(tidyr)
dat1 %>%
pivot_wider(names_from = numbers, values_from = value)
# # A tibble: 2 x 5
# name `1` `2` `3` `4`
# <fct> <dbl> <dbl> <dbl> <dbl>
# 1 firstName 0.341 -0.703 -0.380 -0.746
# 2 secondName -0.898 -0.335 -0.501 -0.175
multiple columns long -> wide
pivot_wider() is also capable of more complex pivot operations. For example, you can pivot multiple columns simultaneously:
# create another column for showing the functionality
dat2 <- dat1 %>%
dplyr::rename(valA = value) %>%
dplyr::mutate(valB = valA * 2)
dat2 %>%
pivot_wider(names_from = numbers, values_from = c(valA, valB))
# # A tibble: 2 × 9
# name valA_1 valA_2 valA_3 valA_4 valB_1 valB_2 valB_3 valB_4
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 firstName 0.341 -0.703 -0.380 -0.746 0.682 -1.41 -0.759 -1.49
# 2 secondName -0.898 -0.335 -0.501 -0.175 -1.80 -0.670 -1.00 -0.349
There is much more functionality to be found in the docs.
Using your example dataframe, we could:
xtabs(value ~ name + numbers, data = dat1)
Other two options:
Base package:
df <- unstack(dat1, form = value ~ numbers)
rownames(df) <- unique(dat1$name)
df
sqldf package:
library(sqldf)
sqldf('SELECT name,
MAX(CASE WHEN numbers = 1 THEN value ELSE NULL END) x1,
MAX(CASE WHEN numbers = 2 THEN value ELSE NULL END) x2,
MAX(CASE WHEN numbers = 3 THEN value ELSE NULL END) x3,
MAX(CASE WHEN numbers = 4 THEN value ELSE NULL END) x4
FROM dat1
GROUP BY name')
Using base R aggregate function:
aggregate(value ~ name, dat1, I)
# name value.1 value.2 value.3 value.4
#1 firstName 0.4145 -0.4747 0.0659 -0.5024
#2 secondName -0.8259 0.1669 -0.8962 0.1681
The base reshape function works perfectly fine:
df <- data.frame(
year = c(rep(2000, 12), rep(2001, 12)),
month = rep(1:12, 2),
values = rnorm(24)
)
df_wide <- reshape(df, idvar="year", timevar="month", v.names="values", direction="wide", sep="_")
df_wide
Where
idvar is the column of classes that separates rows
timevar is the column of classes to cast wide
v.names is the column containing numeric values
direction specifies wide or long format
the optional sep argument is the separator used in between timevar class names and v.names in the output data.frame.
If no idvar exists, create one before using the reshape() function:
df$id <- c(rep("year1", 12), rep("year2", 12))
df_wide <- reshape(df, idvar="id", timevar="month", v.names="values", direction="wide", sep="_")
df_wide
Just remember that idvar is required! The timevar and v.names part is easy. The output of this function is more predictable than some of the others, as everything is explicitly defined.
There's very powerful new package from genius data scientists at Win-Vector (folks that made vtreat, seplyr and replyr) called cdata. It implements "coordinated data" principles described in this document and also in this blog post. The idea is that regardless how you organize your data, it should be possible to identify individual data points using a system of "data coordinates". Here's a excerpt from the recent blog post by John Mount:
The whole system is based on two primitives or operators
cdata::moveValuesToRowsD() and cdata::moveValuesToColumnsD(). These
operators have pivot, un-pivot, one-hot encode, transpose, moving
multiple rows and columns, and many other transforms as simple special
cases.
It is easy to write many different operations in terms of the
cdata primitives. These operators can work-in memory or at big data
scale (with databases and Apache Spark; for big data use the
cdata::moveValuesToRowsN() and cdata::moveValuesToColumnsN()
variants). The transforms are controlled by a control table that
itself is a diagram of (or picture of) the transform.
We will first build the control table (see blog post for details) and then perform the move of data from rows to columns.
library(cdata)
# first build the control table
pivotControlTable <- buildPivotControlTableD(table = dat1, # reference to dataset
columnToTakeKeysFrom = 'numbers', # this will become column headers
columnToTakeValuesFrom = 'value', # this contains data
sep="_") # optional for making column names
# perform the move of data to columns
dat_wide <- moveValuesToColumnsD(tallTable = dat1, # reference to dataset
keyColumns = c('name'), # this(these) column(s) should stay untouched
controlTable = pivotControlTable# control table above
)
dat_wide
#> name numbers_1 numbers_2 numbers_3 numbers_4
#> 1 firstName 0.3407997 -0.7033403 -0.3795377 -0.7460474
#> 2 secondName -0.8981073 -0.3347941 -0.5013782 -0.1745357
much easier way!
devtools::install_github("yikeshu0611/onetree") #install onetree package
library(onetree)
widedata=reshape_toWide(data = dat1,id = "name",j = "numbers",value.var.prefix = "value")
widedata
name value1 value2 value3 value4
firstName 0.3407997 -0.7033403 -0.3795377 -0.7460474
secondName -0.8981073 -0.3347941 -0.5013782 -0.1745357
if you want to go back from wide to long, only change Wide to Long, and no changes in objects.
reshape_toLong(data = widedata,id = "name",j = "numbers",value.var.prefix = "value")
name numbers value
firstName 1 0.3407997
secondName 1 -0.8981073
firstName 2 -0.7033403
secondName 2 -0.3347941
firstName 3 -0.3795377
secondName 3 -0.5013782
firstName 4 -0.7460474
secondName 4 -0.1745357
This works even if you have missing pairs and it doesn't require sorting (as.matrix(dat1)[,1:2] can be replaced with cbind(dat1[,1],dat1[,2])):
> set.seed(45);dat1=data.frame(name=rep(c("firstName","secondName"),each=4),numbers=rep(1:4,2),value=rnorm(8))
> u1=unique(dat1[,1]);u2=unique(dat1[,2])
> m=matrix(nrow=length(u1),ncol=length(u2),dimnames=list(u1,u2))
> m[as.matrix(dat1)[,1:2]]=dat1[,3]
> m
1 2 3 4
firstName 0.3407997 -0.7033403 -0.3795377 -0.7460474
secondName -0.8981073 -0.3347941 -0.5013782 -0.1745357
This doesn't work if you have missing pairs and it requires sorting, but it's a bit shorter in case the pairs are already sorted:
> u1=unique(dat1[,1]);u2=unique(dat1[,2])
> dat1=dat1[order(dat1[,1],dat1[,2]),] # not actually needed in this case
> matrix(dat1[,3],length(u1),,T,list(u1,u2))
1 2 3 4
firstName 0.3407997 -0.7033403 -0.3795377 -0.7460474
secondName -0.8981073 -0.3347941 -0.5013782 -0.1745357
Here's a function version of the first approach (add as.data.frame to make it work with tibbles):
l2w=function(x,row=1,col=2,val=3,sort=F){
u1=unique(x[,row])
u2=unique(x[,col])
if(sort){u1=sort(u1);u2=sort(u2)}
out=matrix(nrow=length(u1),ncol=length(u2),dimnames=list(u1,u2))
out[cbind(x[,row],x[,col])]=x[,val]
out
}
Or if you only have the values of the lower triangle, you can do this:
> euro=as.matrix(eurodist)[1:3,1:3]
> lower=data.frame(V1=rownames(euro)[row(euro)[lower.tri(euro)]],V2=colnames(euro)[col(euro)[lower.tri(euro)]],V3=euro[lower.tri(euro)])
> lower
V1 V2 V3
1 Barcelona Athens 3313
2 Brussels Athens 2963
3 Brussels Barcelona 1318
> n=unique(c(lower[,1],lower[,2]))
> full=rbind(lower,setNames(lower[,c(2,1,3)],names(lower)),data.frame(V1=n,V2=n,V3=0))
> full
V1 V2 V3
1 Barcelona Athens 3313
2 Brussels Athens 2963
3 Brussels Barcelona 1318
4 Athens Barcelona 3313
5 Athens Brussels 2963
6 Barcelona Brussels 1318
7 Athens Athens 0
8 Barcelona Barcelona 0
9 Brussels Brussels 0
> l2w(full,sort=T)
Athens Barcelona Brussels
Athens 0 3313 2963
Barcelona 3313 0 1318
Brussels 2963 1318 0
Or here's another approach:
> rc=as.matrix(lower[-3])
> n=sort(unique(c(rc)))
> m=matrix(0,length(n),length(n),,list(n,n))
> m[rc]=lower[,3]
> m[rc[,2:1]]=lower[,3]
> m
Athens Barcelona Brussels
Athens 0 3313 2963
Barcelona 3313 0 1318
Brussels 2963 1318 0
Another simple method in base R is to use xtabs. The result of xtabs is basically just a matrix with a fancy class name, but you can make it look like a regular matrix with class(x)=NULL;attr(x,"call")=NULL;dimnames(x)=unname(dimnames(x)):
> x=xtabs(value~name+numbers,dat1);x
numbers
name 1 2 3 4
firstName 0.3407997 -0.7033403 -0.3795377 -0.7460474
secondName -0.8981073 -0.3347941 -0.5013782 -0.1745357
> str(x)
'xtabs' num [1:2, 1:4] 0.341 -0.898 -0.703 -0.335 -0.38 ...
- attr(*, "dimnames")=List of 2
..$ name : chr [1:2] "firstName" "secondName"
..$ numbers: chr [1:4] "1" "2" "3" "4"
- attr(*, "call")= language xtabs(formula = value ~ name + numbers, data = dat1)
> class(x)
[1] "xtabs" "table"
> class(as.matrix(x)) # `as.matrix` has no effect because `x` is already a matrix
[1] "xtabs" "table"
> class(x)=NULL;class(x)
[1] "matrix" "array"
> attr(x,"call")=NULL;dimnames(x)=unname(dimnames(x))
> x # now it looks like a regular matrix
1 2 3 4
firstName 0.3407997 -0.7033403 -0.3795377 -0.7460474
secondName -0.8981073 -0.3347941 -0.5013782 -0.1745357
> str(x)
num [1:2, 1:4] 0.341 -0.898 -0.703 -0.335 -0.38 ...
- attr(*, "dimnames")=List of 2
..$ : chr [1:2] "firstName" "secondName"
..$ : chr [1:4] "1" "2" "3" "4"
Normally as.data.frame(x) converts the result of xtabs back to long format, but you can avoid it with class(x)=NULL:
> x=xtabs(value~name+numbers,dat1);as.data.frame(x)
name numbers Freq
1 firstName 1 0.3407997
2 secondName 1 -0.8981073
3 firstName 2 -0.7033403
4 secondName 2 -0.3347941
5 firstName 3 -0.3795377
6 secondName 3 -0.5013782
7 firstName 4 -0.7460474
8 secondName 4 -0.1745357
> class(x)=NULL;as.data.frame(x)
1 2 3 4
firstName 0.3407997 -0.7033403 -0.3795377 -0.7460474
secondName -0.8981073 -0.3347941 -0.5013782 -0.1745357
This converts data in wide fromat to long format (unlist converts a dataframe to a vector and c converts a matrix to a vector):
w2l=function(x)data.frame(V1=rownames(x)[row(x)],V2=colnames(x)[col(x)],V3=unname(c(unlist(x))))
Came here via a linked question Reshape three column data frame to matrix ("long" to "wide" format). That question is closed, so I writing an alternative solution here.
I found a alternative solution, perhaps useful for someone looking for converting three columns to a matrix. I am referring to decoupleR (2.3.2) package. Below is copied from their site
Generates a kind of table where the rows come from id_cols, the columns from names_from and the values from values_from.
Usage
pivot_wider_profile(
data,
id_cols,
names_from,
values_from,
values_fill = NA,
to_matrix = FALSE,
to_sparse = FALSE,
...
)
Using only dplyr and map.
library(dplyr)
library(purrr)
set.seed(45)
dat1 <- data.frame(
name = rep(c("firstName", "secondName"), each=4),
numbers = rep(1:4, 2), value = rnorm(8)
)
longer_to_wider <- function(data, name_from, value_from){
group <- colnames(data)[!(colnames(data) %in% c(name_from,value_from))]
data %>% group_by(.data[[group]]) %>%
summarise( name = list(.data[[name_from]]),
value = list(.data[[value_from]])) %>%
{
d <- data.frame(
name = .[[name_from]] %>% unlist() %>% unique()
)
e <- map_dfc(.[[group]],function(x){
y <- data_frame(
x = data %>% filter(.data[[group]] == x) %>% pull(value_from)
)
colnames(y) <- x
y
})
cbind(d,e)
}
}
longer_to_wider(dat1, "name", "value")
# name 1 2 3 4
# 1 firstName 0.3407997 -0.7033403 -0.3795377 -0.7460474
# 2 secondName -0.8981073 -0.3347941 -0.5013782 -0.1745357

Convert API JSON format to data frame

Im trying to create a new subset data frame from this site using R.
#load libraries
library(dplyr)
library(jsonlite)
library(tidyr)
#source file
url = "http://api.us.socrata.com/api/catalog/v1 q=nasa&domains=data.nasa.gov&offset=0&limit=500"
metadata <- fromJSON(url)
#Create a new data frame
nasa_api <- data.frame(id = metadata$results$resource$id,
title = metadata$results$resource$name,
description = metadata$results$resource$description,
download_count = metadata$results$resource$download_count,
domain_category = metadata$results$classification$domain_category,
link = metadata$results$link,
permlink = metadata$results$permalink)
I notice that metadata object contains nested lists. I need to create a new dataset for classifications which is a data frame nested inside metadata. So ideally I want this new data frame to contain "id" so that I can join these 2 datasets later.
I think it will be an easy task but I am new to R. Please can you help?
I noticed there is a problem in your URL (v1 q=nasa should be v1?q=nasa). As such, I have illustrated how you might solve this problem with the tidyjson package. It can be a lot of typing, but it gives you a solid tidy data_frame afterwards. I recommend the development version from devtools::install_github('jeremystan/tidyjson'), which has some features not yet on CRAN.
In any case, since you did not articulate which nested arrays you are interested in, I just picked one (classification/domain_metadata).
## devtools::install_github('jeremystan/tidyjson')
library(dplyr)
library(tidyjson)
j <- as.tbl_json("http://api.us.socrata.com/api/catalog/v1?q=nasa&domains=data.nasa.gov&offset=0&limit=500")
base <- j %>% enter_object(results) %>% gather_array()
nasa_api <- base %>% spread_values(id = jstring(resource, id), title = jstring(resource,
name), description = jstring(resource, description), download_count = jstring(resource,
download_count), domain_category = jstring(classification, domain_category),
link = jstring(link), permlink = jstring(permlink))
print(nasa_api)
#> # A tbl_json: 500 x 9 tibble with a "JSON" attribute
#> `attr(., "JSON")` document.id array.index id
#> <chr> <int> <int> <chr>
#> 1 "{\"resource\":{\"d..." 1 1 gvk9-iz74
#> 2 "{\"resource\":{\"d..." 1 2 scmi-np9r
#> 3 "{\"resource\":{\"d..." 1 3 gquh-watm
#> 4 "{\"resource\":{\"d..." 1 4 dtgb-tk9p
#> 5 "{\"resource\":{\"d..." 1 5 j6wr-4xhn
#> 6 "{\"resource\":{\"d..." 1 6 357b-ra7j
#> 7 "{\"resource\":{\"d..." 1 7 e2ud-kf5m
#> 8 "{\"resource\":{\"d..." 1 8 uwnx-gns8
#> 9 "{\"resource\":{\"d..." 1 9 fzmj-dfnj
#> 10 "{\"resource\":{\"d..." 1 10 szzb-kefa
#> # ... with 490 more rows, and 6 more variables: title <chr>,
#> # description <chr>, download_count <chr>, domain_category <chr>,
#> # link <chr>, permlink <chr>
## explore the json_types of one of the objects
base %>% enter_object("classification") %>% .[1, ] %>% gather_object() %>% json_types()
#> # A tbl_json: 5 x 4 tibble with a "JSON" attribute
#> `attr(., "JSON")` document.id array.index name
#> <chr> <int> <int> <chr>
#> 1 [] 1 1 categories
#> 2 [] 1 1 tags
#> 3 "\"Management/Ope..." 1 1 domain_category
#> 4 [] 1 1 domain_tags
#> 5 "[{\"value\":\"\",\"k..." 1 1 domain_metadata
#> # ... with 1 more variables: type <fctr>
## example of an ancillary table
base %>% spread_values(id = jstring(resource, id)) %>% enter_object("classification") %>%
enter_object("domain_metadata") %>% gather_array("domain_metadata_id") %>%
spread_values(key = jstring(key), value = jstring(value)) %>% select(document.id,
array.index, id, key, value) %>% as_data_frame()
#> # A tibble: 6,343 x 5
#> document.id array.index id key
#> * <int> <int> <chr> <chr>
#> 1 1 1 gvk9-iz74 Common-Core_Contact-Email
#> 2 1 1 gvk9-iz74 Common-Core_License
#> 3 1 1 gvk9-iz74 Common-Core_System-of-Records
#> 4 1 1 gvk9-iz74 Common-Core_Program-Code
#> 5 1 1 gvk9-iz74 Common-Core_Described-By
#> 6 1 1 gvk9-iz74 Common-Core_Public-Access-Level
#> 7 1 1 gvk9-iz74 Common-Core_Temporal-Applicability
#> 8 1 1 gvk9-iz74 Common-Core_Is-Quality-Data
#> 9 1 1 gvk9-iz74 Common-Core_Language
#> 10 1 1 gvk9-iz74 Common-Core_References
#> # ... with 6,333 more rows, and 1 more variables: value <chr>

Reordering column names in R with the use of a two dimensional array

I could not find another question about my problem although a lot with similar title exist. So, I have a matrix 1000x200 with chemical compounds as column names. For some reason I want to reorder my compounds based on a two dimensional array that matches compound names with Compound IDs. For example:
Compound CID
Ramipril 5362129
Eliprodil 60703
artesunate 5464098
benzonatate 7699
But in my data set is:
benzonatate Ramipril Eliprodil Artesunate
1 0.453 0.332 0.897 0.123
The desired output should be a matrix with columns ordered as in the two dimensional vector:
Ramipril Eliprodil Artesunate benzonatate
1 0.332 0.897 0.123 0.453
Is there a way that I can match the names with CIDs and in the same time and then reorder the matrix columns?
We could use match to get the index for reordering the second dataset ('df2') based on the 'Compound' column of first dataset ('df1'). Eventhough, the OP mentioned a matrix as the first dataset, it might be better to use data.frame for columns that have mixed classes.
df2[,match(toupper(df1[, 'Compound']), toupper(colnames(df2)), nomatch=0)]
# Ramipril Eliprodil Artesunate benzonatate
#1 0.332 0.897 0.123 0.453
Using another example, where I created 'm2' as a matrix with column names as chemical compounds,
m2N <- m2[,match(toupper(df1N[,'Compound']), toupper(colnames(m2)), nomatch=0)]
m2N
# Ramipril Eliprodil artesunate benzonatate
#[1,] 6 1 3 3
#[2,] 4 6 2 2
#[3,] 7 7 4 7
#[4,] 7 1 1 5
#[5,] 3 2 10 7
#[6,] 9 7 2 10
#[7,] 2 0 8 3
#[8,] 0 6 6 8
#[9,] 5 6 7 8
#[10,] 1 0 10 2
data
df1 <- structure(list(Compound = c("Ramipril", "Eliprodil", "artesunate",
"benzonatate"), CID = c(5362129L, 60703L, 5464098L, 7699L)),
.Names = c("Compound",
"CID"), class = "data.frame", row.names = c(NA, -4L))
df2 <- structure(list(benzonatate = 0.453, Ramipril = 0.332,
Eliprodil = 0.897,
Artesunate = 0.123), .Names = c("benzonatate", "Ramipril",
"Eliprodil", "Artesunate"), class = "data.frame", row.names = "1")
newdata
df1N <- structure(list(Compound = c("Ramipril", "Eliprodil", "Stargazer",
"artesunate", "benzonatate", "Ronipril"), CID = c(5362129L, 60703L,
7859L, 5464098L, 7699L, 7892L)), .Names = c("Compound", "CID"
), class = "data.frame", row.names = c(NA, -6L))
set.seed(24)
m2 <- matrix(sample(0:10, 4*10, replace=TRUE), ncol=4,
dimnames=list(NULL, c('benzonatate', 'Ramipril', 'Eliprodil', 'artesunate')))

storing value against variable name "QW1I5K20" in an array element Q[1,5,20] using R

I have an excel file (.csv) with a sorted column of variable names such as "QW1I1K5" and numerical values against them.
this list goes on for
W from 1 to 15
I from 1 to 4
K from 1 to 30
total elements = 15*4*30 = 1800
I want to store the numerical values against these variables in an array whose indices are derived from the variable name .
for example QW1I1K5 has a value 11 . this must be stored in an array element Q[1,1,5] = 11 ( index set of [1,1,5] corresponds to W1 , I1 , K5)
May be this helps
Q <- array(dat$Col2, dim=c(15,4,30))
dat$Col2[dat$Col1=='QW1I1K5']
#[1] 34
Q[1,1,5]
#[1] 34
dat$Col2[dat$Col1=='QW4I3K8']
#[1] 38
Q[4,3,8]
#[1] 38
If you want the index along with the values
library(reshape2)
d1 <- melt(Q)
head(d1,3)
# Var1 Var2 Var3 value
#1 1 1 1 12
#2 2 1 1 9
#3 3 1 1 29
Q[1,1,1]
#[1] 12
Q[3,1,1]
#[1] 29
Update
Suppose, your data is in the order as you described in the comments, which will be dat1
indx <- read.table(text=gsub('[^0-9]+', ' ', dat1$Col1), header=FALSE)
dat2 <- dat1[do.call(order, indx[,3:1]),]
Q1 <- array(dat2$Col2,dim=c(15,4,30))
Q1[1,1,2]
#[1] 20
dat2$Col2[dat2$Col1=='QW1I1K2']
#[1] 20
data
Col1 <- do.call(paste,c(expand.grid('QW', 1:15, 'I', 1:4, 'K',1:30),
list(sep='')))
set.seed(24)
dat <- data.frame(Col1, Col2=sample(1:40, 1800,replace=TRUE))
dat1 <- dat[order(as.numeric(gsub('[^0-9]+', '', dat$Col1))),]
row.names(dat1) <- NULL
I would suggest looking at using "data.table" and setting your key to the split columns. You can use cSplit from my "splitstackshape" function to easily split the column.
Sample Data:
df <- data.frame(
V1 = c("QW1I1K1", "QW1I1K2", "QW1I1K3",
"QW1I1K4", "QW2I1K5", "QW2I3K2"),
V2 = c(15, 20, 5, 6, 7, 9))
df
# V1 V2
# 1 QW1I1K1 15
# 2 QW1I1K2 20
# 3 QW1I1K3 5
# 4 QW1I1K4 6
# 5 QW2I1K5 7
# 6 QW2I3K2 9
Splitting the column:
library(splitstackshape)
out <- cSplit(df, "V1", "[A-Z]+", fixed = FALSE)
setnames(out, c("V2", "W", "I", "K"))
setcolorder(out, c("W", "I", "K", "V2"))
setkey(out, W, I, K)
out
# W I K V2
# 1: 1 1 1 15
# 2: 1 1 2 20
# 3: 1 1 3 5
# 4: 1 1 4 6
# 5: 2 1 5 7
# 6: 2 3 2 9
Extracting rows:
out[J(1, 1, 4)]
# W I K V2
# 1: 1 1 4 6
out[J(2, 3, 2)]
# W I K V2
# 1: 2 3 2 9

Resources