I am trying to create a series of tables by group and I would like to have each table run iteratively with a different variable. I would also like to add the p-value from an anova to the bottom of the table. I am able to do a single table easily using the aov function and the kable function. I thought a for loop might work as follows:
#list of column names
varlist <- c("var1", "var2", "var3", "var4", "var5", "var6")
for (var in 1:6){
#anova
aov(varlist[[var]] ~ group, data=dc3)
#pull out pvalue for anova as string
pval <-paste("ANOVA P-Value:", round(summary(fit)[[1]][["Pr(>F)"]][[1]], 3))
# Create Table
Table <- dc3 %>% group_by(group) %>%
summarise(Mean = round(mean(varlist[[var]], na.rm = TRUE),2), SD = round(sd(varlist[[var]], na.rm = TRUE),2))
# Add Pvalue to bottom on table
kable(worst_arr_delays, "html") %>%
kable_styling(bootstrap_options = c("striped", "hover")) %>%
add_footnote(c(pval2), notation = "symbol")
}
Ideally, this would give me six tables that would look similar to this:
If I am understanding what you want to do, I think that this code could serve you.
library(dplyr)
library(knitr)
library(kableExtra)
dc3 = data.frame(var1=rnorm(40,25,5),var2=rnorm(40,25,5),var3=rnorm(40,25,5),
var4=rnorm(40,25,5),var5=rnorm(40,25,5),var6=rnorm(40,25,5),
group=rep(c("gr.2","gr.3","gr.4","veh"),each=10))
res = NULL
for(i in 1:6){
fit <- aov(dc3[,i]~group,dc3)
pval <-paste("ANOVA P-Value:", round(summary(fit)[[1]][["Pr(>F)"]][[1]], 3))
# Create Table
Table <- dc3 %>% group_by(group) %>%
summarise(Mean = round(mean(dc3[,i],na.rm = TRUE),2), SD = round(sd(dc3[,i], na.rm = TRUE),2))
# Add Pvalue to bottom on table
res <- kable(Table,"html") %>%
kable_styling(bootstrap_options = c("striped", "hover")) %>%
add_footnote(c(pval), notation = "symbol")
print(res)
}
Related
I have integrated an R model into a stored procedure using the R Tools for Visual Studio guidance. The syntax is as follows:
ALTER PROCEDURE [dbo].[spRegressionPeak]
AS
BEGIN
EXEC sp_execute_external_script #language = N'R'
, #script = N'
# #InputDataSet: input data frame, result of SQL query execution
# #OutputDataSet: data frame to pass back to SQL
# Test code
#library(RODBC)
# channel <- odbcDriverConnect(dbConnection)
# InputDataSet <- sqlQuery(channel,iconv(paste(readLines(''~/visual studio 2017/prod360/regressionpeak.query.sql'', encoding = ''UTF-8'', warn = FALSE), collapse=''\n''), from = ''UTF-8'', to = ''ASCII'', sub = '''') )
# odbcClose(channel)
#'' Regression Peaks
#''
#'' Runs polynomial regressions on a data table with one model for each
#'' user ID - independent variable pair. Note that independent variables
#'' are identified as all columns matching the following pattern: the
#'' letter "c" followed by a one-or-more digit number. The dependent
#'' variable is identified by its name "dv". The user ID is identified by
#'' its name "id". Also note that the regressors are the means of the
#'' original observations, grouped by \code{code}.
#''
#'' #param x Table to run regressions on
#'' #param c_means Code means table
#'' #importFrom rlang .data
#''
#'' #return Summary table where each distinct \code{code} value is
#'' represented by one row with columns for the respective standard
#'' deviations of each independent variable.
#'' #export
regression_peak <- function(x, c_means) {
df <-
dplyr::select(x, .data$id, .data$code, .data$dv) %>%
dplyr::left_join(c_means, by = "code")
id <- unique(df$id)
iv <- names(df)[stringr::str_detect(names(df), "c\\d+")]
grid <- tidyr::crossing(id, iv)
peaks <- purrr::map2_df(grid$id, grid$iv, function(i_id, i_iv) {
x <-
dplyr::filter(df, .data$id == i_id) %>%
dplyr::select_at(dplyr::vars(.data$dv, i_iv)) %>%
dplyr::rename(iv = !!i_iv)
fit <- stats::lm(dv ~ iv + I(iv ^ 2), data = x)
coef_a <- stats::coef(fit)["iv"]
coef_b <- stats::coef(fit)["I(iv^2)"]
extr <-
tibble::tibble(
type = c(2, 1, 1),
iv = c(unname(-coef_a / (2 * coef_b)), min(x$iv), max(x$iv))
) %>%
dplyr::mutate(y = stats::predict(fit, newdata = ., type = "response"))
t_max <- extr$iv[extr$type == 2]
tibble::tibble(
id = i_id,
iv = i_iv,
max = dplyr::case_when(
min(x$iv) < t_max & t_max < max(x$iv) ~ extr$iv[which.max(extr$y)],
TRUE ~ extr$iv[which.max(extr$y[extr$type == 1])]
) # should be x value
)
})
tidyr::spread(peaks, .data$iv, .data$max) %>%
dplyr::select(.data$id, iv) %>%
dplyr::filter_at(dplyr::vars(dplyr::matches("c\\d+")),
dplyr::any_vars(!is.na(.)))
}
OutputDataSet <- InputDataSet
'
, #input_data_1 = N'-- Place SQL query retrieving data for the R stored procedure here
DECLARE #cols AS NVARCHAR(MAX),
#query AS NVARCHAR(MAX),
#StudyID int,
#sStudyID VARCHAR(50)
Select a.StudyId, a.RespID, p.ProductNumber, p.ProductSequence,
CONVERT(varchar(50),a.DateAdded,101) as StudyDate,
CONVERT(VARCHAR(15),CAST((a.DateAdded)AS TIME),100) as
StudyTime,DATENAME(dw,a.DateAdded) as [DayOfWeek],
p.A_Value as A,p.B_Value as B,p.C_Value as C,p.D_Value AS D,p.E_Value AS
E,p.F_Value AS F, q.QuestionNumber
from answers a
inner join Products p on a.ProductID = p.ProductID
inner join Questions q on a.QuestionID = q.QuestionID
where a.StudyID = #sStudyID'
--- Edit this line to handle the output data frame.
WITH RESULT SETS (([StudyID] int, [RespID] int, [ProductNumber] int,
[ProductSequence] int, [StudyDate] date, [StudyTime] time, [DayOfWeek]
VARCHAR(10),[QuestionNumber] int, [A] int, [B] int, [C] int, [D] int, [E]
int, [F] INT));
END;
When I execute the stored procedure in SQL Server Management Studio, the Return Value = 0 and no data is output. I'm not sure that the variables are being appropriately declared as I'm not prompted for them when I execute the stored procedure.
How do I modify the stored procedure to return the intended data? Can I call this from ASP.NET by providing the appropriate study ID?
Your code is very hard to read, but to me, it looks like you are not declaring the parameters you want to get out. Below is an example of how you can do it:
DECLARE #out_val float;
exec sp_execute_external_script
#language = N'R',
#script = N'
iris_dataset <- iris
setosa <- iris[iris$Species == "setosa",]
menSepWidth <- mean(setosa$Sepal.Width)
iris_dataset$Sepal.Length <- iris_dataset$Sepal.Length * multiplier
OutputDataSet <- data.frame(iris_dataset$Sepal.Length)
',
#params = N'#multiplier float, #menSepWidth float OUTPUT',
#multiplier = 5,
#menSepWidth = #out_val OUTPUT
WITH RESULT SETS ((SepalLength float));
SELECT #out_val AS MeanSepWidth
Have a look at this blog post where I talk about how to handle parameters etc. when you use sp_execute_external_script.
Hope this helps!
I'm trying to write Unicode strings from R to SQL, and then use that SQL table to power a Power BI dashboard. Unfortunately, the Unicode characters only seem to work when I load the table back into R, and not when I view the table in SSMS or Power BI.
require(odbc)
require(DBI)
require(dplyr)
con <- DBI::dbConnect(odbc::odbc(),
.connection_string = "DRIVER={ODBC Driver 13 for SQL Server};SERVER=R9-0KY02L01\\SQLEXPRESS;Database=Test;trusted_connection=yes;")
testData <- data_frame(Characters = "❤")
dbWriteTable(con,"TestUnicode",testData,overwrite=TRUE)
result <- dbReadTable(con, "TestUnicode")
result$Characters
Successfully yields:
> result$Characters
[1] "❤"
However, when I pull that table in SSMS:
SELECT * FROM TestUnicode
I get two different characters:
Characters
~~~~~~~~~~
â¤
Those characters are also what appear in Power BI. How do I correctly pull the heart character outside of R?
It turns out this is a bug somewhere in R/DBI/the ODBC driver. The issue is that R stores strings as UTF-8 encoded, while SQL Server stores them as UTF-16LE encoded. Also, when dbWriteTable creates a table, it by default creates a VARCHAR column for strings which can't even hold Unicode characters. Thus, you need to both:
Change the column in the R data frame from being a string column to a list column of UTF-16LE raw bytes.
When using dbWriteTable, specify the field type as being NVARCHAR(MAX)
This seems like something that should still be handled by either DBI or ODBC or something though.
require(odbc)
require(DBI)
# This function takes a string vector and turns it into a list of raw UTF-16LE bytes.
# These will be needed to load into SQL Server
convertToUTF16 <- function(s){
lapply(s, function(x) unlist(iconv(x,from="UTF-8",to="UTF-16LE",toRaw=TRUE)))
}
# create a connection to a sql table
connectionString <- "[YOUR CONNECTION STRING]"
con <- DBI::dbConnect(odbc::odbc(),
.connection_string = connectionString)
# our example data
testData <- data.frame(ID = c(1,2,3), Char = c("I", "❤","Apples"), stringsAsFactors=FALSE)
# we adjust the column with the UTF-8 strings to instead be a list column of UTF-16LE bytes
testData$Char <- convertToUTF16(testData$Char)
# write the table to the database, specifying the field type
dbWriteTable(con,
"UnicodeExample",
testData,
append=TRUE,
field.types = c(Char = "NVARCHAR(MAX)"))
dbDisconnect(con)
Inspired by last answer and github: r-dbi/DBI#215: Storing unicode characters in SQL Server
Following field.types = c(Char = "NVARCHAR(MAX)") but with vector and compute of max because of the error dbReadTable/dbGetQuery returns Invalid Descriptor Index .... :
vector_nvarchar<-c(Filter(Negate(is.null),
(
lapply(testData,function(x){
if (is.character(x) ) c(
names(x),
paste0("NVARCHAR(",
max(
# nvarchar(max) gave error dbReadTable/dbGetQuery returns Invalid Descriptor Index error on SQL server
# https://github.com/r-dbi/odbc/issues/112
# so we compute the max
nchar(
iconv( #nchar doesn't work for UTF-8 : help (nchar)
Filter(Negate(is.null),x)
,"UTF-8","ASCII",sub ="x"
)
)
,na.rm = TRUE)
,")"
)
)
})
)
))
con= DBI::dbConnect(odbc::odbc(),.connection_string=xxxxt, encoding = 'UTF-8')
DBI::dbWriteTable(con,"UnicodeExample",testData, overwrite= TRUE, append=FALSE, field.types= vector_nvarchar)
DBI::dbGetQuery(con,iconv('select * from UnicodeExample'))
Inspired by the last answer I also tried to find an automated way for writing data frames to SQL server. I can not confirm the nvarchar(max) errors, so I ended up with these functions:
convertToUTF16_df <- function(df){
output <- cbind(df[sapply(df, typeof) != "character"]
, list.cbind(apply(df[sapply(df, typeof) == "character"], 2, function(x){
return(lapply(x, function(y) unlist(iconv(y, from = "UTF-8", to = "UTF-16LE", toRaw = TRUE))))
}))
)[colnames(df)]
return(output)
}
field_types <- function(df){
output <- list()
output[colnames(df)[sapply(df, typeof) == "character"]] <- "nvarchar(max)"
return(output)
}
DBI::dbWriteTable(odbc_connect
, name = SQL("database.schema.table")
, value = convertToUTF16_df(df)
, overwrite = TRUE
, row.names = FALSE
, field.types = field_types(df)
)
I found the previous answer very useful but ran into problems with character vectors that had another encoding such as 'latin1' instead of UTF-8. This resulted in random NULLs in the database column due to special characters such as non-breaking spaces.
In order to avoid these encoding issues, I've made the following modifications to detect the character vector encoding or otherwise default back to UTF-8 before conversion to UTF-16LE:
library(rlist)
convertToUTF16_df <- function(df){
output <- cbind(df[sapply(df, typeof) != "character"]
, list.cbind(apply(df[sapply(df, typeof) == "character"], 2, function(x){
return(lapply(x, function(y) {
if (Encoding(y)=="unknown") {
unlist(iconv(enc2utf8(y), from = "UTF-8", to = "UTF-16LE", toRaw = TRUE))
} else {
unlist(iconv(y, from = Encoding(y), to = "UTF-16LE", toRaw = TRUE))
}
}))
}))
)[colnames(df)]
return(output)
}
field_types <- function(df){
output <- list()
output[colnames(df)[sapply(df, typeof) == "character"]] <- "nvarchar(max)"
return(output)
}
DBI::dbWriteTable(odbc_connect
, name = SQL("database.schema.table")
, value = convertToUTF16_df(df)
, overwrite = TRUE
, row.names = FALSE
, field.types = field_types(df)
)
Ideally, I'd still modify this to remove the rlist dependency but it seems to work now.
You could consider using the package RODBC instead of odbc/DBI. I've have used RODBC with SQL Server and with Microsoft Access as permanent data storage system. I never had trouble with german umlaut (e.g. Ä, ä, ..., ß)
I wonder if using iconv is an appealing alternative as there seem to boe some '\X00' issues (e.g. https://www.r-bloggers.com/2010/06/more-powerful-iconv-in-r/)
I am posting this answer as an Extension to the top answer, because some people might find it useful.
If you need Unicode strings in SQL statements such as INSERT or UPDATE where you cannot use dbWriteTable(), you can constructing your query with dbBind() like this:
x <- "äöü"
x <- iconv(x, from="UTF-8", to="UTF-16LE", toRaw = TRUE)
q <-
"
update foobar
set umlauts = ?
where id = 1
")
query <- DBI::dbSendStatement(con, q)
DBI::dbBind(query, list(x))
DBI::dbClearResult(query)
The loop functions completely different and I'm not sure if it is because of the google analytics package because very little is different between the code.
Doesn't WORK difference in print statement output shows results are not coming out correctly.
library(googleAnalyticsR)
library(tidyverse)
#settings
start_dat <- as.character(Sys.Date()-31)
end_dat <- as.character(Sys.Date()-1)
#Authorize Google Analytics R- this will open a webpage
#You must be logged into your Google Analytics account on your web browser
ga_auth()
account_sum <- ga_account_list()
#Add the start and end date to the date frame, as well as some columns to use to populate the metrics
account_sum$start_dat <- start_dat
account_sum$end_dat <- end_dat
## choose the v3 segment
segment_for_call <- "gaid::-5"
## make the v3 segment object in the v4 segment object:
seg_ob <- segment_ga4("OrganicTraffic", segment_id = segment_for_call)
# cycle through the list of views, pull the data, and add it to the
#account_summary
for (i in 1:5){
view_id <- (Book1CSV[[1]][i])
views=view_id
ga_dat <- google_analytics_4(views,
date_range = c(start_dat, end_dat),
segments = seg_ob,
metrics = c("sessions", "pageviews"),
dimensions = c("year","segment"))
ga_dat <- summarise(ga_dat,
sessions = sum(sessions),
pageviews = sum(pageviews))
account_sum$sessions[i] <- ga_data$sessions
account_sum$pageviews[i] <- ga_data$pageviews
print(account_summary)
}
clean_sum <- select(account_sum,
ID = webPropertyId ,
Account = accountName,
Views = views,
Type = type,
Level = level,
'Start Date' = start_dat,
'End Date' = end_dat,
Sessions = sessions,
Pageviews = pageviews)
write.csv (ga_dat, "doesntwork.csv", row.names = TRUE)
THIS WORKS!!!!!!!!!!!!!!! Print statement prints code
library(googleAnalyticsR)
library(tidyverse)
#settings
start_date <- as.character(Sys.Date()-31)
end_date <- as.character(Sys.Date()-1)
metrics <- c("sessions", "pageviews")
dimensions <- "year"
#Authorize Google Analytics R- this will open a webpage
#You must be logged into your Google Analytics account on your web browser
ga_auth()
account_summary <- ga_account_list()
#Add the start and end date to the date frame, as well as some columns to use to populate the metrics
account_summary$start_date <- start_date
account_summary$end_date <- end_date
# cycle through the list of views, pull the data, and add it to the
#account_summary
for (i in 1:6){
view_id <- (Book1CSV[[1]][i])
ga_data <- google_analytics_4(viewId = view_id,
date_range = c(start_date,end_date),
metrics = metrics,
dimensions = dimensions)
# This query might return multiple rows (if it spans a year boundary), so
#collapse and clean up
ga_data <- summarise(ga_data,
sessions = sum(sessions),
pageviews = sum(pageviews))
#add the totals to the account summary
account_summary$sessions[i] <- ga_data$sessions
account_summary$pageviews[i] <- ga_data$pageviews
print(account_summary)
}
# Make a more compact set of data
clean_summary <- select(account_summary,
Account = accountName,
View = viewId,
Type = type,
Level = level,
'Start Date' = start_date,
'End Date' = end_date,
Sessions = sessions,
Pageviews = pageviews,
ID = webPropertyId)
select
write.csv (clean_summary, "worksfine.csv", row.names = FALSE)
Can't really understand whats going wrong here. Would like some detailed help. I have uploaded a file called Book1CSV as I couldn't get a try-catch statement that would work to catch the error. Google merchant store beta account was causing the crash.
This is an anti-R pattern, modifying an object in a loop. Its better to return a list of data.frames, then merge then at the end and create your summary.
So something like:
my_view_ids <- c(12345,233445,232434)
## fetch data, create a list of
all_data <- lapply(my_view_ids, function(id){
one_data <- tryCatch(google_analytics_4(viewId = id,
date_range = c(start_date,end_date),
metrics = metrics,
dimensions = dimensions),
error = function(ex){message(ex); NULL})
one_data$viewId <- id
one_data
})
## Reduce the list of data.frames to one data.frame
all_dataframe <- Reduce(rbind, all_data)
### make your summary etc.
I am having a JDBC connection with Apache Spark and PostgreSQL and I want to insert some data into my database. When I use append mode I need to specify id for each DataFrame.Row. Is there any way for Spark to create primary keys?
Scala:
If all you need is unique numbers you can use zipWithUniqueId and recreate DataFrame. First some imports and dummy data:
import sqlContext.implicits._
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{StructType, StructField, LongType}
val df = sc.parallelize(Seq(
("a", -1.0), ("b", -2.0), ("c", -3.0))).toDF("foo", "bar")
Extract schema for further usage:
val schema = df.schema
Add id field:
val rows = df.rdd.zipWithUniqueId.map{
case (r: Row, id: Long) => Row.fromSeq(id +: r.toSeq)}
Create DataFrame:
val dfWithPK = sqlContext.createDataFrame(
rows, StructType(StructField("id", LongType, false) +: schema.fields))
The same thing in Python:
from pyspark.sql import Row
from pyspark.sql.types import StructField, StructType, LongType
row = Row("foo", "bar")
row_with_index = Row(*["id"] + df.columns)
df = sc.parallelize([row("a", -1.0), row("b", -2.0), row("c", -3.0)]).toDF()
def make_row(columns):
def _make_row(row, uid):
row_dict = row.asDict()
return row_with_index(*[uid] + [row_dict.get(c) for c in columns])
return _make_row
f = make_row(df.columns)
df_with_pk = (df.rdd
.zipWithUniqueId()
.map(lambda x: f(*x))
.toDF(StructType([StructField("id", LongType(), False)] + df.schema.fields)))
If you prefer consecutive number your can replace zipWithUniqueId with zipWithIndex but it is a little bit more expensive.
Directly with DataFrame API:
(universal Scala, Python, Java, R with pretty much the same syntax)
Previously I've missed monotonicallyIncreasingId function which should work just fine as long as you don't require consecutive numbers:
import org.apache.spark.sql.functions.monotonicallyIncreasingId
df.withColumn("id", monotonicallyIncreasingId).show()
// +---+----+-----------+
// |foo| bar| id|
// +---+----+-----------+
// | a|-1.0|17179869184|
// | b|-2.0|42949672960|
// | c|-3.0|60129542144|
// +---+----+-----------+
While useful monotonicallyIncreasingId is non-deterministic. Not only ids may be different from execution to execution but without additional tricks cannot be used to identify rows when subsequent operations contain filters.
Note:
It is also possible to use rowNumber window function:
from pyspark.sql.window import Window
from pyspark.sql.functions import rowNumber
w = Window().orderBy()
df.withColumn("id", rowNumber().over(w)).show()
Unfortunately:
WARN Window: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
So unless you have a natural way to partition your data and ensure uniqueness is not particularly useful at this moment.
from pyspark.sql.functions import monotonically_increasing_id
df.withColumn("id", monotonically_increasing_id()).show()
Note that the 2nd argument of df.withColumn is monotonically_increasing_id() not monotonically_increasing_id .
I found the following solution to be relatively straightforward for the case where zipWithIndex() is the desired behavior, i.e. for those desirng consecutive integers.
In this case, we're using pyspark and relying on dictionary comprehension to map the original row object to a new dictionary which fits a new schema including the unique index.
# read the initial dataframe without index
dfNoIndex = sqlContext.read.parquet(dataframePath)
# Need to zip together with a unique integer
# First create a new schema with uuid field appended
newSchema = StructType([StructField("uuid", IntegerType(), False)]
+ dfNoIndex.schema.fields)
# zip with the index, map it to a dictionary which includes new field
df = dfNoIndex.rdd.zipWithIndex()\
.map(lambda (row, id): {k:v
for k, v
in row.asDict().items() + [("uuid", id)]})\
.toDF(newSchema)
For anyone else who doesn't require integer types, concatenating the values of several columns whose combinations are unique across the data can be a simple alternative. You have to handle nulls since concat/concat_ws won't do that for you. You can also hash the output if the concatenated values are long:
import pyspark.sql.functions as sf
unique_id_sub_cols = ["a", "b", "c"]
df = df.withColumn(
"UniqueId",
sf.md5(
sf.concat_ws(
"-",
*[
sf.when(sf.col(sub_col).isNull(), sf.lit("Missing")).otherwise(
sf.col(sub_col)
)
for sub_col in unique_id_sub_cols
]
)
),
)
With RODBC, there were functions like sqlUpdate(channel, dat, ...) that allowed you pass dat = data.frame(...) instead of having to construct your own SQL string.
However, with R's DBI, all I see are functions like dbSendQuery(conn, statement, ...) which only take a string statement and gives no opportunity to specify a data.frame directly.
So how to UPDATE using a data.frame with DBI?
Really late, my answer, but maybe still helpful...
There is no single function (I know) in the DBI/odbc package but you can replicate the update behavior using a prepared update statement (which should work faster than RODBC's sqlUpdate since it sends the parameter values as a batch to the SQL server:
library(DBI)
library(odbc)
con <- dbConnect(odbc::odbc(), driver="{SQL Server Native Client 11.0}", server="dbserver.domain.com\\default,1234", Trusted_Connection = "yes", database = "test") # assumes Microsoft SQL Server
dbWriteTable(con, "iris", iris, row.names = TRUE) # create and populate a table (adding the row names as a separate columns used as row ID)
update <- dbSendQuery(con, 'update iris set "Sepal.Length"=?, "Sepal.Width"=?, "Petal.Length"=?, "Petal.Width"=?, "Species"=? WHERE row_names=?')
# create a modified version of `iris`
iris2 <- iris
iris2$Sepal.Length <- 5
iris2$Petal.Width[2] <- 1
iris2$row_names <- rownames(iris) # use the row names as unique row ID
dbBind(update, iris2) # send the updated data
dbClearResult(update) # release the prepared statement
# now read the modified data - you will see the updates did work
data1 <- dbReadTable(con, "iris")
dbDisconnect(con)
This works only if you have a primary key which I created in the above example by using the row names which are a unique number increased by one for each row...
For more information about the odbc package I have used in the DBI dbConnect statement see: https://github.com/rstats-db/odbc
Building on R Yoda's answer, I made myself the helper function below. This allows using a dataframe to specify update conditions.
While I built this to run transaction updates (i.e. single rows), it can in theory update multiple rows passing a condition. However, that's not the same as updating multiple rows using an input dataframe. Maybe somebody else can build on this...
dbUpdateCustom = function(x, key_cols, con, schema_name, table_name) {
if (nrow(x) != 1) stop("Input dataframe must be exactly 1 row")
if (!all(key_cols %in% colnames(x))) stop("All columns specified in 'key_cols' must be present in 'x'")
# Build the update string --------------------------------------------------
df_key <- dplyr::select(x, one_of(key_cols))
df_upt <- dplyr::select(x, -one_of(key_cols))
set_str <- purrr::map_chr(colnames(df_upt), ~glue::glue_sql('{`.x`} = {x[[.x]]}', .con = con))
set_str <- paste(set_str, collapse = ", ")
where_str <- purrr::map_chr(colnames(df_key), ~glue::glue_sql("{`.x`} = {x[[.x]]}", .con = con))
where_str <- paste(where_str, collapse = " AND ")
update_str <- glue::glue('UPDATE {schema_name}.{table_name} SET {set_str} WHERE {where_str}')
# Execute ------------------------------------------------------------------
query_res <- DBI::dbSendQuery(con, update_str)
DBI::dbClearResult(query_res)
return (invisible(TRUE))
}
Where
x: 1-row dataframe that contains 1+ key columns, and 1+ update columns.
key_cols: character vector, of 1 or more column names that are the keys (i.e. used in the WHERE clause)
Here is a little helper function I put together using REPLACE INTO to update a table using DBI, replacing old duplicate entries with the new values. It's basic and for my own needs, but should be easy to modify. All you need to pass to the function is the connection, table name, and dataframe. Note that the table must have a PRIMARY KEY column. I've also included a simple working example.
row_to_list <- function(Y) suppressWarnings(split(Y, f = row(Y)))
sql_val <- function(y){
if(!is.numeric(y)){
return(paste0("'",y,"'"))
}else{
if(is.na(y)){
return("NULL")
}else{
return(as.character(y))
}
}
}
to_sql_row <- function(x) paste0("(",paste(do.call("c", lapply(x, FUN = sql_val)), collapse = ", "),")")
bracket <- function(x) paste0("`",x,"`")
to_sql_string <- function(x) paste0("(",paste(sapply(x, FUN = bracket), collapse = ", "),")")
replace_into_table <- function(con, table_name, new_data){
#new_data <- data.table(new_data)
cols <- to_sql_string(names(new_data))
vals <- paste(lapply(row_to_list(new_data), FUN = to_sql_row), collapse = ", ")
query <- paste("REPLACE INTO", table_name, cols, "VALUES", vals)
rs <- dbExecute(con, query)
return(rs)
}
tb <- data.frame("id" = letters[1:20], "A" = 1:20, "B" = seq(.1,2,.1)) # sample data
dbWriteTable(con, "test_table", tb) # create table
dbExecute(con, "ALTER TABLE test_table ADD PRIMARY KEY (id)") # set primary key
new_data <- data.frame("id" = letters[19:23], "A" = 1:5, "B" = seq(101,105)) # new data
new_data[4,2] <- NA # add some NA values
new_data[5,3] <- NA
table_name <- "test_table"
replace_into_table(con, "test_table", new_data)
result <- dbReadTable(con, "test_table")