RSQLite RS-DBI driver: (error in statement: no such table: test) - database
I have just started using RSQLite for analysis of a very large survey data set using R and the survey package by Thomas Lumley. I am getting an error message that has been asked about before on Stack Overflow and the R help archive, but the solutions don't apply to my data (one solution was that the original poster was using POSIX data type, but my data doesn't have that). I don't think it is a problem with the survey package, rather I think I am doing something wrong with the database/table creation. One thing that may help, when I use the sample from my data that I posed below, I don't get an error with a SELECT query, but when I do the same thing with my full data set, I do get the same error. Here is a sample of my data and some reproducible code:
test=structure(list(household = c(0, 0, 0, 0, 0), NUMADULT = c(2L,
1L, 2L, 1L, 1L), CHILDREN = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_), SEX = c(1L, 2L, 1L, 2L, 2L), X_STATE = c(36L, 5L,
53L, 41L, 10L), X_FINALWT = c(665.97647582, 53.293518032, 72.60538811,
61.223634396, 5.5921160216), AGE = c(30L, 65L, 9L, 49L, 48L),
X_INCOMG = structure(c(6L, 6L, 6L, 6L, 6L), .Label = c("1",
"2", "3", "4", "5", "9"), class = "factor"), X_MAM502Y = structure(c(NA,
1L, NA, NA, NA), .Label = c("1", "2", "9"), class = "factor"),
HLTHPLAN = structure(c(2L, 1L, 1L, 1L, 1L), .Label = c("1",
"2"), class = "factor"), MEDCOST = structure(c(1L, 2L, 2L,
2L, 2L), .Label = c("1", "2"), class = "factor"), QLACTLM2 = c(2L,
2L, 2L, 2L, 2L), CTYCODE = structure(c(30L, 53L, 33L, 26L,
1L), .Label = c("1", "3", "5", "6", "7", "9", "10", "11",
"13", "14", "15", "17", "19", "20", "21", "23", "25", "27",
"28", "29", "30", "31", "33", "35", "37", "39", "41", "43",
"45", "47", "49", "51", "53", "55", "57", "59", "61", "63",
"65", "67", "69", "71", "73", "75", "77", "79", "81", "83",
"85", "86", "87", "89", "91", "93", "95", "97", "99", "101",
"103", "105", "107", "109", "111", "113", "115", "117", "119",
"121", "123", "125", "127", "129", "131", "133", "135", "137",
"139", "141", "143", "145", "147", "149", "151", "153", "155",
"157", "159", "161", "163", "165", "167", "169", "171", "173",
"175", "177", "179", "181", "183", "185", "187", "189", "191",
"193", "195", "197", "199", "201", "205", "209", "215", "227",
"235", "245", "297", "303", "309", "339", "355", "439", "453",
"491", "510", "550", "590", "650", "700", "710", "740", "760",
"770", "777", "800", "810", "999", "203", "207", "217", "221",
"223", "275", "277", "295", "313", "381", "423", "680", "12",
"54", "186", "211", "213", "219", "225", "229", "231", "233",
"237", "239", "241", "247", "249", "251", "253", "255", "257",
"259", "261", "265", "267", "271", "273", "279", "281", "285",
"287", "289", "291", "293", "299", "305", "311", "321", "323",
"325", "329", "331", "337", "341", "343", "347", "349", "351",
"353", "361", "363", "365", "367", "371", "373", "375", "387",
"395", "397", "401", "407", "409", "415", "419", "427", "441",
"449", "451", "455", "457", "459", "463", "465", "467", "469",
"471", "473", "477", "479", "481", "485", "487", "489", "493",
"497", "499", "503", "520", "540", "570", "600", "630", "660",
"670", "683", "690", "730", "750", "775", "820", "830", "840",
"790"), class = "factor"), X_RACEGR2 = structure(c(1L, 1L,
NA, 1L, NA), .Label = c("1", "2", "3", "4", "5"), class = "factor"),
PERSDOC2 = structure(c(3L, 1L, 1L, 1L, 1L), .Label = c("1",
"2", "3"), class = "factor"), POORHLTH = c(0, NA, NA, 0,
0), X_EDUCAG = structure(c(3L, 2L, 4L, 4L, 4L), .Label = c("1",
"2", "3", "4"), class = "factor"), X_PSU = c(2004006698L,
2004014294L, 2004100796L, 2004024220L, 2004005537L), X_STSTR = c(36011L,
5012L, 53271L, 41012L, 10011L), X_RFMAM2Y = structure(c(NA,
1L, NA, 1L, 1L), .Label = c("1", "2", "9"), class = "factor"),
X_RFSMOK3 = structure(c(2L, 1L, 1L, 2L, 1L), .Label = c("1",
"2"), class = "factor"), X_RFHLTH = structure(c(1L, 1L, 1L,
1L, 1L), .Label = c("1", "2", "3"), class = "factor"), YEAR = c(2004,
2004, 2004, 2004, 2004), bcccp = structure(c(2L, 2L, 2L,
2L, 1L), .Label = c("0", "1"), class = "factor"), pov.limit = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_), cutoff = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_), elig = c(NA, NA,
NA, NA, NA), bcccp_elig = c(NA, NA, NA, NA, NA)), .Names = c("household",
"NUMADULT", "CHILDREN", "SEX", "X_STATE", "X_FINALWT", "AGE",
"X_INCOMG", "X_MAM502Y", "HLTHPLAN", "MEDCOST", "QLACTLM2", "CTYCODE",
"X_RACEGR2", "PERSDOC2", "POORHLTH", "X_EDUCAG", "X_PSU", "X_STSTR",
"X_RFMAM2Y", "X_RFSMOK3", "X_RFHLTH", "YEAR", "bcccp", "pov.limit",
"cutoff", "elig", "bcccp_elig"), row.names = c(NA, 5L), class = "data.frame")
library(survey)
library(sqldf)
library(RSQLite)
drv=dbDriver('SQLite')
con=dbConnect(drv,'brfsagg.db')
dbWriteTable(con,'brfs0210',test)
dbListFields(con,'brfs0210') #This function works
sqldf("select SEX from brfs0210") #This works with my sample data but I get the same error message when I use the full data set.
dbExistsTable(con,'test') #This proves that the table exists
brfsvy=svydesign(id=~X_PSU, strata=~X_STSTR, weights=~X_FINALWT,nest=TRUE,
data='test',dbtype='SQLite',dbname=system.file('brfsagg.db',package='survey')) #This always generates the error message, regardless of whether I am using the test sample data or my full data set.
the r code that you are trying to write has already been written here with accompanying blog post here. why would you bother re-inventing the wheel? googling r brfss or import brfss into r gets you to those posts.
is there a reason you want to re-write everything from scratch yourself? there is lots of example syntax using SQLite with the survey package here ..here's how to fix this particular issue. :)
library(survey)
library(RSQLite)
db.filename <- 'brfsagg.db'
con <- dbConnect(SQLite(),db.filename)
dbWriteTable( con , 'test' , test )
brfsvy <-
svydesign(
id = ~X_PSU ,
strata = ~X_STSTR ,
weights = ~X_FINALWT ,
nest = TRUE ,
data = 'test' ,
dbtype = 'SQLite' ,
dbname = db.filename
)
svymean( ~ SEX , brfsvy )
options( 'survey.lonely.psu' = 'adjust' )
svymean( ~ SEX , brfsvy )
svymean( ~ factor( SEX ) , brfsvy )
Related
Ruby set hash inside the Hash for the Array of Hashes
I am working on Rails 6 API. This is what I get "data": [ { "invoice_details": { "customer_name": "Dylan Sollfrank", "invoice_number": "1060", "invoice_status": "paid" } }, { "transaction_number": "QB1589148496", "customer_name": "Freeman Sporting Goods:55 Twin Lane", "amount": { "amount_to_pay": 86.4, "payment_fee": 0.0 }, "created_time": "03:38 AM", "created_date": "May 11, 2020", "payment_method": "qb_payment", "payment_status": "completed" }, Following is my code def get_payment_report_activity(invoice_transactions, timezone = Time.zone.name) invoice_details = [] transaction_details = {} amount = {} invoice_transactions.group_by(&:paymentable_id).each do |key, transactions| invoice = Invoice.find key invoice_details.push(invoice_details:{ customer_name: invoice&.customer&.fully_qualified_name&.strip, invoice_number: invoice&.doc_number, invoice_status: invoice&.invoice_status }) transactions.each do |transaction| customer = transaction&.paymentable&.customer amount[:amount_to_pay] = transaction&.amount_to_pay.to_f amount[:payment_fee] = transaction&.payment_fee.to_f transaction_details[:transaction_number] = transaction&.transaction_number transaction_details[:customer_name] = customer&.fully_qualified_name&.strip transaction_details[:amount] = amount transaction_details[:created_time] = Customer.time_format(transaction.created_at.in_time_zone(timezone)) transaction_details[:created_date] = Customer.date_format(transaction.created_at.in_time_zone(timezone)) transaction_details[:payment_method] = transaction&.payment_method transaction_details[:payment_status] = transaction&.payment_status end invoice_details << transaction_details end invoice_details end Now I need the hash transaction details inside the invoice_details hash label as transaction_details and there can be multiple transaction details inside the invoice_details "data": [ { "invoice_details": { "customer_name": "Dylan Sollfrank", "invoice_number": "1060", "invoice_status": "paid", "transaction_details: [{ "transaction_number": "QB1589148496", "customer_name": "Freeman Sporting Goods:55 Twin Lane", "amount": { "amount_to_pay": 86.4, "payment_fee": 0.0 }, "created_time": "03:38 AM", "created_date": "May 11, 2020", "payment_method": "qb_payment", "payment_status": "completed" }, { "transaction_number": "QB1589148496", "customer_name": "Freeman Sporting Goods:55 Twin Lane", "amount": { "amount_to_pay": 86.4, "payment_fee": 0.0 }, "created_time": "03:38 AM", "created_date": "May 11, 2020", "payment_method": "qb_payment", "payment_status": "completed" }] }, "invoice_details": { "customer_name": "Dylan Sollfrank", "invoice_number": "1060", "invoice_status": "paid", "transaction_details : { "transaction_number": "QB1589148496", "customer_name": "Freeman Sporting Goods:55 Twin Lane", "amount": { "amount_to_pay": 86.4, "payment_fee": 0.0 }, "created_time": "03:38 AM", "created_date": "May 11, 2020", "payment_method": "qb_payment", "payment_status": "completed" } }, }
you can try like this: def get_payment_report_activity(invoice_transactions, timezone = Time.zone.name) invoice_details = [] invoice_transactions.group_by(&:paymentable_id).each do |key, transactions| invoice = Invoice.find key transaction_details = [] transactions.each do |transaction| transaction_hash = {} amount_hash = {} customer = transaction&.paymentable&.customer amount_hash[:amount_to_pay] = transaction&.amount_to_pay.to_f amount_hash[:payment_fee] = transaction&.payment_fee.to_f transaction_hash[:transaction_number] = transaction&.transaction_number transaction_hash[:customer_name] = customer&.fully_qualified_name&.strip transaction_hash[:created_time] = Customer.time_format(transaction.created_at.in_time_zone(timezone)) transaction_hash[:created_date] = Customer.date_format(transaction.created_at.in_time_zone(timezone)) transaction_hash[:payment_method] = transaction&.payment_method transaction_hash[:payment_status] = transaction&.payment_status transaction_hash[:amount] = amount_hash transaction_details << transaction_hash end invoice_details.push(invoice_details: { customer_name: invoice&.customer&.fully_qualified_name&.strip, invoice_number: invoice&.doc_number, invoice_status: invoice&.invoice_status, transaction_details: transaction_details }) end invoice_details end
How to split a data.frame into an array by a factor?
If we want to split a data.frame by a "factor" f we usually do: split(df1, df1$f) But how do we do that when we want to split the data.frame into an array? I find my code a little awkward because of the two t, also the max could be a bit unreliable: A <- array(t(as.matrix(df1)), dim=c(ncol(df1), max(df1$id), max(df1$f))) apply(A, c(1, 3), t) Is there a less complicated base R solution? The following won't give me what I want, since str still yields lists. as.array(split(df1, df1$f)) as.array(lapply(split(df1, df1$f), as.matrix)) as.array(lapply(split(df1, df1$f), function(x) matrix(unlist(x), nrow(x)))) Data df1 <- structure(list(id = c(1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L), f = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L), x1 = c(14L, 15L, 4L, 10L, 8L, 6L, 7L, 2L, 5L, 9L, 3L, 12L, 13L, 1L, 11L), x2 = c(12L, 15L, 6L, 9L, 1L, 14L, 11L, 2L, 7L, 4L, 8L, 5L, 10L, 13L, 3L)), class = "data.frame", row.names = c(NA, -15L))
mylist = split(df1, df1$f) dims = dim(mylist[[1]]) array(sapply(mylist, function(x){ m = as.matrix(x) array(m, dim = dims) }), dim = c(dims, length(mylist)))
Record Linkage with multiple datasets
The problem fastLink and RecordLinkage packages do extremely well in matching records (rows) from database A to database B and vice-versa. The developers are working on extending from matching only 2 databases to multiple databases. A simple example of both I gave here. In the meantime, how would we go about matching multiple data frames? For example, I have multiple medical records of patients from clinic A, B, C, D, E, F, and I want to merge them into a single one. A reproducible example: dfA <- structure(list(fname = c("Jafar", "Nemo", "Simba", "Belle", "Nala", "Jasmine"), lname = c("Evil", "Water", "King", "Beauty", "Princess", "Princess"), gender = c("M", "M", "M", "F", "F", "F"), dob = c(1987, 2000, 2011, 1989, 1970, 1989), city = c("Arabtown", "Atlantic", "Sahara", "Nice", "Sahara", "Arabtown")), row.names = c(NA, -6L ), class = c("tbl_df", "tbl", "data.frame")) dfB <- structure(list(fname = c("Jafar Jr", "Nemo", "Simba", "Belle", "Nala", "Jasmine"), lname = c("Evil", "Waterson", "King", "Beauty", "Princess", "Princess of Arabtown"), gender = c("M", "M", "M", "F", "F", "F"), dob = c(NA, 2000, 2011, NA, NA, 1989), city = c("Arabtown", "Atlantica", "Sahara", "Nice-France", "Sahara", "Arabia")), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame")) dfC <- structure(list(fname = c("Jafar Jr", "Fishy", "Lion", "Belle", "Sarabi", "Jasmine"), lname = c("Evil", "Waterpal", "King", "Beauty", "Queen", NA), gender = c("M", "M", NA, "F", "F", "F"), dob = c(NA, 2000, 2011, NA, 1940, 1989), city = c("Arabia", NA, "Sahara", "France", "Sahara", NA)), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame")) dfD <- structure(list(fname = c("Jafar Jr", "Nemo", "Simba", "Belle", "Sarabi", "Jasmine"), lname = c("Evil", "Waterson", "King", "Beast", "Queen", "Evil"), gender = c("M", "M", "M", "F", "F", "M"), dob = c(NA, 2000, 2011, 1989, NA, 1989), city = c("Arabtown", "Atlantica", "Sahara", NA, "Sahara", "Arabtown")), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame")) dfE <- structure(list(fname = c("Jafar Jr", "Nemo", "Simba", "Belle", "Nala", "Aladdin"), lname = c("Evil", "Pateron", NA, "Gaston", NA, "Streetrat"), gender = c("M", NA, "M", "F", "F", "M"), dob = c(1987, NA, NA, NA, 1970, 1989), city = c("Arabtown", "Atlantica", "Sahara", "France", "Sahara", "Arabia")), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame")) dfF <- structure(list(fname = c("Jafar Jr", "Nemo", "Simba", "Belle", "Nala", "Al"), lname = c("Evil", "Waterson", "Dead", "Beauty", "Princess", "Streetrat"), gender = c("M", "M", NA, "F", "F", "M"), dob = c(1987, 2000, 2011, NA, NA, 1989), city = c("Arabia", "Atlantic", "Sahara", "Nice-France", "Sahara", "Arabia")), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame")) Expected result : In the end I want unique identified records : 1 Jafar Evil M 1987 Arabtown 2 Nemo Water M 2000 Atlantic 3 Simba King M 2011 Sahara 4 Belle Beauty F 1989 Nice 5 Nala Princess F 1970 Sahara 6 Jasmine Princess F 1989 Arabtown 7 Sarabi Queen F 1940 Sahara 8 Aladdin Streetrat M 1989 Arabia Even if the result isn't as clean as above, it's alright. The goal is to find a unified record from all 6 records and belong to the same entity. Both fastLink & RecordLinkage take care of deduping (removing duplicates). How can I develop an approach to deal with more than two databases in this scenario?
Excel macros Help. Creating a new column based of arrays.
I am trying to write a Macros in excel to create a new column with these states divided into these regions. I keep getting runtime error 13 Here is the code I have so far. Sub Region () Dim Pacific As Variant Pacific = Array("WA", "OR", "ID", "CA", "NV", "AZ", "NM", "HI", "AK") Dim Continental As Variant Continental = Array("AR", "IA", "CO", "KS", "LA", "MS", "MT", "ND", "NE", "OK", "SD", "UT", "WY") Dim SouthEast As Variant SouthEast = Array("GA", "AL", "FL", "SC", "KY", "TN") Dim Midwest As Variant Midwest = Array("MN", "WI", "IL", "IN", "MI", "OH") Dim NorthAtlantic As Variant NorthAtlantic = Array("ME", "NH", "MA", "RI", "CT", "VT", "NY", "PA", "NJ", "DE", "MD", "WV", "VA", "NC") Dim Texas As Variant Texas = Array("TX”) Dim state As String , result As String score = Range("F1").Value If state = Pacific Then result = "PACIFIC" ElseIf state = Continental Then result = "Continental" ElseIf state = SouthEast Then result = "SouthEast" ElseIf state = Midwest Then result = "Midwest" ElseIf state = NorthAtlantic Then result = "North Atlantic" ElseIf state = Texas Then result = "Texas" Else result = "fail" End If Range("Z1").Value = result End Sub
AFAIK, to search for the occurrence of a string within an array isn't a simple matter within VBA. You either have to use a loop, or possibly use WorksheetFunction.Match. A simpler way may be to avoid arrays altogether - your code could be easily refactored to use a Select Case statement: Sub Region () Dim state As String , result As String state = Range("F1").Value Select Case state Case "WA", "OR", "ID", "CA", "NV", "AZ", "NM", "HI", "AK" result = "PACIFIC" Case "AR", "IA", "CO", "KS", "LA", "MS", "MT", "ND", "NE", "OK", "SD", "UT", "WY" result = "Continental" Case "GA", "AL", "FL", "SC", "KY", "TN" result = "SouthEast" Case "MN", "WI", "IL", "IN", "MI", "OH" result = "Midwest" Case "ME", "NH", "MA", "RI", "CT", "VT", "NY", "PA", "NJ", "DE", "MD", "WV", "VA", "NC" result = "North Atlantic" Case "TX" result = "Texas" Case Else result = "fail" End Select Range("Z1").Value = result End Sub Note: You also had two code problems. You had score = Range("F1").Value when I think you meant state = Range("F1").Value You had "TX” instead of "TX" - I'm not sure whether the ” causes a problem in your version of Excel, but it does in mine. To extend this function so that it applies to all cells in column F, you will need to loop through each row: Sub Region () Dim state As String , result As String Dim lastRow As Long Dim r As Long With ActiveSheet lastRow = .Cells(.Rows.Count, "F").End(xlUp).Row For r = 1 to lastRow state = .Cells(r, "F").Value Select Case state Case "WA", "OR", "ID", "CA", "NV", "AZ", "NM", "HI", "AK" result = "PACIFIC" Case "AR", "IA", "CO", "KS", "LA", "MS", "MT", "ND", "NE", "OK", "SD", "UT", "WY" result = "Continental" Case "GA", "AL", "FL", "SC", "KY", "TN" result = "SouthEast" Case "MN", "WI", "IL", "IN", "MI", "OH" result = "Midwest" Case "ME", "NH", "MA", "RI", "CT", "VT", "NY", "PA", "NJ", "DE", "MD", "WV", "VA", "NC" result = "North Atlantic" Case "TX" result = "Texas" Case Else result = "fail" End Select .Cells(r, "Z").Value = result Next End With End Sub
Em why don,t you use Access create tables as you did and then link to further logical tables you are going to create (I presume there is some practical use of the code you wrote) That is why access was created in the first place...
Complex data transformation
I need to transform following (simplified) dataset, created by following code: structure(list(W1.1 = structure(c(1L, NA, NA), .Names = c("case1", "case2", "case3"), .Label = "1", class = "factor"), R1.1 = structure(c(1L, NA, NA), .Names = c("case1", "case2", "case3"), .Label = "2", class = "factor"), W1.2 = structure(c(NA, 1L, NA), .Names = c("case1", "case2", "case3"), .Label = "1", class = "factor"), R1.2 = structure(c(NA, 1L, NA), .Names = c("case1", "case2", "case3"), .Label = "1", class = "factor"), W2.1 = structure(c(NA, 1L, NA), .Names = c("case1", "case2", "case3"), .Label = "1", class = "factor"), R2.1 = structure(c(NA, 1L, NA), .Names = c("case1", "case2", "case3"), .Label = "1", class = "factor"), W2.2 = structure(c(1L, NA, NA), .Names = c("case1", "case2", "case3"), .Label = "2", class = "factor"), R2.2 = structure(c(1L, NA, NA), .Names = c("case1", "case2", "case3"), .Label = "1", class = "factor"), W3.1 = structure(c(1L, NA, NA), .Names = c("case1", "case2", "case3"), .Label = "1", class = "factor"), R3.1 = structure(c(1L, NA, NA), .Names = c("case1", "case2", "case3"), .Label = "1", class = "factor"), W3.2 = structure(c(1L, 1L, NA), .Names = c("case1", "case2", "case3"), .Label = "1", class = "factor"), R3.2 = structure(c(1L, 1L, NA), .Names = c("case1", "case2", "case3"), .Label = "1", class = "factor"), age = structure(c(3L, 1L, 2L), .Names = c("case1", "case2", "case3"), .Label = c("20", "48", "56"), class = "factor"), gender = structure(c(2L, 1L, 2L), .Names = c("case1", "case2", "case3"), .Label = c("female", "male"), class = "factor")), .Names = c("W1.1", "R1.1", "W1.2", "R1.2", "W2.1", "R2.1", "W2.2", "R2.2", "W3.1", "R3.1", "W3.2", "R3.2", "age", "gender"), row.names = c(NA, 3L ), class = "data.frame") For the new data I want: - a row dedicated to every x.x, with info on the Rx.x value, age and gender. - only have a row returned when Wx.x was 1. When 2 or NA, I don't need it. For my example this dataset should look something like this: incident type Where Reported age gender 1 1 1.1 1 2 56 male 2 2 3.1 1 1 56 male 3 3 3.2 1 1 56 male 4 4 1.2 1 1 20 female 5 5 2.1 1 1 20 female 6 6 3.2 1 1 20 female Note: the "Where" column can even be omitted since it should be a constant vector of 1, and I don't need it for the analysis.
This is (mostly) a problem to be tackled by reshape(). Assuming your original dataset is called "temp": First, reshape it from a wide format to a long format. temp.long <- reshape(temp, direction = "long", idvar=c("age", "gender"), varying = which(!names(temp) %in% c("age", "gender")), sep = "") temp.long # age gender time W R # 56.male.1.1 56 male 1.1 1 2 # 20.female.1.1 20 female 1.1 <NA> <NA> # 48.male.1.1 48 male 1.1 <NA> <NA> # 56.male.1.2 56 male 1.2 <NA> <NA> # 20.female.1.2 20 female 1.2 1 1 # 48.male.1.2 48 male 1.2 <NA> <NA> # 56.male.2.1 56 male 2.1 <NA> <NA> # 20.female.2.1 20 female 2.1 1 1 # 48.male.2.1 48 male 2.1 <NA> <NA> # 56.male.2.2 56 male 2.2 2 1 # 20.female.2.2 20 female 2.2 <NA> <NA> # 48.male.2.2 48 male 2.2 <NA> <NA> # 56.male.3.1 56 male 3.1 1 1 # 20.female.3.1 20 female 3.1 <NA> <NA> # 48.male.3.1 48 male 3.1 <NA> <NA> # 56.male.3.2 56 male 3.2 1 1 # 20.female.3.2 20 female 3.2 1 1 # 48.male.3.2 48 male 3.2 <NA> <NA> Second, do some cleanup. temp.long <- na.omit(temp.long) temp.long <- temp.long[-which(temp.long$W == 2), ] temp.long <- temp.long[order(rev(temp.long$gender), temp.long$time), ] rownames(temp.long) <- NULL temp.long$incident <- seq(nrow(temp.long)) temp.long # age gender time W R incident # 1 56 male 1.1 1 2 1 # 2 56 male 3.1 1 1 2 # 3 56 male 3.2 1 1 3 # 4 20 female 1.2 1 1 4 # 5 20 female 2.1 1 1 5 # 6 20 female 3.2 1 1 6 You can do further cleanup to change your column names and column order if it's important.