Add New Records Within By Groups Using Aggregation Functions

It is not uncommon to have an analysis need whereby one needs to derive an analysis value (AVAL) from multiple records. The ADaM basic dataset structure variable DTYPE is available to indicate when a new derived records has been added to a dataset.

Usage

derive_summary_records(
  dataset,
  by_vars,
  filter = NULL,
  analysis_var,
  summary_fun,
  set_values_to = NULL
)

Arguments

dataset

A data frame.

by_vars

Variables to consider for generation of groupwise summary records. Providing the names of variables in exprs() will create a groupwise summary and generate summary records for the specified groups.

filter

Filter condition as logical expression to apply during summary calculation. By default, filtering expressions are computed within by_vars as this will help when an aggregating, lagging, or ranking function is involved.

For example,

filter = (AVAL > mean(AVAL, na.rm = TRUE)) will filter all AVAL values greater than mean of AVAL with in by_vars.
filter = (dplyr::n() > 2) will filter n count of by_vars greater than 2.

analysis_var

Analysis variable.

summary_fun

Function that takes as an input the analysis_var and performs the calculation. This can include built-in functions as well as user defined functions, for example mean or function(x) mean(x, na.rm = TRUE).

set_values_to

Variables to be set

The specified variables are set to the specified values for the new observations.

A list of variable name-value pairs is expected.

LHS refers to a variable.
RHS refers to the values to set to the variable. This can be a string, a symbol, a numeric value or NA, e.g., exprs(PARAMCD = "TDOSE", PARCAT1 = "OVERALL"). More general expression are not allowed.

Value

A data frame with derived records appended to original dataset.

Details

When all records have same values within by_vars then this function will retain those common values in the newly derived records. Otherwise new value will be set to NA.

Examples

library(tibble)
library(dplyr, warn.conflicts = TRUE)

adeg <- tribble(
  ~USUBJID, ~EGSEQ, ~PARAM, ~AVISIT, ~EGDTC, ~AVAL, ~TRTA,
  "XYZ-1001", 1, "QTcF Int. (msec)", "Baseline", "2016-02-24T07:50", 385, "",
  "XYZ-1001", 2, "QTcF Int. (msec)", "Baseline", "2016-02-24T07:52", 399, "",
  "XYZ-1001", 3, "QTcF Int. (msec)", "Baseline", "2016-02-24T07:56", 396, "",
  "XYZ-1001", 4, "QTcF Int. (msec)", "Visit 2", "2016-03-08T09:45", 384, "Placebo",
  "XYZ-1001", 5, "QTcF Int. (msec)", "Visit 2", "2016-03-08T09:48", 393, "Placebo",
  "XYZ-1001", 6, "QTcF Int. (msec)", "Visit 2", "2016-03-08T09:51", 388, "Placebo",
  "XYZ-1001", 7, "QTcF Int. (msec)", "Visit 3", "2016-03-22T10:45", 385, "Placebo",
  "XYZ-1001", 8, "QTcF Int. (msec)", "Visit 3", "2016-03-22T10:48", 394, "Placebo",
  "XYZ-1001", 9, "QTcF Int. (msec)", "Visit 3", "2016-03-22T10:51", 402, "Placebo",
  "XYZ-1002", 1, "QTcF Int. (msec)", "Baseline", "2016-02-22T07:58", 399, "",
  "XYZ-1002", 2, "QTcF Int. (msec)", "Baseline", "2016-02-22T07:58", 410, "",
  "XYZ-1002", 3, "QTcF Int. (msec)", "Baseline", "2016-02-22T08:01", 392, "",
  "XYZ-1002", 4, "QTcF Int. (msec)", "Visit 2", "2016-03-06T09:50", 401, "Active 20mg",
  "XYZ-1002", 5, "QTcF Int. (msec)", "Visit 2", "2016-03-06T09:53", 407, "Active 20mg",
  "XYZ-1002", 6, "QTcF Int. (msec)", "Visit 2", "2016-03-06T09:56", 400, "Active 20mg",
  "XYZ-1002", 7, "QTcF Int. (msec)", "Visit 3", "2016-03-24T10:50", 412, "Active 20mg",
  "XYZ-1002", 8, "QTcF Int. (msec)", "Visit 3", "2016-03-24T10:53", 414, "Active 20mg",
  "XYZ-1002", 9, "QTcF Int. (msec)", "Visit 3", "2016-03-24T10:56", 402, "Active 20mg",
)

# Summarize the average of the triplicate ECG interval values (AVAL)
derive_summary_records(
  adeg,
  by_vars = exprs(USUBJID, PARAM, AVISIT),
  analysis_var = AVAL,
  summary_fun = function(x) mean(x, na.rm = TRUE),
  set_values_to = exprs(DTYPE = "AVERAGE")
)
#> # A tibble: 24 x 8
#>    USUBJID  EGSEQ PARAM            AVISIT   EGDTC            AVAL TRTA     DTYPE
#>    <chr>    <dbl> <chr>            <chr>    <chr>           <dbl> <chr>    <chr>
#>  1 XYZ-1001     1 QTcF Int. (msec) Baseline 2016-02-24T07:…   385 ""       NA   
#>  2 XYZ-1001     2 QTcF Int. (msec) Baseline 2016-02-24T07:…   399 ""       NA   
#>  3 XYZ-1001     3 QTcF Int. (msec) Baseline 2016-02-24T07:…   396 ""       NA   
#>  4 XYZ-1001     4 QTcF Int. (msec) Visit 2  2016-03-08T09:…   384 "Placeb… NA   
#>  5 XYZ-1001     5 QTcF Int. (msec) Visit 2  2016-03-08T09:…   393 "Placeb… NA   
#>  6 XYZ-1001     6 QTcF Int. (msec) Visit 2  2016-03-08T09:…   388 "Placeb… NA   
#>  7 XYZ-1001     7 QTcF Int. (msec) Visit 3  2016-03-22T10:…   385 "Placeb… NA   
#>  8 XYZ-1001     8 QTcF Int. (msec) Visit 3  2016-03-22T10:…   394 "Placeb… NA   
#>  9 XYZ-1001     9 QTcF Int. (msec) Visit 3  2016-03-22T10:…   402 "Placeb… NA   
#> 10 XYZ-1002     1 QTcF Int. (msec) Baseline 2016-02-22T07:…   399 ""       NA   
#> # … with 14 more rows

advs <- tribble(
  ~USUBJID, ~VSSEQ, ~PARAM, ~AVAL, ~VSSTRESU, ~VISIT, ~VSDTC,
  "XYZ-001-001", 1164, "Weight", 99, "kg", "Screening", "2018-03-19",
  "XYZ-001-001", 1165, "Weight", 101, "kg", "Run-In", "2018-03-26",
  "XYZ-001-001", 1166, "Weight", 100, "kg", "Baseline", "2018-04-16",
  "XYZ-001-001", 1167, "Weight", 94, "kg", "Week 24", "2018-09-30",
  "XYZ-001-001", 1168, "Weight", 92, "kg", "Week 48", "2019-03-17",
  "XYZ-001-001", 1169, "Weight", 95, "kg", "Week 52", "2019-04-14",
)

# Set new values to any variable. Here, `DTYPE = MAXIMUM` refers to `max()` records
# and `DTYPE = AVERAGE` refers to `mean()` records.
derive_summary_records(
  advs,
  by_vars = exprs(USUBJID, PARAM),
  analysis_var = AVAL,
  summary_fun = max,
  set_values_to = exprs(DTYPE = "MAXIMUM")
) %>%
  derive_summary_records(
    by_vars = exprs(USUBJID, PARAM),
    analysis_var = AVAL,
    summary_fun = mean,
    set_values_to = exprs(DTYPE = "AVERAGE")
  )
#> # A tibble: 8 x 8
#>   USUBJID     VSSEQ PARAM   AVAL VSSTRESU VISIT     VSDTC      DTYPE  
#>   <chr>       <dbl> <chr>  <dbl> <chr>    <chr>     <chr>      <chr>  
#> 1 XYZ-001-001  1164 Weight  99   kg       Screening 2018-03-19 NA     
#> 2 XYZ-001-001  1165 Weight 101   kg       Run-In    2018-03-26 NA     
#> 3 XYZ-001-001  1166 Weight 100   kg       Baseline  2018-04-16 NA     
#> 4 XYZ-001-001  1167 Weight  94   kg       Week 24   2018-09-30 NA     
#> 5 XYZ-001-001  1168 Weight  92   kg       Week 48   2019-03-17 NA     
#> 6 XYZ-001-001  1169 Weight  95   kg       Week 52   2019-04-14 NA     
#> 7 XYZ-001-001    NA Weight 101   NA       NA        NA         MAXIMUM
#> 8 XYZ-001-001    NA Weight  97.4 NA       NA        NA         AVERAGE

# Sample ADEG dataset with triplicate record for only AVISIT = 'Baseline'
adeg <- tribble(
  ~USUBJID, ~EGSEQ, ~PARAM, ~AVISIT, ~EGDTC, ~AVAL, ~TRTA,
  "XYZ-1001", 1, "QTcF Int. (msec)", "Baseline", "2016-02-24T07:50", 385, "",
  "XYZ-1001", 2, "QTcF Int. (msec)", "Baseline", "2016-02-24T07:52", 399, "",
  "XYZ-1001", 3, "QTcF Int. (msec)", "Baseline", "2016-02-24T07:56", 396, "",
  "XYZ-1001", 4, "QTcF Int. (msec)", "Visit 2", "2016-03-08T09:48", 393, "Placebo",
  "XYZ-1001", 5, "QTcF Int. (msec)", "Visit 2", "2016-03-08T09:51", 388, "Placebo",
  "XYZ-1001", 6, "QTcF Int. (msec)", "Visit 3", "2016-03-22T10:48", 394, "Placebo",
  "XYZ-1001", 7, "QTcF Int. (msec)", "Visit 3", "2016-03-22T10:51", 402, "Placebo",
  "XYZ-1002", 1, "QTcF Int. (msec)", "Baseline", "2016-02-22T07:58", 399, "",
  "XYZ-1002", 2, "QTcF Int. (msec)", "Baseline", "2016-02-22T07:58", 410, "",
  "XYZ-1002", 3, "QTcF Int. (msec)", "Baseline", "2016-02-22T08:01", 392, "",
  "XYZ-1002", 4, "QTcF Int. (msec)", "Visit 2", "2016-03-06T09:53", 407, "Active 20mg",
  "XYZ-1002", 5, "QTcF Int. (msec)", "Visit 2", "2016-03-06T09:56", 400, "Active 20mg",
  "XYZ-1002", 6, "QTcF Int. (msec)", "Visit 3", "2016-03-24T10:53", 414, "Active 20mg",
  "XYZ-1002", 7, "QTcF Int. (msec)", "Visit 3", "2016-03-24T10:56", 402, "Active 20mg",
)

# Compute the average of AVAL only if there are more than 2 records within the
# by group
derive_summary_records(
  adeg,
  by_vars = exprs(USUBJID, PARAM, AVISIT),
  filter = n() > 2,
  analysis_var = AVAL,
  summary_fun = function(x) mean(x, na.rm = TRUE),
  set_values_to = exprs(DTYPE = "AVERAGE")
)
#> # A tibble: 16 x 8
#>    USUBJID  EGSEQ PARAM          AVISIT   EGDTC          AVAL TRTA        DTYPE 
#>    <chr>    <dbl> <chr>          <chr>    <chr>         <dbl> <chr>       <chr> 
#>  1 XYZ-1001     1 QTcF Int. (ms… Baseline 2016-02-24T0…  385  ""          NA    
#>  2 XYZ-1001     2 QTcF Int. (ms… Baseline 2016-02-24T0…  399  ""          NA    
#>  3 XYZ-1001     3 QTcF Int. (ms… Baseline 2016-02-24T0…  396  ""          NA    
#>  4 XYZ-1001     4 QTcF Int. (ms… Visit 2  2016-03-08T0…  393  "Placebo"   NA    
#>  5 XYZ-1001     5 QTcF Int. (ms… Visit 2  2016-03-08T0…  388  "Placebo"   NA    
#>  6 XYZ-1001     6 QTcF Int. (ms… Visit 3  2016-03-22T1…  394  "Placebo"   NA    
#>  7 XYZ-1001     7 QTcF Int. (ms… Visit 3  2016-03-22T1…  402  "Placebo"   NA    
#>  8 XYZ-1002     1 QTcF Int. (ms… Baseline 2016-02-22T0…  399  ""          NA    
#>  9 XYZ-1002     2 QTcF Int. (ms… Baseline 2016-02-22T0…  410  ""          NA    
#> 10 XYZ-1002     3 QTcF Int. (ms… Baseline 2016-02-22T0…  392  ""          NA    
#> 11 XYZ-1002     4 QTcF Int. (ms… Visit 2  2016-03-06T0…  407  "Active 20… NA    
#> 12 XYZ-1002     5 QTcF Int. (ms… Visit 2  2016-03-06T0…  400  "Active 20… NA    
#> 13 XYZ-1002     6 QTcF Int. (ms… Visit 3  2016-03-24T1…  414  "Active 20… NA    
#> 14 XYZ-1002     7 QTcF Int. (ms… Visit 3  2016-03-24T1…  402  "Active 20… NA    
#> 15 XYZ-1001    NA QTcF Int. (ms… Baseline NA             393.  NA         AVERA…
#> 16 XYZ-1002    NA QTcF Int. (ms… Baseline NA             400.  NA         AVERA…