Compute association statistics between columns of a data frame

Evaluate a list of scalar functions on any number of "response" columns by any number of "predictor" columns

univariate_associations(
    data,
    f,
    responses,
    predictors
)

Arguments

data: A data.frame.
f: A function or a list of functions (preferably named) that take a vector as input in the first two arguments and return a scalar.
responses: A vector of quoted/unquoted columns, positions, and/or tidyselect::select_helpers to be evaluated as the first argument. See the left argument in dish.
predictors: A vector of quoted/unquoted columns, positions, and/or tidyselect::select_helpers to be evaluated as the second argument. See the right argument in dish.

Value

A tibble::tibble with the response/predictor columns down the rows and the results of the f across the columns. The names of the result columns will be the names provided in f.

Author

Alex Zajichek

Examples

#Make a list of functions to evaluate
f <-
  list(
    
    #Compute a univariate p-value
    `P-value` =
      function(y, x) {
        if(some_type(x, c("factor", "character"))) {
          
          p <- fisher.test(factor(y), factor(x), simulate.p.value = TRUE)$p.value
          
        } else {
          
          p <- kruskal.test(x, factor(y))$p.value
          
        }
        
        ifelse(p < 0.001, "<0.001", as.character(round(p, 2)))
        
      },
    
    #Compute difference in AIC model between null model and one predictor model
    `AIC Difference` =
      function(y, x) {
        
        glm(factor(y)~1, family = "binomial")$aic -
          glm(factor(y)~x, family = "binomial")$aic
        
      }
  )

#Choose a couple binary outcomes
heart_disease %>% 
  univariate_associations(
    f = f,
    responses = c(ExerciseInducedAngina, HeartDisease)
  )
#> # A tibble: 14 × 4
#>    response              predictor   `P-value` `AIC Difference`
#>    <chr>                 <chr>       <chr>                <dbl>
#>  1 ExerciseInducedAngina Age         0.14                 0.567
#>  2 ExerciseInducedAngina Sex         0.01                 4.72 
#>  3 ExerciseInducedAngina ChestPain   <0.001              64.1  
#>  4 ExerciseInducedAngina BP          0.38                -0.740
#>  5 ExerciseInducedAngina Cholesterol 0.14                -0.875
#>  6 ExerciseInducedAngina BloodSugar  0.66                -1.80 
#>  7 ExerciseInducedAngina MaximumHR   <0.001              42.5  
#>  8 HeartDisease          Age         <0.001              13.4  
#>  9 HeartDisease          Sex         <0.001              22.0  
#> 10 HeartDisease          ChestPain   <0.001              80.1  
#> 11 HeartDisease          BP          0.03                 4.95 
#> 12 HeartDisease          Cholesterol 0.04                 0.206
#> 13 HeartDisease          BloodSugar  0.66                -1.81 
#> 14 HeartDisease          MaximumHR   <0.001              55.1  

#Use a subset of predictors
heart_disease %>% 
  univariate_associations(
    f = f,
    responses = c(ExerciseInducedAngina, HeartDisease),
    predictors = c(Age, BP)
  )
#> # A tibble: 4 × 4
#>   response              predictor `P-value` `AIC Difference`
#>   <chr>                 <chr>     <chr>                <dbl>
#> 1 ExerciseInducedAngina Age       0.14                 0.567
#> 2 ExerciseInducedAngina BP        0.38                -0.740
#> 3 HeartDisease          Age       <0.001              13.4  
#> 4 HeartDisease          BP        0.03                 4.95 

#Numeric predictors only
heart_disease %>% 
  univariate_associations(
    f = f,
    responses = c(ExerciseInducedAngina, HeartDisease),
    predictors = is.numeric
  )
#> Warning: Use of bare predicate functions was deprecated in tidyselect 1.1.0.
#> ℹ Please use wrap predicates in `where()` instead.
#>   # Was:
#>   data %>% select(is.numeric)
#> 
#>   # Now:
#>   data %>% select(where(is.numeric))
#> # A tibble: 8 × 4
#>   response              predictor   `P-value` `AIC Difference`
#>   <chr>                 <chr>       <chr>                <dbl>
#> 1 ExerciseInducedAngina Age         0.14                 0.567
#> 2 ExerciseInducedAngina BP          0.38                -0.740
#> 3 ExerciseInducedAngina Cholesterol 0.14                -0.875
#> 4 ExerciseInducedAngina MaximumHR   <0.001              42.5  
#> 5 HeartDisease          Age         <0.001              13.4  
#> 6 HeartDisease          BP          0.03                 4.95 
#> 7 HeartDisease          Cholesterol 0.04                 0.206
#> 8 HeartDisease          MaximumHR   <0.001              55.1