Compute descriptive statistics on columns of a data frame

The user can specify an unlimited number of functions to evaluate and the types of data that each set of functions will be applied to (including the default; see "Details").

descriptives(
    data,
    f_all = NULL,
    f_numeric = NULL,
    numeric_types = "numeric",
    f_categorical = NULL,
    categorical_types = "factor",
    f_other = NULL,
    useNA = c("ifany", "no", "always"),
    round = 2,
    na_string = "(missing)"
)

Arguments

data: A data.frame.
f_all: A list of functions to evaluate on all columns.
f_numeric: A list of functions to evaluate on numeric_types columns.
numeric_types: Character vector of data types that should be evaluated by f_numeric.
f_categorical: A list of functions to evaluate on categorical_types columns.
categorical_types: Character vector of data types that should be evaluated by f_categorical.
f_other: A list of functions to evaluate on remaining columns.
useNA: See table for details. Defaults to "ifany".
round: Digit to round numeric data. Defaults to 2.
na_string: String to fill in NA names.

Details

The following fun_key's are available by default for the specified types:

ALL: length, missing, available, class, unique
Numeric: mean, sd, min, q1, median, q3, max, iqr, range
Categorical: count, proportion, percent

Value

A tibble::tibble with the following columns:

fun_eval: Column types function was applied to
fun_key: Name of function that was evaluated
col_ind: Index from input dataset
col_lab: Label of the column
val_ind: Index of the value within the function result
val_lab: Label extracted from the result with names
val_dbl: Numeric result
val_chr: Non-numeric result
val_cbn: Combination of (rounded) numeric and non-numeric values

Author

Alex Zajichek

Examples

#Default
heart_disease %>%
    descriptives()
#> # A tibble: 111 × 9
#>    fun_eval fun_key   col_ind col_lab    val_ind val_lab val_dbl val_chr val_cbn
#>    <chr>    <chr>       <int> <chr>        <int> <chr>     <dbl> <chr>   <chr>  
#>  1 all      available       1 Age              1 NA          303 NA      303    
#>  2 all      available       2 Sex              1 NA          303 NA      303    
#>  3 all      available       3 ChestPain        1 NA          303 NA      303    
#>  4 all      available       4 BP               1 NA          303 NA      303    
#>  5 all      available       5 Cholester…       1 NA          303 NA      303    
#>  6 all      available       6 BloodSugar       1 NA          303 NA      303    
#>  7 all      available       7 MaximumHR        1 NA          303 NA      303    
#>  8 all      available       8 ExerciseI…       1 NA          303 NA      303    
#>  9 all      available       9 HeartDise…       1 NA          303 NA      303    
#> 10 all      class           1 Age              1 NA           NA numeric numeric
#> # ℹ 101 more rows

#Allow logicals as categorical
heart_disease %>%
    descriptives(
        categorical_types = c("logical", "factor")
    ) %>%
    
    #Extract info from the column
    dplyr::filter(
        col_lab == "BloodSugar"
    ) 
#> # A tibble: 11 × 9
#>    fun_eval    fun_key   col_ind col_lab val_ind val_lab val_dbl val_chr val_cbn
#>    <chr>       <chr>       <int> <chr>     <int> <chr>     <dbl> <chr>   <chr>  
#>  1 all         available       6 BloodS…       1 NA      303     NA      303    
#>  2 all         class           6 BloodS…       1 NA       NA     logical logical
#>  3 all         length          6 BloodS…       1 NA      303     NA      303    
#>  4 all         missing         6 BloodS…       1 NA        0     NA      0      
#>  5 all         unique          6 BloodS…       1 NA        2     NA      2      
#>  6 categorical count           6 BloodS…       1 FALSE   258     NA      258    
#>  7 categorical count           6 BloodS…       2 TRUE     45     NA      45     
#>  8 categorical percent         6 BloodS…       1 FALSE    85.1   NA      85.15  
#>  9 categorical percent         6 BloodS…       2 TRUE     14.9   NA      14.85  
#> 10 categorical proporti…       6 BloodS…       1 FALSE     0.851 NA      0.85   
#> 11 categorical proporti…       6 BloodS…       2 TRUE      0.149 NA      0.15   

#Nothing treated as numeric
heart_disease %>%
    descriptives(
        numeric_types = NULL
    )
#> # A tibble: 75 × 9
#>    fun_eval fun_key   col_ind col_lab    val_ind val_lab val_dbl val_chr val_cbn
#>    <chr>    <chr>       <int> <chr>        <int> <chr>     <dbl> <chr>   <chr>  
#>  1 all      available       1 Age              1 NA          303 NA      303    
#>  2 all      available       2 Sex              1 NA          303 NA      303    
#>  3 all      available       3 ChestPain        1 NA          303 NA      303    
#>  4 all      available       4 BP               1 NA          303 NA      303    
#>  5 all      available       5 Cholester…       1 NA          303 NA      303    
#>  6 all      available       6 BloodSugar       1 NA          303 NA      303    
#>  7 all      available       7 MaximumHR        1 NA          303 NA      303    
#>  8 all      available       8 ExerciseI…       1 NA          303 NA      303    
#>  9 all      available       9 HeartDise…       1 NA          303 NA      303    
#> 10 all      class           1 Age              1 NA           NA numeric numeric
#> # ℹ 65 more rows

#Evaluate a custom function
heart_disease %>%
    descriptives(
        f_numeric = 
            list(
                cv = function(x) sd(x, na.rm = TRUE)/mean(x, na.rm = TRUE)
            )
    ) %>%
    
    #Extract info from the custom function
    dplyr::filter(
        fun_key == "cv"
    ) 
#> # A tibble: 4 × 9
#>   fun_eval fun_key col_ind col_lab     val_ind val_lab val_dbl val_chr val_cbn
#>   <chr>    <chr>     <int> <chr>         <int> <chr>     <dbl> <chr>   <chr>  
#> 1 numeric  cv            1 Age               1 NA        0.166 NA      0.17   
#> 2 numeric  cv            4 BP                1 NA        0.134 NA      0.13   
#> 3 numeric  cv            5 Cholesterol       1 NA        0.210 NA      0.21   
#> 4 numeric  cv            7 MaximumHR         1 NA        0.153 NA      0.15