<- function(arguments) {
name
body }
functions
Chapter 15
General format for writing a function
We’ll discuss 3 types of functions:
Vector
Data
Plotting
Vector Functions
Motivation for functions n <- function(a) {
- Create 4 vectors, each containing 5 random numbers with mean = 2, and standard deviation = 3.
- Normalize each vector to have min = 0 and max = 1.
- Notice any inefficency?
- Write a function to normalize a vector and use it.
- Use the function inside
mutate
like this:
n <- function(a) { rng <- range(a, na.rm = TRUE) (a - rng[1])/(rng[2] - rng[1]) }
# using a function within mutate
|> mutate(
df a = rescale01(a),
b = rescale01(b),
c = rescale01(c),
d = rescale01(d),
)
We can use across()
to make the above even more efficient
How to compute the mean of many columns
library(tidyverse)
# get the mean of columns a - d.
<- tibble(
df a = rnorm(10),
b = rnorm(10),
c = rnorm(10),
d = rnorm(10)
)
print(df)
# A tibble: 10 × 4
a b c d
<dbl> <dbl> <dbl> <dbl>
1 -1.00 1.30 0.867 -0.964
2 -0.458 -1.27 -0.0498 -0.636
3 0.984 -1.13 -0.583 -3.57
4 1.46 -0.172 0.496 2.31
5 0.0556 0.767 0.385 1.27
6 -1.95 -1.22 0.249 0.842
7 -2.08 -0.0279 1.60 0.358
8 -0.652 0.442 0.159 0.616
9 -1.02 1.39 -0.125 0.929
10 -0.794 1.14 -0.193 -0.104
|> summarize(across(a:d, mean)) df
# A tibble: 1 × 4
a b c d
<dbl> <dbl> <dbl> <dbl>
1 -0.545 0.121 0.280 0.105
How to compute the median of numeric columns
glimpse(diamonds)
Rows: 53,940
Columns: 10
$ carat <dbl> 0.23, 0.21, 0.23, 0.29, 0.31, 0.24, 0.24, 0.26, 0.22, 0.23, 0.…
$ cut <ord> Ideal, Premium, Good, Premium, Good, Very Good, Very Good, Ver…
$ color <ord> E, E, E, I, J, J, I, H, E, H, J, J, F, J, E, E, I, J, J, J, I,…
$ clarity <ord> SI2, SI1, VS1, VS2, SI2, VVS2, VVS1, SI1, VS2, VS1, SI1, VS1, …
$ depth <dbl> 61.5, 59.8, 56.9, 62.4, 63.3, 62.8, 62.3, 61.9, 65.1, 59.4, 64…
$ table <dbl> 55, 61, 65, 58, 58, 57, 57, 55, 61, 61, 55, 56, 61, 54, 62, 58…
$ price <int> 326, 326, 327, 334, 335, 336, 336, 337, 337, 338, 339, 340, 34…
$ x <dbl> 3.95, 3.89, 4.05, 4.20, 4.34, 3.94, 3.95, 4.07, 3.87, 4.00, 4.…
$ y <dbl> 3.98, 3.84, 4.07, 4.23, 4.35, 3.96, 3.98, 4.11, 3.78, 4.05, 4.…
$ z <dbl> 2.43, 2.31, 2.31, 2.63, 2.75, 2.48, 2.47, 2.53, 2.49, 2.39, 2.…
|> summarize(across(where(is.numeric), median)) diamonds
# A tibble: 1 × 7
carat depth table price x y z
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0.7 61.8 57 2401 5.7 5.71 3.53
The range
function
Use the range of a vector to minimize redundancy
<- function(x) {
rescale01 <- range(x, na.rm = TRUE)
rng - rng[1]) / (rng[2] - rng[1])
(x }
The z-score
<- function(x) {
z_score - mean(x, na.rm = TRUE)) / sd(x, na.rm = TRUE)
(x }
Clamp
What happens if you remove the .default
line?
<- function(x, min, max) {
clamp case_when(
< min ~ min,
x > max ~ max,
x .default = x
)
}
clamp(1:10, min = 3, max = 7)
[1] 3 3 3 4 5 6 7 7 7 7
A function on strings
# notice what is gettings assigned TO
<- function(x) {
first_upper str_sub(x, 1, 1) <- str_to_upper(str_sub(x, 1, 1))
x
}
first_upper("hello")
[1] "Hello"
A string stripper
# the order matters in the str_remove_all's below
<- function(x) {
clean_number <- str_detect(x, "%")
is_pct <- x |>
num str_remove_all("%") |>
str_remove_all(",") |>
str_remove_all(fixed("$")) |>
as.numeric()
if_else(is_pct, num / 100, num)
}
clean_number("$12,300")
[1] 12300
clean_number("45%")
[1] 0.45
Summarize functions
Here’s a somewhat advanced version. Observe the importance of the :=
<- function(x,cat_var, num_var){
f |> group_by({{cat_var}}) |>
x summarize("{{num_var}}" := mean({{num_var}}))
}