<- function(arguments) {
name
body }
functions
Chapter 15
General format for writing a function
We’ll discuss 3 types of functions:
Vector
Data
Plotting
Vector Functions
Motivation for functions
- Create 4 vectors, each containing 5 random numbers with mean = 2, and standard deviation = 3.
- Normalize each vector to have min = 0 and max = 1.
- Notice any inefficency?
- Write a function to normalize a vector and use it.
<- function(a) {
n <- range(a, na.rm = TRUE)
rng - rng[1])/(rng[2] - rng[1])
(a }
- Use the function inside
mutate
like this:
library(tidyverse)
# using a function within mutate
|> mutate(
df a = rescale01(a),
b = rescale01(b),
c = rescale01(c),
d = rescale01(d))
We can use across()
to make the above even more efficient
### How to compute the mean of many columns
library(tidyverse)
# get the mean of columns a - d.
<- tibble(
df a = rnorm(10),
b = rnorm(10),
c = rnorm(10),
d = rnorm(10)
)
print(df)
# A tibble: 10 × 4
a b c d
<dbl> <dbl> <dbl> <dbl>
1 0.949 1.10 -1.36 0.623
2 -1.26 -1.39 1.56 0.0212
3 0.421 0.856 0.130 -0.135
4 -0.813 2.15 1.01 -1.34
5 -1.63 -2.86 -0.0441 -1.12
6 -0.431 0.409 -1.06 1.57
7 0.510 0.775 -0.436 0.113
8 1.93 -0.164 0.220 0.414
9 -0.0219 -1.22 1.21 -0.254
10 0.769 -0.0388 0.368 0.536
|> summarize(across(a:d, mean)) df
# A tibble: 1 × 4
a b c d
<dbl> <dbl> <dbl> <dbl>
1 0.0412 -0.0376 0.160 0.0433
### How to compute the median of numeric columns
glimpse(diamonds)
Rows: 53,940
Columns: 10
$ carat <dbl> 0.23, 0.21, 0.23, 0.29, 0.31, 0.24, 0.24, 0.26, 0.22, 0.23, 0.…
$ cut <ord> Ideal, Premium, Good, Premium, Good, Very Good, Very Good, Ver…
$ color <ord> E, E, E, I, J, J, I, H, E, H, J, J, F, J, E, E, I, J, J, J, I,…
$ clarity <ord> SI2, SI1, VS1, VS2, SI2, VVS2, VVS1, SI1, VS2, VS1, SI1, VS1, …
$ depth <dbl> 61.5, 59.8, 56.9, 62.4, 63.3, 62.8, 62.3, 61.9, 65.1, 59.4, 64…
$ table <dbl> 55, 61, 65, 58, 58, 57, 57, 55, 61, 61, 55, 56, 61, 54, 62, 58…
$ price <int> 326, 326, 327, 334, 335, 336, 336, 337, 337, 338, 339, 340, 34…
$ x <dbl> 3.95, 3.89, 4.05, 4.20, 4.34, 3.94, 3.95, 4.07, 3.87, 4.00, 4.…
$ y <dbl> 3.98, 3.84, 4.07, 4.23, 4.35, 3.96, 3.98, 4.11, 3.78, 4.05, 4.…
$ z <dbl> 2.43, 2.31, 2.31, 2.63, 2.75, 2.48, 2.47, 2.53, 2.49, 2.39, 2.…
|> summarize(across(where(is.numeric), median)) diamonds
# A tibble: 1 × 7
carat depth table price x y z
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0.7 61.8 57 2401 5.7 5.71 3.53
## The range
function
Use the range of a vector to minimize redundancy
<- function(x) {
rescale01 <- range(x, na.rm = TRUE)
rng - rng[1]) / (rng[2] - rng[1])
(x }
### The z-score
<- function(x) {
z_score - mean(x, na.rm = TRUE)) / sd(x, na.rm = TRUE)
(x }
### Clamp
What happens if you remove the .default
line?
<- function(x, min, max) {
clamp case_when(
< min ~ min,
x > max ~ max,
x .default = x
)
}
clamp(1:10, min = 3, max = 7)
[1] 3 3 3 4 5 6 7 7 7 7
### A function on strings
# notice what is gettings assigned TO
<- function(x) {
first_upper str_sub(x, 1, 1) <- str_to_upper(str_sub(x, 1, 1))
x
}
first_upper("hello")
[1] "Hello"
### A string stripper
# the order matters in the str_remove_all's below
<- function(x) {
clean_number <- str_detect(x, "%")
is_pct <- x |>
num str_remove_all("%") |>
str_remove_all(",") |>
str_remove_all(fixed("$")) |>
as.numeric()
if_else(is_pct, num / 100, num)
}
clean_number("$12,300")
[1] 12300
clean_number("45%")
[1] 0.45
### Summarize functions
Here’s a somewhat advanced version. Observe the importance of the :=
<- function(x,cat_var, num_var){
f |> group_by({{cat_var}}) |>
x summarize("{{num_var}}" := mean({{num_var}}))
}