Background
The function makes use of cut
function offered in R's base package in order to "bin" a numeric vector into provided categories and apply, meaningful user, friendly labels.
Example
For vector:
set.seed(1); x <- runif(10)
[1] 0.26550866 0.37212390 ...
and brackets c(0.1, 0.3)
The function would return (for the two values above):
0.1 >= your_value <= 0.3
your_value >= 0.3
Implementation
cut_into_bins <- function(x, bin_groups, value_name = "your_value") {
# Sort vector
bin_groups <- sort(bin_groups)
# Ensure infinity at the ends
if (head(bin_groups, 1) != Inf) {
bin_groups <- append(bin_groups, -Inf, 0)
}
if (tail(bin_groups, 1) != Inf) {
bin_groups <- append(bin_groups, Inf)
}
# Create labels
lbls <- NULL
i <- 1
while (i < length(bin_groups)) {
lbls[i] <- paste(bin_groups[i], bin_groups[i + 1])
i <- i + 1
}
lbls <- sapply(
X = lbls,
FUN = function(x) {
if (grepl("-Inf", x, fixed = TRUE)) {
gsub("-Inf", paste(value_name, "<="), x)
} else if (grepl("Inf", x, fixed = TRUE)) {
x <- gsub("Inf", "", x)
paste(value_name, ">=", x)
} else {
gsub("(\\d+\\.\\d+)(\\s)(\\d+\\.\\d+)", paste("\1円 <=", value_name ,"<= \3円"), x)
}
}
)
# Cut and return simple character vector
res <-
cut.default(
x = x,
breaks = bin_groups,
include.lowest = TRUE,
right = TRUE,
labels = lbls
)
as.character(trimws(res))
}
Testing
sample_vec <-
c(
-198,-19292.221,-0.5,
0.1,
0.8,
0.3,
0.11,
0.5,
0.55,
0.6,
0.72,
-0.72,
0.95,
1,
1.2,
9829082,
2092
)
custom_bands <- c(0.1, 0.5, 0.6, 0.75, 0.9)
# Run function
res <- cut_into_bins(x = sample_vec, bin_groups = custom_bands)
# print(matrix(data = c(sample_vec, res), ncol = 2))
Results
# [,1] [,2]
# [1,] "-198" "your_value <= 0.1"
# [2,] "-19292.221" "your_value <= 0.1"
# [3,] "-0.5" "your_value <= 0.1"
# [4,] "0.1" "your_value <= 0.1"
# [5,] "0.8" "0.75 <= your_value <= 0.9"
# [6,] "0.3" "0.1 <= your_value <= 0.5"
# [7,] "0.11" "0.1 <= your_value <= 0.5"
# [8,] "0.5" "0.1 <= your_value <= 0.5"
# [9,] "0.55" "0.5 <= your_value <= 0.6"
# [10,] "0.6" "0.5 <= your_value <= 0.6"
# [11,] "0.72" "0.6 <= your_value <= 0.75"
# [12,] "-0.72" "your_value <= 0.1"
# [13,] "0.95" "your_value >= 0.9"
# [14,] "1" "your_value >= 0.9"
# [15,] "1.2" "your_value >= 0.9"
# [16,] "9829082" "your_value >= 0.9"
# [17,] "2092" "your_value >= 0.9"
Sought feedback
In particular, I'm interested in comments addressing the following:
- The way object
lols
is constructed is inelegant. In particular, I don't appreciate reliance ongsub
; what would be wiser approach to this challenge? - Are there any edge cases that function may not capture?
- In the actual implementation I'm also testing for correct types of passed vectors:
x
andbin_groups
so there is no risk of strings being passed instead of numeric vectors, etc.
- In the actual implementation I'm also testing for correct types of passed vectors:
Some afterthoughs ...
Following @minem's reply, I've run some benchmarking tests on different approaches to label creation:
# Functions ---------------------------------------------------------------
unique_sort <- function(x) {
x <- c(Inf, -Inf, x)
x <- unique(x)
sort(x)
}
sort_unique <- function(x) {
x <- c(Inf, -Inf, x)
x <- sort(x)
unique(x)
}
if_logic <- function(x) {
if (head(x, 1) != Inf) {
x <- append(x, -Inf, 0)
}
if (tail(x, 1) != Inf) {
x <- append(x, Inf)
}
}
# Benchmark ---------------------------------------------------------------
bands <- c(0.1, 0.5, 0.6, 0.75, 0.9)
bench::mark(
unique_sort(x = bands),
sort_unique(x = bands),
if_logic(x = bands)
)
Results
It would appear that clunky if
approach performs better; although, this is not something that is relevant to this function as labels are created only once...
# A tibble: 3 x 13
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
<bch:expr> <bch:tm> <bch:t> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
1 unique_sort(x = bands) 30.01μs 33.7μs 27365. 0B 13.7 9995 5 365ms <dbl [... <Rprofm... <bch:t... <tibbl...
2 sort_unique(x = bands) 30.38μs 61.2μs 14340. 0B 8.87 6466 4 451ms <dbl [... <Rprofm... <bch:t... <tibbl...
3 if_logic(x = bands) 9.32μs 11.6μs 84078. 0B 16.8 9998 2 119ms <dbl [... <Rprofm... <bch:t... <tibbl...
1 Answer 1
I would adjust the function like:
cut_into_bins2 <- function(x, bin_groups, value_name = "your_value") {
# Ensure infinity at the ends
bin_groups <- c(-Inf, Inf, bin_groups)
bin_groups <- unique(bin_groups)
bin_groups <- sort(bin_groups)
# Create labels
bin_groups2 <- bin_groups[-length(bin_groups)][-1]
n2 <- length(bin_groups2)
lbls <- c(
sprintf("%s <= %s", value_name, bin_groups2[1]),
sprintf("%s < %s <= %s", bin_groups2[-n2], value_name, bin_groups2[-1]),
sprintf("%s < %s", bin_groups2[n2], value_name)
)
# Cut and return simple character vector
res <-
cut.default(
x = x,
breaks = bin_groups,
include.lowest = TRUE,
right = TRUE,
labels = lbls
)
res
return(as.character(res))
}
- shorter addition of Inf values. We add them, take unique values and then sort.
- rewrote creation of labels. As we know all values are unique and sorted we can create the labels like this. + adjusted the labels to match the results ('<' instead of '<=' for interval matching)