The following code computes three different features over the same dataset. I'm not sure if the filter_by_day_segment
function can be made tidy or there's a more efficient/short but still readable way of refactor my code.
library(dplyr)
filter_by_day_segment <- function(data, day_segment){
if(day_segment == "daily"){
return(data %>% group_by(local_date))
} else {
return(data %>% filter(day_segment == local_day_segment) %>% group_by(local_date))
}
}
compute_metric <- function(data, metric, day_segment){
if(metric == "countscans"){
data <- filter_by_day_segment(data, day_segment)
return(data %>% summarise(!!paste("sensor", day_segment, metric, sep = "_") := n()))
}else if(metric == "uniquedevices"){
data <- filter_by_day_segment(data, day_segment)
return(data %>% summarise(!!paste("sensor", day_segment, metric, sep = "_") := n_distinct(value)))
}
else if(metric == "countscansmostuniquedevice"){
data <- data %>% group_by(value) %>%
mutate(N=n()) %>%
ungroup() %>%
filter(N == max(N))
data <- filter_by_day_segment(data, day_segment)
return(data %>% summarise(!!paste("sensor", day_segment, metric, sep = "_") := n()))
}
}
data <- read.csv("test.csv")
day_segment <- "daily"
metrics <- c("countscans", "uniquedevices", "countscansmostuniquedevice")
features = data.frame()
for(metric in metrics){
feature <- compute_metric(data, metric, day_segment)
if(nrow(features) == 0){
features <- feature
} else{
features <- merge(features, feature, by="local_date", all = TRUE)
}
}
print(features)
A test CSV file
"local_date","value"
"2018-05-21","FC:44"
"2018-05-21","FC:58"
"2018-05-21","FF:7E"
"2018-05-21","F8:77"
"2018-05-21","F8:77"
"2018-05-22","FB:F1"
"2018-05-22","FC:62"
"2018-05-22","FE:D4"
"2018-05-22","FE:D4"
"2018-05-22","FC:F1"
"2018-05-23","F8:77"
"2018-05-23","F8:77"
"2018-05-23","FF:13"
"2018-05-23","F8:3F"
"2018-05-23","F8:3F"
"2018-05-23","F8:3F"
"2018-05-23","FC:B6"
"2018-05-24","FC:0D"
"2018-05-24","F8:3F"
"2018-05-24","F7:B6"
"2018-05-24","F6:96"
"2018-05-24","F6:96"
"2018-05-24","F6:96"
"2018-05-24","F6:96"
"2018-05-24","F6:96"
"2018-05-24","F6:96"
"2018-05-24","F6:96"
"2018-05-25","FC:A8"
"2018-05-25","FC:44"
"2018-05-25","FC:44"
"2018-05-25","FC:44"
"2018-05-25","FC:44"
"2018-05-25","FC:44"
"2018-05-25","FC:44"
"2018-05-25","FC:44"
"2018-05-25","FC:44"
"2018-05-26","FC:F1"
"2018-05-26","FC:A8"
"2018-05-26","FF:89"
"2018-05-26","FF:89"
"2018-05-26","FF:89"
1 Answer 1
If I was reviewing this code professionally, my first comment would be that you should stick to a style guide. This will govern things like spacing between statements, brackets, operators etc. It can be seen as a kind of nitpicky remark (I certainly did at first) but having a consistent style massively aids readability for you and for others.
The second (style) comment would be that your code is halfway between very pipe-based code and "standard" R style (lots of assignment). This makes it difficult to read. If you're going to go with pipes, stick with it.
Also, when using an ifelse with more than 2 conditions, it's often clearer to use a switch block. This reduces the potential for massive nested ifs, and should also discourage you from doing much branching within the if.
I don't really like how similar the summarise calls are in each case. Ideally I would refactor that, but I think that is somewhat tricky due to how dplyr handles n
and n_distinct
.
This is how I would rewrite your code, though I would also consult the tidyverse style guide to see their recommendations -- I generally try to avoid programming with dplyr so I'm not that familiar with the style.
filter_by_day_segment_refactor <- function(data, day_segment) {
## Minimise the amount done within the if/else clause (also reduces duplication)
if (day_segment == "daily") {
fun <- identity
} else {
fun <- function(x) filter(x, day_segment == local_day_segment)
}
data %>% fun() %>% group_by(local_date)
}
compute_metric_refactor <- function(data, metric, day_segment) {
## 3 case switch block with just 1 pipe each,
## rather than 3 conditionals with a mix of
## pipes and assignment
switch(metric,
"countscans" = {
data %>%
filter_by_day_segment(day_segment) %>%
summarise(!!paste("sensor", day_segment, metric, sep = "_") := n())
},
"uniquedevices" = {
data %>%
filter_by_day_segment(day_segment) %>%
summarise(!!paste("sensor", day_segment, metric, sep = "_") := n_distinct(value))
},
"countscansmostuniquedevice" = {
data %>% group_by(value) %>%
mutate(N = n()) %>%
ungroup() %>%
filter(N == max(N)) %>%
filter_by_day_segment(day_segment) %>%
summarise(!!paste("sensor", day_segment, metric, sep = "_") := n())
}
)
}