user_sample_3

plot of bias comparison
Load data for multiple stations and run counterfactual.
Investigate effect of 9 Euro ticket for station Köln, Bottrop, Lünen

Prerequisites

Installation for user as described in README

Steps

adapt the data_dir (in windows e.g. //de-inf-008/$/Eigene Dateien/ubair-master/data/)

 library(ubair)
 library(ggplot2)
 # set data_dir where the data is stored
data_dir <- "../../Daten/user_sample_data/"
 # This might take a few seconds for large files
data <- load_uba_data_from_dir(data_dir = data_dir)

Set sample variables.

target <- "NO2"
stations <- list(
 Lünen = "DENW006", Bottrop = "DENW021",
 Köln = "DENW212"
)
meteo_variables <- c("TMP", "RFE", "WIG", "WIR", "LDR")
 
 # dates for 9 Euro ticket effect
application_start <- lubridate::ymd("20220301") # = start reference time
date_effect_start <- lubridate::ymd_hm("20220601 00:00")
application_end <- lubridate::ymd("20220831") # = end effect time
 
buffer <- 0 # number of data points to be ignored before effect
 
trend <- "linear"
model_type <- "lightgbm"
 
window_size <- 14 # days of data to calculate the mean in prediction results

create a params.yaml. Either

copy the default params.yaml to your file system, adapt it and load it or
load the default and adapt the params programmatically

params <- load_params()
 # adapt params programatically
params$target <- target
params$meteo_variables <- meteo_variables

Prepare data of each station for training and run counterfactual. See function documentation for further details.

results_all <- data.table::data.table()
 for (station_name in names(stations)) {
 env_data <- clean_data(data, station = stations[station_name])
 dt_prepared <- prepare_data_for_modelling(env_data, params)
 dt_prepared <- dt_prepared[complete.cases(dt_prepared)]
 split_data <- split_data_counterfactual(
 dt_prepared,
 application_start = application_start,
 application_end = application_end
 )
 res <- run_counterfactual(split_data,
 params,
 detrending_function = trend,
 model_type = model_type,
 alpha = 0.9,
 log_transform = TRUE
 )
 predictions <- res$prediction
 predictions[, station_name := station_name]
 predictions[, station := stations[station_name]]
 predictions[, model := model_type]
 predictions[, trend := trend]
 results_all <- rbind(results_all, predictions)
}
 #> Warning in log(dt_train_new$value): NaNs wurden erzeugt
 #> [LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000316 seconds.
 #> You can set `force_row_wise=true` to remove the overhead.
 #> And if memory is not enough, you can set `force_col_wise=true`.
 #> [LightGBM] [Info] Total Bins 1549
 #> [LightGBM] [Info] Number of data points in the train set: 60472, number of used features: 8
 #> [LightGBM] [Info] Start training from score -0.000000
 #> Warning in log(dt_train_new$value): NaNs wurden erzeugt
 #> [LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000346 seconds.
 #> You can set `force_row_wise=true` to remove the overhead.
 #> And if memory is not enough, you can set `force_col_wise=true`.
 #> [LightGBM] [Info] Total Bins 1550
 #> [LightGBM] [Info] Number of data points in the train set: 60133, number of used features: 8
 #> [LightGBM] [Info] Start training from score -0.000000
 #> [LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000315 seconds.
 #> You can set `force_row_wise=true` to remove the overhead.
 #> And if memory is not enough, you can set `force_col_wise=true`.
 #> [LightGBM] [Info] Total Bins 1552
 #> [LightGBM] [Info] Number of data points in the train set: 60494, number of used features: 8
 #> [LightGBM] [Info] Start training from score 0.000000

Plot bias (prediction - observation) to compare multiple stations

results_all[, bias := prediction - value]
results_all[, station_label := paste0(station_name, " (", station, ")")]
results_all[, station := as.character(unlist(station))]
results_all[, smoothed_bias := data.table::frollmean(bias,
 n = 24 * window_size,
 align = "center"
),
by = station
]
bias_plot <- ggplot2::ggplot(
 results_all,
 ggplot2::aes(
 x = date,
 y = smoothed_bias,
 color = station_label,
 group = station_label
 )
) +
 geom_line() +
 labs(
 title = paste("NO2 Bias Over Time by Station (", window_size, "-day mean)"),
 x = "Date",
 y = "NO2 Bias \n(prediction - observation) [μg/m3]",
 color = "Station"
 ) +
 ggplot2::geom_vline(
 xintercept = date_effect_start, linetype = 4,
 colour = "black"
 ) +
 ggplot2::theme_bw() +
 ggplot2::scale_x_datetime(
 date_minor_breaks = "1 month",
 date_breaks = "2 month"
 ) +
 ggplot2::theme(legend.position = "bottom")
bias_plot
 #> Warning: Removed 1005 rows containing missing values or values outside the scale range
 #> (`geom_line()`).

plot of chunk bias_plot

Plot counterfactual for Köln.

station_name_plot <- "Köln"
counterfactual_plot <- plot_counterfactual(
 dplyr::filter(results_all, station_name == station_name_plot), params,
 window_size = window_size,
 date_effect_start,
 buffer = buffer
)
counterfactual_plot

plot of chunk counterfactual_plot_3 ```