user_sample_3

Prerequisites

Installation for user as described in README

Steps

  1. adapt the data_dir (in windows e.g. //de-inf-008/$/Eigene Dateien/ubair-master/data/)
 library(ubair)
 library(ggplot2)
 # set data_dir where the data is stored
data_dir <- "../../Daten/user_sample_data/"
 # This might take a few seconds for large files
data <- load_uba_data_from_dir(data_dir = data_dir)
  1. Set sample variables.
target <- "NO2"
stations <- list(
 Lünen = "DENW006", Bottrop = "DENW021",
 Köln = "DENW212"
)
meteo_variables <- c("TMP", "RFE", "WIG", "WIR", "LDR")
 
 # dates for 9 Euro ticket effect
application_start <- lubridate::ymd("20220301") # = start reference time
date_effect_start <- lubridate::ymd_hm("20220601 00:00")
application_end <- lubridate::ymd("20220831") # = end effect time
 
buffer <- 0 # number of data points to be ignored before effect
 
trend <- "linear"
model_type <- "lightgbm"
 
window_size <- 14 # days of data to calculate the mean in prediction results
  1. create a params.yaml. Either
params <- load_params()
 # adapt params programatically
params$target <- target
params$meteo_variables <- meteo_variables
  1. Prepare data of each station for training and run counterfactual. See function documentation for further details.
results_all <- data.table::data.table()
 for (station_name in names(stations)) {
 env_data <- clean_data(data, station = stations[station_name])
 dt_prepared <- prepare_data_for_modelling(env_data, params)
 dt_prepared <- dt_prepared[complete.cases(dt_prepared)]
 split_data <- split_data_counterfactual(
 dt_prepared,
 application_start = application_start,
 application_end = application_end
 )
 res <- run_counterfactual(split_data,
 params,
 detrending_function = trend,
 model_type = model_type,
 alpha = 0.9,
 log_transform = TRUE
 )
 predictions <- res$prediction
 predictions[, station_name := station_name]
 predictions[, station := stations[station_name]]
 predictions[, model := model_type]
 predictions[, trend := trend]
 results_all <- rbind(results_all, predictions)
}
 #> Warning in log(dt_train_new$value): NaNs wurden erzeugt
 #> [LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000316 seconds.
 #> You can set `force_row_wise=true` to remove the overhead.
 #> And if memory is not enough, you can set `force_col_wise=true`.
 #> [LightGBM] [Info] Total Bins 1549
 #> [LightGBM] [Info] Number of data points in the train set: 60472, number of used features: 8
 #> [LightGBM] [Info] Start training from score -0.000000
 #> Warning in log(dt_train_new$value): NaNs wurden erzeugt
 #> [LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000346 seconds.
 #> You can set `force_row_wise=true` to remove the overhead.
 #> And if memory is not enough, you can set `force_col_wise=true`.
 #> [LightGBM] [Info] Total Bins 1550
 #> [LightGBM] [Info] Number of data points in the train set: 60133, number of used features: 8
 #> [LightGBM] [Info] Start training from score -0.000000
 #> [LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000315 seconds.
 #> You can set `force_row_wise=true` to remove the overhead.
 #> And if memory is not enough, you can set `force_col_wise=true`.
 #> [LightGBM] [Info] Total Bins 1552
 #> [LightGBM] [Info] Number of data points in the train set: 60494, number of used features: 8
 #> [LightGBM] [Info] Start training from score 0.000000
  1. Plot bias (prediction - observation) to compare multiple stations
results_all[, bias := prediction - value]
results_all[, station_label := paste0(station_name, " (", station, ")")]
results_all[, station := as.character(unlist(station))]
results_all[, smoothed_bias := data.table::frollmean(bias,
 n = 24 * window_size,
 align = "center"
),
by = station
]
bias_plot <- ggplot2::ggplot(
 results_all,
 ggplot2::aes(
 x = date,
 y = smoothed_bias,
 color = station_label,
 group = station_label
 )
) +
 geom_line() +
 labs(
 title = paste("NO2 Bias Over Time by Station (", window_size, "-day mean)"),
 x = "Date",
 y = "NO2 Bias \n(prediction - observation) [μg/m3]",
 color = "Station"
 ) +
 ggplot2::geom_vline(
 xintercept = date_effect_start, linetype = 4,
 colour = "black"
 ) +
 ggplot2::theme_bw() +
 ggplot2::scale_x_datetime(
 date_minor_breaks = "1 month",
 date_breaks = "2 month"
 ) +
 ggplot2::theme(legend.position = "bottom")
bias_plot
 #> Warning: Removed 1005 rows containing missing values or values outside the scale range
 #> (`geom_line()`).
plot of chunk bias_plot
plot of chunk bias_plot
  1. Plot counterfactual for Köln.
station_name_plot <- "Köln"
counterfactual_plot <- plot_counterfactual(
 dplyr::filter(results_all, station_name == station_name_plot), params,
 window_size = window_size,
 date_effect_start,
 buffer = buffer
)
counterfactual_plot

plot of chunk counterfactual_plot_3 ```

AltStyle によって変換されたページ (->オリジナル) /