library(harp)
library(here)
library(dplyr)
library(patchwork)
library(tidyr)
library(forcats)
When we compare verification scores for different forecast models, it can be important to understand whether the differences between the scores are statistically significant. In harp this is done by bootstrapping.
Bootstrapping is a process by which statistics are calculated from samples of different draws from the same data - we call these replicates. The statistics are calculated for each replicate and the variability between the replicates tells us something about how sure we can be about the statistic.
For forecast verification, this means calculating the scores for many replicates. We can then estimate the confidence intervals for the scores and / or the confidence of the differences between scores for different forecast models.
Here we will once again make a comparison between REF and IFSENS for 2m temperature. As usual we will read the forecasts in, select the common cases and join them together. Since it’s temperature we will also convert K to degrees C.
t2m <- read_point_forecast(
dttm = seq_dttm(2018030800, 2018033100, "1d"),
fcst_model = c("REF", "IFSENS"),
fcst_type = "eps",
parameter = "T2m",
file_path = here("data/FCTABLE"),
file_template = list(
REF = "fctable",
IFSENS = "fctable_eps"
)
) %>%
scale_param(-273.15, "degC") %>%
common_cases() %>%
join_to_fcst(
read_point_obs(
unique_valid_dttm(.),
parameter = "T2m",
stations = unique_stations(.),
obs_path = here("data/OBSTABLE")
) %>%
scale_param(-273.15, "degC", col = T2m)
)
We’re going to do an extra check on the observations using
check_obs_against_fcst()
, which only allows observations
that are within a certain number of standard deviations of the
forecast.
t2m <- check_obs_against_fcst(t2m, T2m, num_sd_allowed = 6)
And now we’ll just do a verification as normal (switching off deterministic verification for members for speed).
verif <- ens_verify(t2m, T2m, verify_members = FALSE)
## ::Computing verification for fcst_model `REF`::
## Warning: `ens_mean_and_var()` was deprecated in harpIO 0.1.0.
## ℹ Please use `ens_stats()` instead.
## ℹ The deprecated feature was likely used in the harpPoint package.
## Please report the issue at <https://github.com/harphub/harpPoint/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Spread; Skill for lead_time
## ✔
## Hexbin for lead_time✔
## Rank histogram for lead_time✔
## CRPS for lead_time✔
## ::Computing verification for fcst_model `IFSENS`::
## Spread; Skill for lead_time✔
## Hexbin for lead_time✔
## Rank histogram for lead_time✔
## CRPS for lead_time✔
We’re going to concentrate on spread-skill, bias of the ensemble mean and CRPS for this analysis, and we’ll use patchwork to arrange the plots.
my_cols <- data.frame(
mname = names(t2m),
colour = c("#DD5555", "#5555DD")
)
plot_point_verif(
verif, spread_skill, plot_num_cases = FALSE, plot_caption = "", colour_table = my_cols) +
plot_point_verif(verif, mean_bias, plot_num_cases = FALSE, plot_caption = "", colour_table = my_cols) /
plot_point_verif(verif, crps, plot_num_cases = FALSE, colour_table = my_cols) +
plot_layout(guides = "collect") &
theme(legend.position = "bottom")
If we want to bootstrap the verification we use
bootstrap_verify()
, which takes the forecast data, the
verification function, the parameter, the number of bootstrap replicates
and other arguments to the verification function as its arguments. You
can also do the bootstrapping in parallel if you have access to more
than one core by setting parallel = TRUE.
bootstrap_verify(
t2m,
ens_verify,
T2m,
n = 10,
show_progress = TRUE
)
## ::ens_summary_scores:: # A tibble: 272 × 17
## fcst_model ref_model lead_time score ref_score_mean ref_score_median
## <chr> <chr> <int> <chr> <dbl> <dbl>
## 1 IFSENS REF 0 crps 1.97 1.97
## 2 IFSENS REF 0 crps_potential 1.43 1.43
## 3 IFSENS REF 0 crps_reliabil… 0.539 0.540
## 4 IFSENS REF 0 mean_bias 0.501 0.507
## 5 IFSENS REF 0 rmse 3.22 3.21
## 6 IFSENS REF 0 spread 0.849 0.846
## 7 IFSENS REF 0 spread_skill_… 0.264 0.262
## 8 IFSENS REF 0 stde 3.18 3.17
## 9 IFSENS REF 3 crps 2.11 2.11
## 10 IFSENS REF 3 crps_potential 1.47 1.47
## # ℹ 262 more rows
## # ℹ 11 more variables: ref_score_upper <dbl>, ref_score_lower <dbl>,
## # fcst_score_mean <dbl>, fcst_score_median <dbl>, fcst_score_upper <dbl>,
## # fcst_score_lower <dbl>, difference_mean <dbl>, difference_median <dbl>,
## # difference_upper <dbl>, difference_lower <dbl>, percent_better <dbl>
##
## --harp verification for T2m--
## # for forecasts from 00:00 UTC 08 Mar 2018 to 00:00 UTC 31 Mar 2018
## # using 179 observation stations
## # for verification groups:
## -> lead_time
## ℹ use `attributes()` to see detailed metadata
That’s pretty slow, so we can switch parallel to TRUE.
bs_verif <- bootstrap_verify(
t2m,
ens_verify,
T2m,
n = 100,
parallel = TRUE,
num_cores = 50
)
## Bootstrapping on 8 cores.
bs_verif
## ::ens_summary_scores:: # A tibble: 272 × 17
## fcst_model ref_model lead_time score ref_score_mean ref_score_median
## <chr> <chr> <int> <chr> <dbl> <dbl>
## 1 IFSENS REF 0 crps 1.97 1.97
## 2 IFSENS REF 0 crps_potential 1.43 1.43
## 3 IFSENS REF 0 crps_reliabil… 0.538 0.540
## 4 IFSENS REF 0 mean_bias 0.483 0.480
## 5 IFSENS REF 0 rmse 3.22 3.21
## 6 IFSENS REF 0 spread 0.849 0.849
## 7 IFSENS REF 0 spread_skill_… 0.264 0.265
## 8 IFSENS REF 0 stde 3.18 3.18
## 9 IFSENS REF 3 crps 2.11 2.11
## 10 IFSENS REF 3 crps_potential 1.47 1.47
## # ℹ 262 more rows
## # ℹ 11 more variables: ref_score_upper <dbl>, ref_score_lower <dbl>,
## # fcst_score_mean <dbl>, fcst_score_median <dbl>, fcst_score_upper <dbl>,
## # fcst_score_lower <dbl>, difference_mean <dbl>, difference_median <dbl>,
## # difference_upper <dbl>, difference_lower <dbl>, percent_better <dbl>
##
## --harp verification for T2m--
## # for forecasts from 00:00 UTC 08 Mar 2018 to 00:00 UTC 31 Mar 2018
## # using 179 observation stations
## # for verification groups:
## -> lead_time
## ℹ use `attributes()` to see detailed metadata
It’s worth noting what each of the columns in the output is:
harpVis doesn’t yet have a function that is capable of dealing with this type of data, but a function to plot scores with confidence intervals might look something like this:
plot_func <- function(x, plot_score, conf = c("ribbon", "errorbar"), alpha = 0.5) {
conf <- match.arg(conf)
p <- ggplot(
filter(x$ens_summary_scores, score %in% plot_score),
aes(
x = lead_time,
y = fcst_score_mean,
ymin = fcst_score_lower,
ymax = fcst_score_upper,
colour = fcst_model,
fill = fcst_model,
linetype = score
)
)
if (conf == "errorbar") {
p <- p + geom_errorbar()
}
if (conf == "ribbon") {
p <- p + geom_ribbon(alpha = alpha, colour = NA) +
scale_fill_manual(values = c(REF = "#DD5555", IFSENS = "#5555DD")) +
guides(fill = "none")
}
p <- p + geom_line() +
scale_x_continuous(breaks = seq(0, 48, 6)) +
labs(
x = "Lead Time [h]",
y = paste(harpVis:::totitle(gsub("_", " ", plot_score)), collapse = "; "),
colour = NULL,
linetype = NULL,
title = paste0(
paste(harpVis:::totitle(gsub("_", " ", plot_score)), collapse = "; "),
" : ",
paste(
format(as_dttm(range(attr(x, "dttm"))), "%H:%M %d %b %Y"),
collapse = " - "
)
),
subtitle = paste(length(attr(x, "stations")), "Stations")
) +
scale_colour_manual(values = c(REF = "#DD5555", IFSENS = "#5555DD"))
if (length(plot_score) < 2) {
p <- p +
guides(linetype = "none")
} else {
p <- p +
guides(linetype = guide_legend(NULL, override.aes = list(fill = NA)))
}
p + theme(legend.position = "bottom")
}
Then we can use the function to construct our plot, but now with added error bars!
plot_func(bs_verif, c("spread", "rmse"), "errorbar") +
plot_func(bs_verif, c("mean_bias"), "errorbar") /
plot_func(bs_verif, c("crps"), "errorbar") +
plot_layout(guides = "collect") &
theme_bw() &
theme(legend.position = "bottom")
Error bars can sometimes clutter the plot, so a ribbon around the confidence interval can be better.
plot_func(bs_verif, c("spread", "rmse"), "ribbon") +
plot_func(bs_verif, c("mean_bias"), "ribbon") /
plot_func(bs_verif, c("crps"), "ribbon") +
plot_layout(guides = "collect") &
theme_bw() &
theme(legend.position = "bottom")
We can also make a function to plot the difference between two models and the confidence of that difference for a given interval.
plot_diff_func <- function(
x, plot_score, fcst_model, ref_model, significance = 0.95,
conf = c("ribbon", "errorbar"), alpha = 0.5
) {
conf <- match.arg(conf)
sig_text <- c(
paste0(
fcst_model, " better than ", ref_model,
" with ", significance * 100, "% confidence"
),
paste0(
fcst_model, " worse than ", ref_model,
" with ", significance * 100, "% confidence"
),
paste0(
"No difference between ", fcst_model, " and ", ref_model,
" with ", significance * 100, "% confidence"
)
)
shapes <- c(24, 25, 1)
names(shapes) <- sig_text
colours <- c("#5555DD", "#DD5555", "#ACACAC")
names(colours) <- sig_text
plot_data <- filter(
x$ens_summary_scores,
score %in% plot_score,
fcst_model == .env$fcst_model,
ref_model == .env$ref_model
) %>%
mutate(
sig = case_when(
percent_better >= significance ~ sig_text[1],
percent_better <= (1 - significance) ~ sig_text[2],
TRUE ~ sig_text[3]
)
)
p <- ggplot(
plot_data,
aes(
x = lead_time,
y = difference_mean,
ymin = difference_lower,
ymax = difference_upper,
linetype = score
)
) +
labs(
x = "Lead Time [h]",
y = paste(harpVis:::totitle(gsub("_", " ", plot_score)), collapse = "; "),
colour = NULL,
linetype = NULL,
shape = NULL,
title = paste0(
paste(harpVis:::totitle(gsub("_", " ", plot_score)), collapse = "; "),
" : ",
paste(
format(as_dttm(range(attr(x, "dttm"))), "%H:%M %d %b %Y"),
collapse = " - "
)
),
subtitle = paste(
fcst_model, "-", ref_model, "at",
length(attr(x, "stations")), "Stations"
)
)
if (length(plot_score) < 2) {
p <- p +
guides(linetype = "none")
} else {
p <- p +
guides(linetype = guide_legend(NULL, override.aes = list(fill = NA)))
}
if (conf == "errorbar") {
p <- p + geom_errorbar(colour = "#555555")
}
if (conf == "ribbon") {
p <- p + geom_ribbon(alpha = alpha, fill = "#AAAAAA") + guides(fill = "none")
}
p +
geom_line(colour = "#555555") +
geom_point(aes(shape = sig, colour = sig, fill = sig)) +
scale_colour_manual(values = colours) +
scale_fill_manual(values = colours) +
scale_shape_manual(values = shapes) +
scale_x_continuous(breaks = seq(0, 48, 6)) +
guides(
colour = guide_legend(nrow = 3),
fill = guide_legend(NULL, nrow = 3),
shape = guide_legend(nrow = 3)
) +
theme_bw() +
theme(legend.position = "bottom")
}
And now we can do the same layout
plot_diff_func(bs_verif, c("spread", "rmse"), "REF", "IFSENS", 0.95, "ribbon") +
plot_diff_func(bs_verif, "mean_bias", "REF", "IFSENS", 0.95, "ribbon") /
plot_diff_func(bs_verif, "crps", "REF", "IFSENS", 0.95, "ribbon") +
plot_layout(guides = "collect") &
theme(legend.position = "bottom")
In bootstrapping a problem arises if the data are in some way serially correlated - whether that be in time or space. In order to get a representative statistic for each bootstrap replicate we need to ensure that those serial correlations are maintained for each replicate.
The computation and interpretation of spatial autocorrelations is
quite complicated, so it won’t be covered here (but e.g. see moran.test
from the spdep package). However, we can ignore the impact of
spatial autocorrelations altogether by making sure that every station is
represented in each bootstrap replicate. This can be done by pooling the
data so that a so called “block” bootstrap can be used whereby blocks of
data are drawn with replacement from the dataset. To ensure that all
stations are included in each bootstrap replicate we can pool the data
by forecast date. This is done with the pool_by
argument to
bootstrap_verify()
where unique values in the column named
in pool_by
are used to group the data together.
bs_fcdate_block <- bootstrap_verify(
t2m, ens_verify, T2m, 100, pool_by = "fcst_dttm", parallel = TRUE, num_cores = 50
)
## Bootstrapping on 8 cores.
We can now compare how the confidence interval for the differences compare - on the left is the basic bootstrap, and on the right with replicates drawing from pools of data from forecast dates.
(
plot_diff_func(bs_verif, c("spread", "rmse"), "REF", "IFSENS") +
plot_diff_func(bs_fcdate_block, c("spread", "rmse"), "REF", "IFSENS")
) /
(
plot_diff_func(bs_verif, c("mean_bias"), "REF", "IFSENS") +
plot_diff_func(bs_fcdate_block, c("mean_bias"), "REF", "IFSENS")
) /
(
plot_diff_func(bs_verif, c("crps"), "REF", "IFSENS") +
plot_diff_func(bs_fcdate_block, c("crps"), "REF", "IFSENS")
) +
plot_layout(guides = "collect") &
theme(legend.position = "bottom")
It is clear that the the differences become less confident with a widening of the error bars and a change in the significance of some of the differences, particularly for the bias.
Let’s look at how the errors are correlated in time. We can do this
by grouping the verification by lead time and forecast date and then
calculating the autocorrelations, using R’s acf()
function,
as a function of the forecast date for each lead time.
verif_time <- ens_verify(
t2m, T2m, groupings = c("lead_time", "fcst_dttm"), verify_members = FALSE,
hexbin = FALSE, rank_hist = FALSE
)
acf()
returns a list, but we just want to get the lag
and the autocorrelation values from it so we make a function to extract
those variables into a data frame.
acf_to_df <- function(x) {
ac <- acf(x, plot = FALSE)
data.frame(lag = as.vector(ac$lag), autocorrelation = as.vector(ac$acf))
}
We can then use dplyr’s summarise()
function to compute
the autocorrelations for each lead time and forecast model. We need to
make sure that the data are arranged in order of fcdate to have the
correct lags.
autocorr <- group_by(verif_time$ens_summary_scores, fcst_model, lead_time) %>%
arrange(fcst_dttm) %>%
summarise(ac = list(acf_to_df(mean_bias))) %>%
unnest(ac)
And now we see how the forecast errors are correlated in time for each of the models.
ggplot(autocorr, aes(lag, autocorrelation, fill = fcst_model, colour = fcst_model)) +
geom_abline(colour = "grey20", slope = 0, intercept = 0) +
geom_line() +#position = "dodge") +
facet_wrap(vars(
paste0("LT = ", fct_inorder(formatC(lead_time, width = 2, flag = "0")), "h")
)) +
scale_colour_manual(NULL, values = c(REF = "#DD5555", IFSENS = "#5555DD")) +
scale_x_continuous(breaks = seq(0, 20)) +
theme_bw() +
labs(x = "Lag (days)", y = "Autocorrelation", fill = NULL) +
theme(legend.position = "bottom")
In general the errors are autocorrelated for 1 - 2 days, but in some
cases it’s up to 4 days. We can use the information to create pools of
consecutive days for the block bootstrap. For these data a pool size of
4 days (day 0 - day 3) seems like a good compromise. However, this could
be different depending on the forecast parameter, time of year and model
domain. Generating the pools is a two stage process - first we need to
define the pools using make_bootstrap_pools()
. We can then
pass the output of this to the pool_by
argument in
bootstrap_verify()
.
bs_4day_pools <- make_bootstrap_pools(t2m, fcst_dttm, "4d")
bs_4day_pools
## # A tibble: 23 × 2
## fcst_dttm pool
## <dttm> <dbl>
## 1 2018-03-08 00:00:00 1
## 2 2018-03-09 00:00:00 1
## 3 2018-03-10 00:00:00 1
## 4 2018-03-11 00:00:00 1
## 5 2018-03-12 00:00:00 2
## 6 2018-03-13 00:00:00 2
## 7 2018-03-14 00:00:00 2
## 8 2018-03-15 00:00:00 2
## 9 2018-03-16 00:00:00 3
## 10 2018-03-17 00:00:00 3
## # ℹ 13 more rows
bs_4day_block <- bootstrap_verify(
t2m, ens_verify, T2m, 100, pool_by = bs_4day_pools, parallel = TRUE, num_cores = 50
)
## Bootstrapping on 8 cores.
Now we can compare the impact of including temporal autocorrelations in the bootstrapping procedure - the pooling by forecast date is on the left and by pools of 4 forecast days is on the right.
(
plot_diff_func(bs_fcdate_block, c("spread", "rmse"), "REF", "IFSENS") +
plot_diff_func(bs_4day_block, c("spread", "rmse"), "REF", "IFSENS")
) /
(
plot_diff_func(bs_fcdate_block, c("mean_bias"), "REF", "IFSENS") +
plot_diff_func(bs_4day_block, c("mean_bias"), "REF", "IFSENS")
) /
(
plot_diff_func(bs_fcdate_block, c("crps"), "REF", "IFSENS") +
plot_diff_func(bs_4day_block, c("crps"), "REF", "IFSENS")
) +
plot_layout(guides = "collect") &
theme(legend.position = "bottom")
A final thing to consider is whether the pools of data should overlap
so that all serial correlations in time are kept. We can do that by
setting overlap = TRUE
when making the pools.
bs_4day_ol_pools <- make_bootstrap_pools(t2m, fcst_dttm, "4d", overlap = TRUE)
bs_4day_ol_block <- bootstrap_verify(
t2m, ens_verify, T2m, 100, pool_by = bs_4day_ol_pools, parallel = TRUE, num_cores = 50
)
## Bootstrapping on 8 cores.
Again we can compare the difference - 4 day pools on the left, 4 day overlapping pools on the right.
(
plot_diff_func(bs_4day_block, c("spread", "rmse"), "REF", "IFSENS") +
plot_diff_func(bs_4day_ol_block, c("spread", "rmse"), "REF", "IFSENS")
) /
(
plot_diff_func(bs_4day_block, c("mean_bias"), "REF", "IFSENS") +
plot_diff_func(bs_4day_ol_block, c("mean_bias"), "REF", "IFSENS")
) /
(
plot_diff_func(bs_4day_block, c("crps"), "REF", "IFSENS") +
plot_diff_func(bs_4day_ol_block, c("crps"), "REF", "IFSENS")
) +
plot_layout(guides = "collect") &
theme(legend.position = "bottom")
Here there is no obvious difference, so maybe the pools do not need to overlap.
Plotting a score card can be done from these data using the `plot_score
plot_scorecard(
bs_4day_block,
fcst_model = "REF",
ref_model = "IFSENS",
scores = c(
"mean_bias", "rmse", "spread", "stde", "crps"
)
)