library(ggplot2)
library(viridis)
library(hrbrthemes)
library(cmdstanr)
library(bayesplot)
library(spatstat)
library(dplyr)
library(tidyr)
source("presence_only_functions.R")
source("surface_functions.R")
5 Simulation and modelling
5.1 Implementation
The previous sections discussed the theoretical and practical aspects of data generation. We now proceed to implement the data generation procedure. We first generate observations at each site and create plots of observed intensities, bias, and counts. We then fit a Poisson Point Process to the generated data.
We create a ring-shaped environment based on two correlated covariates and generate PO points accordingly.
<- 100
area_D <- 20
k <- k^2
sites # Parameter values somewhere around Fithian 2015.
<- -2
alpha <- 2
beta <- -2
gamma <- 0.5
delta <- list(alpha=alpha, beta=beta, gamma=gamma, delta=delta)
params <- 0.7
aux_cor <- get_sampling_surface_donut(k) grid_points
<- get_bias_surface_correlated(grid_points, aux_cor) grid_points
[1] "Correlation between X and Z"
[,1] [,2]
[1,] 1.0000000 0.8318977
[2,] 0.8318977 1.0000000
<- sites/4
sites <- grid_points %>% dplyr::filter(x<(k/2)+1, y<(k/2)+1)
grid_points # Filter for only a quarter of the grid.
# If we filter, we need to change the total sites as well
# Need distances for LGCP
<- as.matrix(dist(grid_points[,1:2]))
distance_mat <- 1
data_reps <- specify_corr(grid_points[,1:2])
corr_matrix <- F
gp_bool <- generate_ppp_data_r(grid_points, params, sites, data_reps, corr_matrix, gp_bool, area_D)
sim_data <- which(sim_data$Y>0)
Y_positive_indices stopifnot(sum(sim_data$Y[Y_positive_indices]) == nrow(sim_data$coordinates))
$Y_coords sim_data
x y
1 8.875951 1.773338
2 8.418654 2.586547
3 7.682086 2.968490
4 9.443523 2.937034
5 9.232656 3.004253
6 9.419722 3.417642
7 9.369643 2.645977
8 10.242691 3.446854
9 10.425511 3.301856
10 9.850484 2.680007
11 5.661172 4.209399
12 6.244311 4.384666
13 6.620891 3.542528
14 6.521558 3.645729
15 9.643957 3.970256
16 4.693362 4.684367
17 5.261157 4.808354
18 7.162766 4.505626
19 4.276170 5.821939
20 3.873761 5.525741
21 4.134683 6.405719
22 1.843085 7.150085
23 2.543938 7.697512
24 2.708596 7.552906
25 2.873088 9.258479
26 2.668776 8.519745
27 3.293108 8.620217
28 2.620333 10.064382
29 3.022244 9.757110
30 3.112645 10.053107
grid_points[Y_positive_indices,]
x y aux_x aux_z
19 9 2 0.3425419 0.25429287
28 8 3 1.7474542 2.01497588
29 9 3 2.2578390 1.58412306
30 10 3 2.3263479 1.22750804
36 6 4 1.9218166 1.24545874
37 7 4 1.7189159 1.11232836
40 10 4 0.4314067 -0.43573796
45 5 5 2.2578390 2.30065458
47 7 5 0.1949836 0.02354021
54 4 6 1.9218166 2.54440191
62 2 7 -0.2854436 -0.59967665
73 3 8 1.7474542 1.04834902
83 3 9 2.2578390 1.26880450
93 3 10 2.3263479 2.20702428
The above output shows the exact location of the generated points and the associated \(x\) and \(z\) covariate values.
# Use the final generation iteration
<- ggplot(grid_points, aes(x, y, fill=sim_data$lambda[data_reps,])) +
p geom_tile() +
scale_fill_viridis(discrete=FALSE) +
ggtitle("Generated intensity per site")
print(p)
<- ggplot(grid_points, aes(x, y, fill=sim_data$bias[data_reps,])) +
p geom_tile() +
scale_fill_viridis(discrete=FALSE) +
ggtitle("Generated bias per site")
print(p)
<- sim_data$lambda[data_reps,]*sim_data$bias[data_reps,]
thinned_intensity <- ggplot() +
p geom_tile(grid_points, mapping=aes(x, y, fill=thinned_intensity, width=1, height=1), alpha=.6) +
scale_fill_viridis(discrete=FALSE, name="L*b") +
ggtitle("Generated thinning per site, including generated (or observed) individuals") +
geom_point(data=sim_data$Y_coords, mapping=aes(x=x, y=y), size=3, col="white") +
theme(panel.grid.minor = element_line(colour="white")) +
scale_y_continuous(breaks = seq(0, 20, 1)) +
scale_x_continuous(breaks = seq(0, 20, 1))
print(p)
The above plots show the PO points and the ring-shaped surface.
Next we fit the Thinned Nonhomogeneous Poisson Process model in Stan. We can make a few observations about this model. First, examine the likelihood: \[L(\lambda|Y) = \prod^{N}_{i=1}\frac{\lambda_i^{n_i}\exp[-\lambda_i]}{n_i!}\]. This is made using the assumption that the counts at sites \(s_i\) are distributed following a poisson process with mean \(\lambda\). This assumption allows for a simpler specification of the likelihood, since we can use the poisson distribution. The setting of \(|A|=1\) allows us to make this assumption, as we do not need to multiple the intensity parameter \(\lambda\) by the area in the model.
The next sections of code fitting and diagnosing the poisson process model in Stan, using the generated data.
<- stan_poisson_process
model_string write(model_string, model_path)
# data_reps is 1 here so just use the first index
= list(N = sites, X = grid_points$aux_x, Z = grid_points$aux_z, y = sim_data$Y[data_reps,])
data <- cmdstan_model(model_path)
model <- model$sample(data=data, seed=13, chains=3, iter_sampling=2000, iter_warmup=200) fit
The above code fits the table. We next examine the fit.
$summary(variables=c('alpha', 'beta', 'gamma', 'delta')) fit
# A tibble: 4 × 10
variable mean median sd mad q5 q95 rhat ess_bulk ess_tail
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 alpha -1.36 -1.37 6.74 6.72 -12.5 9.67 1.00 1995. 2148.
2 beta 1.81 1.81 0.386 0.372 1.18 2.45 1.00 2314. 2537.
3 gamma -1.42 -1.43 6.73 6.72 -12.5 9.73 1.00 2019. 2187.
4 delta -0.126 -0.119 0.377 0.382 -0.761 0.496 1.00 2401. 2583.
From the table above, we can observe that the parameters \(\beta\) and \(\delta\) have been recovered. However, due to the correlation of \(\alpha\) and \(\gamma\), only the sum \(\alpha+\gamma\) can be correctly identified. Our “true” sum of was equal to -4, and we can see the estimate here is \(~4.7\), which is reasonable given we incorporate priors and are estimating correlated parameters. The correlation between the two can be observed in the scatter plots below, as well as in the wide confidence interval of the posterior, relative to the other parameters.
5.2 Diagnostics
Here we check goodness of fit and Posterior Predictive Checks.
<- c('alpha', 'gamma', 'beta', 'delta')
param_vec <- fit$draws(variables=param_vec)
posterior color_scheme_set("mix-blue-pink")
<- mcmc_trace(posterior, pars = c("alpha", "beta", "gamma", "delta"),
p_trace facet_args = list(nrow = 2, labeller = label_parsed))
print(p_trace + facet_text(size = 15))
<- ggtitle(paste("Posterior distributions, with means and 90% interval"))
plot_title <- mcmc_areas(posterior, prob = 0.9, point_est="mean", regex_pars = c("alpha", "beta")) + plot_title
p_post print(p_post)
<- ggtitle(paste("Posterior distributions, with means and 90% interval"))
plot_title <- mcmc_areas(posterior, prob = 0.9, point_est="mean", regex_pars = c("gamma", "delta")) + plot_title
p_post print(p_post)
mcmc_intervals(posterior)
Importantly, note the wide credible intervals on \(\alpha\) and \(\gamma\). Why? See the scatter plots below.
<- ggtitle("Parameter posterior sample correlation")
plot_title <- mcmc_pairs(posterior)
p_post print(p_post)
mcmc_scatter(posterior, pars=c('alpha','gamma'))
There is near perfect correlation between \(\alpha\) and \(\gamma\), as expected.
5.3 PPD
We next look at the Posterior Predictive Distribution (PPD) of the observations per site. That is, we plot the generated counts \(Y_i\) for site \(s_i\) on the sampling surface (grid).
The PPD can be expressed as the samples from the predictive distribution of \(Y\) taking into account the uncertainty surrounding the parameters. The PPD can be written as \[ p(\tilde{y}|Y) = \int p(\tilde{y}|\lambda, b, Y)p(\lambda, b|Y) dbd\lambda \]
such that the posterior uncertainty of the parameters is integrated out.
Below we plot the generated values of \(\tilde{y}\) for each site. Additionally, the generated quantities of intensity, bias, and intensity*bias ($b) per site are also plotted on the grid.
<- c('lambda_bias', 'lambda_rep', 'b_rep', 'y_rep')
generated_vars <- fit$summary(variables=generated_vars[1])$mean
lambda_thinned <- fit$summary(variables=generated_vars[2])$mean
lambda_rep <- fit$summary(variables=generated_vars[3])$mean
bias_rep <- fit$summary(variables=generated_vars[4])$mean
y_ppd <- data.frame(x=grid_points$x, y=grid_points$y, y_rep=y_ppd, lt=lambda_thinned, l=lambda_rep, b=bias_rep)
ppd_df head(ppd_df)
x y y_rep lt l b
1 1 1 0.002000000 0.001857518 29666.24 243090.3
2 2 1 0.001166667 0.001859585 29731.43 243665.1
3 3 1 0.001666667 0.002067503 30430.25 263499.0
4 4 1 0.001333333 0.002184711 34790.56 244657.8
5 5 1 0.003500000 0.003014073 50946.62 245093.8
6 6 1 0.005166667 0.005084997 90423.75 243710.1
It is important to notice that while the thinned intensity is ostensibly realistic in estimation, the intensity and bias are not. This is due to the specification of the Thinned Poisson Process \[\lambda^* = \lambda(s)b(s)\] such that \[\lambda^* = \exp\left[\alpha + \beta*X(s) + \gamma + \delta*Z(s)\right]\]
We can then see the generated data in the sampling surface:
<- ggplot(ppd_df, aes(x, y, fill=y_rep)) +
p geom_tile() +
scale_fill_viridis(discrete=FALSE) +
ggtitle("PPD counts per site")
print(p)
<- ggplot(ppd_df, aes(x, y, fill=lt)) +
p geom_tile() +
scale_fill_viridis(discrete=FALSE) +
ggtitle("Thinned intensity per site")
print(p)
<- ggplot(ppd_df, aes(x, y, fill=l)) +
p geom_tile() +
scale_fill_viridis(discrete=FALSE) +
ggtitle("Intensity per site")
print(p)
<- ggplot(ppd_df, aes(x, y, fill=b)) +
p geom_tile() +
scale_fill_viridis(discrete=FALSE) +
ggtitle("Bias per site")
print(p)
The prediction of PO observatiosn at each site is realistic, as is the thinned intensity. But alone, the intensity and bias predictive disitributions are poorly estimated.
5.4 Correlation between coefficients
In the above plots, it seems that the bias and unthinned intensity are behaving strangely, even though the thinned intensity is generated as we would expect. This is a manifestation of the completely correlated parameters describing the behavior of the intensity and the bias. We can notice that the generated bias and intensity alone are both really large. The issue with the large values appears this way because we are taking the mean of the posterior distribution as our point estimate and showing that in the plot of the sites above. However, when we generate the thinned intensity, we multiply each draw from the posterior predictive chain together. So even though the mean of the posterior chain is high, the individual draws of the bias and intensity inversely fluctuate around each other, due the high correlation between the two parameters.
We take a look at the posterior intervals of the estimates to get a little better idea of this.
<- ggtitle(paste("Posterior distributions, with means and 90% interval"))
plot_title <- mcmc_intervals(fit$draws(), prob = 0.9, point_est="mean", regex_pars = generated_vars[1]) + plot_title
p_post print(p_post)
The intervals shown above are for the thinned bias. These are relatively reasonable, as stated.
<- mcmc_intervals(fit$draws(), prob = 0.9, point_est="mean", regex_pars = generated_vars[2]) + plot_title
p_post print(p_post)
Warning: Removed 37 rows containing missing values (`geom_point()`).
However, the intervals for the intensity (unthinned) are unrealistic, due to the unindentifiable parameters.
<- mcmc_intervals(fit$draws(), prob = 0.9, point_est="mean", regex_pars = generated_vars[3]) + plot_title
p_post print(p_post)
Warning: Removed 100 rows containing missing values (`geom_point()`).
And finally, the intervals of the generated bias is also not correct.
Then we look at the intervals and histograms of the posterior chains for just a few chains.
# The gen parameters for just a few of the sites
<- c('lambda_rep[1]', 'lambda_rep[10]', 'lambda_rep[20]')
gen_lambda <- c('lambda_bias[1]', 'lambda_bias[10]', 'lambda_bias[20]')
gen_lb <- c('b_rep[1]', 'b_rep[10]', 'b_rep[20]')
gen_b <- mcmc_intervals(fit$draws(), prob = 0.9, point_est="mean", pars = gen_lambda) + plot_title
p_post print(p_post)
Warning: Removed 2 rows containing missing values (`geom_point()`).
<- mcmc_intervals(fit$draws(), prob = 0.9, point_est="mean", pars = gen_b) + plot_title
p_post print(p_post)
Warning: Removed 3 rows containing missing values (`geom_point()`).
<- mcmc_intervals(fit$draws(), prob = 0.9, point_est="mean", pars = gen_lb) + plot_title
p_post print(p_post)
<- mcmc_hist(fit$draws(), pars = gen_lambda)
p_post print(p_post)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
<- mcmc_hist(fit$draws(), pars = gen_b)
p_post print(p_post)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
<- mcmc_hist(fit$draws(), pars = gen_lb)
p_post print(p_post)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Look at counts for each site
#gen_y <- c("y_rep[1]", "y_rep[10]", "y_rep[20]")
#p_post <- mcmc_hist(fit$draws(), pars=gen_y)
#print(p_post)
From this we notice that the intervals for some of the generated lambdas and biases are wide, but not for the thinned values. But even for those that have wide intervals, nearly all of the chain values in the histograms are the same. This is because it’s only a few values that cause the mean to be biased, due to the correlation of the parameters.
The take away is that the mean for the generated quantities is very skewed by the occassional large value. This high skew is caused by the correlation between b and l, since these parameters are not estimable between themselves. Therefore, we arrive at reasonable estimates for b*l since this is draw specific (where when one is high the other is low), but the overall mean for either b or l over multiple draws cannot be trusted. This is clear from the histograms of \(\lambda\) and the bias, where most of the values take on very low values, but do fluctuate higher. And when one goes higher, the other goes lower, which I’d like to check a specific example of, for one draw.
5.5 Diagnostics
Finally, it is possible to look at a variety of NUTS diagnostics metrics. I am not going to go into much detail here.
<- nuts_params(fit)
nuts_fit mcmc_parcoord(fit$ draws(), np=nuts_fit)
mcmc_nuts_divergence(nuts_fit, log_posterior(fit))
The parcord plot shows 1 line per iteration of the sampler. We can see the values each sampled parameter takes on, which allows us to notice the extent that correlation has on the sampled values. If there are any divergences in the sampling, these will show up in red and help us to pick out the parameters that may be contributing. to this.
There are no divergences so we don’t need to worry.
mcmc_rhat(rhat(fit))
mcmc_neff(neff_ratio(fit))
#mcmc_nuts_energy(nuts_fit, binwidth=1/2)
#mcmc_nuts_acceptance(nuts_fit, log_posterior(fit))
R hat, which compares between and within chain estimates, looks good.
“neff” is the effective sample size estimates the number of independent samples, after accounting for autocorrelation in the chains. The figure here shows there are a reasonable number of independent samples, though the threshold is somewhat arbitrary.
mcmc_acf(fit$draws(), pars=c('alpha', 'beta', 'gamma', 'delta'))
Autocorrelation in the chains looks well-handled.
$cmdstan_diagnose() fit
Processing csv files: C:/Users/msavery/AppData/Local/Temp/RtmpG2BpUK/thinned_poisson_process-202405241050-1-52106e.csv, C:/Users/msavery/AppData/Local/Temp/RtmpG2BpUK/thinned_poisson_process-202405241050-2-52106e.csv, C:/Users/msavery/AppData/Local/Temp/RtmpG2BpUK/thinned_poisson_process-202405241050-3-52106e.csv
Checking sampler transitions treedepth.
Treedepth satisfactory for all transitions.
Checking sampler transitions for divergences.
No divergent transitions found.
Checking E-BFMI - sampler transitions HMC potential energy.
E-BFMI satisfactory.
Effective sample size satisfactory.
Split R-hat values satisfactory all parameters.
Processing complete, no problems detected.
# Leave loo out.
#loo_results <- fit$loo(variables="lp__", cores=4)
#print(loo_results)
It appears we have no additional problems in the models according the the Stan diagnostics.
5.6 LGCP
We next move on to incorporating spatial correlation. To do so, we fit a Log Gaussian Cox Process (LGCP). Where before we specified the intensity as \[ \lambda(s) = \theta(s)b(s) = \exp[\alpha + \beta'x(s) + \gamma + \delta'z(s)] \] we now add a Gaussian process to incorporate spatial correlation between sites \[ \lambda(s) = \theta(s)b(s) = \exp[\alpha + \beta'x(s) + \gamma + \delta'z(s) + w(s)] \] so that the expected value for quadrat \(A\) will be the integral over the quadrat: \[ \Delta(A) = \int_A\lambda(s)ds = \int_A \exp[\alpha + \beta'x(s) + \gamma + \delta'z(s) + w(s)]ds \] We make the same approximations as before based on our limited covariate resolution.
While the topic of LGCPs on its own is quite interesting and deserves its own treatment, for now we examine its behavior only as an adjustment to the NHPP. In a future post I will discuss approximations to the LGCP, which are quite important as fitting the spatial correlation matrix is computationally intensive as the number of locations increases. If we want to apply the LGCP to developing optimal designs (another future topic), we will need an approximation.
For now, I proceed with fitting the model in Stan.
<- log_gaussian_cox_process
model_string write(model_string, model_path)
# Here we need to use the distance matrix
= list(N = sites, X = grid_points$aux_x, Z = grid_points$aux_z, y = sim_data$Y[data_reps,], D = distance_mat)
data <- cmdstan_model(model_path)
model <- model$sample(data=data, seed=13, chains=3, iter_sampling=2000, iter_warmup=200, thin = 10) lgcp_fit
Having fit the LGCP we can examine the parameters
$summary(variables=c('alpha', 'beta', 'gamma', 'delta')) lgcp_fit
# A tibble: 4 × 10
variable mean median sd mad q5 q95 rhat ess_bulk ess_tail
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 alpha -1.20 -1.19 0.746 0.800 -2.39 0.0364 1.00 636. 599.
2 beta 1.49 1.50 0.361 0.352 0.889 2.08 0.999 612. 602.
3 gamma -1.20 -1.25 0.753 0.794 -2.40 0.0336 1.00 536. 447.
4 delta 0.0336 0.0490 0.371 0.394 -0.587 0.622 1.00 642. 631.
<- lgcp_fit$draws()
posterior color_scheme_set("mix-blue-pink")
<- mcmc_trace(posterior, pars = c("alpha", "beta", "gamma", "delta"),
p_trace facet_args = list(nrow = 2, labeller = label_parsed))
print(p_trace + facet_text(size = 15))
The chains don’t look perfect and it would probably be better to run the sampling algorithm for longer. But for now this is ok.
Then check the posterior intervals:
<- ggtitle(paste("Posterior distributions, with means and 90% interval"))
plot_title <- mcmc_areas(posterior, prob = 0.9, point_est="mean", regex_pars = c("alpha", "beta")) + plot_title
p_post print(p_post)
<- ggtitle(paste("Posterior distributions, with means and 90% interval"))
plot_title <- mcmc_areas(posterior, prob = 0.9, point_est="mean", regex_pars = c("gamma", "delta")) + plot_title
p_post print(p_post)
mcmc_intervals(posterior, pars=c('alpha', 'beta', 'gamma', 'delta'))
#mcmc_hist(fit$draws(), pars = c('alpha', 'beta', 'gamma', 'delta'))
<- ggtitle("Parameter posterior sample correlation")
plot_title <- mcmc_pairs(posterior, pars=c('alpha', 'beta', 'gamma', 'delta'))
p_post print(p_post)
Interestingly, the \(\alpha\) and \(\gamma\) parameters are somewhat identified. We can see that the parameters are nearly recovered, though biased slightly towards 0. Additionally, The credible intervals are much smaller than in the Poisson point process, indicating that the parameters are indentifiable, though the scatter plots still show some correlation.
Finally we take a look at the posterior predictions for \(\lambda\), \(b\), and \(\lambda\cdot b\).
<- c('lambda_bias', 'lambda_rep', 'b_rep', 'y_rep')
generated_vars <- lgcp_fit$summary(variables=generated_vars[1])$mean
lambda_thinned <- lgcp_fit$summary(variables=generated_vars[2])$mean
lambda_rep <- lgcp_fit$summary(variables=generated_vars[3])$mean
bias_rep <- lgcp_fit$summary(variables=generated_vars[4])$mean
y_ppd <- data.frame(x=grid_points$x, y=grid_points$y, y_rep=y_ppd, lt=lambda_thinned, l=lambda_rep, b=bias_rep)
ppd_df <- ggplot(ppd_df, aes(x, y, fill=y_rep)) +
p geom_tile() +
scale_fill_viridis(discrete=FALSE) +
ggtitle("PPD counts per site")
print(p)
<- ggplot(ppd_df, aes(x, y, fill=lt)) +
p geom_tile() +
scale_fill_viridis(discrete=FALSE) +
ggtitle("Thinned intensity per site")
print(p)
<- ggplot(ppd_df, aes(x, y, fill=l)) +
p geom_tile() +
scale_fill_viridis(discrete=FALSE) +
ggtitle("Intensity per site")
print(p)
<- ggplot(ppd_df, aes(x, y, fill=b)) +
p geom_tile() +
scale_fill_viridis(discrete=FALSE) +
ggtitle("Bias per site")
print(p)
This time, we get realistic estimates of \(b\) and \(\lambda\), before thinning. Inducing the correlation structure allows us to better recover the parameters, albeit at a high computational cost.
Having fit the Poisson Process and LGCP to to simulated PO data and observed the behavior of each model, we have met the goal we set out to accomplish. Hopefully this can serve as a useful resource to anyone interested the topic.