RStan サンプル - you1025/my_something_flagments GitHub Wiki

パラメータの推定

二項分布を仮定し 32/42, 27/41 の 2 パターンにおけるパラメータ theta およびそれらの差を推定する。

library(tidyverse)
library(rstan)
library(HDInterval)

options(mc.cores = parallel::detectCores())
rstan_options(auto_write = TRUE)

stan_code <- "
data {
  int n1;
  int x1;
  int n2;
  int x2;
}
parameters {
  real<lower=0, upper=1> theta1;
  real<lower=0, upper=1> theta2;
}
model {
  x1 ~ binomial(n1, theta1);
  x2 ~ binomial(n2, theta2);
}
generated quantities {
  real diff_thetas = (theta1 - theta2);
  real is_1_greater_than_2 = (diff_thetas > 0);
}
"

model <- rstan::stan_model(model_code = stan_code)
fit <- rstan::sampling(
  model,
  data = list(
    n1 = 42,
    x1 = 32,
    n2 = 41,
    x2 = 27
  ),
  chains = 4,
  iter = 3500,
  warmup = 1000,
  seed = 1234
)
fit
#Inference for Stan model: 61a36f3e0f198d75bc1f8b3ca3bb3043.
#4 chains, each with iter=3500; warmup=1000; thin=1; 
#post-warmup draws per chain=2500, total post-warmup draws=10000.

#                      mean se_mean   sd   2.5%    25%    50%    75%  97.5% n_eff Rhat
#theta1                0.75    0.00 0.06   0.61   0.71   0.75   0.79   0.86  8411    1
#theta2                0.65    0.00 0.07   0.50   0.60   0.65   0.70   0.78  7686    1
#diff_thetas           0.10    0.00 0.09  -0.09   0.04   0.10   0.16   0.28  8183    1
#is_1_greater_than_2   0.85    0.00 0.36   0.00   1.00   1.00   1.00   1.00  7136    1
#lp__                -53.55    0.01 0.99 -56.27 -53.95 -53.25 -52.85 -52.58  4587    1

#Samples were drawn using NUTS(diag_e) at Fri Oct 15 16:08:03 2021.
#For each parameter, n_eff is a crude measure of effective sample size,
#and Rhat is the potential scale reduction factor on split chains (at 
#convergence, Rhat=1).

ESS 比率

サンプル数に対する ESS(Effective Sample Size) の比率
0.1(=10%) より小さいとよろしくないらしい(https://dastatis.github.io/pdf/StanAdvent2018/bayesplot.html)

rstan::stan_ess(fit, c("theta1", "theta2", "diff_thetas"))$data

可視化

rstan::stan_ess(fit, c("theta1", "theta2", "diff_thetas"))$data %>%
  tibble::as_tibble(rownames = "parameter") %>%

  # 各推定値(の中央値)でソート
  dplyr::mutate(parameter = forcats::fct_reorder(parameter, stat, .desc = T)) %>%

  ggplot(aes(parameter, stat)) +
    geom_col() +
    scale_y_continuous(limits = c(0, 1))

MCMC サンプルの抽出

mcmc_samples <- rstan::extract(fit, c("theta1", "theta2", "diff_thetas"))

代表値

EAP

eap1 <- mean(mcmc_samples$theta1) # 0.7488828

MED

med1 <- median(mcmc_samples$theta1) # 0.753199

MAP

# 密度推定
d1 <- density(mcmc_samples$theta1)

map1 <- d1$x[which.max(d1$y)] # 0.7702992

ベイズ信用区間

四分位数 + α

quantile(mcmc_samples$theta1, probs = c(0.025, 0.25, 0.5, 0.75, 0.975))
#     2.5%       25%       50%       75%     97.5% 
#0.6124567 0.7066960 0.7531990 0.7948876 0.8639592

HDI

# 密度推定
d1 <- density(mcmc_samples$theta1)

# 95% 区間
HDInterval::hdi(d1, credMass = 0.95)
#     lower     upper 
# 0.6216844 0.8728245 

# 99% 区間
HDInterval::hdi(d1, credMass = 0.99)
#     lower     upper 
# 0.5708920 0.8972801

集計サンプル

tibble::as_tibble(mcmc_samples) %>%

  # 代表値および HDI の算出
  dplyr::summarise(
    # 各列(推定値)ごとに代表値および HDI を算出
    dplyr::across(.fns = function(theta) {
      # 密度推定
      d <- density(theta)

      # MAP
      map <- d$x[which.max(d$y)]

      # 95% HDI
      hdi_95 <- HDInterval::hdi(d, credMass = 0.95)

      list(
        map = map,
        mean = mean(theta),
        median = median(theta),
        
        hdi_95_lower = hdi_95[[1]],
        hdi_95_upper = hdi_95[[2]]
      )
    })
  ) %>%

  # 表形式に整形
  dplyr::mutate(estimation = names(theta1)) %>%
  tidyr::unnest(cols = -estimation) %>%
  tidyr::pivot_longer(cols = -estimation, names_to = "parameter") %>%
  tidyr::pivot_wider(names_from = estimation, values_from = value)

# A tibble: 3 × 6
#  parameter     map   mean median hdi_95_lower hdi_95_upper
#  <chr>       <dbl>  <dbl>  <dbl>        <dbl>        <dbl>
#1 theta1      0.767 0.749  0.754        0.620         0.872
#2 theta2      0.659 0.651  0.653        0.509         0.790
#3 diff_thetas 0.119 0.0986 0.0999      -0.0930        0.288
⚠️ **GitHub.com Fallback** ⚠️