パッケージの装備
# データ整形汎用パッケージ
library(tidyverse)
## ─ Attaching packages ───────── tidyverse 1.2.1 ─
## ✔ ggplot2 3.0.0 ✔ purrr 0.2.5
## ✔ tibble 1.4.2 ✔ dplyr 0.7.6
## ✔ tidyr 0.8.1 ✔ stringr 1.3.1
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ─ Conflicts ─────────── tidyverse_conflicts() ─
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
# MCMC乱数発生器stanをRからつかうパッケージ
library(rstan)
## Loading required package: StanHeaders
## rstan (Version 2.17.3, GitRev: 2e1f913d3ca3)
## For execution on a local, multicore CPU with excess RAM we recommend calling
## options(mc.cores = parallel::detectCores()).
## To avoid recompilation of unchanged Stan programs, we recommend calling
## rstan_options(auto_write = TRUE)
##
## Attaching package: 'rstan'
## The following object is masked from 'package:tidyr':
##
## extract
# rstanを並列で使うオプション
options(mc.cores = parallel::detectCores())
# 変更なしの実行ファイルは保存しておくオプション
rstan_options(auto_write = TRUE)
# データ要約・可視化パッケージ
library(summarytools)
# 複数のグラフを並べて表示するパッケージ
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(GGally)
##
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
##
## nasa
# ベイズモデル比較指標計算パッケージ
library(loo)
## This is loo version 2.0.0.
## **NOTE: As of version 2.0.0 loo defaults to 1 core but we recommend using as many as possible. Use the 'cores' argument or set options(mc.cores = NUM_CORES) for an entire session. Visit mc-stan.org/loo/news for details on other changes.
# ベイズモデルの結果を可視化するパッケージ
library(bayesplot)
## This is bayesplot version 1.6.0
## - Online documentation and vignettes at mc-stan.org/bayesplot
## - bayesplot theme set to bayesplot::theme_default()
## * Does _not_ affect other ggplot2 plots
## * See ?bayesplot_theme_set for details on theme setting
# 描画の際に文字化けするMacユーザは次の行のコメントアウトをとって実行する
old = theme_set(theme_gray(base_family = "HiraKakuProN-W3"))
乱数による近似
# 数値例を発生
set.seed(12345)
# 標準正規分布から発生する100個の乱数をつくってみる
x100 <- rnorm(100,0,1)
# 一部表示
head(x100)
## [1] 0.5855288 0.7094660 -0.1093033 -0.4534972 0.6058875 -1.8179560
mean(x100) # 平均値
## [1] 0.2451972
var(x100) # 分散
## [1] 1.242625
sd(x100) # 標準偏差
## [1] 1.114731
max(x100) # 最大値
## [1] 2.477111
min(x100) # 最小値
## [1] -2.380358
median(x100) # 中央値
## [1] 0.4837183
# パーセンタイル
# 0%, 25%, 50%, 75%, 100%
quantile(x100,probs=c(0,0.25,0.5,0.75,1))
## 0% 25% 50% 75% 100%
## -2.3803581 -0.5901091 0.4837183 0.9003897 2.4771109
毎回答えが違う
# 標準正規乱数1
x100.1 <- rnorm(100,0,1)
# 標準正規乱数2
x100.2 <- rnorm(100,0,1)
# 標準正規乱数3
x100.3 <- rnorm(100,0,1)
# それぞれの平均値
mean(x100.1)
## [1] 0.04523311
mean(x100.2)
## [1] -0.04621158
mean(x100.3)
## [1] 0.2152759
サンプルサイズを増やすと理論値に近づく
# 1000サンプル
x1000 <- rnorm(1000,0,1)
mean(x1000)
## [1] -0.03272243
# 10000サンプル
x10000 <- rnorm(10000,0,1)
mean(x10000)
## [1] -0.004254936
# 100000サンプル
x100000 <- rnorm(100000,0,1)
mean(x100000)
## [1] 0.002199223
確率点
# quantile関数でサンプルのパーセンタイル点を算出
quantile(x100000,probs=c(0,0.25,0.33,0.75,1))
## 0% 25% 33% 75% 100%
## -4.5631823 -0.6735926 -0.4335554 0.6758085 5.5830210
# 標準正規分布の理論的q点
qnorm(0.25,0,1)
## [1] -0.6744898
qnorm(0.33,0,1)
## [1] -0.4399132
qnorm(0.75,0,1)
## [1] 0.6744898
ある数字よりも大きく(小さく)なる確率
length(x100000[x100000<1.96])/length(x100000)
## [1] 0.97464
pnorm(1.96,0,1)
## [1] 0.9750021
可視化
# データをデータフレームにまとめる
data.frame(class=c(rep(1,NROW(x100)),
rep(2,NROW(x1000)),
rep(3,NROW(x10000)),
rep(4,NROW(x100000))),
value=c(x100,x1000,x10000,x100000)) %>%
# グループ名を作る変数を作成
mutate(class=as.factor(class)) %>%
# 作図。x軸は値。グループごとに分けたヒストグラム
ggplot(aes(x=value))+geom_histogram(binwidth = 0.1)+xlim(-4,4)+
facet_wrap(~class,scales = "free")
## Warning: Removed 8 rows containing non-finite values (stat_bin).
## Warning: Removed 4 rows containing missing values (geom_bar).
積分も簡単
# 該当する列数/総列数
NROW(x100000[x100000>0 & x100000 <1])/NROW(x100000)
## [1] 0.3428
data.frame(val=x100000) %>% mutate(itg=ifelse(val>0&val<1,1,2)) %>%
mutate(itg=as.factor(itg)) %>%
ggplot(aes(x=val,fill=itg))+geom_histogram(binwidth = 0.01) +
xlim(-4,4) +theme(legend.position = "none")
## Warning: Removed 8 rows containing non-finite values (stat_bin).