確率的推論と行動選択

!
! control as inference active inference
!
!
!
! Christopher L Buckley
!
!
!
2
! On the Relationship Between Active Inference and Control as Inference [Millidge+ 20] Control as inference active inference
! Active inference: demystified and compared [Sajid+ 20] Active inference
! Reinforcement Learning and Control as Probabilistic Inference: Tutorial and Review [Levine 18] Control as inference
! Reinforcement Learning as Iterative and Amortised Inference [Millidge+ 20] Control as Inference amortized
! What does the free energy principle tell us about the brain? [Gershman 19] Active inference
! Hindsight Expectation Maximization for Goal-conditioned Reinforcement Learning [Tang+ 20] Control as inference Variational RL

MDP
! MDP
! state action
state transition probability
! MDP
t st ∈ 𝒮 at ∈ 𝒜 t + 1
st+1 p (st+1 |st, at)
3
st−1 st st+1
at−1 at at+1

POMDP
! MDP observation
!
! POMDP
s o
o s p(o|s)
4
st−1 st st+1
at−1 at at+1
ot−1 ot ot+1

! MDP policy
! trajectory
!
! reward
!
p (a|s)
T τ = (s1, a1, . . . , sT, aT)
r (st, at)
𝔼p(τ)
[
T
∑
t=1
r (st, at)
]
popt (a|s)
5
p(τ) = p(s1:T, a1:T) =
T
∏
t=1
p(at |st)p(st |st−1, at−1)

! plan
!
! Active inference
!
!
!
π = [a1, . . . , aT]
T τ = (s1:T, π)
π
6
p(τ) = p(π)p(s1:T |π) = p(π)
T
∏
t=1
p(st |st−1, π)

! preference
?
1.
! Control as inference RL as inference Planning as inference
! Variational RL
2.
!
! active inference
7

Control as Inference Variational RL
8

! optimality variable
!
!
=>
𝒪t ∈ {0,1}
t st at 𝒪t = 1 t
r
9
p(𝒪t = 1|st, at) := exp (r (st, at))
st
𝒪t
at
st+1
𝒪t+1
at+1
st−1
𝒪t−1
at−1

ELBO
! ELBO
! ELBO
! ELBO
!
q(τ) p(τ)
12
log p ( 𝒪1:T) = log
∫
p ( 𝒪1:T, τ) dτ
= log 𝔼q(τ)
[
p ( 𝒪1:T, τ)
q (τ) ]
≥ 𝔼q(τ) [log p ( 𝒪1:T |τ) + log p (τ) − log q (τ)]
= 𝔼q(τ)
[
T
∑
t=1
r (st, at)
]
− DKL [q(τ)∥p(τ)] =: L(q)
τ
𝒪1:t
p (τ| 𝒪1:T) ≈ q(τ)
p (τ)
p ( 𝒪1:T |τ)

1.
!
!
!
!
!
control as inference; CAI
p (at ∣ st) =
1
| 𝒜|
qϕ (at ∣ st) ϕ
13
qϕ(τ) :=
T
∏
t=1
qϕ (at ∣ st) q (st ∣ st−1, at−1) =
T
∏
t=1
qϕ (at ∣ st) p (st ∣ st−1, at−1)
p(τ) :=
T
∏
t=1
p (at ∣ st) p (st ∣ st−1, at−1) =
1
| 𝒜|
T
∏
t=1
p (st ∣ st−1, at−1)

1.
! ELBO
!
!
14
L(ϕ) = 𝔼qϕ(τ)
[
T
∑
t=1
r (st, at)
]
− DKL [qϕ(τ)∥p(τ)]
≥ 𝔼qϕ(τ)
[
T
∑
t=1
r (st, at) − log qϕ(at |st)
]
= 𝔼qϕ(τ)
[
T
∑
t=1
r (st, at) + ℋ (qϕ(at |st))]
J(ϕ) := 𝔼qϕ(τ)
[
T
∑
t=1
r (st, at) + ℋ (qϕ(at |st))]

Soft Actor-Critic
! Soft Actor-Critic SAC [Haarnoja+ 17, 18]
! ELBO off-policy .
! Q
! Q critic actor
!
! Control as Inference https://deeplearning.jp/reinforcement_cource-2020s/
! Control as Inference https://www.slideshare.net/DeepLearningJP2016/dlcontrol-as-inference-201266247
Qθ (st, at) = r(st, at) + 𝔼p(st+1|st,at) [V(st+1)]
Qθ (st, at) qϕ(at |st)
15
Jq
t (ϕ) = 𝔼qϕ(at|st)p(st) [
log (qϕ (at |st)) − Qθ (st, at)]
JQ
t (θ) = 𝔼qϕ(at|st)p(st)
[(
r (st, at) + 𝔼p(st+1|st,at) [V¯θ (st+1)] − Qθ (st, at))
2
]
Vθ(st+1) = 𝔼qϕ(at+1|st+1) [Qθ(st+1, at+1) − log qϕ(at+1 |st+1)]
Q

POMDP
! Control as inference POMDP
! VAE
16
! SLAC[Lee+ 19]
! RNN
!
! [Han+ 19]
! RNN VRNN[Chung+ 16]
! variational recurrent model VRMat

CAI
! CAI
! Mirror descent [Bubeck, 14]
=> Variational Inference Model Predictive Control VI-MPC [Okada+ 19]
!
π
𝒲(π) = 𝔼q(τ)[p(𝒪1:T |τ)]
p(𝒪1:T |τ) := f(r(τ))
17
q(i+1)
(π) ←
q(i)
(π) ⋅ 𝒲 (π) ⋅ q(i)
(π)
𝔼q(i)(π) [ 𝒲 (π) ⋅ q(i) (π)]
[Okada+ 19]

Control as inference
! CAI
! SAC VI-MPC
! amortized [Kingma+ 13]
! [Millidge+ 20]
! amortized
18

2.
! CAI
! ELBO
! ELBO
!
=> Variational RL
p (at ∣ st)
q θ
19
pθ(τ) :=
T
∏
t=1
pθ (at ∣ st) p (st ∣ st−1, at−1)
L(θ, q) = 𝔼q(τ)
[
T
∑
t=1
r (st, at)
]
− DKL [q(τ)∥pθ(τ)]

EM
! E
!
! M
! E ELBO
!
! MPO[Abdolmaleki+ 18] V-MPO[Song+ 19]
! M E
θ θ = θold
θ
θ
20
̂θ = max
θ
𝔼q(τ)[log pθ(τ)] = max
θ
𝔼q(τ)
[
T
∑
t=1
log pθ (at ∣ st)
]
q(τ) = pθold (τ| 𝒪1:T) =
p ( 𝒪1:T ∣ τ) pθold
(τ)
∑τ
p ( 𝒪1:T ∣ τ) pθold
(τ)

MPO E
! Maximum a posteriori Policy Optimization MPO [Abdolmaleki+ 18]
!
! E Q
! Q off-policy
! MPO DL
! https://www.slideshare.net/DeepLearningJP2016/dlhyper-parameter-agnostic-methods-in-reinforcement-learning
θold pθold
(at ∣ st) ̂Qθold
(st, at)
21
q(τ) =
T
∏
t=1
q (at ∣ st) p (st ∣ st−1, at−1)
q(at |st) ∝ pθold
(at ∣ st)exp
̂Qθold
(st, at)
η
η > 0

Control as inference Variational RL
! Control as inference
! Variational RL
!
22
τ
𝒪1:T
p (τ| 𝒪1:T) ≈ q(τ)
p (τ)
p ( 𝒪1:T |τ)
τ
𝒪1:T
pθ (τ| 𝒪1:T) ≈ q(τ)
pθ (τ)
p ( 𝒪1:T |τ)
θ
Control as inference Variational RL

!
! Friston
!
!
24
※ ver.3
https://www.slideshare.net/masatoshiyoshida/ss-238982118

!
!
!
!
! unconscious inference
!
!
!
!
25
?
要因結果
推論（知覚）

!
!
!
o s
o s
26
p(o, s) = p(o|s)p(s)
p(s|o) =
p(s)p(o|s)
∑s
p(s)p(o|s)
推論
状態
⽣成
観測
内部モデル
（世界モデル）環境
!
"
o s

!
!
! Bayesian surprise
! active learning
!
!
a o a
u(o) = DKL[p(s ∣ o, a)||p(s ∣ a)] I(a)
a
I(a) a s o
I(a)
27
I(a) :=
∑
o
p(o ∣ a)DKL[p(s ∣ o, a)||p(s ∣ a)] = 𝔼p(o∣a)[u(o)]

!
.
!
!
o1:T π = [a1, . . . , aT]
U(o1:T) =
T
∑
t=1
u (ot)
28
I(π) = 𝔼p(o1:T∣π) [U(o1:T)] =
∑
o1:T
p(o1:T ∣ π)U(o1:T)

!
! ELBO
! ELBO variational free energy
! free energy principle
!
!
q(s)
−log p(o)
29
log p(o) ≥ 𝔼q(s) [
log
p(o, s)
q(s) ]
F(o, q) := − 𝔼q(s) [
log
p(o, s)
q(s) ]

!
!
!
! 1
!
!
! 2
o
−log p(o)
q
q(s)
30
F(o, q) = − log p(o) + DKL[q(s)||p(s|o)]

Active inference
!
! active inference AIF
t Gt
q(st |ot, π) ≈ p(st |ot, π)
33
Gt(π) = − 𝔼q(ot, st ∣ π)
[
log
p (ot, st ∣ π)
q (st ∣ π) ]
≈ − 𝔼q(ot, st ∣ π)
[
log
p (ot |π) q (st ∣ ot, π)
q (st ∣ π) ]
= − 𝔼q(ot, st ∣ π) [log p (ot ∣ π)] − 𝔼q(ot ∣ π) [
DKL [q (st ∣ ot, π)||q (st ∣ π)]]

Active inference
!
! 1
!
! active inference
!
! 1 0
q = p
34
Gt(π) = − 𝔼q(ot, st ∣ π) [log p (ot ∣ π)] − 𝔼q(ot ∣ π) [
DKL [q (st ∣ ot, π)||q (st ∣ π)]]
= − 𝔼p(ot, st ∣ π) [log p (ot ∣ π)] − 𝔼p(ot ∣ π) [
DKL [p (st ∣ ot, π)||p (st ∣ π)]]
= 𝔼p(st ∣ π) [
ℋ (p (ot ∣ π))]
− I(π) ※ p(st |st−1, π) p(st |π)

Active inference
!
! 1
!
! extrinsic value
! 2
! bayesian surprise
! intrinsic value
=>
35
−Gt(π) = 𝔼q(ot,st|π) [log p(ot |π)] + 𝔼q(ot|π) [DKL[q(st |ot, π)||q(st |π)]]

Active inference
!
!
!
!
!
[Gershman+ 19]
!
36
˜p(o1:T) = exp(r(o1:T))
※ ˜p

Control as inference active inference
37

active inference
! Active inference AIF [Millidge+ 20]
!
!
! t −Gt(ϕ)
38
˜p (st, ot, at) = p(st |ot, at)p(at |st)˜p(ot |at) ≈ q(st |ot, at)p(at |st)˜p(ot |at)
qϕ(st, at) = qϕ (at ∣ st) q(st)
−Gt(ϕ) = 𝔼qϕ(ot, st, at)
[
log
˜p (st, ot, at)
qϕ (st, at) ]
≈ 𝔼qϕ(ot, st, at) [log ˜p (ot |at) + log p (at |st) + log q(st |ot, at) − log qϕ (at |st) − log q(st)]
= 𝔼qϕ(ot, st, at) [log ˜p (ot |at)] − 𝔼qϕ(ot, st, at)
[log qϕ (at |st) − log p(at |st)] + 𝔼qϕ(ot, st, at) [log q(st |ot, at) − log q(st)]
≈ 𝔼q(ot ∣ at) [log ˜p (ot ∣ at)] − 𝔼q(st) [
DKL (qϕ (at ∣ st) ∥p (at ∣ st))]
+ 𝔼q(ot, at ∣ st) [
DKL (q (st ∣ ot, at) ∥q (st ∣ at))]
= 𝔼q(ot ∣ at) [log ˜p (ot ∣ at)] + 𝔼q(st) [
ℋ (qϕ (at ∣ st))]
p (at ∣ st) =
1
| 𝒜|

AIF CAI
! CAI
! AIF
! 1
! 2
! AIF
! AIF 3
! CAI AIF
!
39
𝔼q(st,at) [log p ( 𝒪t |st, at)] + 𝔼q(st) [
ℋ (qϕ(at |st))]
𝔼q(ot ∣ at) [log ˜p (ot ∣ at)] + 𝔼q(st) [

Likelihood-AIF
! AIF CAI Likelihood-AIF
!
! CAI
˜p(ot) ˜p(ot |st)
−Gt
q(st) = p(st) p (at ∣ st) =
1
| 𝒜|
40
−Gt(ϕ) = 𝔼qϕ(ot, st, at)
[
log
˜p (st, ot, at)
qϕ (st, at) ]
= 𝔼qϕ(ot, st, at) [log ˜p (ot ∣ st) + log p (st) + log p (at ∣ st) − log qϕ (at ∣ st) − log q (st)]
= 𝔼qϕ(st, at) [log ˜p (ot ∣ st)] − DKL (q (st)||p (st)) − 𝔼q(st) [
DKL (qϕ (at ∣ st)||p (at ∣ st))]
−Gt(ϕ) = 𝔼qϕ(st, at) [log ˜p (ot |st)] + 𝔼q(st) [

Likelihood-AIF CAI
! CAI
! Likelihood-AIF
! 2
! AIF POMDP MDP CAI 1
! CAI
! 2
log ˜p (ot ∣ st) = log p ( 𝒪t |st, at)
41
𝔼qϕ(st,at) [log p ( 𝒪t |st, at)] + 𝔼q(st) [
ℋ (qϕ(at |st))]
𝔼qϕ(st, at) [log ˜p (ot |st)] + 𝔼q(st) [

CAI AIF
! CAI
!
!
!
!
! AIF
!
!
!
42

!
1.
! Control as inference
! Amortized
! Variational RL
2.
! active inference
!
!
!
43

確率的推論と行動選択

Recomendados

Recomendados

Más contenido relacionado

La actualidad más candente

La actualidad más candente (20)

Similar a 確率的推論と行動選択

Similar a 確率的推論と行動選択 (20)

Más de Masahiro Suzuki

Más de Masahiro Suzuki (17)

Último

Último (20)

確率的推論と行動選択