修改公式1，2

2ac2b97f · Ruizhi Chen · 916cb6a8 · 2ac2b97f
Commit 2ac2b97f authored Sep 10, 2020 by Ruizhi Chen
Show whitespace changes
Inline Side-by-side

Showing with 4 additions and 5 deletions

AAAI2021/tex/theory.tex
+4 -5

No files found.
--- a/AAAI2021/tex/theory.tex
+++ b/AAAI2021/tex/theory.tex
-
 \section{ Symbolic Language Producing}
 \label{sec:thory}

@@ -121,10 +120,10 @@ use the predicted result $\hat{t}$ of the listener agent as the
 evidence of whether giving positive rewards. Then, the gradients of the
 expected reward $ J(\theta_S, \theta_L)$ can be calculated as follows:
 \begin{align}
-  \nabla_{\theta^S} J &= \mathbb{E}_{\pi^S, \pi^L} \left[ r(\hat{t}, t) \cdot
-    \nabla_{\theta^S} \log{\pi^S(s_0, s_1 | t)} \right] \\
-  \nabla_{\theta^L} J &= \mathbb{E}_{\pi^S, \pi^L} \left[ r(\hat{t}, t) \cdot
-    \nabla_{\theta^L} \log{\pi^S(\hat{t} | s_0, s_1)} \right]
+  \nabla_{\theta^S} J &= \mathbb{E}_{\pi^S_{old}, \pi^L} \left[ r(\hat{t}, t) \cdot
+     \frac{\nabla_{\theta^S}\pi^S(s_0, s_1 | t)}{\pi^S_{old}(s_0, s_1 | t)} \right] \\
+  \nabla_{\theta^L} J &= \mathbb{E}_{\pi^S, \pi^L_{old}} \left[ r(\hat{t}, t) \cdot
+    \frac{\nabla_{\theta^L} \pi^L(\hat{t} | s_0, s_1)}{\pi^L_{old}(\hat{t} | s_0, s_1)} \right]
 \end{align}