~

458544ad · Zidong Du · 4302acd8 · 458544ad
Commit 458544ad authored Sep 10, 2020 by Zidong Du
Hide whitespace changes
Inline Side-by-side

Showing with 39 additions and 38 deletions

AAAI2021/tex/theory.tex
+39 -38

No files found.
--- a/AAAI2021/tex/theory.tex
+++ b/AAAI2021/tex/theory.tex
@@ -17,10 +17,48 @@
 \end{figure*}


-Before going to the detail of the training algorithms, we first introduce the environment, gaming rules, and agent architecture for enabling the emergence of symbolic language. 

+\begin{algorithm}[t]
+  \caption{Learning Algorithm$(t,\hat{t})$}
+  \label{al:learning}
+  \small
+	\begin{algorithmic}[1]
+		\IF{Training the speaker agent S}
+		\FOR{Batch T randomly selected from $M_0\times M_1$}
+        \FOR{$t=(c_0,c_1)$ in T}
+        \STATE $P(s_0|t),P(s_1|t)=\pi_{old}^S(s=(s_0,s_1)|t)$
+        \STATE Sample $s_0$ with $P(s_0|t)$, $s_1$ with $P(s_1|t)$
+        \STATE $P(\hat{t}|s) = \pi^L(\hat{t}|s)$ 
+        \STATE Sample $\hat{t}$ with $P(\hat{t}|s)$
+        \STATE Get reward $r(\hat{t},t)$
+        \STATE $J(\theta^S,\theta^L)=E_{\pi_{old}^S,\pi^L}[r(\hat{t},t)\cdot\frac{\pi^S(s|t)}{\pi^S_{old}(s|t)}]$
+        \STATE Update $\theta^S$ by $\bigtriangledown_{\theta^S}J$
+        \ENDFOR
+        \STATE $\pi_{old}^S\leftarrow \pi^S$
+		\ENDFOR
+		\ENDIF
+	
+		\IF{Training the listener agent L}
+		\FOR{Batch T randomly selected from $M_0\times M_1$}
+		\FOR{$t=(c_0,c_1)$ in T}
+		\STATE $P(s_0|t),P(s_1|t)=\pi^S(s=(s_0,s_1)|t)$
+		\STATE Sample $s_0$ with $P(s_0|t)$, $s_1$ with $P(s_1|t)$
+		\STATE $P(\hat{t}|s) = \pi^L_{old}(\hat{t}|s)$ 
+		\STATE Sample $\hat{t}$ with $P(\hat{t}|s)$
+		\STATE Get reward $r(\hat{t},t)$
+		\STATE $J(\theta^S,\theta^L)=E_{\pi_{old}^S,\pi^L}[r(\hat{t},t)\cdot\frac{\pi^L(s|t)}{\pi^L_{old}(s|t)}]$
+		\STATE Update $\theta^L$ by $\bigtriangledown_{\theta^L}J$
+		\ENDFOR
+		\STATE $\pi_{old}^L\leftarrow \pi^L$
+		\ENDFOR
+		\ENDIF
+	\end{algorithmic}
+\end{algorithm}


+
+Before going to the detail of the training algorithms, we first introduce the environment, gaming rules, and agent architecture for enabling the emergence of symbolic language. 
+
 \subsection{Environment setup}
 \label{ssec:env}
 Figure~\ref{fig:game} shows the entire environment used in this study,
@@ -107,40 +145,3 @@ expected reward $ J(\theta_S, \theta_L)$ can be calculated as follows:
 \end{align}


-\begin{algorithm}[t]
-  \caption{Learning Algorithm$(t,\hat{t})$}
-  \label{al:learning}
-  \small
-	\begin{algorithmic}[1]
-		\IF{Training the speaker agent S}
-		\FOR{Batch T randomly selected from $M_0\times M_1$}
-        \FOR{$t=(c_0,c_1)$ in T}
-        \STATE $P(s_0|t),P(s_1|t)=\pi_{old}^S(s=(s_0,s_1)|t)$
-        \STATE Sample $s_0$ with $P(s_0|t)$, $s_1$ with $P(s_1|t)$
-        \STATE $P(\hat{t}|s) = \pi^L(\hat{t}|s)$ 
-        \STATE Sample $\hat{t}$ with $P(\hat{t}|s)$
-        \STATE Get reward $r(\hat{t},t)$
-        \STATE $J(\theta^S,\theta^L)=E_{\pi_{old}^S,\pi^L}[r(\hat{t},t)\cdot\frac{\pi^S(s|t)}{\pi^S_{old}(s|t)}]$
-        \STATE Update $\theta^S$ by $\bigtriangledown_{\theta^S}J$
-        \ENDFOR
-        \STATE $\pi_{old}^S\leftarrow \pi^S$
-		\ENDFOR
-		\ENDIF
-	
-		\IF{Training the listener agent L}
-		\FOR{Batch T randomly selected from $M_0\times M_1$}
-		\FOR{$t=(c_0,c_1)$ in T}
-		\STATE $P(s_0|t),P(s_1|t)=\pi^S(s=(s_0,s_1)|t)$
-		\STATE Sample $s_0$ with $P(s_0|t)$, $s_1$ with $P(s_1|t)$
-		\STATE $P(\hat{t}|s) = \pi^L_{old}(\hat{t}|s)$ 
-		\STATE Sample $\hat{t}$ with $P(\hat{t}|s)$
-		\STATE Get reward $r(\hat{t},t)$
-		\STATE $J(\theta^S,\theta^L)=E_{\pi_{old}^S,\pi^L}[r(\hat{t},t)\cdot\frac{\pi^L(s|t)}{\pi^L_{old}(s|t)}]$
-		\STATE Update $\theta^L$ by $\bigtriangledown_{\theta^L}J$
-		\ENDFOR
-		\STATE $\pi_{old}^L\leftarrow \pi^L$
-		\ENDFOR
-		\ENDIF
-	\end{algorithmic}
-\end{algorithm}
-