Commit 458544ad by Zidong Du

~

parent 4302acd8
......@@ -17,10 +17,48 @@
\end{figure*}
Before going to the detail of the training algorithms, we first introduce the environment, gaming rules, and agent architecture for enabling the emergence of symbolic language.
\begin{algorithm}[t]
\caption{Learning Algorithm$(t,\hat{t})$}
\label{al:learning}
\small
\begin{algorithmic}[1]
\IF{Training the speaker agent S}
\FOR{Batch T randomly selected from $M_0\times M_1$}
\FOR{$t=(c_0,c_1)$ in T}
\STATE $P(s_0|t),P(s_1|t)=\pi_{old}^S(s=(s_0,s_1)|t)$
\STATE Sample $s_0$ with $P(s_0|t)$, $s_1$ with $P(s_1|t)$
\STATE $P(\hat{t}|s) = \pi^L(\hat{t}|s)$
\STATE Sample $\hat{t}$ with $P(\hat{t}|s)$
\STATE Get reward $r(\hat{t},t)$
\STATE $J(\theta^S,\theta^L)=E_{\pi_{old}^S,\pi^L}[r(\hat{t},t)\cdot\frac{\pi^S(s|t)}{\pi^S_{old}(s|t)}]$
\STATE Update $\theta^S$ by $\bigtriangledown_{\theta^S}J$
\ENDFOR
\STATE $\pi_{old}^S\leftarrow \pi^S$
\ENDFOR
\ENDIF
\IF{Training the listener agent L}
\FOR{Batch T randomly selected from $M_0\times M_1$}
\FOR{$t=(c_0,c_1)$ in T}
\STATE $P(s_0|t),P(s_1|t)=\pi^S(s=(s_0,s_1)|t)$
\STATE Sample $s_0$ with $P(s_0|t)$, $s_1$ with $P(s_1|t)$
\STATE $P(\hat{t}|s) = \pi^L_{old}(\hat{t}|s)$
\STATE Sample $\hat{t}$ with $P(\hat{t}|s)$
\STATE Get reward $r(\hat{t},t)$
\STATE $J(\theta^S,\theta^L)=E_{\pi_{old}^S,\pi^L}[r(\hat{t},t)\cdot\frac{\pi^L(s|t)}{\pi^L_{old}(s|t)}]$
\STATE Update $\theta^L$ by $\bigtriangledown_{\theta^L}J$
\ENDFOR
\STATE $\pi_{old}^L\leftarrow \pi^L$
\ENDFOR
\ENDIF
\end{algorithmic}
\end{algorithm}
Before going to the detail of the training algorithms, we first introduce the environment, gaming rules, and agent architecture for enabling the emergence of symbolic language.
\subsection{Environment setup}
\label{ssec:env}
Figure~\ref{fig:game} shows the entire environment used in this study,
......@@ -107,40 +145,3 @@ expected reward $ J(\theta_S, \theta_L)$ can be calculated as follows:
\end{align}
\begin{algorithm}[t]
\caption{Learning Algorithm$(t,\hat{t})$}
\label{al:learning}
\small
\begin{algorithmic}[1]
\IF{Training the speaker agent S}
\FOR{Batch T randomly selected from $M_0\times M_1$}
\FOR{$t=(c_0,c_1)$ in T}
\STATE $P(s_0|t),P(s_1|t)=\pi_{old}^S(s=(s_0,s_1)|t)$
\STATE Sample $s_0$ with $P(s_0|t)$, $s_1$ with $P(s_1|t)$
\STATE $P(\hat{t}|s) = \pi^L(\hat{t}|s)$
\STATE Sample $\hat{t}$ with $P(\hat{t}|s)$
\STATE Get reward $r(\hat{t},t)$
\STATE $J(\theta^S,\theta^L)=E_{\pi_{old}^S,\pi^L}[r(\hat{t},t)\cdot\frac{\pi^S(s|t)}{\pi^S_{old}(s|t)}]$
\STATE Update $\theta^S$ by $\bigtriangledown_{\theta^S}J$
\ENDFOR
\STATE $\pi_{old}^S\leftarrow \pi^S$
\ENDFOR
\ENDIF
\IF{Training the listener agent L}
\FOR{Batch T randomly selected from $M_0\times M_1$}
\FOR{$t=(c_0,c_1)$ in T}
\STATE $P(s_0|t),P(s_1|t)=\pi^S(s=(s_0,s_1)|t)$
\STATE Sample $s_0$ with $P(s_0|t)$, $s_1$ with $P(s_1|t)$
\STATE $P(\hat{t}|s) = \pi^L_{old}(\hat{t}|s)$
\STATE Sample $\hat{t}$ with $P(\hat{t}|s)$
\STATE Get reward $r(\hat{t},t)$
\STATE $J(\theta^S,\theta^L)=E_{\pi_{old}^S,\pi^L}[r(\hat{t},t)\cdot\frac{\pi^L(s|t)}{\pi^L_{old}(s|t)}]$
\STATE Update $\theta^L$ by $\bigtriangledown_{\theta^L}J$
\ENDFOR
\STATE $\pi_{old}^L\leftarrow \pi^L$
\ENDFOR
\ENDIF
\end{algorithmic}
\end{algorithm}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment