Commit 9324ac4d by Zidong Du

~

parent 5401c11a
......@@ -8,6 +8,8 @@
\newcommand{\rmk}[1]{\textcolor{red}{--[#1]--}}
\newcommand{\note}[1]{\textcolor{red}{#1}}
\usepackage{enumitem}
\usepackage{amsmath}
\usepackage{amsfonts}
\usepackage{aaai21} % DO NOT CHANGE THIS
\usepackage{times} % DO NOT CHANGE THIS
......@@ -224,41 +226,6 @@
\input{tex/experiments.tex}
\input{tex/last.tex}
\begin{algorithm}[!h]
\caption{OurAlgorithm$(t,\hat{t})$}
\begin{algorithmic}[1]
\IF{Training the speaker agent S}
\FOR{Batch T randomly selected from $M_0\times M_1$}
\FOR{$t=(c_0,c_1)$ in T}
\STATE $P(s_0|t),P(s_1|t)=\pi_{old}^S(s=(s_0,s_1)|t)$
\STATE Sample $s_0$ with $P(s_0|t)$, $s_1$ with $P(s_1|t)$
\STATE $P(\hat{t}|s) = \pi^L(\hat{t}|s)$
\STATE Sample $\hat{t}$ with $P(\hat{t}|s)$
\STATE Get reward $R(\hat{t},t)$
\STATE $J(\theta^S,\theta^L)=E_{\pi_{old}^S,\pi^L}[R(\hat{t},t)\cdot\frac{\pi^S(s|t)}{\pi^S_{old}(s|t)}]$
\STATE Update $\theta^S$ by $\bigtriangledown_{\theta^S}J$
\ENDFOR
\STATE $\pi_{old}^S\leftarrow \pi^S$
\ENDFOR
\ENDIF
\IF{Training the listener agent L}
\FOR{Batch T randomly selected from $M_0\times M_1$}
\FOR{$t=(c_0,c_1)$ in T}
\STATE $P(s_0|t),P(s_1|t)=\pi^S(s=(s_0,s_1)|t)$
\STATE Sample $s_0$ with $P(s_0|t)$, $s_1$ with $P(s_1|t)$
\STATE $P(\hat{t}|s) = \pi^L_{old}(\hat{t}|s)$
\STATE Sample $\hat{t}$ with $P(\hat{t}|s)$
\STATE Get reward $R(\hat{t},t)$
\STATE $J(\theta^S,\theta^L)=E_{\pi_{old}^S,\pi^L}[R(\hat{t},t)\cdot\frac{\pi^L(s|t)}{\pi^L_{old}(s|t)}]$
\STATE Update $\theta^L$ by $\bigtriangledown_{\theta^L}J$
\ENDFOR
\STATE $\pi_{old}^L\leftarrow \pi^L$
\ENDFOR
\ENDIF
\end{algorithmic}
\end{algorithm}
\bibliography{ref.bib}
......
......@@ -54,29 +54,90 @@ circle''.
\label{fig:agents}
\end{figure}
The agents apply their own policy to play the referential game. Denote the
policy of the speaker agent S and the listener L as $\pi_S$ and $\pi_L$. $\pi_S$
indicates the conditional probability $P(s_0|t)$ and $P(s_1|t)$. $\pi_L$
indicates the conditional probability $P(\hat{t}|s_0,s_1)$. The listener agent
output predict result $\hat{t}$ through random sampling on the conditional
probability $P(\hat{t}|s_0,s_1)$. The neural networks are used to simulate the
agent policy. The agent architecture is shown in Figure 1.
For the speaker, the input object t is firstly passed to a MLP to get a hidden
layer vector $h^S$. Then, the hidden layer vector is split into two feature
vectors $h_0^S$ and $h_1^S$ with length h\_size. Through a MLP and a softmax layer,
these feature vectors are transformed as the output $o_0$ and $o_1$ with the length
|V| respectively. Lastly, the symbol sequences $s_0$ and $s_1$ are sampled from the
output $o_0$ and $o_1$.
For the listener, the input symbol sequences $s_0$ and $s_1$ are passed into a MLP
respectively to get the hidden layer vectors $h_0$ and $h_1$. The length of each
vector is h\_size. Concatenating these vectors, and passing the conjunctive
vector into a MLP and a softmax layer, the output $o^L$ with length $|M_0||M_1|$
denotes $P(\hat{t}|s_0,s_1)$. Lastly, the predict result is sampled from the
output $o^L$.
In the experiments, the symbol h\_size is used to denote the model capacity of
the agents.
\subsection{Training algorithm}
Figure~\ref{fig:agents} shows the architecture of the constructed agents,
including the Speaker $S$ and Listener $L$.
\textbf{Speaker.} Regarding the Speaker $S$, it is constructed as a three-layer neural
network. The Speaker $S$ processes the input object $t$ with a fully-connected
layer to obtain the hidden layer $h^s$, which is split into two sub-layers. Each
sub-layer is further processed with fully-connected layers to obtain the output
layer. The output layer results indicate the probability distribution of symbols
with given input object $t$, i.e., $o_i^{s}=P(s_i|t)$ $i\in{0,1}$. \note{The final
readout symbols are sampled based on such probability distribution.}
\textbf{Listener.} Regarding the Listener $L$, it is constructed as a
three-layer neural network, too. Different from Speaker $S$ that split the
hidden layer into two sub-layers, $L$ concatenates two sub-layers into one
output layer. The output layer results are also the probability distribution of
symbols $\hat{t}$ with given input sequence $s$, i.e, $o^{L}=P(\hat{t}|s_0,s_1)$.
\note{The final readout symbol is sampled based the probability.}
\subsection{Learning algorithm}
\label{ssec:training}
To remove all the handcrafted induction as well as for a more realistic
scenario, agents for this referential game are independent to each other,
without sharing model parameters or architectural connections. As shown in
Algorithm~\ref{al:learning}, we train the separate Speaker $S$ and Listener $L$ with
Stochastic Policy Gradient methodology in a tick-tock manner, i.e, training one
agent while keeping the other one. Roughly, when training the Speaker, the
target is set to maximize the expected reward
$J(\theta_S, \theta_L)=E_{\pi_S,\pi_L}[R(t, t^)]$ by adjusting the parameter
$\theta_S$, where $\theta_S$ is the neural network parameters of Speaker $S$
with learned output probability distribution $\pi_S$, and $\theta_L$ is the
neural network parameters of Listener with learned probability distribution $\pi_L$.
Similarly, when training the Listener, the target is set to maximize the
expected reward$ J(theta_S, theta_L)$ by fixing the parameter $\theta_S$ and
adjusting the parameter $\theta_L$.
Additionally, to avoid the handcrafted induction on emergent language, we only
use the predict result $\hat{t}$ of the listener agent as the
evidence of whether giving the positive rewards. Then, the gradients of the
expected reward $ J(theta_S, theta_L)$ can be calculated as follows:
\begin{align}
\nabla_{\theta^S} J &= \mathbb{E}_{\pi^S, \pi^L} \left[ R(\hat{t}, t) \cdot
\nabla_{\theta^S} \log{\pi^S(s_0, s_1 | t)} \right] \\
\nabla_{\theta^L} J &= \mathbb{E}_{\pi^S, \pi^L} \left[ R(\hat{t}, t) \cdot
\nabla_{\theta^L} \log{\pi^S(\hat{t} | s_0, s_1)} \right]
\end{align}
\begin{algorithm}[t]
\caption{Learning Algorithm$(t,\hat{t})$}
\label{al:learning}
\begin{algorithmic}[1]
\IF{Training the speaker agent S}
\FOR{Batch T randomly selected from $M_0\times M_1$}
\FOR{$t=(c_0,c_1)$ in T}
\STATE $P(s_0|t),P(s_1|t)=\pi_{old}^S(s=(s_0,s_1)|t)$
\STATE Sample $s_0$ with $P(s_0|t)$, $s_1$ with $P(s_1|t)$
\STATE $P(\hat{t}|s) = \pi^L(\hat{t}|s)$
\STATE Sample $\hat{t}$ with $P(\hat{t}|s)$
\STATE Get reward $R(\hat{t},t)$
\STATE $J(\theta^S,\theta^L)=E_{\pi_{old}^S,\pi^L}[R(\hat{t},t)\cdot\frac{\pi^S(s|t)}{\pi^S_{old}(s|t)}]$
\STATE Update $\theta^S$ by $\bigtriangledown_{\theta^S}J$
\ENDFOR
\STATE $\pi_{old}^S\leftarrow \pi^S$
\ENDFOR
\ENDIF
\IF{Training the listener agent L}
\FOR{Batch T randomly selected from $M_0\times M_1$}
\FOR{$t=(c_0,c_1)$ in T}
\STATE $P(s_0|t),P(s_1|t)=\pi^S(s=(s_0,s_1)|t)$
\STATE Sample $s_0$ with $P(s_0|t)$, $s_1$ with $P(s_1|t)$
\STATE $P(\hat{t}|s) = \pi^L_{old}(\hat{t}|s)$
\STATE Sample $\hat{t}$ with $P(\hat{t}|s)$
\STATE Get reward $R(\hat{t},t)$
\STATE $J(\theta^S,\theta^L)=E_{\pi_{old}^S,\pi^L}[R(\hat{t},t)\cdot\frac{\pi^L(s|t)}{\pi^L_{old}(s|t)}]$
\STATE Update $\theta^L$ by $\bigtriangledown_{\theta^L}J$
\ENDFOR
\STATE $\pi_{old}^L\leftarrow \pi^L$
\ENDFOR
\ENDIF
\end{algorithmic}
\end{algorithm}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment