Commit f40a1f4e by YZhao
parents 179ff574 57dfb313
...@@ -8,6 +8,8 @@ ...@@ -8,6 +8,8 @@
\newcommand{\rmk}[1]{\textcolor{red}{--[#1]--}} \newcommand{\rmk}[1]{\textcolor{red}{--[#1]--}}
\newcommand{\note}[1]{\textcolor{red}{#1}} \newcommand{\note}[1]{\textcolor{red}{#1}}
\usepackage{enumitem} \usepackage{enumitem}
\usepackage{amsmath}
\usepackage{amsfonts}
\usepackage{aaai21} % DO NOT CHANGE THIS \usepackage{aaai21} % DO NOT CHANGE THIS
\usepackage{times} % DO NOT CHANGE THIS \usepackage{times} % DO NOT CHANGE THIS
...@@ -95,7 +97,7 @@ ...@@ -95,7 +97,7 @@
% articles, conjunctions, and prepositions are lower case unless they % articles, conjunctions, and prepositions are lower case unless they
% directly follow a colon or long dash % directly follow a colon or long dash
\title{Revisiting the Natural Emergence of Symbolic Language with Agent Capacity} \title{Enabling the Emergence of Symbolic Language without Handcrafted Inductions}
\author{ \author{
%Authors %Authors
% All authors must be in the same font size and format. % All authors must be in the same font size and format.
...@@ -177,7 +179,7 @@ ...@@ -177,7 +179,7 @@
inductions. inductions.
In this paper, we are the first to successfully achieve high compositional symbolic In this paper, we are the first to successfully achieve high compositional symbolic
language in a \emph{natural} manner. language in a \emph{natural} manner without handcrafted inductions.
Initially, by thoroughly investigating the compositionality of emerged symbolic Initially, by thoroughly investigating the compositionality of emerged symbolic
language after removing the \emph{deliberately handcrafted} language after removing the \emph{deliberately handcrafted}
inductions, we observe that the agent capacity plays a key role in inductions, we observe that the agent capacity plays a key role in
...@@ -194,7 +196,7 @@ ...@@ -194,7 +196,7 @@
experimental results lead to a counter-intuitive conclusion that lower agent experimental results lead to a counter-intuitive conclusion that lower agent
capacity facilitates the emergence of symbolic language with higher capacity facilitates the emergence of symbolic language with higher
compositionality. \note{Based on our conclusion, we can generate higher compositionality. \note{Based on our conclusion, we can generate higher
compositional symbolic language with a high probability.} compositional symbolic language with a higher probability.}
% The natural emergence of symbolic languages with high compositionality has % The natural emergence of symbolic languages with high compositionality has
...@@ -224,42 +226,8 @@ ...@@ -224,42 +226,8 @@
\input{tex/experiments.tex} \input{tex/experiments.tex}
\input{tex/last.tex} \input{tex/last.tex}
\begin{algorithm}[!h] \clearpage
\caption{OurAlgorithm$(t,\hat{t})$} \newpage
\begin{algorithmic}[1]
\IF{Training the speaker agent S}
\FOR{Batch T randomly selected from $M_0\times M_1$}
\FOR{$t=(c_0,c_1)$ in T}
\STATE $P(s_0|t),P(s_1|t)=\pi_{old}^S(s=(s_0,s_1)|t)$
\STATE Sample $s_0$ with $P(s_0|t)$, $s_1$ with $P(s_1|t)$
\STATE $P(\hat{t}|s) = \pi^L(\hat{t}|s)$
\STATE Sample $\hat{t}$ with $P(\hat{t}|s)$
\STATE Get reward $R(\hat{t},t)$
\STATE $J(\theta^S,\theta^L)=E_{\pi_{old}^S,\pi^L}[R(\hat{t},t)\cdot\frac{\pi^S(s|t)}{\pi^S_{old}(s|t)}]$
\STATE Update $\theta^S$ by $\bigtriangledown_{\theta^S}J$
\ENDFOR
\STATE $\pi_{old}^S\leftarrow \pi^S$
\ENDFOR
\ENDIF
\IF{Training the listener agent L}
\FOR{Batch T randomly selected from $M_0\times M_1$}
\FOR{$t=(c_0,c_1)$ in T}
\STATE $P(s_0|t),P(s_1|t)=\pi^S(s=(s_0,s_1)|t)$
\STATE Sample $s_0$ with $P(s_0|t)$, $s_1$ with $P(s_1|t)$
\STATE $P(\hat{t}|s) = \pi^L_{old}(\hat{t}|s)$
\STATE Sample $\hat{t}$ with $P(\hat{t}|s)$
\STATE Get reward $R(\hat{t},t)$
\STATE $J(\theta^S,\theta^L)=E_{\pi_{old}^S,\pi^L}[R(\hat{t},t)\cdot\frac{\pi^L(s|t)}{\pi^L_{old}(s|t)}]$
\STATE Update $\theta^L$ by $\bigtriangledown_{\theta^L}J$
\ENDFOR
\STATE $\pi_{old}^L\leftarrow \pi^L$
\ENDFOR
\ENDIF
\end{algorithmic}
\end{algorithm}
\bibliography{ref.bib} \bibliography{ref.bib}
\end{document} \end{document}
...@@ -11,4 +11,53 @@ ...@@ -11,4 +11,53 @@
timestamp = {Thu, 04 Apr 2019 13:20:09 +0200}, timestamp = {Thu, 04 Apr 2019 13:20:09 +0200},
biburl = {https://dblp.org/rec/bib/conf/iclr/WuLCS18}, biburl = {https://dblp.org/rec/bib/conf/iclr/WuLCS18},
bibsource = {dblp computer science bibliography, https://dblp.org} bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{kottur-etal-2017-natural,
title = "Natural Language Does Not Emerge {`}Naturally{'} in Multi-Agent Dialog",
author = "Kottur, Satwik and
Moura, Jos{\'e} and
Lee, Stefan and
Batra, Dhruv",
booktitle = "Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing",
month = sep,
year = "2017",
address = "Copenhagen, Denmark",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/D17-1321",
doi = "10.18653/v1/D17-1321",
pages = "2962--2967",
abstract = "A number of recent works have proposed techniques for end-to-end learning of communication protocols among cooperative multi-agent populations, and have simultaneously found the emergence of grounded human-interpretable language in the protocols developed by the agents, learned without any human supervision! In this paper, using a Task {\&} Talk reference game between two agents as a testbed, we present a sequence of {`}negative{'} results culminating in a {`}positive{'} one {--} showing that while most agent-invented languages are effective (i.e. achieve near-perfect task rewards), they are decidedly not interpretable or compositional. In essence, we find that natural language does not emerge {`}naturally{'},despite the semblance of ease of natural-language-emergence that one may gather from recent literature. We discuss how it is possible to coax the invented languages to become more and more human-like and compositional by increasing restrictions on how two agents may communicate.",
}
@article{kirby2015compression,
title={Compression and communication in the cultural evolution of linguistic structure},
author={Kirby, Simon and Tamariz, Monica and Cornish, Hannah and Smith, Kenny},
journal={Cognition},
volume={141},
pages={87--102},
year={2015},
publisher={Elsevier}
}
@inproceedings{lazaridou2018emergence,
title={Emergence of Linguistic Communication from Referential Games with Symbolic and Pixel Input},
author={Lazaridou, Angeliki and Hermann, Karl Moritz and Tuyls, Karl and Clark, Stephen},
booktitle={International Conference on Learning Representations},
year={2018}
}
@inproceedings{li2019ease,
title={Ease-of-teaching and language structure from emergent communication},
author={Li, Fushan and Bowling, Michael},
booktitle={Advances in Neural Information Processing Systems},
pages={15851--15861},
year={2019}
}
@inproceedings{evtimova2018emergent,
title={Emergent Communication in a Multi-Modal, Multi-Step Referential Game},
author={Evtimova, Katrina and Drozdov, Andrew and Kiela, Douwe and Cho, Kyunghyun},
booktitle={International Conference on Learning Representations},
year={2018}
} }
\ No newline at end of file
\section{Introduction} \section{Introduction}
\label{sec:introduction} \label{sec:introduction}
The emergence of symbolic language has always been an important and controversial The emergence of symbolic language has always been an important issue,
issue. This problem attracts attentions from a broad range of communities, which attracts attentions from a broad range of communities,
including philology~\cite{}, biology~\cite{}, and computer including philology~\cite{}, biology~\cite{}, and computer
science~\cite{}. Especially in computer science, efforts in recent years try to explore science~\cite{}. Especially in computer science, efforts in recent years try to explore
the emergence of symbolic language in virtual, multi-agent environments, where the emergence of symbolic language in virtual multi-agent environments, where
agents are trained to communicate with neural network based method, i.e., deep agents are trained to communicate with neural network based methods such as deep
reinforcement learning~\cite{}. For example, \note{XXXX} reinforcement learning~\cite{}.
%Such works can be roughly classified into two categories, %Such works can be roughly classified into two categories,
%referential game~\cite{} and multi-agent reinforcement learning (MARL)~\cite{}, based on %referential game~\cite{} and multi-agent reinforcement learning (MARL)~\cite{}, based on
%the environment setting. %the environment setting.
Compositionality is widely used and The quality of emergent symbolic language is typically measured by its \emph{compositionality}.
taken as an important metric to evaluate the emerged symbolic language. Compositionality is a principle that determines
Originally, compositionality is a principle that whether the meaning of a complex expression (e.g, phrase), which is assembled out of a
whether the meaning of a complex expression (e.g, phase), which is assembled out of the
given set of simple components (e.g., symbols), can be determined by its given set of simple components (e.g., symbols), can be determined by its
constituent components and the rules that combines them~\cite{}. constituent components and the rules that combines them~\cite{}.
\note{For example, the expression "AAAI is a conference'' consists of two \note{For example, the expression "AAAI is a conference'' consists of two
meaningful words ``AAAI'' and ``conference'', and a rule for definition (``is'').} meaningful words ``AAAI'' and ``conference'', and a rule for definition (``is'').
More recently, measuring the compositionality \note{xxxxx}. More recently, measuring the compositionality \note{xxxxx}.}
%It %It
...@@ -38,45 +37,46 @@ More recently, measuring the compositionality \note{xxxxx}. ...@@ -38,45 +37,46 @@ More recently, measuring the compositionality \note{xxxxx}.
\centering \centering
\includegraphics[width=0.9\columnwidth]{fig/occupy} \includegraphics[width=0.9\columnwidth]{fig/occupy}
\caption{\rmk{compositionality.}} \caption{\rmk{compositionality.}}
\label{fig:symbols} \label{fig:induction}
\end{figure} \end{figure}
Prior studies focus on achieving high compositionality Prior studies focus on achieving high compositional symbolic language
through \emph{deliberately handcrafted} inductions, e.g., small vocabulary through \emph{deliberately handcrafted} inductions, e.g., small vocabulary
sizes~\cite{}, memoryless~\cite{}, carefully constructed rewards~\cite{}, and sizes~\cite{}, memoryless~\cite{}, carefully constructed rewards~\cite{}, and
ease-of-teaching~\cite{}. \note{xxxxxxx} ease-of-teaching~\cite{}. \note{The possible intuition is that high compositional symbolic
However, these unnatural inductions prevent us from better understanding the mystery of language cannot emerge without induction in existing multi-agent environment.}
Figure~\ref{fig:induction} reports the compositionality when training two agents in the widely-used
listener-speaker referential game, and it can be observed that \note{the compositionality
of emerged symbolic language is extremely low without any induction}.
Though such unnatural inductions are useful, they prevent us from better understanding the mystery of
the emergence of language and even intelligence among our pre-human ancestors. the emergence of language and even intelligence among our pre-human ancestors.
Yet, few works investigate the emergence of high compositional symbolic language Yet, few works investigate the emergence of high compositional symbolic language
\emph{naturally}, i.e., without \emph{deliberately \emph{naturally}, i.e., without handcrafted inductions.
handcrafted} inductions. In other words, it is never clear whether \emph{natural}
As a results, it is never clear whether \emph{natural} environment and agents are sufficient for achieving high compositionality.
environment and agent are sufficient for compositionality.
In this work, we focus on the natural emergence of high compositional symbolic language In this paper, we are the first work to achieve high compositional
naturally without any handcrafted induction. symbolic language without any deliberately handcrafted induction. The key observation
Initially, we thoroughly analyze the compositionality of emerged symbolic is that the internal \emph{agent capacity} plays a crucial role in the compositionality
language after removing the \emph{deliberately handcrafted} of symbolic language,
inductions. Figure~\ref{fig:comp} reports the compositionality when train two by thoroughly analyzing the compositionality after removing the inductions in
agents in a listener-speaker referential game. It can be observed that \note{it the most widely-used listener-speaker referential game framework.
is challenging to achieve high compositionality without induction as Concretely, the relationship between the agent capacity and the compositionality
xxxxxx}. Moreover, we observe that the agent capacity plays a key role in of symbolic language is characterized both theoretically and experimentally.
compositionality, see Figure xxx.
We reveal and characterize the quantitative relationship
between the agent capacity and the compositionality of symbolic language both
theoretically and experimentally.
%theoretically %theoretically
Regarding the theoretical analysis, we use the Regarding the theoretical analysis, we use the
Markov Series Channel (MSC)~\cite{} to model the language transmission process and a \note{Markov Series Channel (MSC)~\cite{} to model the language transmission process and a
novel mutual information-based metric to measure the compositionality quantitatively. novel mutual information-based metric to measure the compositionality quantitatively}.
%experimentally %experimentally
Regarding the experimental verification, it is conducted on a listener-speaker Regarding the experimental validation, two different dedicated experiments, i.e.,
referential game framework with eliminated unnatural inductions. \note{XXX and XXX, are utilized for XXX}.
Both theoretical analysis and %Regarding the experimental validation, it is conducted on a listener-speaker
experimental results lead to a counter-intuitive conclusion that lower agent %referential game framework with eliminated unnatural inductions.
capacity facilitates the emergence of symbolic language with higher Both the theoretical analysis and experimental results lead to a counter-intuitive
compositionality. conclusion that \emph{lower agent capacity facilitates the emergence of symbolic language
with higher compositionality}. \note{Therefore, by only reducing the agent capacity
in such a natural environment, we
can generate a higher compositional symbolic language with a higher probability.}
%Prior studies focus on investigating how to affect the %Prior studies focus on investigating how to affect the
...@@ -167,8 +167,9 @@ Both theoretical analysis and ...@@ -167,8 +167,9 @@ Both theoretical analysis and
In this paper, we made the following contributions: In this paper, we made the following contributions:
\begin{itemize}[topsep=0pt,itemsep=0cm] \begin{itemize}[topsep=0pt,itemsep=0cm]
\item We are the first to successfully achieve high compositional symbolic \item To our best knowledge, we are the first work to successfully achieve
language naturally, without any deliberately handcrafted inductions. high compositional symbolic
language naturally, without any deliberately handcrafted induction.
\item We thoroughly analyze the compositionality of emerged symbolic language \item We thoroughly analyze the compositionality of emerged symbolic language
after removing deliberately handcrafted inductions, and confirm that the agent after removing deliberately handcrafted inductions, and confirm that the agent
capacity acts as a key factor for compositionality. capacity acts as a key factor for compositionality.
......
...@@ -4,10 +4,11 @@ ...@@ -4,10 +4,11 @@
%external environmental factors %external environmental factors
Previous works focus on the external environmental factors that impact the Previous works focus on the external environmental factors that impact the
compositionality of emerged symbolic language. compositionality of emerged symbolic language.
For example, XXX proposed small vocabulary sizes~\cite{}. For example, ~\citet{kirby2015compression} explored how the pressures for compressivity and compressibility lead the structured language.
XXX proposed memoryless~\cite{}. ~\citet{kottur-etal-2017-natural} constrained the vocabulary size and whether the listener has memory to coax the compositionality of the emergent language.
XXX proposed carefully constructed distractors~\cite{}. ~\citet{lazaridou2018emergence} showed that the degree of structure found in the input data affects the emergence of the symbolic language.
XXX proposed ease-of-teaching~\cite{}. ~\citet{li2019ease} studied how the pressure, ease of teaching, impact on the iterative language of the population regime.
~\citet{evtimova2018emergent} designed a novel multi-modal scenarios, which the speaker and the listener should access to different modalities of the input object, to explore the language emergence.
Such factors are deliberately designed, which are too ideal to be true in Such factors are deliberately designed, which are too ideal to be true in
the real world. None of these works realizes the importance of model capacity of the real world. None of these works realizes the importance of model capacity of
agent itself. \rmk{this should be largely emphasized.} agent itself. \rmk{this should be largely emphasized.}
......
...@@ -24,35 +24,22 @@ In this paper, the task is xxxx. ...@@ -24,35 +24,22 @@ In this paper, the task is xxxx.
\textbf{Game rules} In our referential game, agents follow the following rules \textbf{Game rules} In our referential game, agents follow the following rules
to finish the game in a cooperatively manner. In each round,once received an to finish the game in a cooperatively manner. In each round,once received an
input object $t$, Speaker $S$ speaks a symbol sequence $s$ to Listener $L$ ; input object $t$, Speaker $S$ speaks a symbol sequence $s$ to Listener $L$ ;
Listener $L$ reconstruct the predict result $\hat{t}$ based on the listened Listener $L$ reconstruct the predicted result $\hat{t}$ based on the listened
sequence $s$; if $t=\hat{t}$, agents win this game and receive positive rewards sequence $s$; if $t=\hat{t}$, agents win this game and receive positive rewards
($R(t,\hat{t})=1$); otherwise agents fail this game and receive negative rewards ($R(t,\hat{t})=1$); otherwise agents fail this game and receive negative rewards
($R(t,\hat{t})=-1$). ($R(t,\hat{t})=-1$).
Precisely, Precisely, during the game, Speaker $S$ receives an input object $t$, which is
an expression with two words from the vocabulary set $V$, i.e., two
one-hot vector representing shape and color, respectively. Based on the $t$,
An input object t is a concept sequence with fixed length, denoted Speaker $S$ speaks a symbol sequence $s$, which similarly contains two words
$t=(c_0,c_1)$. from $V$. The Listener $L$ receives $s$ and output predicted result $\hat{t}$,
a single word (one-hot vector) selected from the Cartesian product of set two $V$s
The concept $c_0(shape)$ and $c_1(color)$ are indicated as a ($V\times V$), which representing all the meanings of two combined words from $V$.
one-hot vector respectively. Please note that since $t$ and $\hat{t}$ have different length, we say
The length of each one-hot vector ranges from 3 to 6. $t=\hat{t}$ if $t$ expresses the same meaning as $\hat{t}$, e.g.,
These two vectors are concatenated to denote the input object t. $t={[0,0,1],[0,1,0]}$ would be equal to $\hat{t}=[0,0,0,0,0,1]$ if they both mean ``red
Each symbol sequence s contains two words, denoted $(s_0,s_1)$. Each word $s_i$ circle''.
is chosen in the vocabulary set $V$. In this game, let the card $|V|$ range from
4 to 10, and the inequation $|V|^2\geq|M_1||M_1|$ is satisfied to ensure the
symbol sequence $(s_0,s_1)$ can be used to denote all the input object t. The
one-hot vector with the length $|V|$ is used to indicate the word $s_0$ and
$s_1$ respectively. Then, the two one-hot vectors are concatenated to denote the
symbol sequence s.
The predict result $\hat{t}$ is denoted as a one-hot vector with the length
$|M_0||M_1|$. Each bit of the one-hot vector denotes one input object. If the
predict result $\hat{t}[i*|M_1|+j]=1$, the one-hot vector of each predict
concept $\hat{c}_0$ and $\hat{c}_1$ respectively satisfied $\hat{c}_0[i]=1$ and
$\hat{c}_1[j]=1$.
If $(c_0,c_1)$ is equal to $(\hat{c}_0,\hat{c}_1)$, the input object and the
predict result indicate the same object.
...@@ -67,29 +54,91 @@ predict result indicate the same object. ...@@ -67,29 +54,91 @@ predict result indicate the same object.
\label{fig:agents} \label{fig:agents}
\end{figure} \end{figure}
The agents apply their own policy to play the referential game. Denote the Figure~\ref{fig:agents} shows the architecture of the constructed agents,
policy of the speaker agent S and the listener L as $\pi_S$ and $\pi_L$. $\pi_S$ including the Speaker $S$ and Listener $L$.
indicates the conditional probability $P(s_0|t)$ and $P(s_1|t)$. $\pi_L$
indicates the conditional probability $P(\hat{t}|s_0,s_1)$. The listener agent \textbf{Speaker.} Regarding the Speaker $S$, it is constructed as a three-layer neural
output predict result $\hat{t}$ through random sampling on the conditional network. The Speaker $S$ processes the input object $t$ with a fully-connected
probability $P(\hat{t}|s_0,s_1)$. The neural networks are used to simulate the layer to obtain the hidden layer $h^s$, which is split into two sub-layers. Each
agent policy. The agent architecture is shown in Figure 1. sub-layer is further processed with fully-connected layers to obtain the output
For the speaker, the input object t is firstly passed to a MLP to get a hidden layer. The output layer results indicate the probability distribution of symbols
layer vector $h^S$. Then, the hidden layer vector is split into two feature with given input object $t$, i.e., $o_i^{s}=P(s_i|t)$ $i\in{0,1}$. \note{The final
vectors $h_0^S$ and $h_1^S$ with length h\_size. Through a MLP and a softmax layer, readout symbols are sampled based on such probability distribution.}
these feature vectors are transformed as the output $o_0$ and $o_1$ with the length
|V| respectively. Lastly, the symbol sequences $s_0$ and $s_1$ are sampled from the \textbf{Listener.} Regarding the Listener $L$, it is constructed as a
output $o_0$ and $o_1$. three-layer neural network, too. Different from Speaker $S$ that split the
For the listener, the input symbol sequences $s_0$ and $s_1$ are passed into a MLP hidden layer into two sub-layers, $L$ concatenates two sub-layers into one
respectively to get the hidden layer vectors $h_0$ and $h_1$. The length of each output layer. The output layer results are also the probability distribution of
vector is h\_size. Concatenating these vectors, and passing the conjunctive symbols $\hat{t}$ with given input sequence $s$, i.e, $o^{L}=P(\hat{t}|s_0,s_1)$.
vector into a MLP and a softmax layer, the output $o^L$ with length $|M_0||M_1|$ \note{The final readout symbol is sampled based the probability.}
denotes $P(\hat{t}|s_0,s_1)$. Lastly, the predict result is sampled from the
output $o^L$.
In the experiments, the symbol h\_size is used to denote the model capacity of
the agents. \subsection{Learning algorithm}
\subsection{Training algorithm}
\label{ssec:training} \label{ssec:training}
To remove all the handcrafted induction as well as for a more realistic
scenario, agents for this referential game are independent to each other,
without sharing model parameters or architectural connections. As shown in
Algorithm~\ref{al:learning}, we train the separate Speaker $S$ and Listener $L$ with
Stochastic Policy Gradient methodology in a tick-tock manner, i.e, training one
agent while keeping the other one. Roughly, when training the Speaker, the
target is set to maximize the expected reward
$J(\theta_S, \theta_L)=E_{\pi_S,\pi_L}[R(t, t^)]$ by adjusting the parameter
$\theta_S$, where $\theta_S$ is the neural network parameters of Speaker $S$
with learned output probability distribution $\pi_S$, and $\theta_L$ is the
neural network parameters of Listener with learned probability distribution $\pi_L$.
Similarly, when training the Listener, the target is set to maximize the
expected reward$ J(theta_S, theta_L)$ by fixing the parameter $\theta_S$ and
adjusting the parameter $\theta_L$.
Additionally, to avoid the handcrafted induction on emergent language, we only
use the predict result $\hat{t}$ of the listener agent as the
evidence of whether giving the positive rewards. Then, the gradients of the
expected reward $ J(theta_S, theta_L)$ can be calculated as follows:
\begin{align}
\nabla_{\theta^S} J &= \mathbb{E}_{\pi^S, \pi^L} \left[ R(\hat{t}, t) \cdot
\nabla_{\theta^S} \log{\pi^S(s_0, s_1 | t)} \right] \\
\nabla_{\theta^L} J &= \mathbb{E}_{\pi^S, \pi^L} \left[ R(\hat{t}, t) \cdot
\nabla_{\theta^L} \log{\pi^S(\hat{t} | s_0, s_1)} \right]
\end{align}
\begin{algorithm}[t]
\caption{Learning Algorithm$(t,\hat{t})$}
\label{al:learning}
\small
\begin{algorithmic}[1]
\IF{Training the speaker agent S}
\FOR{Batch T randomly selected from $M_0\times M_1$}
\FOR{$t=(c_0,c_1)$ in T}
\STATE $P(s_0|t),P(s_1|t)=\pi_{old}^S(s=(s_0,s_1)|t)$
\STATE Sample $s_0$ with $P(s_0|t)$, $s_1$ with $P(s_1|t)$
\STATE $P(\hat{t}|s) = \pi^L(\hat{t}|s)$
\STATE Sample $\hat{t}$ with $P(\hat{t}|s)$
\STATE Get reward $R(\hat{t},t)$
\STATE $J(\theta^S,\theta^L)=E_{\pi_{old}^S,\pi^L}[R(\hat{t},t)\cdot\frac{\pi^S(s|t)}{\pi^S_{old}(s|t)}]$
\STATE Update $\theta^S$ by $\bigtriangledown_{\theta^S}J$
\ENDFOR
\STATE $\pi_{old}^S\leftarrow \pi^S$
\ENDFOR
\ENDIF
\IF{Training the listener agent L}
\FOR{Batch T randomly selected from $M_0\times M_1$}
\FOR{$t=(c_0,c_1)$ in T}
\STATE $P(s_0|t),P(s_1|t)=\pi^S(s=(s_0,s_1)|t)$
\STATE Sample $s_0$ with $P(s_0|t)$, $s_1$ with $P(s_1|t)$
\STATE $P(\hat{t}|s) = \pi^L_{old}(\hat{t}|s)$
\STATE Sample $\hat{t}$ with $P(\hat{t}|s)$
\STATE Get reward $R(\hat{t},t)$
\STATE $J(\theta^S,\theta^L)=E_{\pi_{old}^S,\pi^L}[R(\hat{t},t)\cdot\frac{\pi^L(s|t)}{\pi^L_{old}(s|t)}]$
\STATE Update $\theta^L$ by $\bigtriangledown_{\theta^L}J$
\ENDFOR
\STATE $\pi_{old}^L\leftarrow \pi^L$
\ENDFOR
\ENDIF
\end{algorithmic}
\end{algorithm}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment