Commit f40a1f4e by YZhao
parents 179ff574 57dfb313
......@@ -8,6 +8,8 @@
\newcommand{\rmk}[1]{\textcolor{red}{--[#1]--}}
\newcommand{\note}[1]{\textcolor{red}{#1}}
\usepackage{enumitem}
\usepackage{amsmath}
\usepackage{amsfonts}
\usepackage{aaai21} % DO NOT CHANGE THIS
\usepackage{times} % DO NOT CHANGE THIS
......@@ -95,7 +97,7 @@
% articles, conjunctions, and prepositions are lower case unless they
% directly follow a colon or long dash
\title{Revisiting the Natural Emergence of Symbolic Language with Agent Capacity}
\title{Enabling the Emergence of Symbolic Language without Handcrafted Inductions}
\author{
%Authors
% All authors must be in the same font size and format.
......@@ -177,7 +179,7 @@
inductions.
In this paper, we are the first to successfully achieve high compositional symbolic
language in a \emph{natural} manner.
language in a \emph{natural} manner without handcrafted inductions.
Initially, by thoroughly investigating the compositionality of emerged symbolic
language after removing the \emph{deliberately handcrafted}
inductions, we observe that the agent capacity plays a key role in
......@@ -194,7 +196,7 @@
experimental results lead to a counter-intuitive conclusion that lower agent
capacity facilitates the emergence of symbolic language with higher
compositionality. \note{Based on our conclusion, we can generate higher
compositional symbolic language with a high probability.}
compositional symbolic language with a higher probability.}
% The natural emergence of symbolic languages with high compositionality has
......@@ -224,42 +226,8 @@
\input{tex/experiments.tex}
\input{tex/last.tex}
\begin{algorithm}[!h]
\caption{OurAlgorithm$(t,\hat{t})$}
\begin{algorithmic}[1]
\IF{Training the speaker agent S}
\FOR{Batch T randomly selected from $M_0\times M_1$}
\FOR{$t=(c_0,c_1)$ in T}
\STATE $P(s_0|t),P(s_1|t)=\pi_{old}^S(s=(s_0,s_1)|t)$
\STATE Sample $s_0$ with $P(s_0|t)$, $s_1$ with $P(s_1|t)$
\STATE $P(\hat{t}|s) = \pi^L(\hat{t}|s)$
\STATE Sample $\hat{t}$ with $P(\hat{t}|s)$
\STATE Get reward $R(\hat{t},t)$
\STATE $J(\theta^S,\theta^L)=E_{\pi_{old}^S,\pi^L}[R(\hat{t},t)\cdot\frac{\pi^S(s|t)}{\pi^S_{old}(s|t)}]$
\STATE Update $\theta^S$ by $\bigtriangledown_{\theta^S}J$
\ENDFOR
\STATE $\pi_{old}^S\leftarrow \pi^S$
\ENDFOR
\ENDIF
\IF{Training the listener agent L}
\FOR{Batch T randomly selected from $M_0\times M_1$}
\FOR{$t=(c_0,c_1)$ in T}
\STATE $P(s_0|t),P(s_1|t)=\pi^S(s=(s_0,s_1)|t)$
\STATE Sample $s_0$ with $P(s_0|t)$, $s_1$ with $P(s_1|t)$
\STATE $P(\hat{t}|s) = \pi^L_{old}(\hat{t}|s)$
\STATE Sample $\hat{t}$ with $P(\hat{t}|s)$
\STATE Get reward $R(\hat{t},t)$
\STATE $J(\theta^S,\theta^L)=E_{\pi_{old}^S,\pi^L}[R(\hat{t},t)\cdot\frac{\pi^L(s|t)}{\pi^L_{old}(s|t)}]$
\STATE Update $\theta^L$ by $\bigtriangledown_{\theta^L}J$
\ENDFOR
\STATE $\pi_{old}^L\leftarrow \pi^L$
\ENDFOR
\ENDIF
\end{algorithmic}
\end{algorithm}
\clearpage
\newpage
\bibliography{ref.bib}
\end{document}
......@@ -12,3 +12,52 @@
biburl = {https://dblp.org/rec/bib/conf/iclr/WuLCS18},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{kottur-etal-2017-natural,
title = "Natural Language Does Not Emerge {`}Naturally{'} in Multi-Agent Dialog",
author = "Kottur, Satwik and
Moura, Jos{\'e} and
Lee, Stefan and
Batra, Dhruv",
booktitle = "Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing",
month = sep,
year = "2017",
address = "Copenhagen, Denmark",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/D17-1321",
doi = "10.18653/v1/D17-1321",
pages = "2962--2967",
abstract = "A number of recent works have proposed techniques for end-to-end learning of communication protocols among cooperative multi-agent populations, and have simultaneously found the emergence of grounded human-interpretable language in the protocols developed by the agents, learned without any human supervision! In this paper, using a Task {\&} Talk reference game between two agents as a testbed, we present a sequence of {`}negative{'} results culminating in a {`}positive{'} one {--} showing that while most agent-invented languages are effective (i.e. achieve near-perfect task rewards), they are decidedly not interpretable or compositional. In essence, we find that natural language does not emerge {`}naturally{'},despite the semblance of ease of natural-language-emergence that one may gather from recent literature. We discuss how it is possible to coax the invented languages to become more and more human-like and compositional by increasing restrictions on how two agents may communicate.",
}
@article{kirby2015compression,
title={Compression and communication in the cultural evolution of linguistic structure},
author={Kirby, Simon and Tamariz, Monica and Cornish, Hannah and Smith, Kenny},
journal={Cognition},
volume={141},
pages={87--102},
year={2015},
publisher={Elsevier}
}
@inproceedings{lazaridou2018emergence,
title={Emergence of Linguistic Communication from Referential Games with Symbolic and Pixel Input},
author={Lazaridou, Angeliki and Hermann, Karl Moritz and Tuyls, Karl and Clark, Stephen},
booktitle={International Conference on Learning Representations},
year={2018}
}
@inproceedings{li2019ease,
title={Ease-of-teaching and language structure from emergent communication},
author={Li, Fushan and Bowling, Michael},
booktitle={Advances in Neural Information Processing Systems},
pages={15851--15861},
year={2019}
}
@inproceedings{evtimova2018emergent,
title={Emergent Communication in a Multi-Modal, Multi-Step Referential Game},
author={Evtimova, Katrina and Drozdov, Andrew and Kiela, Douwe and Cho, Kyunghyun},
booktitle={International Conference on Learning Representations},
year={2018}
}
\ No newline at end of file
\section{Introduction}
\label{sec:introduction}
The emergence of symbolic language has always been an important and controversial
issue. This problem attracts attentions from a broad range of communities,
The emergence of symbolic language has always been an important issue,
which attracts attentions from a broad range of communities,
including philology~\cite{}, biology~\cite{}, and computer
science~\cite{}. Especially in computer science, efforts in recent years try to explore
the emergence of symbolic language in virtual, multi-agent environments, where
agents are trained to communicate with neural network based method, i.e., deep
reinforcement learning~\cite{}. For example, \note{XXXX}
the emergence of symbolic language in virtual multi-agent environments, where
agents are trained to communicate with neural network based methods such as deep
reinforcement learning~\cite{}.
%Such works can be roughly classified into two categories,
%referential game~\cite{} and multi-agent reinforcement learning (MARL)~\cite{}, based on
%the environment setting.
Compositionality is widely used and
taken as an important metric to evaluate the emerged symbolic language.
Originally, compositionality is a principle that
whether the meaning of a complex expression (e.g, phase), which is assembled out of the
The quality of emergent symbolic language is typically measured by its \emph{compositionality}.
Compositionality is a principle that determines
whether the meaning of a complex expression (e.g, phrase), which is assembled out of a
given set of simple components (e.g., symbols), can be determined by its
constituent components and the rules that combines them~\cite{}.
\note{For example, the expression "AAAI is a conference'' consists of two
meaningful words ``AAAI'' and ``conference'', and a rule for definition (``is'').}
More recently, measuring the compositionality \note{xxxxx}.
meaningful words ``AAAI'' and ``conference'', and a rule for definition (``is'').
More recently, measuring the compositionality \note{xxxxx}.}
%It
......@@ -38,45 +37,46 @@ More recently, measuring the compositionality \note{xxxxx}.
\centering
\includegraphics[width=0.9\columnwidth]{fig/occupy}
\caption{\rmk{compositionality.}}
\label{fig:symbols}
\label{fig:induction}
\end{figure}
Prior studies focus on achieving high compositionality
Prior studies focus on achieving high compositional symbolic language
through \emph{deliberately handcrafted} inductions, e.g., small vocabulary
sizes~\cite{}, memoryless~\cite{}, carefully constructed rewards~\cite{}, and
ease-of-teaching~\cite{}. \note{xxxxxxx}
However, these unnatural inductions prevent us from better understanding the mystery of
ease-of-teaching~\cite{}. \note{The possible intuition is that high compositional symbolic
language cannot emerge without induction in existing multi-agent environment.}
Figure~\ref{fig:induction} reports the compositionality when training two agents in the widely-used
listener-speaker referential game, and it can be observed that \note{the compositionality
of emerged symbolic language is extremely low without any induction}.
Though such unnatural inductions are useful, they prevent us from better understanding the mystery of
the emergence of language and even intelligence among our pre-human ancestors.
Yet, few works investigate the emergence of high compositional symbolic language
\emph{naturally}, i.e., without \emph{deliberately
handcrafted} inductions.
As a results, it is never clear whether \emph{natural}
environment and agent are sufficient for compositionality.
\emph{naturally}, i.e., without handcrafted inductions.
In other words, it is never clear whether \emph{natural}
environment and agents are sufficient for achieving high compositionality.
In this work, we focus on the natural emergence of high compositional symbolic language
naturally without any handcrafted induction.
Initially, we thoroughly analyze the compositionality of emerged symbolic
language after removing the \emph{deliberately handcrafted}
inductions. Figure~\ref{fig:comp} reports the compositionality when train two
agents in a listener-speaker referential game. It can be observed that \note{it
is challenging to achieve high compositionality without induction as
xxxxxx}. Moreover, we observe that the agent capacity plays a key role in
compositionality, see Figure xxx.
We reveal and characterize the quantitative relationship
between the agent capacity and the compositionality of symbolic language both
theoretically and experimentally.
In this paper, we are the first work to achieve high compositional
symbolic language without any deliberately handcrafted induction. The key observation
is that the internal \emph{agent capacity} plays a crucial role in the compositionality
of symbolic language,
by thoroughly analyzing the compositionality after removing the inductions in
the most widely-used listener-speaker referential game framework.
Concretely, the relationship between the agent capacity and the compositionality
of symbolic language is characterized both theoretically and experimentally.
%theoretically
Regarding the theoretical analysis, we use the
Markov Series Channel (MSC)~\cite{} to model the language transmission process and a
novel mutual information-based metric to measure the compositionality quantitatively.
\note{Markov Series Channel (MSC)~\cite{} to model the language transmission process and a
novel mutual information-based metric to measure the compositionality quantitatively}.
%experimentally
Regarding the experimental verification, it is conducted on a listener-speaker
referential game framework with eliminated unnatural inductions.
Both theoretical analysis and
experimental results lead to a counter-intuitive conclusion that lower agent
capacity facilitates the emergence of symbolic language with higher
compositionality.
Regarding the experimental validation, two different dedicated experiments, i.e.,
\note{XXX and XXX, are utilized for XXX}.
%Regarding the experimental validation, it is conducted on a listener-speaker
%referential game framework with eliminated unnatural inductions.
Both the theoretical analysis and experimental results lead to a counter-intuitive
conclusion that \emph{lower agent capacity facilitates the emergence of symbolic language
with higher compositionality}. \note{Therefore, by only reducing the agent capacity
in such a natural environment, we
can generate a higher compositional symbolic language with a higher probability.}
%Prior studies focus on investigating how to affect the
......@@ -167,8 +167,9 @@ Both theoretical analysis and
In this paper, we made the following contributions:
\begin{itemize}[topsep=0pt,itemsep=0cm]
\item We are the first to successfully achieve high compositional symbolic
language naturally, without any deliberately handcrafted inductions.
\item To our best knowledge, we are the first work to successfully achieve
high compositional symbolic
language naturally, without any deliberately handcrafted induction.
\item We thoroughly analyze the compositionality of emerged symbolic language
after removing deliberately handcrafted inductions, and confirm that the agent
capacity acts as a key factor for compositionality.
......
......@@ -4,10 +4,11 @@
%external environmental factors
Previous works focus on the external environmental factors that impact the
compositionality of emerged symbolic language.
For example, XXX proposed small vocabulary sizes~\cite{}.
XXX proposed memoryless~\cite{}.
XXX proposed carefully constructed distractors~\cite{}.
XXX proposed ease-of-teaching~\cite{}.
For example, ~\citet{kirby2015compression} explored how the pressures for compressivity and compressibility lead the structured language.
~\citet{kottur-etal-2017-natural} constrained the vocabulary size and whether the listener has memory to coax the compositionality of the emergent language.
~\citet{lazaridou2018emergence} showed that the degree of structure found in the input data affects the emergence of the symbolic language.
~\citet{li2019ease} studied how the pressure, ease of teaching, impact on the iterative language of the population regime.
~\citet{evtimova2018emergent} designed a novel multi-modal scenarios, which the speaker and the listener should access to different modalities of the input object, to explore the language emergence.
Such factors are deliberately designed, which are too ideal to be true in
the real world. None of these works realizes the importance of model capacity of
agent itself. \rmk{this should be largely emphasized.}
......
......@@ -24,35 +24,22 @@ In this paper, the task is xxxx.
\textbf{Game rules} In our referential game, agents follow the following rules
to finish the game in a cooperatively manner. In each round,once received an
input object $t$, Speaker $S$ speaks a symbol sequence $s$ to Listener $L$ ;
Listener $L$ reconstruct the predict result $\hat{t}$ based on the listened
Listener $L$ reconstruct the predicted result $\hat{t}$ based on the listened
sequence $s$; if $t=\hat{t}$, agents win this game and receive positive rewards
($R(t,\hat{t})=1$); otherwise agents fail this game and receive negative rewards
($R(t,\hat{t})=-1$).
Precisely,
An input object t is a concept sequence with fixed length, denoted
$t=(c_0,c_1)$.
The concept $c_0(shape)$ and $c_1(color)$ are indicated as a
one-hot vector respectively.
The length of each one-hot vector ranges from 3 to 6.
These two vectors are concatenated to denote the input object t.
Each symbol sequence s contains two words, denoted $(s_0,s_1)$. Each word $s_i$
is chosen in the vocabulary set $V$. In this game, let the card $|V|$ range from
4 to 10, and the inequation $|V|^2\geq|M_1||M_1|$ is satisfied to ensure the
symbol sequence $(s_0,s_1)$ can be used to denote all the input object t. The
one-hot vector with the length $|V|$ is used to indicate the word $s_0$ and
$s_1$ respectively. Then, the two one-hot vectors are concatenated to denote the
symbol sequence s.
The predict result $\hat{t}$ is denoted as a one-hot vector with the length
$|M_0||M_1|$. Each bit of the one-hot vector denotes one input object. If the
predict result $\hat{t}[i*|M_1|+j]=1$, the one-hot vector of each predict
concept $\hat{c}_0$ and $\hat{c}_1$ respectively satisfied $\hat{c}_0[i]=1$ and
$\hat{c}_1[j]=1$.
If $(c_0,c_1)$ is equal to $(\hat{c}_0,\hat{c}_1)$, the input object and the
predict result indicate the same object.
Precisely, during the game, Speaker $S$ receives an input object $t$, which is
an expression with two words from the vocabulary set $V$, i.e., two
one-hot vector representing shape and color, respectively. Based on the $t$,
Speaker $S$ speaks a symbol sequence $s$, which similarly contains two words
from $V$. The Listener $L$ receives $s$ and output predicted result $\hat{t}$,
a single word (one-hot vector) selected from the Cartesian product of set two $V$s
($V\times V$), which representing all the meanings of two combined words from $V$.
Please note that since $t$ and $\hat{t}$ have different length, we say
$t=\hat{t}$ if $t$ expresses the same meaning as $\hat{t}$, e.g.,
$t={[0,0,1],[0,1,0]}$ would be equal to $\hat{t}=[0,0,0,0,0,1]$ if they both mean ``red
circle''.
......@@ -67,29 +54,91 @@ predict result indicate the same object.
\label{fig:agents}
\end{figure}
The agents apply their own policy to play the referential game. Denote the
policy of the speaker agent S and the listener L as $\pi_S$ and $\pi_L$. $\pi_S$
indicates the conditional probability $P(s_0|t)$ and $P(s_1|t)$. $\pi_L$
indicates the conditional probability $P(\hat{t}|s_0,s_1)$. The listener agent
output predict result $\hat{t}$ through random sampling on the conditional
probability $P(\hat{t}|s_0,s_1)$. The neural networks are used to simulate the
agent policy. The agent architecture is shown in Figure 1.
For the speaker, the input object t is firstly passed to a MLP to get a hidden
layer vector $h^S$. Then, the hidden layer vector is split into two feature
vectors $h_0^S$ and $h_1^S$ with length h\_size. Through a MLP and a softmax layer,
these feature vectors are transformed as the output $o_0$ and $o_1$ with the length
|V| respectively. Lastly, the symbol sequences $s_0$ and $s_1$ are sampled from the
output $o_0$ and $o_1$.
For the listener, the input symbol sequences $s_0$ and $s_1$ are passed into a MLP
respectively to get the hidden layer vectors $h_0$ and $h_1$. The length of each
vector is h\_size. Concatenating these vectors, and passing the conjunctive
vector into a MLP and a softmax layer, the output $o^L$ with length $|M_0||M_1|$
denotes $P(\hat{t}|s_0,s_1)$. Lastly, the predict result is sampled from the
output $o^L$.
In the experiments, the symbol h\_size is used to denote the model capacity of
the agents.
\subsection{Training algorithm}
Figure~\ref{fig:agents} shows the architecture of the constructed agents,
including the Speaker $S$ and Listener $L$.
\textbf{Speaker.} Regarding the Speaker $S$, it is constructed as a three-layer neural
network. The Speaker $S$ processes the input object $t$ with a fully-connected
layer to obtain the hidden layer $h^s$, which is split into two sub-layers. Each
sub-layer is further processed with fully-connected layers to obtain the output
layer. The output layer results indicate the probability distribution of symbols
with given input object $t$, i.e., $o_i^{s}=P(s_i|t)$ $i\in{0,1}$. \note{The final
readout symbols are sampled based on such probability distribution.}
\textbf{Listener.} Regarding the Listener $L$, it is constructed as a
three-layer neural network, too. Different from Speaker $S$ that split the
hidden layer into two sub-layers, $L$ concatenates two sub-layers into one
output layer. The output layer results are also the probability distribution of
symbols $\hat{t}$ with given input sequence $s$, i.e, $o^{L}=P(\hat{t}|s_0,s_1)$.
\note{The final readout symbol is sampled based the probability.}
\subsection{Learning algorithm}
\label{ssec:training}
To remove all the handcrafted induction as well as for a more realistic
scenario, agents for this referential game are independent to each other,
without sharing model parameters or architectural connections. As shown in
Algorithm~\ref{al:learning}, we train the separate Speaker $S$ and Listener $L$ with
Stochastic Policy Gradient methodology in a tick-tock manner, i.e, training one
agent while keeping the other one. Roughly, when training the Speaker, the
target is set to maximize the expected reward
$J(\theta_S, \theta_L)=E_{\pi_S,\pi_L}[R(t, t^)]$ by adjusting the parameter
$\theta_S$, where $\theta_S$ is the neural network parameters of Speaker $S$
with learned output probability distribution $\pi_S$, and $\theta_L$ is the
neural network parameters of Listener with learned probability distribution $\pi_L$.
Similarly, when training the Listener, the target is set to maximize the
expected reward$ J(theta_S, theta_L)$ by fixing the parameter $\theta_S$ and
adjusting the parameter $\theta_L$.
Additionally, to avoid the handcrafted induction on emergent language, we only
use the predict result $\hat{t}$ of the listener agent as the
evidence of whether giving the positive rewards. Then, the gradients of the
expected reward $ J(theta_S, theta_L)$ can be calculated as follows:
\begin{align}
\nabla_{\theta^S} J &= \mathbb{E}_{\pi^S, \pi^L} \left[ R(\hat{t}, t) \cdot
\nabla_{\theta^S} \log{\pi^S(s_0, s_1 | t)} \right] \\
\nabla_{\theta^L} J &= \mathbb{E}_{\pi^S, \pi^L} \left[ R(\hat{t}, t) \cdot
\nabla_{\theta^L} \log{\pi^S(\hat{t} | s_0, s_1)} \right]
\end{align}
\begin{algorithm}[t]
\caption{Learning Algorithm$(t,\hat{t})$}
\label{al:learning}
\small
\begin{algorithmic}[1]
\IF{Training the speaker agent S}
\FOR{Batch T randomly selected from $M_0\times M_1$}
\FOR{$t=(c_0,c_1)$ in T}
\STATE $P(s_0|t),P(s_1|t)=\pi_{old}^S(s=(s_0,s_1)|t)$
\STATE Sample $s_0$ with $P(s_0|t)$, $s_1$ with $P(s_1|t)$
\STATE $P(\hat{t}|s) = \pi^L(\hat{t}|s)$
\STATE Sample $\hat{t}$ with $P(\hat{t}|s)$
\STATE Get reward $R(\hat{t},t)$
\STATE $J(\theta^S,\theta^L)=E_{\pi_{old}^S,\pi^L}[R(\hat{t},t)\cdot\frac{\pi^S(s|t)}{\pi^S_{old}(s|t)}]$
\STATE Update $\theta^S$ by $\bigtriangledown_{\theta^S}J$
\ENDFOR
\STATE $\pi_{old}^S\leftarrow \pi^S$
\ENDFOR
\ENDIF
\IF{Training the listener agent L}
\FOR{Batch T randomly selected from $M_0\times M_1$}
\FOR{$t=(c_0,c_1)$ in T}
\STATE $P(s_0|t),P(s_1|t)=\pi^S(s=(s_0,s_1)|t)$
\STATE Sample $s_0$ with $P(s_0|t)$, $s_1$ with $P(s_1|t)$
\STATE $P(\hat{t}|s) = \pi^L_{old}(\hat{t}|s)$
\STATE Sample $\hat{t}$ with $P(\hat{t}|s)$
\STATE Get reward $R(\hat{t},t)$
\STATE $J(\theta^S,\theta^L)=E_{\pi_{old}^S,\pi^L}[R(\hat{t},t)\cdot\frac{\pi^L(s|t)}{\pi^L_{old}(s|t)}]$
\STATE Update $\theta^L$ by $\bigtriangledown_{\theta^L}J$
\ENDFOR
\STATE $\pi_{old}^L\leftarrow \pi^L$
\ENDFOR
\ENDIF
\end{algorithmic}
\end{algorithm}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment