Commit db11311b by Zidong Du

~

parents a4844a72 56daeae5
No preview for this file type
......@@ -30,6 +30,24 @@
abstract = "A number of recent works have proposed techniques for end-to-end learning of communication protocols among cooperative multi-agent populations, and have simultaneously found the emergence of grounded human-interpretable language in the protocols developed by the agents, learned without any human supervision! In this paper, using a Task {\&} Talk reference game between two agents as a testbed, we present a sequence of {`}negative{'} results culminating in a {`}positive{'} one {--} showing that while most agent-invented languages are effective (i.e. achieve near-perfect task rewards), they are decidedly not interpretable or compositional. In essence, we find that natural language does not emerge {`}naturally{'},despite the semblance of ease of natural-language-emergence that one may gather from recent literature. We discuss how it is possible to coax the invented languages to become more and more human-like and compositional by increasing restrictions on how two agents may communicate.",
}
@inproceedings{chaabouni-etal-2019-word,
title = "Word-order Biases in Deep-agent Emergent Communication",
author = "Chaabouni, Rahma and
Kharitonov, Eugene and
Lazaric, Alessandro and
Dupoux, Emmanuel and
Baroni, Marco",
booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics",
month = jul,
year = "2019",
address = "Florence, Italy",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/P19-1509",
doi = "10.18653/v1/P19-1509",
pages = "5166--5175",
abstract = "Sequence-processing neural networks led to remarkable progress on many NLP tasks. As a consequence, there has been increasing interest in understanding to what extent they process language as humans do. We aim here to uncover which biases such models display with respect to {``}natural{''} word-order constraints. We train models to communicate about paths in a simple gridworld, using miniature languages that reflect or violate various natural language trends, such as the tendency to avoid redundancy or to minimize long-distance dependencies. We study how the controlled characteristics of our miniature languages affect individual learning and their stability across multiple network generations. The results draw a mixed picture. On the one hand, neural networks show a strong tendency to avoid long-distance dependencies. On the other hand, there is no clear preference for the efficient, non-redundant encoding of information that is widely attested in natural language. We thus suggest inoculating a notion of {``}effort{''} into neural networks, as a possible way to make their linguistic behavior more human-like.",
}
@article{kirby2015compression,
title={Compression and communication in the cultural evolution of linguistic structure},
author={Kirby, Simon and Tamariz, Monica and Cornish, Hannah and Smith, Kenny},
......
......@@ -65,14 +65,14 @@ emerging high compositional symbolic language.
We further breakdown our results to investigate the importance of agent capacity
to the compositionality of symbolic language. Figure~\ref{fig:exp2} reports the
ratio of high compositional symbolic language in all emerged languages,
Figure~\ref{fig:exp2} (a) and (b) for $MIS>0.99$ and $MIS>0.9$, respectively. It
Figure~\ref{fig:exp2} (a) and (b) for $\mathit{MIS}>0.99$ and $\mathit{MIS}>0.9$, respectively. It
can be observed that the ratio of high compositional symbolic languages
decreases drastically with the increase of $h_{size}$.
Taking vocabulary size $|V|=4$ as an example, symbolic languages with
compositionality $MIS>0.99$ take $>$10\% mainly over all the emerged symbolic
compositionality $\mathit{MIS}>0.99$ take $>$10\% mainly over all the emerged symbolic
languages, when $h_{size}<20$; the ratio reduces to 0\%$\sim$5\% when $h_{size}$
increases to 40; the ratio reduces around 3\% when $h_{size}$ goes beyond 40.
$MIS>0.9$ reports similar results.
$\mathit{MIS}>0.9$ reports similar results.
Notably, when $h_{size}$ is large enough (e.g., $>40$), high compositional
symbolic language is hard to emerge in a natural referential game, for
easy-to-emerge low compositional symbolic language is sufficient in scenarios of
......@@ -82,12 +82,6 @@ more meanings, for the constraint from low capacity.
Additionally, we also perform $\chi^2$ test to check the statistical
significance between the high compositionality and agent
capacity. Figure~\ref{fig:exp10} reports the $\chi^2$ test results for
......@@ -106,7 +100,7 @@ capacity.
\begin{figure*}[t]
\centering
\includegraphics[width=1.99\columnwidth]{fig/Figure9.pdf}
\includegraphics[width=\textwidth]{fig/Figure9.pdf}
\caption{Accuracy of Listeners when varying $h_{size}$ from 1 to 8. Each curve
represents an average accuracy trend from 50 repeated training, with the
range of [$\mu - \sigma$, $\mu + \sigma$], where $\mu$ is the average
......@@ -127,7 +121,7 @@ capacity.
We further breakdown the learning process to investigate the language teaching
scenario, where the Speaker teaches the Listener its fixed symbolic language.
We define three symbolic languages in different compositionality for Speaker to
teach, i.e., high (LA, $MIS=1$), mediate (LB, $MIS=0.83$), low (LC, $MIS=0.41$), see
teach, i.e., high (LA, $\mathit{MIS}=1$), mediate (LB, $\mathit{MIS}=0.83$), low (LC, $\mathit{MIS}=0.41$), see
Figure~\ref{fig:bench}.
Figure~\ref{fig:exp3} reports the accuracy of Listener, i.e., ratio of the correctly
......
......@@ -39,7 +39,7 @@ vocabulary can express almost infinite concepts.}
%
\begin{figure}[t]
\centering
\includegraphics[width=0.99\columnwidth]{fig/Figure1_motivation.pdf}
\includegraphics[width=\columnwidth]{fig/Figure1_motivation.pdf}
\caption{The distribution of compositionality when training for 100 symbolic
languages without
any induction. It can be observed that high compositional symbolic language
......@@ -51,8 +51,7 @@ vocabulary can express almost infinite concepts.}
Prior studies focus on achieving high compositional symbolic language
through \emph{deliberately handcrafted} inductions, e.g., small vocabulary
sizes~\cite{}, memoryless~\cite{}, addtional rewards~\cite{}, constructed loss functions~\cite{}, and
ease-of-teaching~\cite{}. \note{The possible intuition is that high compositional symbolic
language cannot emerge without induction in existing multi-agent environment.}
ease-of-teaching~\cite{}. \note{Such optimization methodologies are driven by the challenges to generate high compositional symbolic without induction in existing multi-agent environment.}
Figure~\ref{fig:induction} reports the compositionality when training two agents
in the widely-used listener-speaker referential game for emerging 100 symbolic
languages, and it can be observed that \note{the compositionality
......@@ -65,7 +64,7 @@ Yet, few works investigate the emergence of high compositional symbolic language
In other words, it is never clear whether \emph{natural}
environment and agents are sufficient for achieving high compositionality.
In this paper, we are the first work to achieve high compositional
This paper is the first one to achieve high compositional
symbolic language without any deliberately handcrafted induction. The key observation
is that the internal \emph{agent capacity} plays a crucial role in the
compositionality of symbolic language.
......
\section{conclusion}
\section{Conclusion}
\label{sec:con}
In this paper, we are the first work to achieve high compositional
......
\begin{table*}[h]
\begin{table*}[htbp]
\centering
\small
\caption{Handcrafted inductions in related works.}
......@@ -9,11 +9,12 @@
Works & Handcrafted induction & Compositionality\\
\midrule
\cite{kirby2015compression}&Expressivity and compressibility&Qualitative, Speaker\\
\cite{kottur-etal-2017-natural}&Vocabulary size, listener's memory&Qualitative, Speaker\\
\cite{choi2018compositional}&Vocabulary size, maximum message length&Qualitative, Speaker+Listener\\
\cite{kottur-etal-2017-natural}&Listener's memory&Qualitative, Speaker\\
\cite{choi2018compositional}&Maximum message length&Qualitative, Speaker+Listener\\
\cite{lazaridou2018emergence}&Structure of input data&Quantitative, Speaker\\
\cite{evtimova2018emergent}&Multi-modal scenarios&Quantitative, Speaker\\
\cite{li2019ease}&Population size, resetting all listeners&Quantitative, Speaker\\
\cite{chaabouni-etal-2019-word}&Word-order constraints&Qualitative, Speaker\\
\cite{chaabouni2020compositionality}&Easier to decode&Quantitative, Speaker\\
\textbf{Ours} & \textbf{None} & \textbf{Quantitative, Speaker+Listener} \\
\bottomrule
......@@ -40,7 +41,7 @@ agent itself. \rmk{this should be largely emphasized.}
%measure
To measure the compositionality of emerged symbolic language, many metrics are
proposed~\cite{}.
proposed~\cite{kottur-etal-2017-natural,choi2018compositional,lazaridou2018emergence,evtimova2018emergent,chaabouni2020compositionality}.
%Widely accepted metrics can be classified into two categories, measuring
%positive signaling~\cite{} and measuring positive listening~\cite{}. The former
%metrics measure the relationship between spoken symbols and received concepts
......@@ -59,7 +60,8 @@ For example, ~\citet{choi2018compositional} printed the agent messages with the
~\citet{kottur-etal-2017-natural} introduced the dialog tree to show the evolution of language compositionality during the trianing process.
Latter, some quantitative metrics are explored.
The topographic similarity\cite{lazaridou2018emergence} is introduced to measure the distances between all the possible pairs of meanings and the corresponding pairs of signals.
\citet{chaabouni2020compositionality} proposed the positional disentanglement and the bag-of-symbols disentanglement. The positional disentanglement measures whether symbols in specific postion clearly relate to the specific attribute of the input object. The bag-of-symbols measure the permutation-invariant characteristic of a language.
\citet{chaabouni2020compositionality} proposed the positional disentanglement, which measures whether symbols in specific postion clearly relate to the specific attribute of the input object.
From Table~\ref{tab:rel}, most metrics are proposed on the sight of the speaker. In our view, human begings developed the language based on both the speakers and the listener. Only one research of \cite{choi2018compositional} in Table~\ref{tab:rel} qualitatively considered from the sight of the speaker and the listener. In this paper, we propose a novel quatitative metric from both the speaker's sight and the listener's sight.
......
\section{Experimental Setup}
\section{ Symbolic Language Producing }
\label{sec:thory}
In this section, we introduce the experimental setup used in this paper,
including the environment setup, agent architecture, and training algorithm.
Before going to the detail of the training algorithms, we first introduce the environment, gaming rules, and agent architecture for enabling the emergence of symbolic language.
\begin{figure}[t]
\centering \includegraphics[width=0.99\columnwidth]{fig/Figure2_The_referential_game_environment.pdf}
\centering \includegraphics[width=\columnwidth]{fig/Figure2_The_referential_game_environment.pdf}
\caption{The referential game in this paper.}
\label{fig:game}
\end{figure}
......@@ -45,7 +44,7 @@ $t=\hat{t}$ if $t$ expresses the same meaning as $\hat{t}$, e.g., ``red circle''
\begin{figure*}[t]
\centering
\includegraphics[width=1.8\columnwidth]{fig/Figure3_The_architecture_of_agents.pdf}
\includegraphics[width=\textwidth]{fig/Figure3_The_architecture_of_agents.pdf}
\caption{The architecture of agents. \emph{Left:} speaker. \emph{Right:} listener.}
\label{fig:agents}
\end{figure*}
......@@ -79,7 +78,7 @@ symbols $\hat{t}$ with given input sequence $s$, i.e, $o^{L}=P(\hat{t}|s_0,s_1)$
To remove all the handcrafted induction as well as for a more realistic
scenario, agents for this referential game are independent of each other,
without sharing model parameters or architectural connections. As shown in
with no shared model parameters or architectural connections. As shown in
Algorithm~\ref{al:learning}, we train the separate Speaker $S$ and Listener $L$ with
Stochastic Policy Gradient methodology in a tick-tock manner, i.e, training one
agent while keeping the other one. Roughly, when training the Speaker, the
......
......@@ -5,7 +5,7 @@ MIS is the similarity between an identity matrix and the mutual information matr
\begin{figure}[t]
\centering
\includegraphics[width=0.99\columnwidth]{fig/Figure4_The_information_channel.pdf}
\includegraphics[width=\columnwidth]{fig/Figure4_The_information_channel.pdf}
\caption{The information channel modeling of the agents in the referential game.}
\label{fig:modeling}
\end{figure}
......@@ -42,22 +42,22 @@ Each column of $M$ correspond to the semantic information carried by one symbol.
\end{figure}
Finally, we define \emph{raw mutual information similarity} ($MIS_0$)
Finally, we define \emph{raw mutual information similarity} ($\mathit{MIS}_0$)
as the average cosine similarity of $M$ columns and one-hot vectors, as
Equation~\ref{eq:mis2}. Furthermore, $MIS$ is the normalized raw mutual
Equation~\ref{eq:mis2}. Furthermore, $\mathit{MIS}$ is the normalized mutual
information similarity into the $[0,1]$ value range, which can be computed with
following formula:
\begin{equation}\label{eq:mis2}\begin{aligned}
MIS_0 &= \frac{1}{2}\sum_{j=0}^1\frac{\max_{i=0,1}RI\left(c_i,s_j\right)}{\epsilon + \sqrt{\sum_{i=0}^{1}RI^2\left(c_i,s_j\right)}}, \epsilon > 0\\
MIS &= 2MIS_0 - 1
\mathit{MIS}_0 &= \frac{1}{2}\sum_{j=0}^1\frac{\max_{i=0,1}R\left(c_i,s_j\right)}{\epsilon + \sqrt{\sum_{i=0}^{1}R^2\left(c_i,s_j\right)}}, \epsilon > 0\\
\mathit{MIS} &= 2\mathit{MIS}_0 - 1
\end{aligned}\end{equation}
Generalized to $m$ symbols and $n$ objects, $MIS$ can be computed with
Generalized to $m$ symbols and $n$ objects, MIS can be computed with
following formula:
\begin{equation}\label{eq:mis2}\begin{aligned}
MIS_0 &= \frac{1}{m}\sum_{j=0}^{m-1}\frac{\max_{i\in[0,n-1]}R\left(c_i,s_j\right)}{\epsilon + \sqrt{\sum_{i=0}^{n-1}R^2\left(c_i,s_j\right)}}, \epsilon > 0\\
MIS &= \frac{n\cdot MIS_0 - 1}{n-1}
\mathit{MIS}_0 &= \frac{1}{m}\sum_{j=0}^{m-1}\frac{\max_{i\in[0,n-1]}R\left(c_i,s_j\right)}{\epsilon + \sqrt{\sum_{i=0}^{n-1}R^2\left(c_i,s_j\right)}}, \epsilon > 0\\
\mathit{MIS} &= \frac{n\cdot \mathit{MIS}_0 - 1}{n-1}
\end{aligned}\end{equation}
MIS is a bilateral metric. Unilateral metrics, e.g. \emph{topographic similarity (topo)}\cite{} and \emph{posdis}\cite{}, only take the policy of the speaker into consideration. We provide an example to illustrate the inadequacy of unilateral metrics, shown in Figure~\ref{fig:unilateral}. In this example, the speaker only uses $s_1$ to represent shape. From the perspective of speaker, the language is perfectly compositional (i.e. both topo and posdis are 1). However, the listener cannot distinguish the shape depend only on $s_1$, showing the non-compositionality in this language. The bilateral metric MIS addresses such defect by taking the policy of the listener into account, thus $MIS < 1$.
MIS is a bilateral metric. Unilateral metrics, e.g. \emph{topographic similarity (topo)}\cite{} and \emph{posdis}\cite{}, only take the policy of the speaker into consideration. We provide an example to illustrate the inadequacy of unilateral metrics, shown in Figure~\ref{fig:unilateral}. In this example, the speaker only uses $s_1$ to represent shape. From the perspective of speaker, the language is perfectly compositional (i.e. both topo and posdis are 1). However, the listener cannot distinguish the shape depend only on $s_1$, showing the non-compositionality in this language. The bilateral metric MIS addresses such defect by taking the policy of the listener into account, thus $\mathit{MIS} < 1$.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment