Commit 37c7a25f by haoyifan

haoyifan add

parent 1c2e5ba1
\begin{thebibliography}{16} \begin{thebibliography}{14}
\providecommand{\natexlab}[1]{#1} \providecommand{\natexlab}[1]{#1}
\providecommand{\url}[1]{\texttt{#1}} \providecommand{\url}[1]{\texttt{#1}}
\providecommand{\urlprefix}{URL } \providecommand{\urlprefix}{URL }
...@@ -25,16 +25,6 @@ Chaabouni, R.; Kharitonov, E.; Bouchacourt, D.; Dupoux, E.; and Baroni, M. ...@@ -25,16 +25,6 @@ Chaabouni, R.; Kharitonov, E.; Bouchacourt, D.; Dupoux, E.; and Baroni, M.
\newblock Compositionality and generalization in emergent languages. \newblock Compositionality and generalization in emergent languages.
\newblock \emph{arXiv preprint arXiv:2004.09124} . \newblock \emph{arXiv preprint arXiv:2004.09124} .
\bibitem[{Chaabouni et~al.(2019)Chaabouni, Kharitonov, Lazaric, Dupoux, and
Baroni}]{chaabouni-etal-2019-word}
Chaabouni, R.; Kharitonov, E.; Lazaric, A.; Dupoux, E.; and Baroni, M. 2019.
\newblock Word-order Biases in Deep-agent Emergent Communication.
\newblock In \emph{Proceedings of the 57th Annual Meeting of the Association
for Computational Linguistics}, 5166--5175. Florence, Italy: Association for
Computational Linguistics.
\newblock \doi{10.18653/v1/P19-1509}.
\newblock \urlprefix\url{https://www.aclweb.org/anthology/P19-1509}.
\bibitem[{Choi, Lazaridou, and de~Freitas(2018)}]{choi2018compositional} \bibitem[{Choi, Lazaridou, and de~Freitas(2018)}]{choi2018compositional}
Choi, E.; Lazaridou, A.; and de~Freitas, N. 2018. Choi, E.; Lazaridou, A.; and de~Freitas, N. 2018.
\newblock Compositional Obverter Communication Learning from Raw Visual Input. \newblock Compositional Obverter Communication Learning from Raw Visual Input.
...@@ -67,13 +57,6 @@ Kharitonov, E.; Chaabouni, R.; Bouchacourt, D.; and Baroni, M. 2019. ...@@ -67,13 +57,6 @@ Kharitonov, E.; Chaabouni, R.; Bouchacourt, D.; and Baroni, M. 2019.
Natural Language Processing and the 9th International Joint Conference on Natural Language Processing and the 9th International Joint Conference on
Natural Language Processing (EMNLP-IJCNLP): System Demonstrations}, 55--60. Natural Language Processing (EMNLP-IJCNLP): System Demonstrations}, 55--60.
\bibitem[{Kirby et~al.(2015)Kirby, Tamariz, Cornish, and
Smith}]{kirby2015compression}
Kirby, S.; Tamariz, M.; Cornish, H.; and Smith, K. 2015.
\newblock Compression and communication in the cultural evolution of linguistic
structure.
\newblock \emph{Cognition} 141: 87--102.
\bibitem[{Kottur et~al.(2017)Kottur, Moura, Lee, and \bibitem[{Kottur et~al.(2017)Kottur, Moura, Lee, and
Batra}]{kottur-etal-2017-natural} Batra}]{kottur-etal-2017-natural}
Kottur, S.; Moura, J.; Lee, S.; and Batra, D. 2017. Kottur, S.; Moura, J.; Lee, S.; and Batra, D. 2017.
......
...@@ -3,46 +3,44 @@ Capacity: max_strings=35307, hash_size=35307, hash_prime=30011 ...@@ -3,46 +3,44 @@ Capacity: max_strings=35307, hash_size=35307, hash_prime=30011
The top-level auxiliary file: AAAI.aux The top-level auxiliary file: AAAI.aux
The style file: aaai21.bst The style file: aaai21.bst
Database file #1: ref.bib.bib Database file #1: ref.bib.bib
Warning--I didn't find a database entry for "" You've used 14 entries,
You've used 16 entries,
2951 wiz_defined-function locations, 2951 wiz_defined-function locations,
681 strings with 8239 characters, 665 strings with 7687 characters,
and the built_in function-call counts, 11790 in all, are: and the built_in function-call counts, 10036 in all, are:
= -- 939 = -- 793
> -- 615 > -- 525
< -- 0 < -- 0
+ -- 223 + -- 190
- -- 208 - -- 177
* -- 784 * -- 658
:= -- 1853 := -- 1586
add.period$ -- 70 add.period$ -- 59
call.type$ -- 16 call.type$ -- 14
change.case$ -- 140 change.case$ -- 121
chr.to.int$ -- 17 chr.to.int$ -- 15
cite$ -- 16 cite$ -- 14
duplicate$ -- 853 duplicate$ -- 731
empty$ -- 835 empty$ -- 713
format.name$ -- 249 format.name$ -- 212
if$ -- 2356 if$ -- 2001
int.to.chr$ -- 1 int.to.chr$ -- 1
int.to.str$ -- 1 int.to.str$ -- 1
missing$ -- 174 missing$ -- 147
newline$ -- 92 newline$ -- 80
num.names$ -- 64 num.names$ -- 56
pop$ -- 477 pop$ -- 416
preamble$ -- 1 preamble$ -- 1
purify$ -- 122 purify$ -- 105
quote$ -- 0 quote$ -- 0
skip$ -- 404 skip$ -- 351
stack$ -- 0 stack$ -- 0
substring$ -- 435 substring$ -- 351
swap$ -- 400 swap$ -- 335
text.length$ -- 0 text.length$ -- 0
text.prefix$ -- 0 text.prefix$ -- 0
top$ -- 0 top$ -- 0
type$ -- 144 type$ -- 126
warning$ -- 0 warning$ -- 0
while$ -- 84 while$ -- 71
width$ -- 0 width$ -- 0
write$ -- 217 write$ -- 186
(There was 1 warning)
No preview for this file type
...@@ -27,7 +27,8 @@ ...@@ -27,7 +27,8 @@
\setlength\headheight{0pt} \setlength\headsep{0pt} \setlength\headheight{0pt} \setlength\headsep{0pt}
%\setlength\footheight{0pt} \setlength\footskip{0pt} %\setlength\footheight{0pt} \setlength\footskip{0pt}
\thispagestyle{empty} \pagestyle{empty} \thispagestyle{empty} \pagestyle{empty}
\flushbottom \twocolumn \sloppy %\flushbottom \twocolumn \sloppy
\flushbottom \sloppy
% We're never going to need a table of contents, so just flush it to % We're never going to need a table of contents, so just flush it to
% save space --- suggested by drstrip@sandia-2 % save space --- suggested by drstrip@sandia-2
\def\addcontentsline#1#2#3{} \def\addcontentsline#1#2#3{}
...@@ -47,7 +48,8 @@ All rights reserved.} ...@@ -47,7 +48,8 @@ All rights reserved.}
\def\thefootnote{\fnsymbol{footnote}} \def\thefootnote{\fnsymbol{footnote}}
% gf: Don't see why we'd want the footnotemark to be 0pt wide % gf: Don't see why we'd want the footnotemark to be 0pt wide
%\def\@makefnmark{\hbox to 0pt{$^{\@thefnmark}$\hss}} %\def\@makefnmark{\hbox to 0pt{$^{\@thefnmark}$\hss}}
\twocolumn[\@maketitle] \@thanks %\twocolumn[\@maketitle] \@thanks
\@thanks
\endgroup \endgroup
% gf: Insert copyright slug unless turned off % gf: Insert copyright slug unless turned off
\if T\copyright@on\insert\footins{\noindent\footnotesize\copyright@text}\fi \if T\copyright@on\insert\footins{\noindent\footnotesize\copyright@text}\fi
......
...@@ -170,7 +170,7 @@ ...@@ -170,7 +170,7 @@
\DeclareMathOperator*{\argmax}{arg\,max} \DeclareMathOperator*{\argmax}{arg\,max}
\begin{document} \begin{document}
\linenumbers %\linenumbers
\maketitle \maketitle
...@@ -229,17 +229,17 @@ compositional language with a higher probability.} ...@@ -229,17 +229,17 @@ compositional language with a higher probability.}
\end{abstract} \end{abstract}
\input{tex/introduction.tex} \input{tex/introduction.tex}
\input{tex/relatedwork.tex} %\input{tex/relatedwork.tex}
\input{tex/theory.tex} %\input{tex/theory.tex}
\input{tex/theory2.tex} %\input{tex/theory2.tex}
\input{tex/experiments.tex} %\input{tex/experiments.tex}
\input{tex/last.tex} %\input{tex/last.tex}
%\clearpage %\clearpage
%\newpage %\newpage
\bibliography{ref.bib} \bibliography{ref.bib}
%\newpage \newpage
%\input{tex/appendix.tex} \input{tex/appendix.tex}
\end{document} \end{document}
...@@ -37,39 +37,39 @@ vocabulary can express almost infinite concepts.} ...@@ -37,39 +37,39 @@ vocabulary can express almost infinite concepts.}
%extract information from a single symbol. %extract information from a single symbol.
% %
% %
\begin{figure}[t] %\begin{figure}[t]
\centering % \centering
\includegraphics[width=\columnwidth]{fig/Figure1_motivation.pdf} % \includegraphics[width=\columnwidth]{fig/Figure1_motivation.pdf}
\caption{The distribution of compositionality for 100 emerged symbolic % \caption{The distribution of compositionality for 100 emerged symbolic
languages without % languages without
any induction. It can be observed that high compositional symbolic language % any induction. It can be observed that high compositional symbolic language
seldom emerged (e.g., $<5\%$ for compositionality $>0.99$). Moreover, varying % seldom emerged (e.g., $<5\%$ for compositionality $>0.99$). Moreover, varying
the vocabulary size does not affect the compositionality notably.} % the vocabulary size does not affect the compositionality notably.}
\label{fig:induction} % \label{fig:induction}
\end{figure} % \end{figure}
\begin{table*}[t] %\begin{table*}[t]
\centering % \centering
\small % \small
\caption{Handcrafted inductions in related works.} % \caption{Handcrafted inductions in related works.}
\label{tab:rel} % \label{tab:rel}
\begin{tabular}{llllll} % \begin{tabular}{llllll}
\toprule % \toprule
Works & Handcrafted induction & Compositionality\\ % Works & Handcrafted induction & Compositionality\\
\midrule % \midrule
\cite{kirby2015compression}&Expressivity and compressibility&Not quantitative, Speaker\\ % \cite{kirby2015compression}&Expressivity and compressibility&Not quantitative, Speaker\\
\cite{kottur-etal-2017-natural}&Listener's memory&Not quantitative, Speaker\\ % \cite{kottur-etal-2017-natural}&Listener's memory&Not quantitative, Speaker\\
\cite{choi2018compositional}&Maximum message length&Not quantitative, Speaker+Listener\\ % \cite{choi2018compositional}&Maximum message length&Not quantitative, Speaker+Listener\\
\cite{lazaridou2018emergence}&Structure of input data&Quantitative, Speaker\\ % \cite{lazaridou2018emergence}&Structure of input data&Quantitative, Speaker\\
\cite{evtimova2018emergent}&Multi-modal scenarios&Quantitative, Speaker\\ % \cite{evtimova2018emergent}&Multi-modal scenarios&Quantitative, Speaker\\
\cite{li2019ease}&Population size, resetting all listeners&Quantitative, Speaker\\ % \cite{li2019ease}&Population size, resetting all listeners&Quantitative, Speaker\\
\cite{chaabouni-etal-2019-word}&Word-order constraints&Not quantitative, Speaker\\ % \cite{chaabouni-etal-2019-word}&Word-order constraints&Not quantitative, Speaker\\
\cite{chaabouni2020compositionality}&Easier to decode&Quantitative, Speaker\\ % \cite{chaabouni2020compositionality}&Easier to decode&Quantitative, Speaker\\
\textbf{Ours} & \textbf{None} & \textbf{Quantitative, Speaker+Listener} \\ % \textbf{Ours} & \textbf{None} & \textbf{Quantitative, Speaker+Listener} \\
\bottomrule % \bottomrule
\end{tabular} % \end{tabular}
\end{table*} % \end{table*}
Prior studies focus on achieving high compositional symbolic language Prior studies focus on achieving high compositional symbolic language
through \emph{deliberately handcrafted} inductions, e.g., additional rewards~\cite{mordatch2017emergence}, through \emph{deliberately handcrafted} inductions, e.g., additional rewards~\cite{mordatch2017emergence},
......
%%%% ijcai21-multiauthor.tex
\typeout{IJCAI--21 Multiple authors example}
% These are the instructions for authors for IJCAI-21.
\documentclass{article}
\pdfpagewidth=8.5in
\pdfpageheight=11in
% The file ijcai21.sty is NOT the same than previous years'
\usepackage{ijcai21}
% Use the postscript times font!
\usepackage{times}
\renewcommand*\ttdefault{txtt}
\usepackage{soul}
\usepackage{url}
\usepackage[hidelinks]{hyperref}
\usepackage[utf8]{inputenc}
\usepackage[small]{caption}
\usepackage{graphicx}
\usepackage{amsmath}
\usepackage{booktabs}
\urlstyle{same}
% the following package is optional:
%\usepackage{latexsym}
% Following comment is from ijcai97-submit.tex:
% The preparation of these files was supported by Schlumberger Palo Alto
% Research, AT\&T Bell Laboratories, and Morgan Kaufmann Publishers.
% Shirley Jowell, of Morgan Kaufmann Publishers, and Peter F.
% Patel-Schneider, of AT\&T Bell Laboratories collaborated on their
% preparation.
% These instructions can be modified and used in other conferences as long
% as credit to the authors and supporting agencies is retained, this notice
% is not changed, and further modification or reuse is not restricted.
% Neither Shirley Jowell nor Peter F. Patel-Schneider can be listed as
% contacts for providing assistance without their prior permission.
% To use for other conferences, change references to files and the
% conference appropriate and use other authors, contacts, publishers, and
% organizations.
% Also change the deadline and address for returning papers and the length and
% page charge instructions.
% Put where the files are available in the appropriate places.
\title{IJCAI--21 Example on typesetting multiple authors}
\author{
First Author$^1$\footnote{Contact Author}\and
Second Author$^2$\and
Third Author$^{2,3}$\And
Fourth Author$^4$\\
\affiliations
$^1$First Affiliation\\
$^2$Second Affiliation\\
$^3$Third Affiliation\\
$^4$Fourth Affiliation\\
\emails
\{first, second\}@example.com,
third@other.example.com,
fourth@example.com
}
\begin{document}
\maketitle
\begin{abstract}
This short example shows a contrived example on how to format the authors' information for {\it IJCAI--21 Proceedings} using \LaTeX{}.
\end{abstract}
\section{Introduction}
This short example shows a contrived example on how to format the authors' information for {\it IJCAI--21 Proceedings}.
\section{Author names}
Each author name must be followed by:
\begin{itemize}
\item A newline {\tt \textbackslash{}\textbackslash{}} command for the last author.
\item An {\tt \textbackslash{}And} command for the second to last author.
\item An {\tt \textbackslash{}and} command for the other authors.
\end{itemize}
\section{Affiliations}
After all authors, start the affiliations section by using the {\tt \textbackslash{}affiliations} command.
Each affiliation must be terminated by a newline {\tt \textbackslash{}\textbackslash{}} command. Make sure that you include the newline on the last affiliation too.
\section{Mapping authors to affiliations}
If some scenarios, the affiliation of each author is clear without any further indication (\emph{e.g.}, all authors share the same affiliation, all authors have a single and different affiliation). In these situations you don't need to do anything special.
In more complex scenarios you will have to clearly indicate the affiliation(s) for each author. This is done by using numeric math superscripts {\tt \$\{\^{}$i,j, \ldots$\}\$}. You must use numbers, not symbols, because those are reserved for footnotes in this section (should you need them). Check the authors definition in this example for reference.
\section{Emails}
This section is optional, and can be omitted entirely if you prefer. If you want to include e-mails, you should either include all authors' e-mails or just the contact author(s)' ones.
Start the e-mails section with the {\tt \textbackslash{}emails} command. After that, write all emails you want to include separated by a comma and a space, following the same order used for the authors (\emph{i.e.}, the first e-mail should correspond to the first author, the second e-mail to the second author and so on).
You may ``contract" consecutive e-mails on the same domain as shown in this example (write the users' part within curly brackets, followed by the domain name). Only e-mails of the exact same domain may be contracted. For instance, you cannot contract ``person@example.com" and ``other@test.example.com" because the domains are different.
\end{document}
@book{ abelson-et-al:scheme,
author = "Harold Abelson and Gerald~Jay Sussman and Julie Sussman",
title = "Structure and Interpretation of Computer Programs",
publisher = "MIT Press",
address = "Cambridge, Massachusetts",
year = "1985"
}
@inproceedings{ bgf:Lixto,
author = "Robert Baumgartner and Georg Gottlob and Sergio Flesca",
title = "Visual Information Extraction with {Lixto}",
booktitle = "Proceedings of the 27th International Conference on Very Large Databases",
pages = "119--128",
publisher = "Morgan Kaufmann",
address = "Rome, Italy",
month = "September",
year = "2001"
}
@article{ brachman-schmolze:kl-one,
author = "Ronald~J. Brachman and James~G. Schmolze",
title = "An overview of the {KL-ONE} knowledge representation system",
journal = "Cognitive Science",
volume = "9",
number = "2",
pages = "171--216",
month = "April--June",
year = "1985"
}
@article{ gottlob:nonmon,
author = "Georg Gottlob",
title = "Complexity results for nonmonotonic logics",
journal = "Journal of Logic and Computation",
volume = "2",
number = "3",
pages = "397--425",
month = "June",
year = "1992"
}
@article{ gls:hypertrees,
author = "Georg Gottlob and Nicola Leone and Francesco Scarcello",
title = "Hypertree Decompositions and Tractable Queries",
journal = "Journal of Computer and System Sciences",
volume = "64",
number = "3",
pages = "579--627",
month = "May",
year = "2002"
}
@article{ levesque:functional-foundations,
author = "Hector~J. Levesque",
title = "Foundations of a functional approach to knowledge representation",
journal = "Artificial Intelligence",
volume = "23",
number = "2",
pages = "155--212",
month = "July",
year = "1984"
}
@inproceedings{ levesque:belief,
author = "Hector~J. Levesque",
title = "A logic of implicit and explicit belief",
booktitle = "Proceedings of the Fourth National Conference on Artificial Intelligence",
publisher = "American Association for Artificial Intelligence",
pages = "198--202",
address = "Austin, Texas",
month = "August",
year = "1984"
}
@article{ nebel:jair-2000,
author = "Bernhard Nebel",
title = "On the compilability and expressive power of propositional planning formalisms",
journal = "Journal of Artificial Intelligence Research",
volume = "12",
pages = "271--315",
year = "2000"
}
@misc{proceedings,
author = {{IJCAI Proceedings}},
title = {{IJCAI} Camera Ready Submission},
howpublished = {\url{https://proceedings.ijcai.org/info}},
}
MAKE = make
file = paper
all: IJCAI
paper-writting:
pdflatex -jobname $@ "\newcommand{\submitmode}{false}\input{$(file)}"
pdflatex -jobname $@ "\newcommand{\submitmode}{false}\input{$(file)}"
bibtex $@
pdflatex -jobname $@ "\newcommand{\submitmode}{false}\input{$(file)}"
pdflatex -jobname $@ "\newcommand{\submitmode}{false}\input{$(file)}"
paper-submission:
pdflatex -jobname $@ "\newcommand{\submitmode}{true}\input{$(file)}"
pdflatex -jobname $@ "\newcommand{\submitmode}{true}\input{$(file)}"
bibtex $@
pdflatex -jobname $@ "\newcommand{\submitmode}{true}\input{$(file)}"
pdflatex -jobname $@ "\newcommand{\submitmode}{true}\input{$(file)}"
final:
pdflatex -jobname $@ "\newcommand{\submitmode}{true}\input{$(file)}"
pdflatex -jobname $@ "\newcommand{\submitmode}{true}\input{$(file)}"
bibtex $@
pdflatex -jobname $@ "\newcommand{\submitmode}{true}\input{$(file)}"
pdflatex -jobname $@ "\newcommand{\submitmode}{true}\input{$(file)}"
pdftops final.pdf
ps2pdf14 -dPDFSETTINGS=/prepress final.pdf final-output.pdf
IJCAI:
pdflatex -jobname $@ $(file)
pdflatex -jobname $@ $(file)
bibtex $@
pdflatex -jobname $@ $(file)
pdflatex -jobname $@ $(file)
pull:
#git pull gitlab master:master
git pull gitlab master --allow-unrelated-histories
push:
git push gitlab master:master
clean:
rm -f *.aux *.bbl *.blg *.log *.out *.pdf *.gz *.fls *.fdb_latexmk
%%%% ijcai21.tex
\typeout{IJCAI--21 Instructions for Authors}
% These are the instructions for authors for IJCAI-21.
\documentclass{article}
\pdfpagewidth=8.5in
\pdfpageheight=11in
% The file ijcai21.sty is NOT the same than previous years'
\usepackage{ijcai21}
% Use the postscript times font!
\usepackage{times}
\usepackage{soul}
\usepackage{url}
\usepackage[hidelinks]{hyperref}
\usepackage[utf8]{inputenc}
\usepackage[small]{caption}
\usepackage{graphicx}
\usepackage{amsmath}
\usepackage{amsthm}
\usepackage{booktabs}
\usepackage{algorithm}
\usepackage{algorithmic}
\urlstyle{same}
% the following package is optional:
%\usepackage{latexsym}
% See https://www.overleaf.com/learn/latex/theorems_and_proofs
% for a nice explanation of how to define new theorems, but keep
% in mind that the amsthm package is already included in this
% template and that you must *not* alter the styling.
\newtheorem{example}{Example}
\newtheorem{theorem}{Theorem}
% Following comment is from ijcai97-submit.tex:
% The preparation of these files was supported by Schlumberger Palo Alto
% Research, AT\&T Bell Laboratories, and Morgan Kaufmann Publishers.
% Shirley Jowell, of Morgan Kaufmann Publishers, and Peter F.
% Patel-Schneider, of AT\&T Bell Laboratories collaborated on their
% preparation.
% These instructions can be modified and used in other conferences as long
% as credit to the authors and supporting agencies is retained, this notice
% is not changed, and further modification or reuse is not restricted.
% Neither Shirley Jowell nor Peter F. Patel-Schneider can be listed as
% contacts for providing assistance without their prior permission.
% To use for other conferences, change references to files and the
% conference appropriate and use other authors, contacts, publishers, and
% organizations.
% Also change the deadline and address for returning papers and the length and
% page charge instructions.
% Put where the files are available in the appropriate places.
%PDF Info Is REQUIRED.
\pdfinfo{
/TemplateVersion (IJCAI.2021.0)
}
\title{Enabling the Emergence of Symbolic Language without Handcrafted Inductions}
% Single author syntax
\author{
% Zhi-Hua Zhou
% \affiliations
% Nanjing University
% \emails
% pcchair@ijcai-21.org
}
% Multiple author syntax (remove the single-author syntax above and the \iffalse ... \fi here)
% Check the ijcai21-multiauthor.tex file for detailed instructions
\iffalse
\author{
First Author$^1$
\and
Second Author$^2$\and
Third Author$^{2,3}$\And
Fourth Author$^4$
\affiliations
$^1$First Affiliation\\
$^2$Second Affiliation\\
$^3$Third Affiliation\\
$^4$Fourth Affiliation
\emails
\{first, second\}@example.com,
third@other.example.com,
fourth@example.com
}
\fi
\begin{document}
\maketitle
\begin{abstract}
The emergence of symbolic languages with high compositionality has
attracted extensive attention from a broad range of communities. Existing
studies achieve high compositionality through \emph{deliberately handcrafted}
inductions (e.g., additional rewards, constructed
loss functions and structural input data) in multi-agent learning, which are unnatural.
Yet, few studies investigate the emergence of symbolic language with high
compositionality \emph{naturally}, i.e., without deliberately handcrafted
inductions.
In this paper, \note{we are the first to successfully achieve high compositional
symbolic language} in a \emph{natural} manner without handcrafted inductions.
Initially, by investigating the emergent
language after removing the \emph{deliberately handcrafted}
inductions, we observe the difficulty in naturally generating high compositional
language.
%the agent capacity plays a key role in compositionality.
Further, we reveal and characterize the \note{quantitative relationship}
between the agent capacity and the compositionality of emergent language, with
a novel mutual information-based metric for more reasonable measuring the compositionality.
The experimental results lead to a counter-intuitive conclusion that lower agent
capacity facilitates the emergence of language with higher
compositionality. \note{Based on our conclusion, we can get a more
compositional language with a higher probability.}
\end{abstract}
\input{tex/introduction.tex}
\input{tex/relatedwork.tex}
\input{tex/theory.tex}
\input{tex/theory2.tex}
\input{tex/experiments.tex}
\input{tex/last.tex}
%\clearpage
%\newpage
\bibliographystyle{ijcai21}
\bibliography{ref.bib}
\end{document}
@inproceedings{DBLP:conf/iclr/WuLCS18,
author = {Shuang Wu and
Guoqi Li and
Feng Chen and
Luping Shi},
title = {Training and Inference with Integers in Deep Neural Networks},
booktitle = {6th International Conference on Learning Representations, {ICLR} 2018,
Vancouver, BC, Canada, April 30 - May 3, 2018, Conference Track Proceedings},
year = {2018},
url = {https://openreview.net/forum?id=HJGXzmspb},
timestamp = {Thu, 04 Apr 2019 13:20:09 +0200},
biburl = {https://dblp.org/rec/bib/conf/iclr/WuLCS18},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%Related Work%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@inproceedings{kottur-etal-2017-natural,
title = "Natural Language Does Not Emerge {`}Naturally{'} in Multi-Agent Dialog",
author = "Kottur, Satwik and
Moura, Jos{\'e} and
Lee, Stefan and
Batra, Dhruv",
booktitle = "Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing",
month = sep,
year = "2017",
address = "Copenhagen, Denmark",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/D17-1321",
doi = "10.18653/v1/D17-1321",
pages = "2962--2967",
abstract = "A number of recent works have proposed techniques for end-to-end learning of communication protocols among cooperative multi-agent populations, and have simultaneously found the emergence of grounded human-interpretable language in the protocols developed by the agents, learned without any human supervision! In this paper, using a Task {\&} Talk reference game between two agents as a testbed, we present a sequence of {`}negative{'} results culminating in a {`}positive{'} one {--} showing that while most agent-invented languages are effective (i.e. achieve near-perfect task rewards), they are decidedly not interpretable or compositional. In essence, we find that natural language does not emerge {`}naturally{'},despite the semblance of ease of natural-language-emergence that one may gather from recent literature. We discuss how it is possible to coax the invented languages to become more and more human-like and compositional by increasing restrictions on how two agents may communicate.",
}
@inproceedings{chaabouni-etal-2019-word,
title = "Word-order Biases in Deep-agent Emergent Communication",
author = "Chaabouni, Rahma and
Kharitonov, Eugene and
Lazaric, Alessandro and
Dupoux, Emmanuel and
Baroni, Marco",
booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics",
month = jul,
year = "2019",
address = "Florence, Italy",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/P19-1509",
doi = "10.18653/v1/P19-1509",
pages = "5166--5175",
abstract = "Sequence-processing neural networks led to remarkable progress on many NLP tasks. As a consequence, there has been increasing interest in understanding to what extent they process language as humans do. We aim here to uncover which biases such models display with respect to {``}natural{''} word-order constraints. We train models to communicate about paths in a simple gridworld, using miniature languages that reflect or violate various natural language trends, such as the tendency to avoid redundancy or to minimize long-distance dependencies. We study how the controlled characteristics of our miniature languages affect individual learning and their stability across multiple network generations. The results draw a mixed picture. On the one hand, neural networks show a strong tendency to avoid long-distance dependencies. On the other hand, there is no clear preference for the efficient, non-redundant encoding of information that is widely attested in natural language. We thus suggest inoculating a notion of {``}effort{''} into neural networks, as a possible way to make their linguistic behavior more human-like.",
}
@article{kirby2015compression,
title={Compression and communication in the cultural evolution of linguistic structure},
author={Kirby, Simon and Tamariz, Monica and Cornish, Hannah and Smith, Kenny},
journal={Cognition},
volume={141},
pages={87--102},
year={2015},
publisher={Elsevier}
}
@inproceedings{lazaridou2018emergence,
title={Emergence of Linguistic Communication from Referential Games with Symbolic and Pixel Input},
author={Lazaridou, Angeliki and Hermann, Karl Moritz and Tuyls, Karl and Clark, Stephen},
booktitle={International Conference on Learning Representations},
year={2018}
}
@inproceedings{li2019ease,
title={Ease-of-teaching and language structure from emergent communication},
author={Li, Fushan and Bowling, Michael},
booktitle={Advances in Neural Information Processing Systems},
pages={15851--15861},
year={2019}
}
@inproceedings{evtimova2018emergent,
title={Emergent Communication in a Multi-Modal, Multi-Step Referential Game},
author={Evtimova, Katrina and Drozdov, Andrew and Kiela, Douwe and Cho, Kyunghyun},
booktitle={International Conference on Learning Representations},
year={2018}
}
@inproceedings{choi2018compositional,
title={Compositional Obverter Communication Learning from Raw Visual Input},
author={Choi, Edward and Lazaridou, Angeliki and de Freitas, Nando},
booktitle={International Conference on Learning Representations},
year={2018}
}
@article{chaabouni2020compositionality,
title={Compositionality and generalization in emergent languages},
author={Chaabouni, Rahma and Kharitonov, Eugene and Bouchacourt, Diane and Dupoux, Emmanuel and Baroni, Marco},
journal={arXiv preprint arXiv:2004.09124},
year={2020}
}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@article{bogin2018emergence,
title={Emergence of Communication in an Interactive World with Consistent Speakers},
author={Bogin, Ben and Geva, Mor and Berant, Jonathan},
journal={arXiv},
pages={arXiv--1809},
year={2018}
}
@inproceedings{jaques2019social,
title={Social influence as intrinsic motivation for multi-agent deep reinforcement learning},
author={Jaques, Natasha and Lazaridou, Angeliki and Hughes, Edward and Gulcehre, Caglar and Ortega, Pedro and Strouse, DJ and Leibo, Joel Z and De Freitas, Nando},
booktitle={International Conference on Machine Learning},
pages={3040--3049},
year={2019},
organization={PMLR}
}
@article{mul2019mastering,
title={Mastering emergent language: learning to guide in simulated navigation},
author={Mul, Mathijs and Bouchacourt, Diane and Bruni, Elia},
journal={arXiv preprint arXiv:1908.05135},
year={2019}
}
@inproceedings{kharitonov2019egg,
title={EGG: a toolkit for research on Emergence of lanGuage in Games},
author={Kharitonov, Eugene and Chaabouni, Rahma and Bouchacourt, Diane and Baroni, Marco},
booktitle={Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP): System Demonstrations},
pages={55--60},
year={2019}
}
@article{labash2020perspective,
title={Perspective taking in deep reinforcement learning agents},
author={Labash, Aqeel and Aru, Jaan and Matiisen, Tambet and Tampuu, Ardi and Vicente, Raul},
journal={Frontiers in Computational Neuroscience},
volume={14},
year={2020},
publisher={Frontiers Media SA}
}
@inproceedings{andreas2018measuring,
title={Measuring Compositionality in Representation Learning},
author={Andreas, Jacob},
booktitle={International Conference on Learning Representations},
year={2018}
}
@book{partee2008compositionality,
title={Compositionality in formal semantics: Selected papers},
author={Partee, Barbara H},
year={2008},
publisher={John Wiley \& Sons}
}
@article{mordatch2017emergence,
title={Emergence of grounded compositional language in multi-agent populations},
author={Mordatch, Igor and Abbeel, Pieter},
journal={arXiv preprint arXiv:1703.04908},
year={2017}
}
@misc{david1969convention,
title={Convention: a philosophical study},
author={David, Lewis},
year={1969},
publisher={Cambridge, Harvard university press}
}
\ No newline at end of file
In this section, a referential game platform and a speaker-listener model are introduced. Referential game is commonly used in the emergent language study, such as [][]. In this game, the speaker needs communicate with the listener to complete a task cooperatively. The game setup for the referential game is firstly described. Then, how to construct the speaker-listener with the neural networks is introduced. Lastly, the training algorithm and the evaluation
methods are discussed.
#subsection1: Set up
In the referential game, the agents should obey the following rules:
a)The speaker agent S uses the input object t to output the corresponding symbol sequence s;
b)The listener agent L uses the symbol sequence s to output the predict result $\hat{t}$;
c)If $t=\hat{t}$, this game is successful, and each agent receives reward $R(t,\hat{t}=1$; otherwise, the game is failed, and the reward is set as $R(t,\hat{t}=-1$.
An input object t is a concept sequence with fixed length, denoted $t=(c_0,c_1)$. The concept $c_0(shape)$ and $c_1(color)$ are indicated as a one-hot vector respectively. The length of each one-hot vector ranges from 3 to 6. These two vectors are concatenated to denote the input object t.
Each symbol sequence s contains two words, denoted $(s_0,s_1)$. Each word $s_i$ is chosen in the vocabulary set $V$. In this game, let the card $|V|$ range from 4 to 10, and the inequation $|V|^2\geq|M_1||M_1|$ is satisfied to ensure the symbol sequence $(s_0,s_1)$ can be used to denote all the input object t. The one-hot vector with the length $|V|$ is used to indicate the word $s_0$ and $s_1$ respectively. Then, the two one-hot vectors are concatenated to denote the symbol sequence s.
The predict result $\hat{t}$ is denoted as a one-hot vector with the length $|M_0||M_1|$. Each bit of the one-hot vector denotes one input object. If the predict result $\hat{t}[i*|M_1|+j]=1$, the one-hot vector of each predict concept $\hat{c}_0$ and $\hat{c}_1$ respectively satisfied $\hat_{c}_0[i]=1$ and $\hat{c}_1[j]=1$.
If $(c_0,c_1) is equal to $(\hat{c}_0,\hat{c}_1)$, the input object and the predict result indicate the same object.
#subsection2: Agent architecture
The agents apply their own policy to play the referential game. Denote the policy of the speaker agent S and the listener L as $\pi_S$ and $\pi_L$. $\pi_S$ indicates the conditional probability $P(s_0|t)$ and $P(s_1|t)$. $\pi_L$ indicates the conditional probability $P(\hat{t}|s_0,s_1)$. The listener agent output predict result $\hat{t}$ through random sampling on the conditional probability $P(\hat{t}|s_0,s_1)$. The neural networks are used to simulate the agent policy. The agent architecture is shown in Figure 1.
For the speaker, the input object t is firstly passed to a MLP to get a hidden layer vector h^S. Then, the hidden layer vector is split into two feature vectors h_0^S and h_1^S with length h_size. Through a MLP and a softmax layer, these feature vectors are transformed as the output o_0 and o_1 with the length |V| respectively. Lastly, the symbol sequences s_0 and s_1 are sampled from the output o_0 and o_1.
For the listener, the input symbol sequences s_0 and s_1 are passed into a MLP respectively to get the hidden layer vectors h_0 and h_1. The length of each vector is h_size. Concatenating these vectors, and passing the conjunctive vector into a MLP and a softmax layer, the output o^L with length $|M_0||M_1|$ denotes P(\hat{t}|s_0,s_1). Lastly, the predict result is sampled from the output o^L.
In the experiments, the symbol h_size is used to denote the model capacity of the agents.
#subsection3: Training Algorithm
In this paper, the Stochastic Policy Gradient methodology is used to train the speaker and the listener respectively. The symbol $\theta_S$ and $\theta_L$ denote the neural network parameters of the policy $\pi_S$ and $\pi_L$ respectively. When training the speaker, the parameter $\theta_L$ is fixed, and the training objective is to maximize the expected reward $ J(theta_S, theta_L) = E_{\pi_S,\pi_L}[R(t, t^)]$ through adjusting the parameter $\theta_S$. In a similar way, the listener is trained to maximize the expected reward$ J(theta_S, theta_L)$ by fixing the parameter $\theta_S$ and adjusting the parameter $\theta_L$. To minimize the influence of artificial induction on emergent language, we only use the predict result $\hat{t}$ of the listener agent as the evidence of whether giving the positive rewards. Then, the gradients of the expected reward $ J(theta_S, theta_L)$ can be calculated as follows:
\begin{align}
\nabla_{\theta^S} J &= \mathbb{E}_{\pi^S, \pi^L} \left[ R(\hat{t}, t) \cdot \nabla_{\theta^S} \log{\pi^S(s_0, s_1 | t)} \right] \\
\nabla_{\theta^L} J &= \mathbb{E}_{\pi^S, \pi^L} \left[ R(\hat{t}, t) \cdot \nabla_{\theta^L} \log{\pi^S(\hat{t} | s_0, s_1)} \right]
\end{align}
Unlike previous studies[][], the agents in this paper are totally independent. It means that all the neural networks parameters of each agent are not shared, and there are not any connection between the architecture of the neural networks. The training procedure is shown in Figure 2. The training process is the alternations of two procedure: the speaker training and the listener training. When one agent is training, the parameters of the other agent are fixed.
\begin{algorithm}[!h]
\caption{OurAlgorithm$(t,\hat{t})$}
\begin{algorithmic}[1]
\IF{Training the speaker agent S}
\FOR{Batch T randomly selected from $M_0\times M_1$}
\FOR{$t=(c_0,c_1)$ in T}
\STATE $P(s_0|t),P(s_1|t)=\pi_{old}^S(s=(s_0,s_1)|t)$
\STATE Sample $s_0$ with $P(s_0|t)$, $s_1$ with $P(s_1|t)$
\STATE $P(\hat{t}|s) = \pi^L(\hat{t}|s)$
\STATE Sample $\hat{t}$ with $P(\hat{t}|s)$
\STATE Get reward $R(\hat{t},t)$
\STATE $J(\theta^S,\theta^L)=E_{\pi_{old}^S,\pi^L}[R(\hat{t},t)\cdot\frac{\pi^S(s|t)}{\pi^S_{old}(s|t)}]$
\STATE Update $\theta^S$ by $\bigtriangledown_{\theta^S}J$
\ENDFOR
\STATE $\pi_{old}^S\leftarrow \pi^S$
\ENDFOR
\ENDIF
\IF{Training the listener agent L}
\FOR{Batch T randomly selected from $M_0\times M_1$}
\FOR{$t=(c_0,c_1)$ in T}
\STATE $P(s_0|t),P(s_1|t)=\pi^S(s=(s_0,s_1)|t)$
\STATE Sample $s_0$ with $P(s_0|t)$, $s_1$ with $P(s_1|t)$
\STATE $P(\hat{t}|s) = \pi^L_{old}(\hat{t}|s)$
\STATE Sample $\hat{t}$ with $P(\hat{t}|s)$
\STATE Get reward $R(\hat{t},t)$
\STATE $J(\theta^S,\theta^L)=E_{\pi^S,\pi_{old}^L}[R(\hat{t},t)\cdot\frac{\pi^L(s|t)}{\pi^L_{old}(s|t)}]$
\STATE Update $\theta^L$ by $\bigtriangledown_{\theta^L}J$
\ENDFOR
\STATE $\pi_{old}^L\leftarrow \pi^L$
\ENDFOR
\ENDIF
\end{algorithmic}
\end{algorithm}
Figure2. Training Algorithm of agents
#subsection4: Evaluation
Our objective is to study the relationship between the agent model capacity and the compositionality of the emergent language, within the range afforded by the need for successful communication. When the accuracy of the listener converges to 100\%, it is believed that the training process is finished. With one training process, the agent model is evaluated through two aspects: the model capacity and the compositionality of the emergent language.
\begin{table}[b]
\centering
\small
\caption{The Chi-square test between high-compositionality and agent capacity.}
\label{tab:exp10}
\begin{tabular}{cccc}
\toprule
\multicolumn{4}{c}{$H_0$: $\mathit{MIS} > 0.90$ is independent with $h_{\mathit{size}}$}\\
\midrule
Configuration & $\chi^2$ & $df$ & $p$-value \\
\midrule
$|M_0|=5,|M_1|=3,|V|=10$ & 87.20 & 10 & $1.72\times 10^{-13}$ \\
$|M_0|=4,|M_1|=4,|V|=10$ & 71.47 & 10 & $1.70\times 10^{-10}$ \\
\bottomrule
\multicolumn{4}{c}{\vspace{1em}}\\
\toprule
\multicolumn{4}{c}{$H_0$: $\mathit{MIS} > 0.99$ is independent with $h_{\mathit{size}}$}\\
\midrule
Configuration & $\chi^2$ & $df$ & $p$-value \\
\midrule
$|M_0|=5,|M_1|=3,|V|=10$ & 34.15 & 10 & $6.39\times 10^{-4}$ \\
$|M_0|=4,|M_1|=4,|V|=10$ & 38.26 & 10 & $1.39\times 10^{-4}$ \\
\bottomrule
\end{tabular}
\end{table}
\section{Appendix}
\label{sec:exp}
We add two sets of experimental results to further verify the relationship between
agent capacity and the compositionality of symbolic language that emerged in our natural referential game.
As a supplement to the \emph{Experiments} section, these two sets of data (coresponding to two
kinds of configuration) are used to prove that the relationship is independent of configuration.
Specifically, with the configuration of: a)$|M_0|=5,|M_1|=3,|V|=10$ and b)$|M_0|=4,|M_1|=4,|V|=10$,
we train the speaker-listener agents to emerge symbolic language when varying the agent capacities,
i.e., hidden layer size ($h_{size}$), from 6 to 100.
\begin{figure}[t]
\centering \includegraphics[width=0.99\columnwidth]{fig/Appendix_Figure1_MIS.pdf}
\caption{Compositionality of symbolic language under different parameters
($[\mu-\sigma,\mu+\sigma]$, where $\mu$ is the mean value and $\sigma$ is
the standard deviation).}
\label{fig:exp1}
\end{figure}
\begin{figure}[t]
\centering \includegraphics[width=0.99\columnwidth]{fig/Appendix_Figure2_Ratio.pdf}
\caption{The ratio of high compositional language. (a) $MIS>0.99$. (b)
$MIS>0.9$. }
\label{fig:exp2}
\end{figure}
Figure~\ref{fig:exp1} reports the supplementally experimental results. Consistent with
previous experiments, it can be observed that the mean value of MIS decreases as the value
of $h_{size}$ increases, no matter what configuration we take. MIS significantly decreases
from around 0.8 to less than 0.7 when $h_{size}$ increases from 6 to 100.
Just like we do in the \emph{Experiment} section, we further breakdown our results to show the importance
of agent capacity for emerging a symbolic language with high compositionality. Figure~\ref{fig:exp2} reports
the ratio of high compositional symbolic language in all emerged languages,
Figure~\ref{fig:exp2} (a) and (b) for $\mathit{MIS}>0.99$ and $\mathit{MIS}>0.9$, respectively.
Under these two supplementary configuration, we also find that the ratio of high compositional symbolic languages
decreases drastically with the increase of $h_{size}$, and that such ratio would be closed to zero when agent capacity
comes too large (i.e., $h_{size} > 80$).
For these two supplementary sets of data, we also perform $\chi^2$ test to check the statistical
significance between the high compositionality and agent
capacity. Table~\ref{tab:exp10} reports the $\chi^2$ test results for
$\mathit{MIS}>0.99$ and $\mathit{MIS}>0.9$, respectively. It can be observed that
for different vocabulary sizes, the p-value is always less than 0.05, which means
the high compositionality has a statistical significance related to agent
capacity.
In conclusion, combining these two supplementary sets of data and experimental results in
the \emph{Experiment} section, we prove that the negative correlation between agent capacity
and compostionality of emergent language is independent with configuration (i.e., vocabulary size,
count of colors and shapes)
%\section{Agent Capacity vs. Compositionality}
%\label{ssec:exp}
\begin{figure}[t]
\centering \includegraphics[width=0.99\columnwidth]{fig/Figure7_The_ratio_of_high_compositional_language.pdf}
\caption{The ratio of high compositional language. (a) $MIS>0.99$. (b)
$MIS>0.9$. }
\label{fig:exp2}
\end{figure}
\begin{figure*}[t]
\centering
\includegraphics[width=\textwidth]{fig/Figure9.pdf}
\caption{Accuracy of Listeners when varying $h_{size}$ from 1 to 8. Each curve
represents an average accuracy trend from 50 repeated training, with the
range of [$\mu - \sigma$, $\mu + \sigma$], where $\mu$ is the average
accuracy and $\sigma$ is the standard deviation.}
\label{fig:exp3}
\end{figure*}
%\begin{figure}[t]
% \centering
% \includegraphics[width=0.99\columnwidth]{fig/Figure10_p_value.pdf}
% \caption{The Chi-square test between high-compositionality and agent
% capacity. (a) $MIS>0.99$. (b)
% $MIS>0.9$.}
% \label{fig:exp10}
%\end{figure}
\begin{table}[b]
\centering
\small
\caption{The Chi-square test between high-compositionality and agent capacity.}
\label{tab:exp10}
\begin{tabular}{cccc}
\toprule
\multicolumn{4}{c}{$H_0$: $\mathit{MIS} > 0.90$ is independent with $h_{\mathit{size}}$}\\
\midrule
Vocabulary size & $\chi^2$ & $df$ & $p$-value \\
\midrule
4 & 22.20 & 10 & $1.41\times 10^{-2}$ \\
6 & 27.52 & 10 & $2.16\times 10^{-3}$ \\
10 & 64.46 & 10 & $5.14\times 10^{-10}$ \\
\bottomrule
\multicolumn{4}{c}{\vspace{1em}}\\
\toprule
\multicolumn{4}{c}{$H_0$: $\mathit{MIS} > 0.99$ is independent with $h_{\mathit{size}}$}\\
\midrule
Vocabulary size & $\chi^2$ & $df$ & $p$-value \\
\midrule
4 & 30.19 & 10 & $7.97\times 10^{-4}$ \\
6 & 25.96 & 10 & $3.80\times 10^{-3}$ \\
10 & 33.80 & 10 & $2.00\times 10^{-4}$ \\
\bottomrule
\end{tabular}
\end{table}
\begin{figure}[t]
\centering
\includegraphics[width=0.8\columnwidth]{fig/Figure8_Three_artificial_languages_with_different_MIS.pdf}
\caption{Three pre-defined language for teaching. (a) LA: high compositionality
($MIS=1$). (b) LB: mediate compositionality ($MIS=0.83$). (c) LC: low compositionality ($MIS=0.41$).}
\label{fig:bench}
\end{figure}
\section{Experiments}
\label{sec:exp}
We exploit the relationship between agent capacity and the compositionality of
symbolic language that emerged in our natural referential game.
For various configuration of
vocabulary size, we fix $|M_0|=|M_1|=3$ and train the speaker-listener agents to emerge symbolic
language when varying the agent capacities, i.e., hidden layer size
($h_{size}$), from 6 to 100.
Figure~\ref{fig:exp1} reports the experimental results. It can be observed that
the mean value of MIS decreases as the value of $h_{size}$ increases. Taking the
configuration of vocabulary size $|V|=10$ as an example, the mean value of MIS
is around 0.8 when $h_{size}\le 20$; MIS significantly decreases to 0.75 when
$h_{size}$ increases from 20 to 40; MIS further reduces to 0.7 when $h_{size}$
increases from 40 to 100.
For different vocabulary sizes, the MIS shares the
similar behavior.
It is because symbols in low-compositional languages carry semantic information
about more concepts. As a result, higher capacity is required to characterize the
complex semantic information for low-compositional language to emerge.
In summary, lower agent capacity improves the possibility of
emerging high compositional symbolic language.
\subsection{Ratio of high compositional language.}
We further breakdown our results to investigate the importance of agent capacity
to the compositionality of symbolic language. Figure~\ref{fig:exp2} reports the
ratio of high compositional symbolic language in all emerged languages,
Figure~\ref{fig:exp2} (a) and (b) for $\mathit{MIS}>0.99$ and $\mathit{MIS}>0.9$, respectively. It
can be observed that the ratio of high compositional symbolic languages
decreases drastically with the increase of $h_{size}$.
Taking vocabulary size $|V|=4$ as an example, symbolic languages with
compositionality $\mathit{MIS}>0.99$ take $>$10\% mainly over all the emerged symbolic
languages, when $h_{size}<20$; the ratio reduces to 0\%$\sim$5\% when $h_{size}$
increases to 40; the ratio reduces around 3\% when $h_{size}$ goes beyond 40.
$\mathit{MIS}>0.9$ reports similar results.
Notably, when $h_{size}$ is large enough (e.g., $>40$), high compositional
symbolic language is hard to emerge in a natural referential game, for
easy-to-emerge low compositional symbolic language is sufficient in scenarios of
referential game.
On the other side, agents are enforced to use compositionality to express
more meanings, for the constraint from low capacity.
Additionally, we also perform $\chi^2$ test to check the statistical
significance between the high compositionality and agent
capacity. Table~\ref{tab:exp10} reports the $\chi^2$ test results for
$\mathit{MIS}>0.99$ and $\mathit{MIS}>0.9$, respectively. It can be observed that
for different vocabulary sizes, the p-value is always less than 0.05, which means
the high compositionality has a statistical significance related to agent
capacity.
%\subsection{Breakdown}
%\label{ssec:language}
%\begin{figure}[t]
% \centering
% \includegraphics[width=0.9\columnwidth]{fig/occupy}
% \caption{}
% \label{fig:exp4}
%\end{figure}
\subsection{Breakdown into language teaching.}
We further breakdown the learning process to investigate the language teaching
scenario, where the Speaker teaches the Listener its fixed symbolic language.
We define three symbolic languages in different compositionality for Speaker to
teach, i.e., high (LA, $\mathit{MIS}=1$), mediate (LB, $\mathit{MIS}=0.83$), low (LC, $\mathit{MIS}=0.41$), see
Figure~\ref{fig:bench}.
Figure~\ref{fig:exp3} reports the accuracy of Listener, i.e., the ratio of the correctly
predicted symbols spoke by Speaker ($t=\hat(t)$), which varies with the
training iterations under different agent capacities.
Figure~\ref{fig:exp3} (a) shows that when $h_{size}$ equals to 1, the agent capacity is
too low to handle languages. Figure~\ref{fig:exp3} (b) shows that when $h_{size}$
equals to 2, agent can only learn $LA$ whose compositionality (i.e. \emph{MIS})
is highest in all three languages. Combing these two observations, we can infer that
language with lower compositionality requires higher agent capacity to ensure
communicating successfully (i.e., $h_{size}$).
Additionally, Figure~\ref{fig:exp3} (c)$\sim$(h) shows that the
higher agent capacity causes a faster training process for all three languages, but the
improvement for different languages is quite different. It is obvious that language with lower compositionality also requires higher agent
capacity to train faster.
%In conclude, teaching an artificial language with
%lower compositionality to agent require higher agent capacity both for learning
%successfully and training faster.
\section{Conclusion}
\label{sec:con}
In this paper, we are the first work to achieve high compositional
symbolic language without any deliberately handcrafted induction.
We made the key observation that the internal \emph{agent capacity} plays a crucial role in the compositionality of symbolic language.
Together with the theoretical analysis, experimental results led to a
counter-intuitive conclusion that \emph{lower agent capacity facilitates the emergence of symbolic language with higher compositionality}.
Therefore, by only reducing the agent capacity in such a natural environment, we
generated a higher compositional symbolic language with a higher probability.
\section{Related Works}
\label{sec:relatedwork}
%external environmental factors
Previous works focus on the \emph{deliberately handcrafted} inductions that affect the
compositionality of emergent language.
Some significant works on studying the environmental inductions on the compositionality of emergent language are summarized in Table~\ref{tab:rel}.
For example, ~\citet{kirby2015compression} explored how the pressures for expressivity and compressibility lead the structured language.
~\citet{kottur-etal-2017-natural} constrained the vocabulary size and whether the listener has memory to coax the compositionality of the emergent language.
~\citet{lazaridou2018emergence} showed that the degree of structure found in the input data affects the emergence of the symbolic language.
~\citet{li2019ease} studied how the pressure, ease of teaching, impact on the iterative language of the population regime.
~\citet{evtimova2018emergent} designed novel multi-modal scenarios, which the speaker and the listener should access to different modalities of the input object, to explore the language emergence.
These inductions are deliberately designed, which are too ideal to be true in
the real world.
In this paper, these handcrafted inductions above are all removed, and the high compositional language is learned only by the agent capacity.
%measure
To measure the compositionality of emergent language, metrics are
proposed~\cite{kottur-etal-2017-natural,choi2018compositional,lazaridou2018emergence,evtimova2018emergent,chaabouni2020compositionality}.
%Widely accepted metrics can be classified into two categories, measuring
%positive signaling~\cite{} and measuring positive listening~\cite{}. The former
%metrics measure the relationship between spoken symbols and received concepts
%\rmk{not clear}, from the perspective of \emph{speakers}.
%For example,.
%The latter metrics measure the relationship between received symbols and
%predicted concepts \rmk{not clear}, from the perspective of \emph{listeners}.
%For example,.
%However, these metrics are not appropriate, for they only measure
%compositionality of symbolic language in \emph{unilateral} role\rmk{not sure},
%either speakers or listeners. They can not measure the degree of \emph{bilateral}
%understanding between speakers and listeners, i.e., the concept-symbol mapping
%consistency between speakers and listeners.
At the initial stage, many studies only analyzed the language compositionality qualitatively (i.e. not quantitatively).
For example, ~\citet{choi2018compositional} printed the agent messages with the letter `abcd' at some training round, and directly analyzed the compositionality on these messages.
~\citet{kottur-etal-2017-natural} introduced the dialog tree to show the evolution of language compositionality during the training process.
Latter, some quantitative metrics are explored.
The topographic similarity\cite{lazaridou2018emergence} is introduced to measure the distances between all the possible pairs of meanings and the corresponding pairs of signals.
\citet{chaabouni2020compositionality} proposed the positional disentanglement, which measures whether symbols in a specific position relate to the specific attribute of the input object.
From Table~\ref{tab:rel}, most metrics are proposed on the sight of speaker. In our view, human beings developed the language based on a bilateral communication between the speaker and the listener. One research~\cite{choi2018compositional} considered the metric bilaterally, but it is not a quantitative metric. In this paper, we propose a novel quantitative metric from both the speaker and the listener's perspective.
In conclusion, the previous works induced the compositional language based on some deliberately handcrafted inductions,
and the quantitative metric from the sight of both the speaker and the listener is still lacking.
In this paper, we remove all the handcrafted inductions as shown in Table~\ref{tab:rel} and get a high compositional language through the internal agent capacity.
Moreover, we propose a quantitative metric which take both the speaker and the listener into account.
\section{ Framework of Language Emerging}
\label{sec:thory}
\begin{figure}[t]
\centering \includegraphics[width=\columnwidth]{fig/Figure2_The_referential_game_environment.pdf}
\caption{The referential game in this paper.}
\label{fig:game}
\end{figure}
\begin{figure*}[t]
\centering
\includegraphics[width=1.8\columnwidth]{fig/Figure3_The_architecture_of_agents.pdf}
\caption{The architecture of agents. \emph{Left:} speaker. \emph{Right:} listener.}
\label{fig:agents}
\end{figure*}
\begin{algorithm}[t]
\caption{Learning Algorithm$(t,\hat{t})$}
\label{al:learning}
\small
\begin{algorithmic}[1]
\IF{Training the speaker agent S}
\FOR{Batch T randomly selected from $M_0\times M_1$}
\FOR{$t=(c_0,c_1)$ in T}
\STATE $P(s_0|t),P(s_1|t)=\pi_{old}^S(s=(s_0,s_1)|t)$
\STATE Sample $s_0$ with $P(s_0|t)$, $s_1$ with $P(s_1|t)$
\STATE $P(\hat{t}|s) = \pi^L(\hat{t}|s)$
\STATE Sample $\hat{t}$ with $P(\hat{t}|s)$
\STATE Get reward $r(\hat{t},t)$
\STATE $J(\theta^S,\theta^L)=E_{\pi_{old}^S,\pi^L}[r(\hat{t},t)\cdot\frac{\pi^S(s|t)}{\pi^S_{old}(s|t)}]$
\STATE Update $\theta^S$ by $\bigtriangledown_{\theta^S}J$
\ENDFOR
\STATE $\pi_{old}^S\leftarrow \pi^S$
\ENDFOR
\ENDIF
\IF{Training the listener agent L}
\FOR{Batch T randomly selected from $M_0\times M_1$}
\FOR{$t=(c_0,c_1)$ in T}
\STATE $P(s_0|t),P(s_1|t)=\pi^S(s=(s_0,s_1)|t)$
\STATE Sample $s_0$ with $P(s_0|t)$, $s_1$ with $P(s_1|t)$
\STATE $P(\hat{t}|s) = \pi^L_{old}(\hat{t}|s)$
\STATE Sample $\hat{t}$ with $P(\hat{t}|s)$
\STATE Get reward $r(\hat{t},t)$
\STATE $J(\theta^S,\theta^L)=E_{\pi^S,\pi_{old}^L}[r(\hat{t},t)\cdot\frac{\pi^L(s|t)}{\pi^L_{old}(s|t)}]$
\STATE Update $\theta^L$ by $\bigtriangledown_{\theta^L}J$
\ENDFOR
\STATE $\pi_{old}^L\leftarrow \pi^L$
\ENDFOR
\ENDIF
\end{algorithmic}
\end{algorithm}
Before going to the detail of the training algorithms, we first introduce the environment, gaming rules, and agent architecture for enabling the emergence of symbolic language.
\subsection{Environment setup}
\label{ssec:env}
Figure~\ref{fig:game} shows the entire environment used in this study,
i.e., a commonly used referential game. Roughly, the referential game requires the speaker and listener to work cooperatively to accomplish a certain task.
In this paper, the task is to have the listener agent reconstruct the object
what the speaker claims it has seen, only through their emerged communication protocol. The consistent success in this game indicates that language has emerged between speaker and listener.
\textbf{Game rules} In our referential game, agents follow the following rules to finish the game in a cooperative manner. In each round, once received an input object $t$, Speaker $S$ speaks symbols $s$ to Listener $L$ ; Listener $L$ reconstruct the predicted result $\hat{t}$ based on the listened symbols $s$; if $t=\hat{t}$, agents win this game and receive positive rewards ($r(t,\hat{t})=1$); otherwise agents fail this game and receive negative rewards ($r(t,\hat{t})=-1$).
Precisely, during the game, Speaker $S$ receives an input object$t$, which is a concept-pair with two concepts
from the concept set $M_0$ and $M_1$, i.e., two one-hot vectors representing shape and color, respectively. Based on the $t$, Speaker $S$ speaks a symbol sequence $s$, which similarly contains two words from $V$.
The Listener $L$ receives $s$ and output predicted result $\hat{t}$, a single word (one-hot vector) corresponded with a concept-pair from the Cartesian product of $M_0\times M_1$, which represents all the meanings of two combined words
from $M_0$ and $M_1$. Please note that since $t$ and $\hat{t}$ have different length, we say $t=\hat{t}$ if $t$ expresses the same concept-pair as $\hat{t}$, e.g., ``red circle''.
\subsection{Agent architecture}
\label{ssec:agent}
Figure~\ref{fig:agents} shows the architecture of the constructed agents,
including the Speaker $S$ and Listener $L$.
\textbf{Speaker.} Regarding the Speaker $S$, it is constructed as a three-layer neural
network. The Speaker $S$ processes the input object $t$ with a fully-connected
layer to obtain the hidden layer $h^s$, which is further processed with fully-connected layers to obtain the output
layer. The output layer results indicate the probability distribution of symbols
with given input object $t$, i.e., $o_i^{s}=P(s_i|t)$ $i\in{0,1}$. \note{The final
readout symbols are sampled based on such probability distribution.}
\textbf{Listener.} Regarding the Listener $L$, it is constructed as a
three-layer neural network, too. Different from Speaker $S$ that tries to separate input object into words, $L$ tries to concatenates words to understand the combined meaning. The output layer results are also the probability distribution of
symbols $\hat{t}$ with given input sequence $s$, i.e, $o^{L}=P(\hat{t}|s_0,s_1)$.
\subsection{Learning algorithm}
\label{ssec:training}
To remove all the handcrafted induction as well as for a more realistic
scenario, agents for this referential game are independent of each other,
with no shared model parameters or architectural connections. As shown in
Algorithm~\ref{al:learning}, we train the separate Speaker $S$ and Listener $L$ with
Stochastic Policy Gradient methodology in a tick-tock manner, i.e, training one
agent while keeping the other one. Roughly, when training the Speaker, the
target is set to maximize the expected reward
$J(\theta_S, \theta_L)=E_{\pi_S,\pi_L}[r(t, \hat{t})]$ by adjusting the parameter
$\theta_S$, where $\theta_S$ is the neural network parameters of Speaker $S$
with learned output probability distribution $\pi_S$, and $\theta_L$ is the
neural network parameters of Listener with learned probability distribution $\pi_L$.
Similarly, when training the Listener, the target is set to maximize the
expected reward$ J(\theta_S, \theta_L)$ by fixing the parameter $\theta_S$ and
adjusting the parameter $\theta_L$.
Additionally, to avoid the handcrafted induction on emergent language, we only
use the predicted result $\hat{t}$ of the listener agent as the
evidence of whether giving positive rewards. Then, the gradients of the
expected reward $ J(\theta_S, \theta_L)$ can be calculated as follows:
\begin{align}
\nabla_{\theta^S} J &= \mathbb{E}_{\pi^S, \pi^L} \left[ r(\hat{t}, t) \cdot
\frac{\nabla_{\theta^S}\pi^S(s_0, s_1 | t)}{\pi^S_{old}(s_0, s_1 | t)} \right] \\
\nabla_{\theta^L} J &= \mathbb{E}_{\pi^S, \pi^L} \left[ r(\hat{t}, t) \cdot
\frac{\nabla_{\theta^L} \pi^L(\hat{t} | s_0, s_1)}{\pi^L_{old}(\hat{t} | s_0, s_1)} \right]
\end{align}
\section{Mutual Information Similarity (MIS)}\label{sec:mis}
In this section, we propose the \emph{Mutual Information Similarity (MIS)} as a metric of compositionality and give a thorough theoretical analysis.
MIS is the similarity between an identity matrix and the mutual information matrix of concepts and symbols.
\begin{figure}[t]
\centering
\includegraphics[width=\columnwidth]{fig/Figure4_The_information_channel.pdf}
\caption{The information channel modeling of the agents in the referential game.}
\label{fig:modeling}
\end{figure}
\begin{figure}[t]
\centering
\includegraphics[width=0.8\columnwidth]{fig/Figure5_An_emergent_language.pdf}
\caption{An emergent language that the unilateral metrics cannot measure its non-compositionality. Notice that given $s_1 = \mathrm{a}$, the listener can neither determine the shape nor the color without the knowledge about $s_0$.}
\label{fig:unilateral}
\end{figure}
Before giving the definition of MIS, we first model the agents in the referential games. As shown in Figure~\ref{fig:modeling}, the listener and speaker in the referential game are connected in tandem. The speaker agent can be regard as a channel, whose input is a concept $c = (c_0, c_1)$ and output is a symbol $s = (s_0, s_1)$. The listener agent can be regard as another channel, whose input is a symbol $s = (s_0, s_1)$ and output is a predict result $\hat{t} = (\hat{c}_0, \hat{c}_1)$. Since the output of the listener only depends on the symbol $s$, we can model the policy of the speaker agent and the listener agent by the probability distribution $P(s = (s_0, s_1) | t = (c_0, c_1))$ and $P(\hat{t} = (\hat{c}_0, \hat{c}_1) | s_0, s_1)$, respectively.
Now we can analyse the information of the concepts preserved in the transmission process given the symbol transmitted, i.e. the conditional mutual information $I\left(t,\hat{t}|s\right)$. Whenever a stable language emerged, the speaker and the listener consistently use a specific symbol $s$ to refer to a specific object $t$. Therefore we can safely say $I\left(t,\hat{t}|s\right) = I\left(t,\hat{t}|s_{t,\hat{t}}\right)$ where $s_{t,\hat{t}}=\max_s\left\{P\left(\hat{t}|s\right)P\left(s|t\right)\right\}$. This conditional mutual information can be obtained by Equation~\ref{eq:cmi}.
\begin{equation}\label{eq:cmi}
I\left(t,\hat{t}|s_{t,\hat{t}}\right) = \sum_t\sum_{\hat{t}}P\left(t,\hat{t}|s_{t,\hat{t}}\right)\log\frac{P\left(t,\hat{t}|s_{t,\hat{t}}\right)}{P\left(t\right) P\left(\hat{t}|s_{t,\hat{t}}\right)}
\end{equation}
We define the ratio of preserved information $R(t, s)$ as Equation~\ref{eq:ri}, where $H(t)$ denotes the information entropy of $t$. $R(t,s)$ measures the degree of alignment between symbols and objects.
\begin{equation}\label{eq:ri}
R\left(t,s\right)=\frac{I\left(t,\hat{t}|s=s_{t,\hat{t}}\right)}{H\left(t\right)}
\end{equation}
Following the Equation~\ref{eq:ri} we can obtain the normalized mutual information matrix $M$ by collecting $R(c_i, s_j)$ for all $i, j$, as Equation~\ref{eq:mri}.
\begin{equation}\label{eq:mri}
M =
\begin{pmatrix}
R\left(c_0,s_0\right) & R\left(c_0,s_0\right)\\
R\left(c_0,s_0\right) & R\left(c_0,s_0\right)
\end{pmatrix}
\end{equation}
Each column of $M$ corresponds to the semantic information carried by one symbol. In a perfectly compositional language, each symbol represents one specific concept exclusively. Therefore, the similarity between the columns of $M$ and a one-hot vector is aligned with the compositionality of the emergent language.
\begin{figure}[t]
\centering \includegraphics[width=0.99\columnwidth]{fig/Figure6_Compostionality_of_symbolic_language.pdf}
\caption{Compositionality of symbolic language under different parameters
($[\mu-\sigma,\mu+\sigma]$, where $\mu$ is the mean value and $\sigma$ is
the standard deviation).}
\label{fig:exp1}
\end{figure}
Finally, we define \emph{raw mutual information similarity} ($\mathit{MIS}_0$)
as the average cosine similarity of $M$ columns and one-hot vectors, as
Equation~\ref{eq:mis2}. Furthermore, $\mathit{MIS}$ is the normalized mutual
information similarity into the $[0,1]$ value range, which can be computed with
following formula:
\begin{equation}\label{eq:mis2}\begin{aligned}
\mathit{MIS}_0 &= \frac{1}{2}\sum_{j=0}^1\frac{\max_{i=0,1}R\left(c_i,s_j\right)}{\epsilon + \sqrt{\sum_{i=0}^{1}R^2\left(c_i,s_j\right)}}, \epsilon > 0\\
\mathit{MIS} &= 2\mathit{MIS}_0 - 1
\end{aligned}\end{equation}
Generalized to $m$ symbols and $n$ objects, MIS can be computed with
following formula:
\begin{equation}\label{eq:mis2}\begin{aligned}
\mathit{MIS}_0 &= \frac{1}{m}\sum_{j=0}^{m-1}\frac{\max_{i\in[0,n-1]}R\left(c_i,s_j\right)}{\epsilon + \sqrt{\sum_{i=0}^{n-1}R^2\left(c_i,s_j\right)}}, \epsilon > 0\\
\mathit{MIS} &= \frac{n\cdot \mathit{MIS}_0 - 1}{n-1}
\end{aligned}\end{equation}
MIS is a bilateral metric. Unilateral metrics, e.g. \emph{topographic similarity (topo)}\cite{} and \emph{posdis}\cite{}, only take the policy of the speaker into consideration. We provide an example to illustrate the inadequacy of unilateral metrics, shown in Figure~\ref{fig:unilateral}. In this example, the speaker only uses $s_1$ to represent the shape. From the perspective of the speaker, the language is perfectly compositional (i.e. both topo and posdis are 1). However, the listener cannot distinguish the shape depend only on $s_1$, showing the non-compositionality in this language. The bilateral metric MIS addresses such defects by taking the policy of the listener into account, thus $\mathit{MIS} < 1$.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment