diff --git a/miniconf/load_site_data.py b/miniconf/load_site_data.py index 0533044..0dbf4b2 100644 --- a/miniconf/load_site_data.py +++ b/miniconf/load_site_data.py @@ -55,26 +55,14 @@ def load_site_data( # papers.html "main_papers", "demo_papers", - "srw_papers", - "cl_papers", - "tacl_papers", "paper_recs", "papers_projection", "main_paper_sessions", "demo_paper_sessions", - "srw_paper_sessions", - "cl_paper_sessions", - "tacl_paper_sessions", "main_paper_zoom_links", "demo_paper_zoom_links", - "srw_paper_zoom_links", - "cl_paper_zoom_links", - "tacl_paper_zoom_links", "main_paper_slideslive_ids", "demo_paper_slideslive_ids", - "srw_paper_slideslive_ids", - "cl_paper_slideslive_ids", - "tacl_paper_slideslive_ids", # socials.html "socials", # workshops.html @@ -160,48 +148,28 @@ def load_site_data( site_data["plenary_session_days"][0][-1] = "active" # Papers' progam to their data - for p in ( - site_data["main_papers"] + site_data["cl_papers"] + site_data["tacl_papers"] - ): + for p in site_data["main_papers"]: p["program"] = "main" for p in site_data["demo_papers"]: p["program"] = "demo" - for p in site_data["srw_papers"]: - p["program"] = "srw" - - site_data["programs"] = ["main", "demo", "findings", "srw", "workshop"] + site_data["programs"] = ["main", "demo", "findings", "workshop"] # papers.{html,json} papers = build_papers( - raw_papers=site_data["main_papers"] - + site_data["demo_papers"] - + site_data["srw_papers"] - + site_data["cl_papers"] - + site_data["tacl_papers"], + raw_papers=site_data["main_papers"] + site_data["demo_papers"], all_paper_sessions=[ site_data["main_paper_sessions"], site_data["demo_paper_sessions"], - site_data["srw_paper_sessions"], - site_data["cl_paper_sessions"], - site_data["tacl_paper_sessions"], ], qa_session_length_hr=qa_session_length_hr, all_paper_zoom_links=site_data["main_paper_zoom_links"] - + site_data["demo_paper_zoom_links"] - + site_data["srw_paper_zoom_links"] - + site_data["cl_paper_zoom_links"] - + site_data["tacl_paper_zoom_links"], - all_paper_slideslive_ids=site_data["main_paper_slideslive_ids"] - + site_data["demo_paper_slideslive_ids"] - + site_data["srw_paper_slideslive_ids"] - + site_data["cl_paper_slideslive_ids"] - + site_data["tacl_paper_slideslive_ids"], + + site_data["demo_paper_zoom_links"], paper_recs=site_data["paper_recs"], paper_images_path=site_data["config"]["paper_images_path"], ) - for prefix in ["main", "demo", "srw", "cl", "tacl"]: + for prefix in ["main", "demo"]: for suffix in [ "papers", "paper_sessions", @@ -575,7 +543,6 @@ def build_papers( all_paper_sessions: List[Dict[str, Dict[str, Any]]], qa_session_length_hr: int, all_paper_zoom_links: List[Dict[str, str]], - all_paper_slideslive_ids: List[Dict[str, str]], paper_recs: Dict[str, List[str]], paper_images_path: str, ) -> List[Paper]: @@ -615,14 +582,6 @@ def build_papers( assert paper_session_id not in zoom_info_for_paper_session zoom_info_for_paper_session[paper_session_id] = item - # build the lookup from paper to slideslive presentation ID - presentation_id_for_paper: Dict[str, str] = {} - for item in all_paper_slideslive_ids: - paper_id = item["UID"] - presentation_id = item["presentation_id"] - assert paper_id not in presentation_id_for_paper - presentation_id_for_paper[paper_id] = presentation_id - # build the lookup from paper to slots sessions_for_paper: DefaultDict[str, List[SessionInfo]] = defaultdict(list) for session_name, session_info in chain( @@ -654,7 +613,7 @@ def build_papers( card_image_path=get_card_image_path_for_paper( item["UID"], paper_images_path ), - presentation_id=presentation_id_for_paper.get(item["UID"]), + presentation_id=item.get("presentation_id", None), content=PaperContent( title=item["title"], authors=extract_list_field(item, "authors"), @@ -700,16 +659,16 @@ def build_tutorials(raw_tutorials: List[Dict[str, Any]]) -> List[Tutorial]: title=item["title"], organizers=item["organizers"], abstract=item["abstract"], - website=item["website"], - material=item["material"], - slides=item["slides"], + website=item.get("website", None), + material=item.get("material", None), + slides=item.get("slides", None), prerecorded=item.get("prerecorded", ""), rocketchat_channel=item.get("rocketchat_channel", ""), sessions=[ TutorialSessionInfo( session_name=session.get("name"), start_time=session.get("start_time"), - end_time=session.get("start_time"), + end_time=session.get("end_time"), livestream_id=session.get("livestream_id"), zoom_link=session.get("zoom_link"), ) diff --git a/sitedata/cl_paper_sessions.yml b/sitedata/cl_paper_sessions.yml deleted file mode 100644 index 1617578..0000000 --- a/sitedata/cl_paper_sessions.yml +++ /dev/null @@ -1,50 +0,0 @@ -3B: - date: 2020-07-06_13:00:00 - papers: - - cl.1482 -4A: - date: 2020-07-06_17:00:00 - papers: - - cl.1508 -4B: - date: 2020-07-06_18:00:00 - papers: - - cl.1482 -5B: - date: 2020-07-06_21:00:00 - papers: - - cl.1508 -6A: - date: 2020-07-07_05:00:00 - papers: - - cl.1550 -7A: - date: 2020-07-07_08:00:00 - papers: - - cl.1552 - - cl.1554 -8A: - date: 2020-07-07_12:00:00 - papers: - - cl.1554 -8B: - date: 2020-07-07_13:00:00 - papers: - - cl.1550 - - cl.1552 -9A: - date: 2020-07-07_17:00:00 - papers: - - cl.1547 -10A: - date: 2020-07-07_20:00:00 - papers: - - cl.1547 -13A: - date: 2020-07-08_12:00:00 - papers: - - cl.1543 -14B: - date: 2020-07-08_18:00:00 - papers: - - cl.1543 diff --git a/sitedata/cl_paper_slideslive_ids.csv b/sitedata/cl_paper_slideslive_ids.csv deleted file mode 100644 index dfc138b..0000000 --- a/sitedata/cl_paper_slideslive_ids.csv +++ /dev/null @@ -1,8 +0,0 @@ -UID,presentation_id -cl.1482,38929477 -cl.1508,38929478 -cl.1543,38929479 -cl.1547,38929480 -cl.1550,38929481 -cl.1552,38929482 -cl.1554,38929483 diff --git a/sitedata/cl_paper_zoom_links.csv b/sitedata/cl_paper_zoom_links.csv deleted file mode 100644 index 5705bca..0000000 --- a/sitedata/cl_paper_zoom_links.csv +++ /dev/null @@ -1,15 +0,0 @@ -UID,session_name,starttime,endtime,timezone,zoom_join_link -cl.1482,3B,2020-07-06T13:00:00Z,2020-07-06T14:00:00Z,UTC,https://zoom.us/j/96380723235 -cl.1482,4B,2020-07-06T18:00:00Z,2020-07-06T19:00:00Z,UTC,https://zoom.us/j/92818205051 -cl.1508,4A,2020-07-06T17:00:00Z,2020-07-06T18:00:00Z,UTC,https://zoom.us/j/99765897930 -cl.1508,5B,2020-07-06T21:00:00Z,2020-07-06T22:00:00Z,UTC,https://zoom.us/j/93195309046 -cl.1543,13A,2020-07-08T12:00:00Z,2020-07-08T13:00:00Z,UTC,https://zoom.us/j/92770418210 -cl.1543,14B,2020-07-08T18:00:00Z,2020-07-08T19:00:00Z,UTC,https://zoom.us/j/91164222207 -cl.1547,9A,2020-07-07T17:00:00Z,2020-07-07T18:00:00Z,UTC,https://zoom.us/j/92511745456 -cl.1547,10A,2020-07-07T20:00:00Z,2020-07-07T21:00:00Z,UTC,https://zoom.us/j/92950490812 -cl.1550,6A,2020-07-07T05:00:00Z,2020-07-07T06:00:00Z,UTC,https://zoom.us/j/97243878419 -cl.1550,8B,2020-07-07T13:00:00Z,2020-07-07T14:00:00Z,UTC,https://zoom.us/j/98008069894 -cl.1552,7A,2020-07-07T08:00:00Z,2020-07-07T09:00:00Z,UTC,https://zoom.us/j/99856112071 -cl.1552,8B,2020-07-07T13:00:00Z,2020-07-07T14:00:00Z,UTC,https://zoom.us/j/99082908217 -cl.1554,7A,2020-07-07T08:00:00Z,2020-07-07T09:00:00Z,UTC,https://zoom.us/j/97410112807 -cl.1554,8A,2020-07-07T12:00:00Z,2020-07-07T13:00:00Z,UTC,https://zoom.us/j/97746901084 diff --git a/sitedata/cl_papers.csv b/sitedata/cl_papers.csv deleted file mode 100644 index 107f62d..0000000 --- a/sitedata/cl_papers.csv +++ /dev/null @@ -1,8 +0,0 @@ -UID,title,authors,abstract,keywords,track,paper_type,pdf_url -cl.1508,"The Design and Implementation of XiaoIce, an Empathetic Social Chatbot",Li Zhou|Jianfeng Gao|Di Li|Heung-Yeung Shum,"This article describes the development of Microsoft XiaoIce, the most popular social chatbot in the world. XiaoIce is uniquely designed as an artifical intelligence companion with an emotional connection to satisfy the human need for communication, affection, and social belonging. We take into account both intelligent quotient and emotional quotient in system design, cast human–machine social chat as decision-making over Markov Decision Processes, and optimize XiaoIce for long-term user engagement, measured in expected Conversation-turns Per Session (CPS). We detail the system architecture and key components, including dialogue manager, core chat, skills, and an empathetic computing module. We show how XiaoIce dynamically recognizes human feelings and states, understands user intent, and responds to user needs throughout long conversations. Since the release in 2014, XiaoIce has communicated with over 660 million active users and succeeded in establishing long-term relationships with many of them. Analysis of large-scale online logs shows that XiaoIce has achieved an average CPS of 23, which is significantly higher than that of other chatbots and even human conversations.",XiaoIce|Microsoft XiaoIce|system design|Markov Processes,Dialogue and Interactive Systems,CL,https://www.mitpressjournals.org/doi/pdf/10.1162/COLI_a_00368 -cl.1482,On the Linguistic Representational Power of Neural Machine Translation Models,Yonatan Belinkov|Nadir Durrani|Fahim Dalvi|Hassan Sajjad|James Glass,"Despite the recent success of deep neural networks in natural language processing and other spheres of artificial intelligence, their interpretability remains a challenge. We analyze the representations learned by neural machine translation (NMT) models at various levels of granularity and evaluate their quality through relevant extrinsic properties. In particular, we seek answers to the following questions: (i) How accurately is word structure captured within the learned representations, which is an important aspect in translating morphologically rich languages? (ii) Do the representations capture long-range dependencies, and effectively handle syntactically divergent languages? (iii) Do the representations capture lexical semantics? We conduct a thorough investigation along several parameters: (i) Which layers in the architecture capture each of these linguistic phenomena; (ii) How does the choice of translation unit (word, character, or subword unit) impact the linguistic properties captured by the underlying representations? (iii) Do the encoder and decoder learn differently and independently? (iv) Do the representations learned by multilingual NMT models capture the same amount of linguistic information as their bilingual counterparts? Our data-driven, quantitative evaluation illuminates important aspects in NMT models and their ability to capture various linguistic phenomena. We show that deep NMT models trained in an end-to-end fashion, without being provided any direct supervision during the training process, learn a non-trivial amount of linguistic information. Notable findings include the following observations: (i) Word morphology and part-of-speech information are captured at the lower layers of the model; (ii) In contrast, lexical semantics or non-local syntactic and semantic dependencies are better represented at the higher layers of the model; (iii) Representations learned using characters are more informed about word-morphology compared to those learned using subword units; and (iv) Representations learned by multilingual models are richer compared to bilingual models.",Linguistic Models|natural processing|artificial intelligence|translating languages,Machine Translation,CL,https://www.mitpressjournals.org/doi/pdf/10.1162/coli_a_00367 -cl.1552,A Systematic Study of Inner-Attention-Based Sentence Representations in Multilingual Neural Machine Translation,Raúl Vázquez|Alessandro Raganato|Mathias Creutz|Jörg Tiedemann,"Neural machine translation has considerably improved the quality of automatic translations by learning good representations of input sentences. In this article, we explore a multilingual translation model capable of producing fixed-size sentence representations by incorporating an intermediate crosslingual shared layer, which we refer to as attention bridge. This layer exploits the semantics from each language and develops into a language-agnostic meaning representation that can be efficiently used for transfer learning.We systematically study the impact of the size of the attention bridge and the effect of including additional languages in the model. In contrast to related previous work, we demonstrate that there is no conflict between translation performance and the use of sentence representations in downstream tasks. In particular, we show that larger intermediate layers not only improve translation quality, especially for long sentences, but also push the accuracy of trainable classification tasks. Nevertheless, shorter representations lead to increased compression that is beneficial in non-trainable similarity tasks. Similarly, we show that trainable downstream tasks benefit from multilingual models, whereas additional language signals do not improve performance in non-trainable benchmarks. This is an important insight that helps to properly design models for specific applications. Finally, we also include an in-depth analysis of the proposed attention bridge and its ability of encoding linguistic properties. We carefully analyze the information that is captured by individual attention heads and identify interesting patterns that explain the performance of specific settings in linguistic probing tasks.",Multilingual Translation|Neural translation|transfer learning|translation,Machine Translation,CL,https://www.mitpressjournals.org/doi/pdf/10.1162/coli_a_00377 -cl.1547,LINSPECTOR: Multilingual Probing Tasks for Word Representations,Gözde Gül Sahin|Clara Vania|Ilia Kuznetsov|Iryna Gurevych,"Despite an ever growing number of word representation models introduced for a large number of languages, there is a lack of a standardized technique to provide insights into what is captured by these models. Such insights would help the community to get an estimate of the downstream task performance, as well as to design more informed neural architectures, while avoiding extensive experimentation which requires substantial computational resources not all researchers have access to. A recent development in NLP is to use simple classification tasks, also called probing tasks, that test for a single linguistic feature such as part-of-speech. Existing studies mostly focus on exploring the linguistic information encoded by the continuous representations of English text. However, from a typological perspective the morphologically poor English is rather an outlier: the information encoded by the word order and function words in English is often stored on a subword, morphological level in other languages. To address this, we introduce 15 type-level probing tasks such as case marking, possession, word length, morphological tag count and pseudoword identification for 24 languages. We present a reusable methodology for creation and evaluation of such tests in a multilingual setting, which is challenging due to lack of resources, lower quality of tools and differences among languages. We then present experiments on several diverse multilingual word embedding models, in which we relate the probing task performance for a diverse set of languages to a range of five classic NLP tasks: POS-tagging, dependency parsing, semantic role labeling, named entity recognition and natural language inference. We find that a number of probing tests have significantly high positive correlation to the downstream tasks, especially for morphologically rich languages. We show that our tests can be used to explore word embeddings or black-box neural models for linguistic cues in a multilingual setting. We release the probing datasets and the evaluation suite LINSPECTOR with https://github.com/UKPLab/linspector.",Word Representations|NLP|classification tasks|probing tasks,Resources and Evaluation,CL,https://www.mitpressjournals.org/doi/pdf/10.1162/coli_a_00376?mobileUi=0 -cl.1550,Unsupervised Word Translation with Adversarial Autoencoder,Tasnim Mohiuddin|Shafiq Joty,"Crosslingual word embeddings learned from monolingual embeddings have a crucial role in many downstream tasks, ranging from machine translation to transfer learning. Adversarial training has shown impressive success in learning crosslingual embeddings and the associated word translation task without any parallel data by mapping monolingual embeddings to a shared space. However, recent work has shown superior performance for non-adversarial methods in more challenging language pairs. In this article, we investigate adversarial autoencoder for unsupervised word translation and propose two novel extensions to it that yield more stable training and improved results. Our method includes regularization terms to enforce cycle consistency and input reconstruction, and puts the target encoders as an adversary against the corresponding discriminator. We use two types of refinement procedures sequentially after obtaining the trained encoders and mappings from the adversarial training, namely, refinement with Procrustes solution and refinement with symmetric re-weighting. Extensive experimentations with high- and low-resource languages from two different data sets show that our method achieves better performance than existing adversarial and non-adversarial approaches and is also competitive with the supervised system. Along with performing comprehensive ablation studies to understand the contribution of different components of our adversarial model, we also conduct a thorough analysis of the refinement procedures to understand their effects.",Unsupervised Translation|machine translation|transfer learning|word task,Machine Translation,CL,https://www.mitpressjournals.org/doi/pdf/10.1162/COLI_a_00374 -cl.1554,Abstract Syntax as Interlingua: Scaling Up the Grammatical Framework from Controlled Languages to Robust Pipelines,Aarne Ranta|Krasimir Angelov|Normunds Gruzitis|Prasanth Kolachina,"Abstract syntax is an interlingual representation used in compilers. Grammatical Framework (GF) applies the abstract syntax idea to natural languages. The development of GF started in 1998, first as a tool for controlled language implementations, where it has gained an established position in both academic and commercial projects. GF provides grammar resources for over 40 languages, enabling accurate generation and translation, as well as grammar engineering tools and components for mobile and Web applications. On the research side, the focus in the last ten years has been on scaling up GF to wide-coverage language processing. The concept of abstract syntax offers a unified view on many other approaches: Universal Dependencies, WordNets, FrameNets, Construction Grammars, and Abstract Meaning Representations. This makes it possible for GF to utilize data from the other approaches and to build robust pipelines. In return, GF can contribute to data-driven approaches by methods to transfer resources from one language to others, to augment data by rule-based generation, to check the consistency of hand-annotated corpora, and to pipe analyses into high-precision semantic back ends. This article gives an overview of the use of abstract syntax as interlingua through both established and emerging NLP applications involving GF.",Abstract Syntax|controlled implementations|accurate generation|accurate translation,"Syntax: Tagging, Chunking and Parsing",CL,https://www.mitpressjournals.org/doi/pdf/10.1162/coli_a_00378 -cl.1543,LESSLEX: Linking Multilingual Embeddings to SenSe Representations of Lexical Items,Davide Colla|Enrico Mensa|Daniele P. Radicioni,"We present LESSLEX, a novel multilingual lexical resource. Different from the vast majority of existing approaches, we ground our embeddings on a sense inventory made available from the BabelNet semantic network. In this setting, multilingual access is governed by the mapping of terms onto their underlying sense descriptions, such that all vectors co-exist in the same semantic space. As a result, for each term we have thus the 'blended' terminological vector along with those describing all senses associated to that term. LessLex has been tested on three tasks relevant to lexical semantics: conceptual similarity, contextual similarity, and semantic text similarity: we experimented over the principal data sets for such tasks in their multilingual and cross-lingual variants, improving on or closely approaching state-of-the-art results. We conclude by arguing that LessLex vectors may be relevant for practical applications and for research on conceptual and lexical access and competence.",SenSe Items|SenSe |LESSLEX|Multilingual Embeddings,Semantics: Lexical,CL,https://www.mitpressjournals.org/doi/pdf/10.1162/coli_a_00375 diff --git a/sitedata/main_papers.csv b/sitedata/main_papers.csv index 7a305eb..faad196 100644 --- a/sitedata/main_papers.csv +++ b/sitedata/main_papers.csv @@ -1,779 +1,783 @@ -UID,title,authors,abstract,keywords,track,paper_type,pdf_url -main.8,Large Scale Multi-Actor Generative Dialog Modeling,Alex Boyd|Raul Puri|Mohammad Shoeybi|Mostofa Patwary|Bryan Catanzaro,"Non-goal oriented dialog agents (i.e. chatbots) aim to produce varying and engaging conversations with a user; however, they typically exhibit either inconsistent personality across conversations or the average personality of all users. This paper addresses these issues by controlling an agent's persona upon generation via conditioning on prior conversations of a target actor. In doing so, we are able to utilize more abstract patterns within a person's speech and better emulate them in generated responses. This work introduces the Generative Conversation Control model, an augmented and fine-tuned GPT-2 language model that conditions on past reference conversations to probabilistically model multi-turn conversations in the actor's persona. We introduce an accompanying data collection procedure to obtain 10.3M conversations from 6 months worth of Reddit comments. We demonstrate that scaling model sizes from 117M to 8.3B parameters yields an improvement from 23.14 to 13.14 perplexity on 1.7M held out Reddit conversations. Increasing model scale yielded similar improvements in human evaluations that measure preference of model samples to the held out target distribution in terms of realism (31% increased to 37% preference), style matching (37% to 42%), grammar and content quality (29% to 42%), and conversation coherency (32% to 40%). We find that conditionally modeling past conversations improves perplexity by 0.47 in automatic evaluations. Through human trials we identify positive trends between conditional modeling and style matching and outline steps to further improve persona control.",Large Modeling|generation|style matching|automatic evaluations,Dialogue and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.acl-main.8.pdf -main.52,CDL: Curriculum Dual Learning for Emotion-Controllable Response Generation,Lei Shen|Yang Feng,"Emotion-controllable response generation is an attractive and valuable task that aims to make open-domain conversations more empathetic and engaging. Existing methods mainly enhance the emotion expression by adding regularization terms to standard cross-entropy loss and thus influence the training process. However, due to the lack of further consideration of content consistency, the common problem of response generation tasks, safe response, is intensified. Besides, query emotions that can help model the relationship between query and response are simply ignored in previous models, which would further hurt the coherence. To alleviate these problems, we propose a novel framework named Curriculum Dual Learning (CDL) which extends the emotion-controllable response generation to a dual task to generate emotional responses and emotional queries alternatively. CDL utilizes two rewards focusing on emotion and content to improve the duality. Additionally, it applies curriculum learning to gradually generate high-quality responses based on the difficulties of expressing various emotions. Experimental results show that CDL significantly outperforms the baselines in terms of coherence, diversity, and relation to emotion factors.",Emotion-Controllable Generation|training process|response tasks|CDL,Dialogue and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.acl-main.52.pdf -main.46,Emergence of Syntax Needs Minimal Supervision,Raphaël Bailly|Kata Gábor,"This paper is a theoretical contribution to the debate on the learnability of syntax from a corpus without explicit syntax-specific guidance. Our approach originates in the observable structure of a corpus, which we use to define and isolate grammaticality (syntactic information) and meaning/pragmatics information. We describe the formal characteristics of an autonomous syntax and show that it becomes possible to search for syntax-based lexical categories with a simple optimization process, without any prior hypothesis on the form of the model.",Syntax|optimization process|syntax|corpus guidance,Theory and Formalism in NLP (Linguistic and Mathematical),Long,https://www.aclweb.org/anthology/2020.acl-main.46.pdf -main.359,Selecting Backtranslated Data from Multiple Sources for Improved Neural Machine Translation,Xabier Soto|Dimitar Shterionov|Alberto Poncelas|Andy Way,"Machine translation (MT) has benefited from using synthetic training data originating from translating monolingual corpora, a technique known as backtranslation. Combining backtranslated data from different sources has led to better results than when using such data in isolation. In this work we analyse the impact that data translated with rule-based, phrase-based statistical and neural MT systems has on new MT systems. We use a real-world low-resource use-case (Basque-to-Spanish in the clinical domain) as well as a high-resource language pair (German-to-English) to test different scenarios with backtranslation and employ data selection to optimise the synthetic corpora. We exploit different data selection strategies in order to reduce the amount of data used, while at the same time maintaining high-quality MT systems. We further tune the data selection method by taking into account the quality of the MT systems used for backtranslation and lexical diversity of the resulting corpora. Our experiments show that incorporating backtranslated data from different sources can be beneficial, and that availing of data selection can yield improved performance.",Neural Translation|Machine MT|new systems|MT systems,Machine Translation,Long,https://www.aclweb.org/anthology/2020.acl-main.359.pdf -main.417,ParaCrawl: Web-Scale Acquisition of Parallel Corpora,Marta Bañón|Pinzhen Chen|Barry Haddow|Kenneth Heafield|Hieu Hoang|Miquel Esplà-Gomis|Mikel L. Forcada|Amir Kamran|Faheem Kirefu|Philipp Koehn|Sergio Ortiz Rojas|Leopoldo Pla Sempere|Gema Ramírez-Sánchez|Elsa Sarrías|Marek Strelec|Brian Thompson|William Waites|Dion Wiggins|Jaume Zaragoza,"We report on methods to create the largest publicly available parallel corpora by crawling the web, using open source software. We empirically compare alternative methods and publish benchmark data sets for sentence alignment and sentence pair filtering. We also describe the parallel corpora released and evaluate their quality and their usefulness to create machine translation systems.",sentence alignment|sentence filtering|machine systems|ParaCrawl,Resources and Evaluation,Long,https://www.aclweb.org/anthology/2020.acl-main.417.pdf -main.371,From Arguments to Key Points: Towards Automatic Argument Summarization,Roy Bar-Haim|Lilach Eden|Roni Friedman|Yoav Kantor|Dan Lahav|Noam Slonim,"Generating a concise summary from a large collection of arguments on a given topic is an intriguing yet understudied problem. We propose to represent such summaries as a small set of talking points, termed key points, each scored according to its salience. We show, by analyzing a large dataset of crowd-contributed arguments, that a small number of key points per topic is typically sufficient for covering the vast majority of the arguments. Furthermore, we found that a domain expert can often predict these key points in advance. We study the task of argument-to-key point mapping, and introduce a novel large-scale dataset for this task. We report empirical results for an extensive set of experiments with this dataset, showing promising performance.",Automatic Summarization|argument-to-key mapping|Arguments|crowd-contributed arguments,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Long,https://www.aclweb.org/anthology/2020.acl-main.371.pdf -main.365,Analysing Lexical Semantic Change with Contextualised Word Representations,Mario Giulianelli|Marco Del Tredici|Raquel Fernández,"This paper presents the first unsupervised approach to lexical semantic change that makes use of contextualised word representations. We propose a novel method that exploits the BERT neural language model to obtain representations of word usages, clusters these representations into usage types, and measures change along time with three proposed metrics. We create a new evaluation dataset and show that the model representations and the detected semantic shifts are positively correlated with human judgements. Our extensive qualitative analysis demonstrates that our method captures a variety of synchronic and diachronic linguistic phenomena. We expect our work to inspire further research in this direction.",Contextualised Representations|unsupervised approach|BERT model|model representations,Semantics: Lexical,Long,https://www.aclweb.org/anthology/2020.acl-main.365.pdf -main.403,Analyzing Political Parody in Social Media,Antonios Maronikolakis|Danae Sánchez Villegas|Daniel Preotiuc-Pietro|Nikolaos Aletras,"Parody is a figurative device used to imitate an entity for comedic or critical purposes and represents a widespread phenomenon in social media through many popular parody accounts. In this paper, we present the first computational study of parody. We introduce a new publicly available data set of tweets from real politicians and their corresponding parody accounts. We run a battery of supervised machine learning models for automatically detecting parody tweets with an emphasis on robustness by testing on tweets from accounts unseen in training, across different genders and across countries. Our results show that political parody tweets can be predicted with an accuracy up to 90%. Finally, we identify the markers of parody through a linguistic analysis. Beyond research in linguistics and political communication, accurately and automatically detecting parody is important to improving fact checking for journalists and analytics such as sentiment analysis through filtering out parodical utterances.",comedic purposes|computational parody|automatically tweets|fact checking,Computational Social Science and Social Media,Long,https://www.aclweb.org/anthology/2020.acl-main.403.pdf -main.91,Query Graph Generation for Answering Multi-hop Complex Questions from Knowledge Bases,Yunshi Lan|Jing Jiang,"Previous work on answering complex questions from knowledge bases usually separately addresses two types of complexity: questions with constraints and questions with multiple hops of relations. In this paper, we handle both types of complexity at the same time. Motivated by the observation that early incorporation of constraints into query graphs can more effectively prune the search space, we propose a modified staged query graph generation method with more flexible ways to generate query graphs. Our experiments clearly show that our method achieves the state of the art on three benchmark KBQA datasets.",Query Generation|Answering Questions|staged method|constraints,Question Answering,Short,https://www.aclweb.org/anthology/2020.acl-main.91.pdf -main.85,Contextualized Sparse Representations for Real-Time Open-Domain Question Answering,Jinhyuk Lee|Minjoon Seo|Hannaneh Hajishirzi|Jaewoo Kang,"Open-domain question answering can be formulated as a phrase retrieval problem, in which we can expect huge scalability and speed benefit but often suffer from low accuracy due to the limitation of existing phrase representation models. In this paper, we aim to improve the quality of each phrase embedding by augmenting it with a contextualized sparse representation (Sparc). Unlike previous sparse vectors that are term-frequency-based (e.g., tf-idf) or directly learned (only few thousand dimensions), we leverage rectified self-attention to indirectly learn sparse vectors in n-gram vocabulary space. By augmenting the previous phrase retrieval model (Seo et al., 2019) with Sparc, we show 4%+ improvement in CuratedTREC and SQuAD-Open. Our CuratedTREC score is even better than the best known retrieve & read model with at least 45x faster inference speed.",Real-Time Answering|Open-domain answering|phrase problem|Contextualized Representations,Question Answering,Short,https://www.aclweb.org/anthology/2020.acl-main.85.pdf -main.629,Transition-based Semantic Dependency Parsing with Pointer Networks,Daniel Fernández-González|Carlos Gómez-Rodríguez,"Transition-based parsers implemented with Pointer Networks have become the new state of the art in dependency parsing, excelling in producing labelled syntactic trees and outperforming graph-based models in this task. In order to further test the capabilities of these powerful neural networks on a harder NLP problem, we propose a transition system that, thanks to Pointer Networks, can straightforwardly produce labelled directed acyclic graphs and perform semantic dependency parsing. In addition, we enhance our approach with deep contextualized word embeddings extracted from BERT. The resulting system not only outperforms all existing transition-based models, but also matches the best fully-supervised accuracy to date on the SemEval 2015 Task 18 datasets among previous state-of-the-art graph-based parsers.",dependency parsing|harder problem|NLP problem|semantic parsing,Semantics: Sentence Level,Long,https://www.aclweb.org/anthology/2020.acl-main.629.pdf -main.173,On Faithfulness and Factuality in Abstractive Summarization,Joshua Maynez|Shashi Narayan|Bernd Bohnet|Ryan McDonald,"It is well known that the standard likelihood training and approximate decoding objectives in neural text generation models lead to less human-like responses for open-ended tasks such as language modeling and story generation. In this paper we have analyzed limitations of these models for abstractive document summarization and found that these models are highly prone to hallucinate content that is unfaithful to the input document. We conducted a large scale human evaluation of several neural abstractive summarization systems to better understand the types of hallucinations they produce. Our human annotators found substantial amounts of hallucinated content in all model generated summaries. However, our analysis does show that pretrained models are better summarizers not only in terms of raw metrics, i.e., ROUGE, but also in generating faithful and factual summaries as evaluated by humans. Furthermore, we show that textual entailment measures better correlate with faithfulness than standard metrics, potentially leading the way to automatic evaluation metrics as well as training and decoding criteria.",Abstractive Summarization|likelihood objectives|open-ended tasks|language modeling,Summarization,Long,https://www.aclweb.org/anthology/2020.acl-main.173.pdf -main.615,Generalized Entropy Regularization or: There's Nothing Special about Label Smoothing,Clara Meister|Elizabeth Salesky|Ryan Cotterell,"Prior work has explored directly regularizing the output distributions of probabilistic models to alleviate peaky (i.e. over-confident) predictions, a common sign of overfitting. This class of techniques, of which label smoothing is one, has a connection to entropy regularization. Despite the consistent success of label smoothing across architectures and data sets in language generation tasks, two problems remain open: (1) there is little understanding of the underlying effects entropy regularizers have on models, and (2) the full space of entropy regularization techniques is largely unexplored. We introduce a parametric family of entropy regularizers, which includes label smoothing as a special case, and use it to gain a better understanding of the relationship between the entropy of a model and its performance on language generation tasks. We also find that variance in model performance can be explained largely by the resulting entropy of the model. Lastly, we find that label smoothing provably does not allow for sparsity in an output distribution, an undesirable property for language generation models, and therefore advise the use of other entropy regularization methods in its place.",label smoothing|language tasks|Generalized Regularization|Label Smoothing,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.615.pdf -main.601,Low-Resource Generation of Multi-hop Reasoning Questions,Jianxing Yu|Wei Liu|Shuang Qiu|Qinliang Su|Kai Wang|Xiaojun Quan|Jian Yin,"This paper focuses on generating multi-hop reasoning questions from the raw text in a low resource circumstance. Such questions have to be syntactically valid and need to logically correlate with the answers by deducing over multiple relations on several sentences in the text. Specifically, we first build a multi-hop generation model and guide it to satisfy the logical rationality by the reasoning chain extracted from a given text. Since the labeled data is limited and insufficient for training, we propose to learn the model with the help of a large scale of unlabeled data that is much easier to obtain. Such data contains rich expressive forms of the questions with structural patterns on syntax and semantics. These patterns can be estimated by the neural hidden semi-Markov model using latent variables. With latent patterns as a prior, we can regularize the generation model and produce the optimal results. Experimental results on the HotpotQA data set demonstrate the effectiveness of our model. Moreover, we apply the generated results to the task of machine reading comprehension and achieve significant performance improvements.",Low-Resource Questions|generating questions|machine comprehension|multi-hop model,Question Answering,Long,https://www.aclweb.org/anthology/2020.acl-main.601.pdf -main.167,GPT-too: A Language-Model-First Approach for AMR-to-Text Generation,Manuel Mager|Ramón Fernandez Astudillo|Tahira Naseem|Md Arafat Sultan|Young-Suk Lee|Radu Florian|Salim Roukos,"Abstract Meaning Representations (AMRs) are broad-coverage sentence-level semantic graphs. Existing approaches to generating text from AMR have focused on training sequence-to-sequence or graph-to-sequence models on AMR annotated data only. In this paper, we propose an alternative approach that combines a strong pre-trained language model with cycle consistency-based re-scoring. Despite the simplicity of the approach, our experimental results show these models outperform all previous techniques on the English LDC2017T10 dataset, including the recent use of transformer architectures. In addition to the standard evaluation metrics, we provide human evaluation experiments that further substantiate the strength of our approach.",AMR-to-Text Generation|GPT-too|Language-Model-First Approach|AMRs,Generation,Short,https://www.aclweb.org/anthology/2020.acl-main.167.pdf -main.198,Stolen Probability: A Structural Weakness of Neural Language Models,David Demeter|Gregory Kimmel|Doug Downey,"Neural Network Language Models (NNLMs) generate probability distributions by applying a softmax function to a distance metric formed by taking the dot product of a prediction vector with all word vectors in a high-dimensional embedding space. The dot-product distance metric forms part of the inductive bias of NNLMs. Although NNLMs optimize well with this inductive bias, we show that this results in a sub-optimal ordering of the embedding space that structurally impoverishes some words at the expense of others when assigning probability. We present numerical, theoretical and empirical analyses which show that words on the interior of the convex hull in the embedding space have their probability bounded by the probabilities of the words on the hull.",Neural Models|Neural NNLMs|NNLMs|softmax function,Machine Learning for NLP,Short,https://www.aclweb.org/anthology/2020.acl-main.198.pdf -main.749,Hierarchical Entity Typing via Multi-level Learning to Rank,Tongfei Chen|Yunmo Chen|Benjamin Van Durme,"We propose a novel method for hierarchical entity classification that embraces ontological structure at both training and during prediction. At training, our novel multi-level learning-to-rank loss compares positive types against negative siblings according to the type tree. During prediction, we define a coarse-to-fine decoder that restricts viable candidates at each level of the ontology based on already predicted parent type(s). Our approach significantly outperform prior work on strict accuracy, demonstrating the effectiveness of our method.",Hierarchical Typing|hierarchical classification|prediction|Multi-level Learning,Information Extraction,Long,https://www.aclweb.org/anthology/2020.acl-main.749.pdf -main.761,DeSePtion: Dual Sequence Prediction and Adversarial Examples for Improved Fact-Checking,Christopher Hidey|Tuhin Chakrabarty|Tariq Alhindi|Siddharth Varia|Kriste Krstovski|Mona Diab|Smaranda Muresan,"The increased focus on misinformation has spurred development of data and systems for detecting the veracity of a claim as well as retrieving authoritative evidence. The Fact Extraction and VERification (FEVER) dataset provides such a resource for evaluating endto- end fact-checking, requiring retrieval of evidence from Wikipedia to validate a veracity prediction. We show that current systems for FEVER are vulnerable to three categories of realistic challenges for fact-checking – multiple propositions, temporal reasoning, and ambiguity and lexical variation – and introduce a resource with these types of claims. Then we present a system designed to be resilient to these “attacks” using multiple pointer networks for document selection and jointly modeling a sequence of evidence sentences and veracity relation predictions. We find that in handling these attacks we obtain state-of-the-art results on FEVER, largely due to improved evidence retrieval.",retrieving evidence|endto- fact-checking|veracity prediction|FEVER,NLP Applications,Long,https://www.aclweb.org/anthology/2020.acl-main.761.pdf -main.775,"Extracting Headless MWEs from Dependency Parse Trees: Parsing, Tagging, and Joint Modeling Approaches",Tianze Shi|Lillian Lee,"An interesting and frequent type of multi-word expression (MWE) is the headless MWE, for which there are no true internal syntactic dominance relations; examples include many named entities (“Wells Fargo”) and dates (“July 5, 2020”) as well as certain productive constructions (“blow for blow”, “day after day”). Despite their special status and prevalence, current dependency-annotation schemes require treating such flat structures as if they had internal syntactic heads, and most current parsers handle them in the same fashion as headed constructions. Meanwhile, outside the context of parsing, taggers are typically used for identifying MWEs, but taggers might benefit from structural information. We empirically compare these two common strategies—parsing and tagging—for predicting flat MWEs. Additionally, we propose an efficient joint decoding algorithm that combines scores from both strategies. Experimental results on the MWE-Aware English Dependency Corpus and on six non-English dependency treebanks with frequent flat structures show that: (1) tagging is more accurate than parsing for identifying flat-structure MWEs, (2) our joint decoder reconciles the two different views and, for non-BERT features, leads to higher accuracies, and (3) most of the gains result from feature sharing between the parsers and taggers.",parsing|tagging|predicting MWEs|identifying MWEs,"Syntax: Tagging, Chunking and Parsing",Long,https://www.aclweb.org/anthology/2020.acl-main.775.pdf -main.239,Low Resource Sequence Tagging using Sentence Reconstruction,Tal Perl|Sriram Chaudhury|Raja Giryes,"This work revisits the task of training sequence tagging models with limited resources using transfer learning. We investigate several proposed approaches introduced in recent works and suggest a new loss that relies on sentence reconstruction from normalized embeddings. Specifically, our method demonstrates how by adding a decoding layer for sentence reconstruction, we can improve the performance of various baselines. We show improved results on the CoNLL02 NER and UD 1.2 POS datasets and demonstrate the power of the method for transfer learning with low-resources achieving 0.6 F1 score in Dutch using only one sample from it.",sentence reconstruction|transfer learning|Low Tagging|Sentence Reconstruction,Machine Learning for NLP,Short,https://www.aclweb.org/anthology/2020.acl-main.239.pdf -main.563,A Contextual Hierarchical Attention Network with Adaptive Objective for Dialogue State Tracking,Yong Shan|Zekang Li|Jinchao Zhang|Fandong Meng|Yang Feng|Cheng Niu|Jie Zhou,"Recent studies in dialogue state tracking (DST) leverage historical information to determine states which are generally represented as slot-value pairs. However, most of them have limitations to efficiently exploit relevant context due to the lack of a powerful mechanism for modeling interactions between the slot and the dialogue history. Besides, existing methods usually ignore the slot imbalance problem and treat all slots indiscriminately, which limits the learning of hard slots and eventually hurts overall performance. In this paper, we propose to enhance the DST through employing a contextual hierarchical attention network to not only discern relevant information at both word level and turn level but also learn contextual representations. We further propose an adaptive objective to alleviate the slot imbalance problem by dynamically adjust weights of different slots during training. Experimental results show that our approach reaches 52.68% and 58.55% joint accuracy on MultiWOZ 2.0 and MultiWOZ 2.1 datasets respectively and achieves new state-of-the-art performance with considerable improvements (+1.24% and +5.98%).",Dialogue Tracking|slot problem|Contextual Network|DST,Dialogue and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.acl-main.563.pdf -main.205,Efficient Strategies for Hierarchical Text Classification: External Knowledge and Auxiliary Tasks,Kervy Rivas Rojas|Gina Bustamante|Arturo Oncevay|Marco Antonio Sobrevilla Cabezudo,"In hierarchical text classification, we perform a sequence of inference steps to predict the category of a document from top to bottom of a given class taxonomy. Most of the studies have focused on developing novels neural network architectures to deal with the hierarchical structure, but we prefer to look for efficient ways to strengthen a baseline model. We first define the task as a sequence-to-sequence problem. Afterwards, we propose an auxiliary synthetic task of bottom-up-classification. Then, from external dictionaries, we retrieve textual definitions for the classes of all the hierarchy's layers, and map them into the word vector space. We use the class-definition embeddings as an additional input to condition the prediction of the next layer and in an adapted beam search. Whereas the modified search did not provide large gains, the combination of the auxiliary task and the additional input of class-definitions significantly enhance the classification accuracy. With our efficient approaches, we outperform previous studies, using a drastically reduced number of parameters, in two well-known English datasets.",Hierarchical Classification|External Tasks|sequence-to-sequence problem|auxiliary bottom-up-classification,NLP Applications,Short,https://www.aclweb.org/anthology/2020.acl-main.205.pdf -main.211,Interactive Machine Comprehension with Information Seeking Agents,Xingdi Yuan|Jie Fu|Marc-Alexandre Côté|Yi Tay|Chris Pal|Adam Trischler,"Existing machine reading comprehension (MRC) models do not scale effectively to real-world applications like web-level information retrieval and question answering (QA). We argue that this stems from the nature of MRC datasets: most of these are static environments wherein the supporting documents and all necessary information are fully observed. In this paper, we propose a simple method that reframes existing MRC datasets as interactive, partially observable environments. Specifically, we ""occlude"" the majority of a document's text and add context-sensitive commands that reveal ""glimpses"" of the hidden text to a model. We repurpose SQuAD and NewsQA as an initial case study, and then show how the interactive corpora can be used to train a model that seeks relevant information through sequential decision making. We believe that this setting can contribute in scaling models to web-level QA scenarios.",Interactive Comprehension|real-world applications|web-level retrieval|question answering,Semantics: Textual Inference and Other Areas of Semantics,Long,https://www.aclweb.org/anthology/2020.acl-main.211.pdf -main.577,Named Entity Recognition as Dependency Parsing,Juntao Yu|Bernd Bohnet|Massimo Poesio,"Named Entity Recognition (NER) is a fundamental task in Natural Language Processing, concerned with identifying spans of text expressing references to entities. NER research is often focused on flat entities only (flat NER), ignoring the fact that entity references can be nested, as in [Bank of [China]] (Finkel and Manning, 2009). In this paper, we use ideas from graph-based dependency parsing to provide our model a global view on the input via a biaffine model (Dozat and Manning, 2017). The biaffine model scores pairs of start and end tokens in a sentence which we use to explore all spans, so that the model is able to predict named entities accurately. We show that the model works well for both nested and flat NER through evaluation on 8 corpora and achieving SoTA performance on all of them, with accuracy gains of up to 2.2 percentage points.",Named Recognition|NER|Natural Processing|NER research,Information Extraction,Short,https://www.aclweb.org/anthology/2020.acl-main.577.pdf -main.588,Dependency Graph Enhanced Dual-transformer Structure for Aspect-based Sentiment Classification,Hao Tang|Donghong Ji|Chenliang Li|Qiji Zhou,"Aspect-based sentiment classification is a popular task aimed at identifying the corresponding emotion of a specific aspect. One sentence may contain various sentiments for different aspects. Many sophisticated methods such as attention mechanism and Convolutional Neural Networks (CNN) have been widely employed for handling this challenge. Recently, semantic dependency tree implemented by Graph Convolutional Networks (GCN) is introduced to describe the inner connection between aspects and the associated emotion words. But the improvement is limited due to the noise and instability of dependency trees. To this end, we propose a dependency graph enhanced dual-transformer network (named DGEDT) by jointly considering the flat representations learnt from Transformer and graph-based representations learnt from the corresponding dependency graph in an iterative interaction manner. Specifically, a dual-transformer structure is devised in DGEDT to support mutual reinforcement between the flat representation learning and graph-based representation learning. The idea is to allow the dependency graph to guide the representation learning of the transformer encoder and vice versa. The results on five datasets demonstrate that the proposed DGEDT outperforms all state-of-the-art alternatives with a large margin.",Aspect-based Classification|Dependency Structure|attention mechanism|Convolutional CNN,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.588.pdf -main.589,Differentiable Window for Dynamic Local Attention,Thanh-Tung Nguyen|Xuan-Phi Nguyen|Shafiq Joty|Xiaoli Li,"We propose Differentiable Window, a new neural module and general purpose component for dynamic window selection. While universally applicable, we demonstrate a compelling use case of utilizing Differentiable Window to improve standard attention modules by enabling more focused attentions over the input regions. We propose two variants of Differentiable Window, and integrate them within the Transformer architecture in two novel ways. We evaluate our proposed approach on a myriad of NLP tasks, including machine translation, sentiment analysis, subject-verb agreement and language modeling. Our experimental results demonstrate consistent and sizable improvements across all tasks.",Dynamic Attention|dynamic selection|NLP tasks|machine translation,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.589.pdf -main.210,INFOTABS: Inference on Tables as Semi-structured Data,Vivek Gupta|Maitrey Mehta|Pegah Nokhiz|Vivek Srikumar,"In this paper, we observe that semi-structured tabulated text is ubiquitous; understanding them requires not only comprehending the meaning of text fragments, but also implicit relationships between them. We argue that such data can prove as a testing ground for understanding how we reason about information. To study this, we introduce a new dataset called INFOTABS, comprising of human-written textual hypotheses based on premises that are tables extracted from Wikipedia info-boxes. Our analysis shows that the semi-structured, multi-domain and heterogeneous nature of the premises admits complex, multi-faceted reasoning. Experiments reveal that, while human annotators agree on the relationships between a table-hypothesis pair, several standard modeling strategies are unsuccessful at the task, suggesting that reasoning about tables can pose a difficult modeling challenge.",INFOTABS|complex reasoning|modeling strategies|meaning fragments,Semantics: Textual Inference and Other Areas of Semantics,Long,https://www.aclweb.org/anthology/2020.acl-main.210.pdf -main.576,MIE: A Medical Information Extractor towards Medical Dialogues,Yuanzhe Zhang|Zhongtao Jiang|Tao Zhang|Shiwan Liu|Jiarun Cao|Kang Liu|Shengping Liu|Jun Zhao,"Electronic Medical Records (EMRs) have become key components of modern medical care systems. Despite the merits of EMRs, many doctors suffer from writing them, which is time-consuming and tedious. We believe that automatically converting medical dialogues to EMRs can greatly reduce the burdens of doctors, and extracting information from medical dialogues is an essential step. To this end, we annotate online medical consultation dialogues in a window-sliding style, which is much easier than the sequential labeling annotation. We then propose a Medical Information Extractor (MIE) towards medical dialogues. MIE is able to extract mentioned symptoms, surgeries, tests, other information and their corresponding status. To tackle the particular challenges of the task, MIE uses a deep matching architecture, taking dialogue turn-interaction into account. The experimental results demonstrate MIE is a promising solution to extract medical information from doctor-patient dialogues.",medical systems|MIE|Medical Extractor|EMRs,Information Extraction,Long,https://www.aclweb.org/anthology/2020.acl-main.576.pdf -main.562,To Boldly Query What No One Has Annotated Before? The Frontiers of Corpus Querying,Markus Gärtner|Kerstin Jung,"Corpus query systems exist to address the multifarious information needs of any person interested in the content of annotated corpora. In this role they play an important part in making those resources usable for a wider audience. Over the past decades, several such query systems and languages have emerged, varying greatly in their expressiveness and technical details. This paper offers a broad overview of the history of corpora and corpus query tools. It focusses strongly on the query side and hints at exciting directions for future development.",Corpus systems|query systems|corpus tools|multifarious needs,Theme,Long,https://www.aclweb.org/anthology/2020.acl-main.562.pdf -main.204,DeeBERT: Dynamic Early Exiting for Accelerating BERT Inference,Ji Xin|Raphael Tang|Jaejun Lee|Yaoliang Yu|Jimmy Lin,"Large-scale pre-trained language models such as BERT have brought significant improvements to NLP applications. However, they are also notorious for being slow in inference, which makes them difficult to deploy in real-time applications. We propose a simple but effective method, DeeBERT, to accelerate BERT inference. Our approach allows samples to exit earlier without passing through the entire model. Experiments show that DeeBERT is able to save up to ~40% inference time with minimal degradation in model quality. Further analyses show different behaviors in the BERT transformer layers and also reveal their redundancy. Our work provides new ideas to efficiently apply deep transformer-based models to downstream tasks. Code is available at https://github.com/castorini/DeeBERT.",Accelerating Inference|NLP applications|inference|real-time applications,NLP Applications,Short,https://www.aclweb.org/anthology/2020.acl-main.204.pdf -main.238,Knowledge Graph Embedding Compression,Mrinmaya Sachan,"Knowledge graph (KG) representation learning techniques that learn continuous embeddings of entities and relations in the KG have become popular in many AI applications. With a large KG, the embeddings consume a large amount of storage and memory. This is problematic and prohibits the deployment of these techniques in many real world settings. Thus, we propose an approach that compresses the KG embedding layer by representing each entity in the KG as a vector of discrete codes and then composes the embeddings from these codes. The approach can be trained end-to-end with simple modifications to any existing KG embedding technique. We evaluate the approach on various standard KG embedding evaluations and show that it achieves 50-1000x compression of embeddings with a minor loss in performance. The compressed embeddings also retain the ability to perform various reasoning tasks such as KG inference.",AI applications|reasoning tasks|KG inference|Knowledge Compression,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.238.pdf -main.774,Uncertain Natural Language Inference,Tongfei Chen|Zhengping Jiang|Adam Poliak|Keisuke Sakaguchi|Benjamin Van Durme,"We introduce Uncertain Natural Language Inference (UNLI), a refinement of Natural Language Inference (NLI) that shifts away from categorical labels, targeting instead the direct prediction of subjective probability assessments. We demonstrate the feasibility of collecting annotations for UNLI by relabeling a portion of the SNLI dataset under a probabilistic scale, where items even with the same categorical label differ in how likely people judge them to be true given a premise. We describe a direct scalar regression modeling approach, and find that existing categorically-labeled NLI data can be used in pre-training. Our best models correlate well with humans, demonstrating models are capable of more subtle inferences than the categorical bin assignment employed in current NLI tasks.",Uncertain Inference|Natural Inference|NLI|UNLI,Semantics: Textual Inference and Other Areas of Semantics,Short,https://www.aclweb.org/anthology/2020.acl-main.774.pdf -main.760,Clinical Concept Linking with Contextualized Neural Representations,Elliot Schumacher|Andriy Mulyar|Mark Dredze,"In traditional approaches to entity linking, linking decisions are based on three sources of information -- the similarity of the mention string to an entity's name, the similarity of the context of the document to the entity, and broader information about the knowledge base (KB). In some domains, there is little contextual information present in the KB and thus we rely more heavily on mention string similarity. We consider one example of this, concept linking, which seeks to link mentions of medical concepts to a medical concept ontology. We propose an approach to concept linking that leverages recent work in contextualized neural models, such as ELMo (Peters et al. 2018), which create a token representation that integrates the surrounding context of the mention and concept name. We find a neural ranking approach paired with contextualized embeddings provides gains over a competitive baseline (Leaman et al. 2013). Additionally, we find that a pre-training step using synonyms from the ontology offers a useful initialization for the ranker.",Clinical Linking|entity linking|linking decisions|concept linking,NLP Applications,Short,https://www.aclweb.org/anthology/2020.acl-main.760.pdf -main.748,A Generate-and-Rank Framework with Semantic Type Regularization for Biomedical Concept Normalization,Dongfang Xu|Zeyu Zhang|Steven Bethard,"Concept normalization, the task of linking textual mentions of concepts to concepts in an ontology, is challenging because ontologies are large. In most cases, annotated datasets cover only a small sample of the concepts, yet concept normalizers are expected to predict all concepts in the ontology. In this paper, we propose an architecture consisting of a candidate generator and a list-wise ranker based on BERT. The ranker considers pairings of concept mentions and candidate concepts, allowing it to make predictions for any concept, not just those seen during training. We further enhance this list-wise approach with a semantic type regularizer that allows the model to incorporate semantic type information from the ontology during training. Our proposed concept normalization framework achieves state-of-the-art performance on multiple datasets.",Biomedical Normalization|Concept normalization|Generate-and-Rank Framework|Semantic Regularization,Information Extraction,Long,https://www.aclweb.org/anthology/2020.acl-main.748.pdf -main.199,Taxonomy Construction of Unseen Domains via Graph-based Cross-Domain Knowledge Transfer,Chao Shang|Sarthak Dash|Md. Faisal Mahbub Chowdhury|Nandana Mihindukulasooriya|Alfio Gliozzo,"Extracting lexico-semantic relations as graph-structured taxonomies, also known as taxonomy construction, has been beneficial in a variety of NLP applications. Recently Graph Neural Network (GNN) has shown to be powerful in successfully tackling many tasks. However, there has been no attempt to exploit GNN to create taxonomies. In this paper, we propose Graph2Taxo, a GNN-based cross-domain transfer framework for the taxonomy construction task. Our main contribution is to learn the latent features of taxonomy construction from existing domains to guide the structure learning of an unseen domain. We also propose a novel method of directed acyclic graph (DAG) generation for taxonomy construction. Specifically, our proposed Graph2Taxo uses a noisy graph constructed from automatically extracted noisy hyponym hypernym candidate pairs, and a set of taxonomies for some known domains for training. The learned model is then used to generate taxonomy for a new unknown domain given a set of terms for that domain. Experiments on benchmark datasets from science and environment domains show that our approach attains significant improvements correspondingly over the state of the art.",Taxonomy Domains|Graph-based Transfer|Extracting relations|taxonomy construction,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.199.pdf -main.600,Harvesting and Refining Question-Answer Pairs for Unsupervised QA,Zhongli Li|Wenhui Wang|Li Dong|Furu Wei|Ke Xu,"Question Answering (QA) has shown great success thanks to the availability of large-scale datasets and the effectiveness of neural models. Recent research works have attempted to extend these successes to the settings with few or no labeled data available. In this work, we introduce two approaches to improve unsupervised QA. First, we harvest lexically and syntactically divergent questions from Wikipedia to automatically construct a corpus of question-answer pairs (named as RefQA). Second, we take advantage of the QA model to extract more appropriate answers, which iteratively refines data over RefQA. We conduct experiments on SQuAD 1.1, and NewsQA by fine-tuning BERT without access to manually annotated data. Our approach outperforms previous unsupervised approaches by a large margin, and is competitive with early supervised models. We also show the effectiveness of our approach in the few-shot learning setting.",Unsupervised QA|Question Answering|Question QA|QA,Question Answering,Long,https://www.aclweb.org/anthology/2020.acl-main.600.pdf -main.166,Conversational Graph Grounded Policy Learning for Open-Domain Conversation Generation,Jun Xu|Haifeng Wang|Zheng-Yu Niu|Hua Wu|Wanxiang Che|Ting Liu,"To address the challenge of policy learning in open-domain multi-turn conversation, we propose to represent prior information about dialog transitions as a graph and learn a graph grounded dialog policy, aimed at fostering a more coherent and controllable dialog. To this end, we first construct a conversational graph (CG) from dialog corpora, in which there are vertices to represent ``what to say'' and ``how to say'', and edges to represent natural transition between a message (the last utterance in a dialog context) and its response. We then present a novel CG grounded policy learning framework that conducts dialog flow planning by graph traversal, which learns to identify a what-vertex and a how-vertex from the CG at each turn to guide response generation. In this way, we effectively leverage the CG to facilitate policy learning as follows: (1) it enables more effective long-term reward design, (2) it provides high-quality candidate actions, and (3) it gives us more control over the policy. Results on two benchmark corpora demonstrate the effectiveness of this framework.",Conversational Learning|Open-Domain Generation|policy learning|dialog planning,Dialogue and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.acl-main.166.pdf -main.172,Attend to Medical Ontologies: Content Selection for Clinical Abstractive Summarization,Sajad Sotudeh Gharebagh|Nazli Goharian|Ross Filice,"Sequence-to-sequence (seq2seq) network is a well-established model for text summarization task. It can learn to produce readable content; however, it falls short in effectively identifying key regions of the source. In this paper, we approach the content selection problem for clinical abstractive summarization by augmenting salient ontological terms into the summarizer. Our experiments on two publicly available clinical data sets (107,372 reports of MIMIC-CXR, and 3,366 reports of OpenI) show that our model statistically significantly boosts state-of-the-art results in terms of ROUGE metrics (with improvements: 2.9% RG-1, 2.5% RG-2, 1.9% RG-L), in the healthcare domain where any range of improvement impacts patients’ welfare.",Content Selection|Clinical Summarization|text task|content problem,Summarization,Short,https://www.aclweb.org/anthology/2020.acl-main.172.pdf -main.614,Learning Robust Models for e-Commerce Product Search,Thanh Nguyen|Nikhil Rao|Karthik Subbian,"Showing items that do not match search query intent degrades customer experience in e-commerce. These mismatches result from counterfactual biases of the ranking algorithms toward noisy behavioral signals such as clicks and purchases in the search logs. Mitigating the problem requires a large labeled dataset, which is expensive and time-consuming to obtain. In this paper, we develop a deep, end-to-end model that learns to effectively classify mismatches and to generate hard mismatched examples to improve the classifier. We train the model end-to-end by introducing a latent variable into the cross-entropy loss that alternates between using the real and generated samples. This not only makes the classifier more robust but also boosts the overall ranking performance. Our model achieves a relative gain compared to baselines by over 26% in F-score, and over 17% in Area Under PR curve. On live search traffic, our model gains significant improvement in multiple countries.",e-Commerce Search|Mitigating problem|ranking algorithms|deep model,Information Retrieval and Text Mining,Short,https://www.aclweb.org/anthology/2020.acl-main.614.pdf -main.628,Sentence Meta-Embeddings for Unsupervised Semantic Textual Similarity,Nina Poerner|Ulli Waltinger|Hinrich Schütze,"We address the task of unsupervised Semantic Textual Similarity (STS) by ensembling diverse pre-trained sentence encoders into sentence meta-embeddings. We apply, extend and evaluate different meta-embedding methods from the word embedding literature at the sentence level, including dimensionality reduction (Yin and Schütze, 2016), generalized Canonical Correlation Analysis (Rastogi et al., 2015) and cross-view auto-encoders (Bollegala and Bao, 2018). Our sentence meta-embeddings set a new unsupervised State of The Art (SoTA) on the STS Benchmark and on the STS12-STS16 datasets, with gains of between 3.7% and 6.4% Pearson’s r over single-source systems.",Unsupervised Similarity|unsupervised STS|dimensionality reduction|pre-trained encoders,Semantics: Sentence Level,Short,https://www.aclweb.org/anthology/2020.acl-main.628.pdf -main.84,A Methodology for Creating Question Answering Corpora Using Inverse Data Annotation,Jan Deriu|Katsiaryna Mlynchyk|Philippe Schläpfer|Alvaro Rodrigo|Dirk von Grünigen|Nicolas Kaiser|Kurt Stockinger|Eneko Agirre|Mark Cieliebak,"In this paper, we introduce a novel methodology to efficiently construct a corpus for question answering over structured data. For this, we introduce an intermediate representation that is based on the logical query plan in a database, called Operation Trees (OT). This representation allows us to invert the annotation process without loosing flexibility in the types of queries that we generate. Furthermore, it allows for fine-grained alignment of the tokens to the operations. Thus, we randomly generate OTs from a context free grammar and annotators just have to write the appropriate question and assign the tokens. We compare our corpus OTTA (Operation Trees and Token Assignment), a large semantic parsing corpus for evaluating natural language interfaces to databases, to Spider and LC-QuaD 2.0 and show that our methodology more than triples the annotation speed while maintaining the complexity of the queries. Finally, we train a state-of-the-art semantic parsing model on our data and show that our dataset is a challenging dataset and that the token alignment can be leveraged to significantly increase the performance.",question answering|annotation|Inverse Annotation|intermediate representation,Question Answering,Long,https://www.aclweb.org/anthology/2020.acl-main.84.pdf -main.90,Learning to Identify Follow-Up Questions in Conversational Question Answering,Souvik Kundu|Qian Lin|Hwee Tou Ng,"Despite recent progress in conversational question answering, most prior work does not focus on follow-up questions. Practical conversational question answering systems often receive follow-up questions in an ongoing conversation, and it is crucial for a system to be able to determine whether a question is a follow-up question of the current conversation, for more effective answer finding subsequently. In this paper, we introduce a new follow-up question identification task. We propose a three-way attentive pooling network that determines the suitability of a follow-up question by capturing pair-wise interactions between the associated passage, the conversation history, and a candidate follow-up question. It enables the model to capture topic continuity and topic shift while scoring a particular candidate follow-up question. Experiments show that our proposed three-way attentive pooling network outperforms all baseline systems by significant margins.",Conversational Answering|answer finding|follow-up task|conversational systems,Question Answering,Long,https://www.aclweb.org/anthology/2020.acl-main.90.pdf -main.364,Adaptive Compression of Word Embeddings,Yeachan Kim|Kang-Min Kim|SangKeun Lee,"Distributed representations of words have been an indispensable component for natural language processing (NLP) tasks. However, the large memory footprint of word embeddings makes it challenging to deploy NLP models to memory-constrained devices (e.g., self-driving cars, mobile devices). In this paper, we propose a novel method to adaptively compress word embeddings. We fundamentally follow a code-book approach that represents words as discrete codes such as (8, 5, 2, 4). However, unlike prior works that assign the same length of codes to all words, we adaptively assign different lengths of codes to each word by learning downstream tasks. The proposed method works in two steps. First, each word directly learns to select its code length in an end-to-end manner by applying the Gumbel-softmax tricks. After selecting the code length, each word learns discrete codes through a neural network with a binary constraint. To showcase the general applicability of the proposed method, we evaluate the performance on four different downstream tasks. Comprehensive evaluation results clearly show that our method is effective and makes the highly compressed word embeddings without hurting the task accuracy. Moreover, we show that our model assigns word to each code-book by considering the significance of tasks.",Adaptive Embeddings|Distributed words|natural tasks|downstream tasks,Semantics: Lexical,Long,https://www.aclweb.org/anthology/2020.acl-main.364.pdf -main.402,Towards Emotion-aided Multi-modal Dialogue Act Classification,Tulika Saha|Aditya Patra|Sriparna Saha|Pushpak Bhattacharyya,"The task of Dialogue Act Classification (DAC) that purports to capture communicative intent has been studied extensively. But these studies limit themselves to text. Non-verbal features (change of tone, facial expressions etc.) can provide cues to identify DAs, thus stressing the benefit of incorporating multi-modal inputs in the task. Also, the emotional state of the speaker has a substantial effect on the choice of the dialogue act, since conversations are often influenced by emotions. Hence, the effect of emotion too on automatic identification of DAs needs to be studied. In this work, we address the role of both multi-modality and emotion recognition (ER) in DAC. DAC and ER help each other by way of multi-task learning. One of the major contributions of this work is a new dataset- multimodal Emotion aware Dialogue Act dataset called EMOTyDA, collected from open-sourced dialogue datasets. To demonstrate the utility of EMOTyDA, we build an attention based (self, inter-modal, inter-task) multi-modal, multi-task Deep Neural Network (DNN) for joint learning of DAs and emotions. We show empirically that multi-modality and multi-tasking achieve better performance of DAC compared to uni-modal and single task DAC variants.",Emotion-aided Classification|Dialogue Classification|automatic DAs|ER,Speech and Multimodality,Long,https://www.aclweb.org/anthology/2020.acl-main.402.pdf -main.416,Dscorer: A Fast Evaluation Metric for Discourse Representation Structure Parsing,Jiangming Liu|Shay B. Cohen|Mirella Lapata,"Discourse representation structures (DRSs) are scoped semantic representations for texts of arbitrary length. Evaluating the accuracy of predicted DRSs plays a key role in developing semantic parsers and improving their performance. DRSs are typically visualized as boxes which are not straightforward to process automatically. Counter transforms DRSs to clauses and measures clause overlap by searching for variable mappings between two DRSs. However, this metric is computationally costly (with respect to memory and CPU time) and does not scale with longer texts. We introduce Dscorer, an efficient new metric which converts box-style DRSs to graphs and then measures the overlap of n-grams. Experiments show that Dscorer computes accuracy scores that are correlated with Counter at a fraction of the time.",Discourse Parsing|Dscorer|Fast Metric|,Resources and Evaluation,Short,https://www.aclweb.org/anthology/2020.acl-main.416.pdf -main.370,Adversarial and Domain-Aware BERT for Cross-Domain Sentiment Analysis,Chunning Du|Haifeng Sun|Jingyu Wang|Qi Qi|Jianxin Liao,"Cross-domain sentiment classification aims to address the lack of massive amounts of labeled data. It demands to predict sentiment polarity on a target domain utilizing a classifier learned from a source domain. In this paper, we investigate how to efficiently apply the pre-training language model BERT on the unsupervised domain adaptation. Due to the pre-training task and corpus, BERT is task-agnostic, which lacks domain awareness and can not distinguish the characteristic of source and target domain when transferring knowledge. To tackle these problems, we design a post-training procedure, which contains the target domain masked language model task and a novel domain-distinguish pre-training task. The post-training procedure will encourage BERT to be domain-aware and distill the domain-specific features in a self-supervised way. Based on this, we could then conduct the adversarial training to derive the enhanced domain-invariant features. Extensive experiments on Amazon dataset show that our model outperforms state-of-the-art methods by a large margin. The ablation study demonstrates that the remarkable improvement is not only from BERT but also from our method.",Cross-Domain Analysis|Cross-domain classification|unsupervised adaptation|transferring knowledge,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Long,https://www.aclweb.org/anthology/2020.acl-main.370.pdf -main.358,SEEK: Segmented Embedding of Knowledge Graphs,Wentao Xu|Shun Zheng|Liang He|Bin Shao|Jian Yin|Tie-Yan Liu,"In recent years, knowledge graph embedding becomes a pretty hot research topic of artificial intelligence and plays increasingly vital roles in various downstream applications, such as recommendation and question answering. However, existing methods for knowledge graph embedding can not make a proper trade-off between the model complexity and the model expressiveness, which makes them still far from satisfactory. To mitigate this problem, we propose a lightweight modeling framework that can achieve highly competitive relational expressiveness without increasing the model complexity. Our framework focuses on the design of scoring functions and highlights two critical characteristics: 1) facilitating sufficient feature interactions; 2) preserving both symmetry and antisymmetry properties of relations. It is noteworthy that owing to the general and elegant design of scoring functions, our framework can incorporate many famous existing methods as special cases. Moreover, extensive experiments on public benchmarks demonstrate the efficiency and effectiveness of our framework. Source codes and data can be found at https://github.com/Wentao-Xu/SEEK.",Segmented Graphs|knowledge embedding|artificial intelligence|recommendation,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.358.pdf -main.47,Language Models as an Alternative Evaluator of Word Order Hypotheses: A Case Study in Japanese,Tatsuki Kuribayashi|Takumi Ito|Jun Suzuki|Kentaro Inui,"We examine a methodology using neural language models (LMs) for analyzing the word order of language. This LM-based method has the potential to overcome the difficulties existing methods face, such as the propagation of preprocessor errors in count-based methods. In this study, we explore whether the LM-based method is valid for analyzing the word order. As a case study, this study focuses on Japanese due to its complex and flexible word order. To validate the LM-based method, we test (i) parallels between LMs and human word order preference, and (ii) consistency of the results obtained using the LM-based method with previous linguistic studies. Through our experiments, we tentatively conclude that LMs display sufficient word order knowledge for usage as an analysis tool. Finally, using the LM-based method, we demonstrate the relationship between the canonical word order and topicalization, which had yet to be analyzed by large-scale experiments.",Evaluator Hypotheses|analyzing order|Language Models|neural models,Theory and Formalism in NLP (Linguistic and Mathematical),Long,https://www.aclweb.org/anthology/2020.acl-main.47.pdf -main.9,PLATO: Pre-trained Dialogue Generation Model with Discrete Latent Variable,Siqi Bao|Huang He|Fan Wang|Hua Wu|Haifeng Wang,"Pre-training models have been proved effective for a wide range of natural language processing tasks. Inspired by this, we propose a novel dialogue generation pre-training framework to support various kinds of conversations, including chit-chat, knowledge grounded dialogues, and conversational question answering. In this framework, we adopt flexible attention mechanisms to fully leverage the bi-directional context and the uni-directional characteristic of language generation. We also introduce discrete latent variables to tackle the inherent one-to-many mapping problem in response generation. Two reciprocal tasks of response generation and latent act recognition are designed and carried out simultaneously within a shared network. Comprehensive experiments on three publicly available datasets verify the effectiveness and superiority of the proposed framework.",natural tasks|conversational answering|language generation|one-to-many problem,Dialogue and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.acl-main.9.pdf -main.53,Efficient Dialogue State Tracking by Selectively Overwriting Memory,Sungdong Kim|Sohee Yang|Gyuwan Kim|Sang-Woo Lee,"Recent works in dialogue state tracking (DST) focus on an open vocabulary-based setting to resolve scalability and generalization issues of the predefined ontology-based approaches. However, they are inefficient in that they predict the dialogue state at every turn from scratch. Here, we consider dialogue state as an explicit fixed-sized memory and propose a selectively overwriting mechanism for more efficient DST. This mechanism consists of two steps: (1) predicting state operation on each of the memory slots, and (2) overwriting the memory with new values, of which only a few are generated according to the predicted state operations. Our method decomposes DST into two sub-tasks and guides the decoder to focus only on one of the tasks, thus reducing the burden of the decoder. This enhances the effectiveness of training and DST performance. Our SOM-DST (Selectively Overwriting Memory for Dialogue State Tracking) model achieves state-of-the-art joint goal accuracy with 51.72% in MultiWOZ 2.0 and 53.01% in MultiWOZ 2.1 in an open vocabulary-based DST setting. In addition, we analyze the accuracy gaps between the current and the ground truth-given situations and suggest that it is a promising direction to improve state operation prediction to boost the DST performance.",Dialogue Tracking|predicting operation|training|open setting,Dialogue and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.acl-main.53.pdf -main.45,Dice Loss for Data-imbalanced NLP Tasks,Xiaoya Li|Xiaofei Sun|Yuxian Meng|Junjun Liang|Fei Wu|Jiwei Li,"Many NLP tasks such as tagging and machine reading comprehension are faced with the severe data imbalance issue: negative examples significantly outnumber positive examples, and the huge number of easy-negative examples overwhelms the training. The most commonly used cross entropy (CE) criteria is actually an accuracy-oriented objective, and thus creates a discrepancy between training and test: at training time, each training instance contributes equally to the objective function, while at test time F1 score concerns more about positive examples. In this paper, we propose to use dice loss in replacement of the standard cross-entropy objective for data-imbalanced NLP tasks. Dice loss is based on the Sørensen–Dice coefficient or Tversky index , which attaches similar importance to false positives and false negatives, and is more immune to the data-imbalance issue. To further alleviate the dominating influence from easy-negative examples in training, we propose to associate training examples with dynamically adjusted weights to deemphasize easy-negative examples. Theoretical analysis shows that this strategy narrows down the gap between the F1 score in evaluation and the dice loss in training. With the proposed training objective, we observe significant performance boost on a wide range of data imbalanced NLP tasks. Notably, we are able to achieve SOTA results on CTB5, CTB6 and UD1.4 for the part of speech tagging task; SOTA results on CoNLL03, OntoNotes5.0, MSRA and OntoNotes4.0 for the named entity recognition task; along with competitive results on the tasks of machine reading comprehension and paraphrase identification.",Data-imbalanced Tasks|NLP tasks|tagging|machine comprehension,Theory and Formalism in NLP (Linguistic and Mathematical),Long,https://www.aclweb.org/anthology/2020.acl-main.45.pdf -main.51,"Simple, Interpretable and Stable Method for Detecting Words with Usage Change across Corpora",Hila Gonen|Ganesh Jawahar|Djamé Seddah|Yoav Goldberg,"The problem of comparing two bodies of text and searching for words that differ in their usage between them arises often in digital humanities and computational social science. This is commonly approached by training word embeddings on each corpus, aligning the vector spaces, and looking for words whose cosine distance in the aligned space is large. However, these methods often require extensive filtering of the vocabulary to perform well, and - as we show in this work - result in unstable, and hence less reliable, results. We propose an alternative approach that does not use vector space alignment, and instead considers the neighbors of each word. The method is simple, interpretable and stable. We demonstrate its effectiveness in 9 different setups, considering different corpus splitting criteria (age, gender and profession of tweet authors, time of tweet) and different languages (English, French and Hebrew).",computational science|word embeddings|vector alignment|vector spaces,Computational Social Science and Social Media,Long,https://www.aclweb.org/anthology/2020.acl-main.51.pdf -main.428,Don’t Say That! Making Inconsistent Dialogue Unlikely with Unlikelihood Training,Margaret Li|Stephen Roller|Ilia Kulikov|Sean Welleck|Y-Lan Boureau|Kyunghyun Cho|Jason Weston,"Generative dialogue models currently suffer from a number of problems which standard maximum likelihood training does not address. They tend to produce generations that (i) rely too much on copying from the context, (ii) contain repetitions within utterances, (iii) overuse frequent words, and (iv) at a deeper level, contain logical flaws.In this work we show how all of these problems can be addressed by extending the recently introduced unlikelihood loss (Welleck et al., 2019) to these cases. We show that appropriate loss functions which regularize generated outputs to match human distributions are effective for the first three issues. For the last important general issue, we show applying unlikelihood to collected data of what a model should not do is effective for improving logical consistency, potentially paving the way to generative models with greater reasoning ability. We demonstrate the efficacy of our approach across several dialogue tasks.",dialogue tasks|Unlikelihood Training|Generative models|maximum training,Dialogue and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.acl-main.428.pdf -main.400,Multimodal Transformer for Multimodal Machine Translation,Shaowei Yao|Xiaojun Wan,"Multimodal Machine Translation (MMT) aims to introduce information from other modality, generally static images, to improve the translation quality. Previous works propose various incorporation methods, but most of them do not consider the relative importance of multiple modalities. Equally treating all modalities may encode too much useless information from less important modalities. In this paper, we introduce the multimodal self-attention in Transformer to solve the issues above in MMT. The proposed method learns the representation of images based on the text, which avoids encoding irrelevant information in images. Experiments and visualization analysis demonstrate that our model benefits from visual information and substantially outperforms previous works and competitive baselines in terms of various metrics.",Multimodal MMT|Multimodal|MMT|representation images,Speech and Multimodality,Short,https://www.aclweb.org/anthology/2020.acl-main.400.pdf -main.79,Interpreting Twitter User Geolocation,Ting Zhong|Tianliang Wang|Fan Zhou|Goce Trajcevski|Kunpeng Zhang|Yi Yang,"Identifying user geolocation in online social networks is an essential task in many location-based applications. Existing methods rely on the similarity of text and network structure, however, they suffer from a lack of interpretability on the corresponding results, which is crucial for understanding model behavior. In this work, we adopt influence functions to interpret the behavior of GNN-based models by identifying the importance of training users when predicting the locations of the testing users. This methodology helps with providing meaningful explanations on prediction results. Furthermore, it also initiates an attempt to uncover the so-called ""black-box"" GNN-based models by investigating the effect of individual nodes.",Interpreting Geolocation|Identifying geolocation|Identifying networks|user geolocation,NLP Applications,Short,https://www.aclweb.org/anthology/2020.acl-main.79.pdf -main.366,Autoencoding Keyword Correlation Graph for Document Clustering,Billy Chiu|Sunil Kumar Sahu|Derek Thomas|Neha Sengupta|Mohammady Mahdy,"Document clustering requires a deep understanding of the complex structure of long-text; in particular, the intra-sentential (local) and inter-sentential features (global). Existing representation learning models do not fully capture these features. To address this, we present a novel graph-based representation for document clustering that builds a graph autoencoder (GAE) on a Keyword Correlation Graph. The graph is constructed with topical keywords as nodes and multiple local and global features as edges. A GAE is employed to aggregate the two sets of features by learning a latent representation which can jointly reconstruct them. Clustering is then performed on the learned representations, using vector dimensions as features for inducing document classes. Extensive experiments on two datasets show that the features learned by our approach can achieve better clustering performance than other existing features, including term frequency-inverse document frequency and average embedding.",Document Clustering|inducing classes|clustering|Autoencoding Graph,Semantics: Lexical,Short,https://www.aclweb.org/anthology/2020.acl-main.366.pdf -main.372,GoEmotions: A Dataset of Fine-Grained Emotions,Dorottya Demszky|Dana Movshovitz-Attias|Jeongwoo Ko|Alan Cowen|Gaurav Nemade|Sujith Ravi,"Understanding emotion expressed in language has a wide range of applications, from building empathetic chatbots to detecting harmful online behavior. Advancement in this area can be improved using large-scale datasets with a fine-grained typology, adaptable to multiple downstream tasks. We introduce GoEmotions, the largest manually annotated dataset of 58k English Reddit comments, labeled for 27 emotion categories or Neutral. We demonstrate the high quality of the annotations via Principal Preserved Component Analysis. We conduct transfer learning experiments with existing emotion benchmarks to show that our dataset generalizes well to other domains and different emotion taxonomies. Our BERT-based model achieves an average F1-score of .46 across our proposed taxonomy, leaving much room for improvement.",transfer learning|GoEmotions|Principal Analysis|BERT-based model,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Long,https://www.aclweb.org/anthology/2020.acl-main.372.pdf -main.414,Unsupervised Alignment-based Iterative Evidence Retrieval for Multi-hop Question Answering,Vikas Yadav|Steven Bethard|Mihai Surdeanu,"Evidence retrieval is a critical stage of question answering (QA), necessary not only to improve performance, but also to explain the decisions of the QA method. We introduce a simple, fast, and unsupervised iterative evidence retrieval method, which relies on three ideas: (a) an unsupervised alignment approach to soft-align questions and answers with justification sentences using only GloVe embeddings, (b) an iterative process that reformulates queries focusing on terms that are not covered by existing justifications, which (c) stops when the terms in the given question and candidate answers are covered by the retrieved justifications. Despite its simplicity, our approach outperforms all the previous methods (including supervised methods) on the evidence selection task on two datasets: MultiRC and QASC. When these evidence sentences are fed into a RoBERTa answer classification component, we achieve state-of-the-art QA performance on these two datasets.",Unsupervised Retrieval|Multi-hop Answering|Evidence retrieval|question answering,Question Answering,Long,https://www.aclweb.org/anthology/2020.acl-main.414.pdf -main.399,Target Inference in Argument Conclusion Generation,Milad Alshomary|Shahbaz Syed|Martin Potthast|Henning Wachsmuth,"In argumentation, people state premises to reason towards a conclusion. The conclusion conveys a stance towards some target, such as a concept or statement. Often, the conclusion remains implicit, though, since it is self-evident in a discussion or left out for rhetorical reasons. However, the conclusion is key to understanding an argument and, hence, to any application that processes argumentation. We thus study the question to what extent an argument's conclusion can be reconstructed from its premises. In particular, we argue here that a decisive step is to infer a conclusion's target, and we hypothesize that this target is related to the premises' targets. We develop two complementary target inference approaches: one ranks premise targets and selects the top-ranked target as the conclusion target, the other finds a new conclusion target in a learned embedding space using a triplet neural network. Our evaluation on corpora from two domains indicates that a hybrid of both approaches is best, outperforming several strong baselines. According to human annotators, we infer a reasonably adequate conclusion target in 89% of the cases.",Target Inference|Argument Generation|argumentation|triplet network,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Long,https://www.aclweb.org/anthology/2020.acl-main.399.pdf -main.86,Dynamic Sampling Strategies for Multi-Task Reading Comprehension,Ananth Gottumukkala|Dheeru Dua|Sameer Singh|Matt Gardner,"Building general reading comprehension systems, capable of solving multiple datasets at the same time, is a recent aspirational goal in the research community. Prior work has focused on model architecture or generalization to held out datasets, and largely passed over the particulars of the multi-task learning set up. We show that a simple dynamic sampling strategy, selecting instances for training proportional to the multi-task model's current performance on a dataset relative to its single task performance, gives substantive gains over prior multi-task sampling strategies, mitigating the catastrophic forgetting that is common in multi-task learning. We also demonstrate that allowing instances of different tasks to be interleaved as much as possible between each epoch and batch has a clear benefit in multitask performance over forcing task homogeneity at the epoch or batch level. Our final model shows greatly increased performance over the best model on ORB, a recently-released multitask reading comprehension benchmark.",Multi-Task Comprehension|generalization|Dynamic Strategies|general systems,Question Answering,Short,https://www.aclweb.org/anthology/2020.acl-main.86.pdf -main.92,A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers,Shen-yun Miao|Chao-Chun Liang|Keh-Yih Su,"We present ASDiv (Academia Sinica Diverse MWP Dataset), a diverse (in terms of both language patterns and problem types) English math word problem (MWP) corpus for evaluating the capability of various MWP solvers. Existing MWP corpora for studying AI progress remain limited either in language usage patterns or in problem types. We thus present a new English MWP corpus with 2,305 MWPs that cover more text patterns and most problem types taught in elementary school. Each MWP is annotated with its problem type and grade level (for indicating the level of difficulty). Furthermore, we propose a metric to measure the lexicon usage diversity of a given MWP corpus, and demonstrate that ASDiv is more diverse than existing corpora. Experiments show that our proposed corpus reflects the true capability of MWP solvers more faithfully.",AI progress|English Solvers|MWP solvers|ASDiv,Resources and Evaluation,Short,https://www.aclweb.org/anthology/2020.acl-main.92.pdf -main.158,A Systematic Assessment of Syntactic Generalization in Neural Language Models,Jennifer Hu|Jon Gauthier|Peng Qian|Ethan Wilcox|Roger Levy,"While state-of-the-art neural network models continue to achieve lower perplexity scores on language modeling benchmarks, it remains unknown whether optimizing for broad-coverage predictive performance leads to human-like syntactic knowledge. Furthermore, existing work has not provided a clear picture about the model properties required to produce proper syntactic generalizations. We present a systematic evaluation of the syntactic knowledge of neural language models, testing 20 combinations of model types and data sizes on a set of 34 English-language syntactic test suites. We find substantial differences in syntactic generalization performance by model architecture, with sequential models underperforming other architectures. Factorially manipulating model architecture and training dataset size (1M-40M words), we find that variability in syntactic generalization performance is substantially greater by architecture than by dataset size for the corpora tested in our experiments. Our results also reveal a dissociation between perplexity and syntactic generalization performance.",Systematic Generalization|Syntactic Generalization|syntactic generalizations|Neural Models,Cognitive Modeling and Psycholinguistics,Long,https://www.aclweb.org/anthology/2020.acl-main.158.pdf -main.164,Automatic Detection of Generated Text is Easiest when Humans are Fooled,Daphne Ippolito|Daniel Duckworth|Chris Callison-Burch|Douglas Eck,"Recent advancements in neural language modelling make it possible to rapidly generate vast amounts of human-sounding text. The capabilities of humans and automatic discriminators to detect machine-generated text have been a large source of research interest, but humans and machines rely on different cues to make their decisions. Here, we perform careful benchmarking and analysis of three popular sampling-based decoding strategies---top-_k_, nucleus sampling, and untruncated random sampling---and show that improvements in decoding methods have primarily optimized for fooling humans. This comes at the expense of introducing statistical abnormalities that make detection easy for automatic systems. We also show that though both human and automatic detector performance improve with longer excerpt length, even multi-sentence excerpts can fool expert human raters over 30% of the time. Our findings reveal the importance of using both human and automatic detectors to assess the humanness of text generation systems.",Automatic Text|detection|humanness systems|neural modelling,Generation,Long,https://www.aclweb.org/anthology/2020.acl-main.164.pdf -main.602,R4C: A Benchmark for Evaluating RC Systems to Get the Right Answer for the Right Reason,Naoya Inoue|Pontus Stenetorp|Kentaro Inui,"Recent studies have revealed that reading comprehension (RC) systems learn to exploit annotation artifacts and other biases in current datasets. This prevents the community from reliably measuring the progress of RC systems. To address this issue, we introduce R4C, a new task for evaluating RC systems' internal reasoning. R4C requires giving not only answers but also derivations: explanations that justify predicted answers. We present a reliable, crowdsourced framework for scalably annotating RC datasets with derivations. We create and publicly release the R4C dataset, the first, quality-assured dataset consisting of 4.6k questions, each of which is annotated with 3 reference derivations (i.e. 13.8k derivations). Experiments show that our automatic evaluation metrics using multiple reference derivations are reliable, and that R4C assesses different skills from an existing benchmark.",R4C|RC Systems|reading systems|RC reasoning,Question Answering,Short,https://www.aclweb.org/anthology/2020.acl-main.602.pdf -main.616,Highway Transformer: Self-Gating Enhanced Self-Attentive Networks,Yekun Chai|Shuo Jin|Xinwen Hou,"Self-attention mechanisms have made striking state-of-the-art (SOTA) progress in various sequence learning tasks, standing on the multi-headed dot product attention by attending to all the global contexts at different locations. Through a pseudo information highway, we introduce a gated component self-dependency units (SDU) that incorporates LSTM-styled gating units to replenish internal semantic importance within the multi-dimensional latent space of individual representations. The subsidiary content-based SDU gates allow for the information flow of modulated latent embeddings through skipped connections, leading to a clear margin of convergence speed with gradient descent algorithms. We may unveil the role of gating mechanism to aid in the context-based Transformer modules, with hypothesizing that SDU gates, especially on shallow layers, could push it faster to step towards suboptimal points during the optimization process.",sequence tasks|optimization process|Highway Transformer|Self-Gating Networks,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.616.pdf -main.170,BPE-Dropout: Simple and Effective Subword Regularization,Ivan Provilkov|Dmitrii Emelianenko|Elena Voita,"Subword segmentation is widely used to address the open vocabulary problem in machine translation. The dominant approach to subword segmentation is Byte Pair Encoding (BPE), which keeps the most frequent words intact while splitting the rare ones into multiple tokens. While multiple segmentations are possible even with the same vocabulary, BPE splits words into unique sequences; this may prevent a model from better learning the compositionality of words and being robust to segmentation errors. So far, the only way to overcome this BPE imperfection, its deterministic nature, was to create another subword segmentation algorithm (Kudo, 2018). In contrast, we show that BPE itself incorporates the ability to produce multiple segmentations of the same word. We introduce BPE-dropout - simple and effective subword regularization method based on and compatible with conventional BPE. It stochastically corrupts the segmentation procedure of BPE, which leads to producing multiple segmentations within the same fixed BPE framework. Using BPE-dropout during training and the standard BPE during inference improves translation quality up to 2.3 BLEU compared to BPE and up to 0.9 BLEU compared to the previous subword regularization.",open problem|machine translation|subword segmentation|training,Machine Translation,Long,https://www.aclweb.org/anthology/2020.acl-main.170.pdf -main.776,Revisiting Higher-Order Dependency Parsers,Erick Fonseca|André F. T. Martins,"Neural encoders have allowed dependency parsers to shift from higher-order structured models to simpler first-order ones, making decoding faster and still achieving better accuracy than non-neural parsers. This has led to a belief that neural encoders can implicitly encode structural constraints, such as siblings and grandparents in a tree. We tested this hypothesis and found that neural parsers may benefit from higher-order features, even when employing a powerful pre-trained encoder, such as BERT. While the gains of higher-order features are small in the presence of a powerful encoder, they are consistent for long-range dependencies and long sentences. In particular, higher-order models are more accurate on full sentence parses and on the exact match of modifier lists, indicating that they deal better with larger, more complex structures.",Higher-Order Parsers|Neural encoders|dependency parsers|higher-order models,"Syntax: Tagging, Chunking and Parsing",Short,https://www.aclweb.org/anthology/2020.acl-main.776.pdf -main.762,Let Me Choose: From Verbal Context to Font Selection,Amirreza Shirani|Franck Dernoncourt|Jose Echevarria|Paul Asente|Nedim Lipka|Thamar Solorio,"In this paper, we aim to learn associations between visual attributes of fonts and the verbal context of the texts they are typically applied to. Compared to related work leveraging the surrounding visual context, we choose to focus only on the input text, which can enable new applications for which the text is the only visual element in the document. We introduce a new dataset, containing examples of different topics in social media posts and ads, labeled through crowd-sourcing. Due to the subjective nature of the task, multiple fonts might be perceived as acceptable for an input text, which makes this problem challenging. To this end, we investigate different end-to-end models to learn label distributions on crowd-sourced data, to capture inter-subjectivity across all annotations.",Font Selection|crowd-sourcing|end-to-end models|Verbal Context,NLP Applications,Short,https://www.aclweb.org/anthology/2020.acl-main.762.pdf -main.548,Neural Mixed Counting Models for Dispersed Topic Discovery,Jiemin Wu|Yanghui Rao|Zusheng Zhang|Haoran Xie|Qing Li|Fu Lee Wang|Ziye Chen,"Mixed counting models that use the negative binomial distribution as the prior can well model over-dispersed and hierarchically dependent random variables; thus they have attracted much attention in mining dispersed document topics. However, the existing parameter inference method like Monte Carlo sampling is quite time-consuming. In this paper, we propose two efficient neural mixed counting models, i.e., the Negative Binomial-Neural Topic Model (NB-NTM) and the Gamma Negative Binomial-Neural Topic Model (GNB-NTM) for dispersed topic discovery. Neural variational inference algorithms are developed to infer model parameters by using the reparameterization of Gamma distribution and the Gaussian approximation of Poisson distribution. Experiments on real-world datasets indicate that our models outperform state-of-the-art baseline models in terms of perplexity and topic coherence. The results also validate that both NB-NTM and GNB-NTM can produce explainable intermediate variables by generating dispersed proportions of document topics.",Dispersed Discovery|mining topics|Neural Models|Mixed models,Semantics: Textual Inference and Other Areas of Semantics,Long,https://www.aclweb.org/anthology/2020.acl-main.548.pdf -main.574,Handling Rare Entities for Neural Sequence Labeling,Yangming Li|Han Li|Kaisheng Yao|Xiaolong Li,"One great challenge in neural sequence labeling is the data sparsity problem for rare entity words and phrases. Most of test set entities appear only few times and are even unseen in training corpus, yielding large number of out-of-vocabulary (OOV) and low-frequency (LF) entities during evaluation. In this work, we propose approaches to address this problem. For OOV entities, we introduce local context reconstruction to implicitly incorporate contextual information into their representations. For LF entities, we present delexicalized entity identification to explicitly extract their frequency-agnostic and entity-type-specific representations. Extensive experiments on multiple benchmark datasets show that our model has significantly outperformed all previous methods and achieved new start-of-the-art results. Notably, our methods surpass the model fine-tuned on pre-trained language models without external resource.",Neural Labeling|data problem|delexicalized identification|local reconstruction,Information Extraction,Long,https://www.aclweb.org/anthology/2020.acl-main.574.pdf -main.212,Syntactic Data Augmentation Increases Robustness to Inference Heuristics,Junghyun Min|R. Thomas McCoy|Dipanjan Das|Emily Pitler|Tal Linzen,"Pretrained neural models such as BERT, when fine-tuned to perform natural language inference (NLI), often show high accuracy on standard datasets, but display a surprising lack of sensitivity to word order on controlled challenge sets. We hypothesize that this issue is not primarily caused by the pretrained model's limitations, but rather by the paucity of crowdsourced NLI examples that might convey the importance of syntactic structure at the fine-tuning stage. We explore several methods to augment standard training sets with syntactically informative examples, generated by applying syntactic transformations to sentences from the MNLI corpus. The best-performing augmentation method, subject/object inversion, improved BERT's accuracy on controlled examples that diagnose sensitivity to word order from 0.28 to 0.73, without affecting performance on the MNLI test set. This improvement generalized beyond the particular construction used for data augmentation, suggesting that augmentation causes BERT to recruit abstract syntactic representations.",Syntactic Augmentation|natural inference|natural NLI|NLI,Semantics: Textual Inference and Other Areas of Semantics,Short,https://www.aclweb.org/anthology/2020.acl-main.212.pdf -main.206,Investigating the effect of auxiliary objectives for the automated grading of learner English speech transcriptions,Hannah Craighead|Andrew Caines|Paula Buttery|Helen Yannakoudakis,"We address the task of automatically grading the language proficiency of spontaneous speech based on textual features from automatic speech recognition transcripts. Motivated by recent advances in multi-task learning, we develop neural networks trained in a multi-task fashion that learn to predict the proficiency level of non-native English speakers by taking advantage of inductive transfer between the main task (grading) and auxiliary prediction tasks: morpho-syntactic labeling, language modeling, and native language identification (L1). We encode the transcriptions with both bi-directional recurrent neural networks and with bi-directional representations from transformers, compare against a feature-rich baseline, and analyse performance at different proficiency levels and with transcriptions of varying error rates. Our best performance comes from a transformer encoder with L1 prediction as an auxiliary task. We discuss areas for improvement and potential applications for text-only speech scoring.",automated transcriptions|automatically speech|multi-task learning|inductive transfer,NLP Applications,Long,https://www.aclweb.org/anthology/2020.acl-main.206.pdf -main.560,The State and Fate of Linguistic Diversity and Inclusion in the NLP World,Pratik Joshi|Sebastin Santy|Amar Budhiraja|Kalika Bali|Monojit Choudhury,"Language technologies contribute to promoting multilingualism and linguistic diversity around the world. However, only a very small number of the over 7000 languages of the world are represented in the rapidly evolving language technologies and applications. In this paper we look at the relation between the types of languages, resources, and their representation in NLP conferences to understand the trajectory that different languages have followed over time. Our quantitative investigation underlines the disparity between languages, especially in terms of their resources, and calls into question the ""language agnostic"" status of current models and systems. Through this paper, we attempt to convince the EMNLP community to prioritise the resolution of the predicaments highlighted here, so that no language is left behind.",NLP conferences|Language technologies|Linguistic Diversity|language status,Theme,Long,https://www.aclweb.org/anthology/2020.acl-main.560.pdf -main.207,SPECTER: Document-level Representation Learning using Citation-informed Transformers,Arman Cohan|Sergey Feldman|Iz Beltagy|Doug Downey|Daniel Weld,"Representation learning is a critical ingredient for natural language processing systems. Recent Transformer language models like BERT learn powerful textual representations, but these models are targeted towards token- and sentence-level training objectives and do not leverage information on inter-document relatedness, which limits their document-level representation power. For applications on scientific documents, such as classification and recommendation, accurate embeddings of documents are a necessity. We propose SPECTER, a new method to generate document-level embedding of scientific papers based on pretraining a Transformer language model on a powerful signal of document-level relatedness: the citation graph. Unlike existing pretrained language models, Specter can be easily applied to downstream applications without task-specific fine-tuning. Additionally, to encourage further research on document-level models, we introduce SciDocs, a new evaluation benchmark consisting of seven document-level tasks ranging from citation prediction, to document classification and recommendation. We show that Specter outperforms a variety of competitive baselines on the benchmark.",Document-level Learning|Representation learning|natural systems|classification,NLP Applications,Long,https://www.aclweb.org/anthology/2020.acl-main.207.pdf -main.561,The Unstoppable Rise of Computational Linguistics in Deep Learning,James Henderson,"In this paper, we trace the history of neural networks applied to natural language understanding tasks, and identify key contributions which the nature of language has made to the development of neural network architectures. We focus on the importance of variable binding and its instantiation in attention-based models, and argue that Transformer is not a sequence model but an induced-structure model. This perspective leads to predictions of the challenges facing research in deep learning architectures for natural language understanding.",Deep Learning|natural tasks|natural understanding|neural networks,Theme,Long,https://www.aclweb.org/anthology/2020.acl-main.561.pdf -main.575,Instance-Based Learning of Span Representations: A Case Study through Named Entity Recognition,Hiroki Ouchi|Jun Suzuki|Sosuke Kobayashi|Sho Yokoi|Tatsuki Kuribayashi|Ryuto Konno|Kentaro Inui,"Interpretable rationales for model predictions play a critical role in practical applications. In this study, we develop models possessing interpretable inference process for structured prediction. Specifically, we present a method of instance-based learning that learns similarities between spans. At inference time, each span is assigned a class label based on its similar spans in the training set, where it is easy to understand how much each training instance contributes to the predictions. Through empirical analysis on named entity recognition, we demonstrate that our method enables to build models that have high interpretability without sacrificing performance.",Named Recognition|Interpretable rationales|model predictions|structured prediction,Information Extraction,Short,https://www.aclweb.org/anthology/2020.acl-main.575.pdf -main.213,Improved Speech Representations with Multi-Target Autoregressive Predictive Coding,Yu-An Chung|James Glass,"Training objectives based on predictive coding have recently been shown to be very effective at learning meaningful representations from unlabeled speech. One example is Autoregressive Predictive Coding (Chung et al., 2019), which trains an autoregressive RNN to generate an unseen future frame given a context such as recent past frames. The basic hypothesis of these approaches is that hidden states that can accurately predict future frames are a useful representation for many downstream tasks. In this paper we extend this hypothesis and aim to enrich the information encoded in the hidden states by training the model to make more accurate future predictions. We propose an auxiliary objective that serves as a regularization to improve generalization of the future frame prediction task. Experimental results on phonetic classification, speech recognition, and speech translation not only support the hypothesis, but also demonstrate the effectiveness of our approach in learning representations that contain richer phonetic content.",Training objectives|downstream tasks|generalization task|phonetic classification,Speech and Multimodality,Short,https://www.aclweb.org/anthology/2020.acl-main.213.pdf -main.549,Reasoning Over Semantic-Level Graph for Fact Checking,Wanjun Zhong|Jingjing Xu|Duyu Tang|Zenan Xu|Nan Duan|Ming Zhou|Jiahai Wang|Jian Yin,"Fact checking is a challenging task because verifying the truthfulness of a claim requires reasoning about multiple retrievable evidence. In this work, we present a method suitable for reasoning about the semantic-level structure of evidence. Unlike most previous works, which typically represent evidence sentences with either string concatenation or fusing the features of isolated evidence sentences, our approach operates on rich semantic structures of evidence obtained by semantic role labeling. We propose two mechanisms to exploit the structure of evidence while leveraging the advances of pre-trained models like BERT, GPT or XLNet. Specifically, using XLNet as the backbone, we first utilize the graph structure to re-define the relative distances of words, with the intuition that semantically related words should have short distances. Then, we adopt graph convolutional network and graph attention network to propagate and aggregate information from neighboring nodes on the graph. We evaluate our system on FEVER, a benchmark dataset for fact checking, and find that rich structural information is helpful and both our graph-based mechanisms improve the accuracy. Our model is the state-of-the-art system in terms of both official evaluation metrics, namely claim verification accuracy and FEVER score.",Reasoning Graph|Fact Checking|string concatenation|semantic labeling,Semantics: Textual Inference and Other Areas of Semantics,Long,https://www.aclweb.org/anthology/2020.acl-main.549.pdf -main.763,Multi-Label and Multilingual News Framing Analysis,Afra Feyza Akyürek|Lei Guo|Randa Elanwar|Prakash Ishwar|Margrit Betke|Derry Tanti Wijaya,"News framing refers to the practice in which aspects of specific issues are highlighted in the news to promote a particular interpretation. In NLP, although recent works have studied framing in English news, few have studied how the analysis can be extended to other languages and in a multi-label setting. In this work, we explore multilingual transfer learning to detect multiple frames from just the news headline in a genuinely low-resource context where there are few/no frame annotations in the target language. We propose a novel method that can leverage elementary resources consisting of a dictionary and few annotations to detect frames in the target language. Our method performs comparably or better than translating the entire target language headline to the source language for which we have annotated data. This work opens up an exciting new capability of scaling up frame analysis to many languages, even those without existing translation technologies. Lastly, we apply our method to detect frames on the issue of U.S. gun violence in multiple languages and obtain exciting insights on the relationship between different frames of the same problem across different countries with different languages.",News framing|NLP|multi-label setting|U.S. violence,NLP Applications,Long,https://www.aclweb.org/anthology/2020.acl-main.763.pdf -main.777,SeqVAT: Virtual Adversarial Training for Semi-Supervised Sequence Labeling,Luoxin Chen|Weitong Ruan|Xinyue Liu|Jianhua Lu,"Virtual adversarial training (VAT) is a powerful technique to improve model robustness in both supervised and semi-supervised settings. It is effective and can be easily adopted on lots of image classification and text classification tasks. However, its benefits to sequence labeling tasks such as named entity recognition (NER) have not been shown as significant, mostly, because the previous approach can not combine VAT with the conditional random field (CRF). CRF can significantly boost accuracy for sequence models by putting constraints on label transitions, which makes it an essential component in most state-of-the-art sequence labeling model architectures. In this paper, we propose SeqVAT, a method which naturally applies VAT to sequence labeling models with CRF. Empirical studies show that SeqVAT not only significantly improves the sequence labeling performance over baselines under supervised settings, but also outperforms state-of-the-art approaches under semi-supervised settings.",Semi-Supervised Labeling|supervised settings|image classification|image tasks,"Syntax: Tagging, Chunking and Parsing",Long,https://www.aclweb.org/anthology/2020.acl-main.777.pdf -main.617,Low-Dimensional Hyperbolic Knowledge Graph Embeddings,Ines Chami|Adva Wolf|Da-Cheng Juan|Frederic Sala|Sujith Ravi|Christopher Ré,"Knowledge graph (KG) embeddings learn low- dimensional representations of entities and relations to predict missing facts. KGs often exhibit hierarchical and logical patterns which must be preserved in the embedding space. For hierarchical data, hyperbolic embedding methods have shown promise for high-fidelity and parsimonious representations. However, existing hyperbolic embedding methods do not account for the rich logical patterns in KGs. In this work, we introduce a class of hyperbolic KG embedding models that simultaneously capture hierarchical and logical patterns. Our approach combines hyperbolic reflections and rotations with attention to model complex relational patterns. Experimental results on standard KG benchmarks show that our method improves over previous Euclidean- and hyperbolic-based efforts by up to 6.1% in mean reciprocal rank (MRR) in low dimensions. Furthermore, we observe that different geometric transformations capture different types of relations while attention- based transformations generalize to multiple relations. In high dimensions, our approach yields new state-of-the-art MRRs of 49.6% on WN18RR and 57.7% on YAGO3-10.",high-fidelity representations|Low-Dimensional Embeddings|Knowledge embeddings|KGs,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.617.pdf -main.171,Improving Non-autoregressive Neural Machine Translation with Monolingual Data,Jiawei Zhou|Phillip Keung,"Non-autoregressive (NAR) neural machine translation is usually done via knowledge distillation from an autoregressive (AR) model. Under this framework, we leverage large monolingual corpora to improve the NAR model's performance, with the goal of transferring the AR model's generalization ability while preventing overfitting. On top of a strong NAR baseline, our experimental results on the WMT14 En-De and WMT16 En-Ro news translation tasks confirm that monolingual data augmentation consistently improves the performance of the NAR model to approach the teacher AR model's performance, yields comparable or better results than the best non-iterative NAR methods in the literature and helps reduce overfitting in the training process.",Non-autoregressive Translation|WMT14 tasks|monolingual augmentation|knowledge distillation,Machine Translation,Short,https://www.aclweb.org/anthology/2020.acl-main.171.pdf -main.165,Multi-Domain Neural Machine Translation with Word-Level Adaptive Layer-wise Domain Mixing,Haoming Jiang|Chen Liang|Chong Wang|Tuo Zhao,"Many multi-domain neural machine translation (NMT) models achieve knowledge transfer by enforcing one encoder to learn shared embedding across domains. However, this design lacks adaptation to individual domains. To overcome this limitation, we propose a novel multi-domain NMT model using individual modules for each domain, on which we apply word-level, adaptive and layer-wise domain mixing. We first observe that words in a sentence are often related to multiple domains. Hence, we assume each word has a domain proportion, which indicates its domain preference. Then word representations are obtained by mixing their embedding in individual domains based on their domain proportions. We show this can be achieved by carefully designing multi-head dot-product attention modules for different domains, and eventually taking weighted averages of their parameters by word-level layer-wise domain proportions. Through this, we can achieve effective domain knowledge sharing and capture fine-grained domain-specific knowledge as well. Our experiments show that our proposed model outperforms existing ones in several NMT tasks.",knowledge transfer|domain sharing|NMT tasks|Multi-Domain Translation,Machine Translation,Long,https://www.aclweb.org/anthology/2020.acl-main.165.pdf -main.603,Recurrent Chunking Mechanisms for Long-Text Machine Reading Comprehension,Hongyu Gong|Yelong Shen|Dian Yu|Jianshu Chen|Dong Yu,"In this paper, we study machine reading comprehension (MRC) on long texts: where a model takes as inputs a lengthy document and a query, extracts a text span from the document as an answer. State-of-the-art models (e.g., BERT) tend to use a stack of transformer layers that are pre-trained from a large number of unlabeled language corpora to encode the joint contextual information of query and document. However, these transformer models can only take as input a fixed-length (e.g., 512) text. To deal with even longer text inputs, previous approaches usually chunk them into equally-spaced segments and predict answers based on each segment independently without considering the information from other segments. As a result, they may form segments that fail to cover complete answers or retain insufficient contexts around the correct answer required for question answering. Moreover, they are less capable of answering questions that need cross-segment information. We propose to let a model learn to chunk in a more flexible way via reinforcement learning: a model can decide the next segment that it wants to process in either direction. We also apply recurrent mechanisms to enable information to flow across segments. Experiments on three MRC tasks -- CoQA, QuAC, and TriviaQA -- demonstrate the effectiveness of our proposed recurrent chunking mechanisms: we can obtain segments that are more likely to contain complete answers and at the same time provide sufficient contexts around the ground truth answers for better predictions.",Long-Text Comprehension|machine comprehension|MRC|question answering,Question Answering,Long,https://www.aclweb.org/anthology/2020.acl-main.603.pdf -main.159,Inflecting When There's No Majority: Limitations of Encoder-Decoder Neural Networks as Cognitive Models for German Plurals,Kate McCurdy|Sharon Goldwater|Adam Lopez,"Can artificial neural networks learn to represent inflectional morphology and generalize to new words as human speakers do? Kirov and Cotterell (2018) argue that the answer is yes: modern Encoder-Decoder (ED) architectures learn human-like behavior when inflecting English verbs, such as extending the regular past tense form /-(e)d/ to novel words. However, their work does not address the criticism raised by Marcus et al. (1995): that neural models may learn to extend not the regular, but the most frequent class — and thus fail on tasks like German number inflection, where infrequent suffixes like /-s/ can still be productively generalized. To investigate this question, we first collect a new dataset from German speakers (production and ratings of plural forms for novel nouns) that is designed to avoid sources of information unavailable to the ED model. The speaker data show high variability, and two suffixes evince 'regular' behavior, appearing more often with phonologically atypical inputs. Encoder-decoder models do generalize the most frequently produced plural class, but do not show human-like variability or 'regular' extension of these other plural markers. We conclude that modern neural models may still struggle with minority-class generalization.",minority-class generalization|Encoder-Decoder Networks|Cognitive Models|artificial networks,Cognitive Modeling and Psycholinguistics,Long,https://www.aclweb.org/anthology/2020.acl-main.159.pdf -main.93,Improving Image Captioning Evaluation by Considering Inter References Variance,Yanzhi Yi|Hangyu Deng|Jinglu Hu,"Evaluating image captions is very challenging partially due to the fact that there are multiple correct captions for every single image. Most of the existing one-to-one metrics operate by penalizing mismatches between reference and generative caption without considering the intrinsic variance between ground truth captions. It usually leads to over-penalization and thus a bad correlation to human judgment. Recently, the latest one-to-one metric BERTScore can achieve high human correlation in system-level tasks while some issues can be fixed for better performance. In this paper, we propose a novel metric based on BERTScore that could handle such a challenge and extend BERTScore with a few new features appropriately for image captioning evaluation. The experimental results show that our metric achieves state-of-the-art human judgment correlation.",Image Evaluation|Evaluating captions|system-level tasks|BERTScore,Resources and Evaluation,Long,https://www.aclweb.org/anthology/2020.acl-main.93.pdf -main.87,Enhancing Answer Boundary Detection for Multilingual Machine Reading Comprehension,Fei Yuan|Linjun Shou|Xuanyu Bai|Ming Gong|Yaobo Liang|Nan Duan|Yan Fu|Daxin Jiang,"Multilingual pre-trained models could leverage the training data from a rich source language (such as English) to improve performance on low resource languages. However, the transfer quality for multilingual Machine Reading Comprehension (MRC) is significantly worse than sentence classification tasks mainly due to the requirement of MRC to detect the word level answer boundary. In this paper, we propose two auxiliary tasks in the fine-tuning stage to create additional phrase boundary supervision: (1) A mixed MRC task, which translates the question or passage to other languages and builds cross-lingual question-passage pairs; (2) A language-agnostic knowledge masking task by leveraging knowledge phrases mined from web. Besides, extensive experiments on two cross-lingual MRC datasets show the effectiveness of our proposed approach.",Multilingual Comprehension|multilingual MRC|MRC|sentence tasks,Question Answering,Long,https://www.aclweb.org/anthology/2020.acl-main.87.pdf -main.398,TaPas: Weakly Supervised Table Parsing via Pre-training,Jonathan Herzig|Pawel Krzysztof Nowak|Thomas Müller|Francesco Piccinno|Julian Eisenschlos,"Answering natural language questions over tables is usually seen as a semantic parsing task. To alleviate the collection cost of full logical forms, one popular approach focuses on weak supervision consisting of denotations instead of logical forms. However, training semantic parsers from weak supervision poses difficulties, and in addition, the generated logical forms are only used as an intermediate step prior to retrieving the denotation. In this paper, we present TaPas, an approach to question answering over tables without generating logical forms. TaPas trains from weak supervision, and predicts the denotation by selecting table cells and optionally applying a corresponding aggregation operator to such selection. TaPas extends BERT's architecture to encode tables as input, initializes from an effective joint pre-training of text segments and tables crawled from Wikipedia, and is trained end-to-end. We experiment with three different semantic parsing datasets, and find that TaPas outperforms or rivals semantic parsing models by improving state-of-the-art accuracy on SQA from 55.1 to 67.2 and performing on par with the state-of-the-art on WikiSQL and WikiTQ, but with a simpler model architecture. We additionally find that transfer learning, which is trivial in our setting, from WikiSQL to WikiTQ, yields 48.7 accuracy, 4.2 points above the state-of-the-art.",Weakly Parsing|semantic task|question tables|SQA,Semantics: Sentence Level,Long,https://www.aclweb.org/anthology/2020.acl-main.398.pdf -main.373,"He said ""who's gonna take care of your children when you are at EMNLP?"": Reported Sexist Acts are Not Sexist",Patricia Chiril|Véronique MORICEAU|Farah Benamara|Alda Mari|Gloria Origgi|Marlène Coulomb-Gully,"In a context of offensive content mediation on social media now regulated by European laws, it is important not only to be able to automatically detect sexist content but also to identify if a message with a sexist content is really sexist or is a story of sexism experienced by a woman. We propose: (1) a new characterization of sexist content inspired by speech acts theory and discourse analysis studies, (2) the first French dataset annotated for sexism detection, and (3) a set of deep learning experiments trained on top of a combination of several tweet’s vectorial representations (word embeddings, linguistic features, and various generalization strategies). Our results are encouraging and constitute a first step towards offensive content moderation.",offensive mediation|characterization content|sexism detection|offensive moderation,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Long,https://www.aclweb.org/anthology/2020.acl-main.373.pdf -main.415,A Corpus for Large-Scale Phonetic Typology,Elizabeth Salesky|Eleanor Chodroff|Tiago Pimentel|Matthew Wiesner|Ryan Cotterell|Alan W Black|Jason Eisner,"A major hurdle in data-driven research on typology is having sufficient data in many languages to draw meaningful conclusions. We present VoxClamantis v1.0, the first large-scale corpus for phonetic typology, with aligned segments and estimated phoneme-level labels in 690 readings spanning 635 languages, along with acoustic-phonetic measures of vowels and sibilants. Access to such data can greatly facilitate investigation of phonetic typology at a large scale and across many languages. However, it is non-trivial and computationally intensive to obtain such alignments for hundreds of languages, many of which have few to no resources presently available. We describe the methodology to create our corpus, discuss caveats with current methods and their impact on the utility of this data, and illustrate possible research directions through a series of case studies on the 48 highest-quality readings. Our corpus and scripts are publicly available for non-commercial use at https://voxclamantisproject.github.io.",Large-Scale Typology|phonetic typology|cross-linguistic variation|domain knowledge,Resources and Evaluation,Long,https://www.aclweb.org/anthology/2020.acl-main.415.pdf -main.401,"Sentiment and Emotion help Sarcasm? A Multi-task Learning Framework for Multi-Modal Sarcasm, Sentiment and Emotion Analysis",Dushyant Singh Chauhan|Dhanush S R|Asif Ekbal|Pushpak Bhattacharyya,"In this paper, we hypothesize that sarcasm is closely related to sentiment and emotion, and thereby propose a multi-task deep learning framework to solve all these three problems simultaneously in a multi-modal conversational scenario. We, at first, manually annotate the recently released multi-modal MUStARD sarcasm dataset with sentiment and emotion classes, both implicit and explicit. For multi-tasking, we propose two attention mechanisms, viz. Inter-segment Inter-modal Attention (Ie-Attention) and Intra-segment Inter-modal Attention (Ia-Attention). The main motivation of Ie-Attention is to learn the relationship between the different segments of the sentence across the modalities. In contrast, Ia-Attention focuses within the same segment of the sentence across the modalities. Finally, representations from both the attentions are concatenated and shared across the five classes (i.e., sarcasm, implicit sentiment, explicit sentiment, implicit emotion, explicit emotion) for multi-tasking. Experimental results on the extended version of the MUStARD dataset show the efficacy of our proposed approach for sarcasm detection over the existing state-of-the-art systems. The evaluation also shows that the proposed multi-task framework yields better performance for the primary task, i.e., sarcasm detection, with the help of two secondary tasks, emotion and sentiment analysis.",Sentiment Analysis|Sentiment |multi-tasking|sarcasm detection,Speech and Multimodality,Long,https://www.aclweb.org/anthology/2020.acl-main.401.pdf -main.367,Autoencoding Pixies: Amortised Variational Inference with Graph Convolutions for Functional Distributional Semantics,Guy Emerson,"Functional Distributional Semantics provides a linguistically interpretable framework for distributional semantics, by representing the meaning of a word as a function (a binary classifier), instead of a vector. However, the large number of latent variables means that inference is computationally expensive, and training a model is therefore slow to converge. In this paper, I introduce the Pixie Autoencoder, which augments the generative model of Functional Distributional Semantics with a graph-convolutional neural network to perform amortised variational inference. This allows the model to be trained more effectively, achieving better results on two tasks (semantic similarity in context and semantic composition), and outperforming BERT, a large pre-trained language model.",Amortised Inference|inference|semantic composition|Autoencoding,Semantics: Lexical,Long,https://www.aclweb.org/anthology/2020.acl-main.367.pdf -main.78,Interpretable Operational Risk Classification with Semi-Supervised Variational Autoencoder,Fan Zhou|Shengming Zhang|Yi Yang,"Operational risk management is one of the biggest challenges nowadays faced by financial institutions. There are several major challenges of building a text classification system for automatic operational risk prediction, including imbalanced labeled/unlabeled data and lacking interpretability. To tackle these challenges, we present a semi-supervised text classification framework that integrates multi-head attention mechanism with Semi-supervised variational inference for Operational Risk Classification (SemiORC). We empirically evaluate the framework on a real-world dataset. The results demonstrate that our method can better utilize unlabeled data and learn visually interpretable document representations. SemiORC also outperforms other baseline methods on operational risk classification.",Interpretable Classification|Operational management|automatic prediction|Operational Classification,NLP Applications,Short,https://www.aclweb.org/anthology/2020.acl-main.78.pdf -main.50,Predicting the Topical Stance and Political Leaning of Media using Tweets,Peter Stefanov|Kareem Darwish|Atanas Atanasov|Preslav Nakov,"Discovering the stances of media outlets and influential people on current, debatable topics is important for social statisticians and policy makers. Many supervised solutions exist for determining viewpoints, but manually annotating training data is costly. In this paper, we propose a cascaded method that uses unsupervised learning to ascertain the stance of Twitter users with respect to a polarizing topic by leveraging their retweet behavior; then, it uses supervised learning based on user labels to characterize both the general political leaning of online media and of popular Twitter users, as well as their stance with respect to the target polarizing topic. We evaluate the model by comparing its predictions to gold labels from the Media Bias/Fact Check website, achieving 82.6% accuracy.",supervised solutions|cascaded method|unsupervised learning|supervised learning,Computational Social Science and Social Media,Long,https://www.aclweb.org/anthology/2020.acl-main.50.pdf -main.429,How does BERT's attention change when you fine-tune? An analysis methodology and a case study in negation scope,Yiyun Zhao|Steven Bethard,"Large pretrained language models like BERT, after fine-tuning to a downstream task, have achieved high performance on a variety of NLP problems. Yet explaining their decisions is difficult despite recent work probing their internal representations. We propose a procedure and analysis methods that take a hypothesis of how a transformer-based model might encode a linguistic phenomenon, and test the validity of that hypothesis based on a comparison between knowledge-related downstream tasks with downstream control tasks, and measurement of cross-dataset consistency. We apply this methodology to test BERT and RoBERTa on a hypothesis that some attention heads will consistently attend from a word in negation scope to the negation cue. We find that after fine-tuning BERT and RoBERTa on a negation scope task, the average attention head improves its sensitivity to negation and its attention consistency across negation datasets compared to the pre-trained models. However, only the base models (not the large models) improve compared to a control task, indicating there is evidence for a shallow encoding of negation only in the base models.",downstream task|NLP problems|knowledge-related tasks|downstream tasks,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.429.pdf -main.44,A Three-Parameter Rank-Frequency Relation in Natural Languages,Chenchen Ding|Masao Utiyama|Eiichiro Sumita,"We present that, the rank-frequency relation in textual data follows f ∝ r^{-ɑ}(r+ɣ)^{-β}, where f is the token frequency and r is the rank by frequency, with (ɑ, β, ɣ) as parameters. The formulation is derived based on the empirical observation that d^2 (x+y)/dx^2 is a typical impulse function, where (x,y)=(\log r, \log f). The formulation is the power law when \mbox{β=0} and the Zipf--Mandelbrot law when \mbox{ɑ=0}. We illustrate that ɑ is related to the analytic features of syntax and β+ɣ to those of morphology in natural languages from an investigation of multilingual corpora.",Three-Parameter Relation|rank-frequency relation|f|token frequency,Theory and Formalism in NLP (Linguistic and Mathematical),Short,https://www.aclweb.org/anthology/2020.acl-main.44.pdf -main.363,An Effectiveness Metric for Ordinal Classification: Formal Properties and Experimental Results,Enrique Amigo|Julio Gonzalo|Stefano Mizzaro|Jorge Carrillo-de-Albornoz,"In Ordinal Classification tasks, items have to be assigned to classes that have a relative ordering, such as ""positive"", ""neutral"", ""negative"" in sentiment analysis. Remarkably, the most popular evaluation metrics for ordinal classification tasks either ignore relevant information (for instance, precision/recall on each of the classes ignores their relative ordering) or assume additional information (for instance, Mean Average Error assumes absolute distances between classes). In this paper we propose a new metric for Ordinal Classification, Closeness Evaluation Measure, that is rooted on Measurement Theory and Information Theory. Our theoretical analysis and experimental results over both synthetic data and data from NLP shared tasks indicate that the proposed metric captures quality aspects from different traditional tasks simultaneously. In addition, it generalizes some popular classification (nominal scale) and error minimization (interval scale) metrics, depending on the measurement scale in which it is instantiated.",Ordinal Classification|Ordinal tasks|sentiment analysis|NLP tasks,Resources and Evaluation,Long,https://www.aclweb.org/anthology/2020.acl-main.363.pdf -main.405,When do Word Embeddings Accurately Reflect Surveys on our Beliefs About People?,Kenneth Joseph|Jonathan Morgan,"Social biases are encoded in word embeddings. This presents a unique opportunity to study society historically and at scale, and a unique danger when embeddings are used in downstream applications. Here, we investigate the extent to which publicly-available word embeddings accurately reflect beliefs about certain kinds of people as measured via traditional survey methods. We find that biases found in word embeddings do, on average, closely mirror survey data across seventeen dimensions of social meaning. However, we also find that biases in embeddings are much more reflective of survey data for some dimensions of meaning (e.g. gender) than others (e.g. race), and that we can be highly confident that embedding-based measures reflect survey data only for the most salient biases.",downstream applications|Word Embeddings|survey methods|embedding-based measures,Computational Social Science and Social Media,Long,https://www.aclweb.org/anthology/2020.acl-main.405.pdf -main.411,DeFormer: Decomposing Pre-trained Transformers for Faster Question Answering,Qingqing Cao|Harsh Trivedi|Aruna Balasubramanian|Niranjan Balasubramanian,"Transformer-based QA models use input-wide self-attention -- i.e. across both the question and the input passage -- at all layers, causing them to be slow and memory-intensive. It turns out that we can get by without input-wide self-attention at all layers, especially in the lower layers. We introduce DeFormer, a decomposed transformer, which substitutes the full self-attention with question-wide and passage-wide self-attentions in the lower layers. This allows for question-independent processing of the input text representations, which in turn enables pre-computing passage representations reducing runtime compute drastically. Furthermore, because DeFormer is largely similar to the original model, we can initialize DeFormer with the pre-training weights of a standard transformer, and directly fine-tune on the target QA dataset. We show DeFormer versions of BERT and XLNet can be used to speed up QA by over 4.3x and with simple distillation-based losses they incur only a 1% drop in accuracy. We open source the code at https://github.com/StonyBrookNLP/deformer.",Faster Answering|question-independent processing|DeFormer|Decomposing Transformers,Question Answering,Long,https://www.aclweb.org/anthology/2020.acl-main.411.pdf -main.68,Rigid Formats Controlled Text Generation,Piji Li|Haisong Zhang|Xiaojiang Liu|Shuming Shi,"Neural text generation has made tremendous progress in various tasks. One common characteristic of most of the tasks is that the texts are not restricted to some rigid formats when generating. However, we may confront some special text paradigms such as Lyrics (assume the music score is given), Sonnet, SongCi (classical Chinese poetry of the Song dynasty), etc. The typical characteristics of these texts are in three folds: (1) They must comply fully with the rigid predefined formats. (2) They must obey some rhyming schemes. (3) Although they are restricted to some formats, the sentence integrity must be guaranteed. To the best of our knowledge, text generation based on the predefined rigid formats has not been well investigated. Therefore, we propose a simple and elegant framework named SongNet to tackle this problem. The backbone of the framework is a Transformer-based auto-regressive language model. Sets of symbols are tailor-designed to improve the modeling performance especially on format, rhyme, and sentence integrity. We improve the attention mechanism to impel the model to capture some future information on the format. A pre-training and fine-tuning framework is designed to further improve the generation quality. Extensive experiments conducted on two collected corpora demonstrate that our proposed framework generates significantly better results in terms of both automatic metrics and the human evaluation.",Rigid Generation|Neural generation|text generation|rhyming schemes,Generation,Long,https://www.aclweb.org/anthology/2020.acl-main.68.pdf -main.377,"Exact yet Efficient Graph Parsing, Bi-directional Locality and the Constructivist Hypothesis",Yajie Ye|Weiwei Sun,"A key problem in processing graph-based meaning representations is graph parsing, i.e. computing all possible derivations of a given graph according to a (competence) grammar. We demonstrate, for the first time, that exact graph parsing can be efficient for large graphs and with large Hyperedge Replacement Grammars (HRGs). The advance is achieved by exploiting locality as terminal edge-adjacency in HRG rules. In particular, we highlight the importance of 1) a terminal edge-first parsing strategy, 2) a categorization of a subclass of HRG, i.e. what we call Weakly Regular Graph Grammar, and 3) distributing argument-structures to both lexical and phrasal rules.",graph parsing|exact parsing|Graph Parsing|graph-based representations,"Syntax: Tagging, Chunking and Parsing",Long,https://www.aclweb.org/anthology/2020.acl-main.377.pdf -main.40,Multiscale Collaborative Deep Models for Neural Machine Translation,Xiangpeng Wei|Heng Yu|Yue Hu|Yue Zhang|Rongxiang Weng|Weihua Luo,"Recent evidence reveals that Neural Machine Translation (NMT) models with deeper neural networks can be more effective but are difficult to train. In this paper, we present a MultiScale Collaborative (MSC) framework to ease the training of NMT models that are substantially deeper than those used previously. We explicitly boost the gradient back-propagation from top to bottom levels by introducing a block-scale collaboration mechanism into deep NMT models. Then, instead of forcing the whole encoder stack directly learns a desired representation, we let each encoder block learns a fine-grained representation and enhance it by encoding spatial dependencies using a context-scale collaboration. We provide empirical evidence showing that the MSC nets are easy to optimize and can obtain improvements of translation quality from considerably increased depth. On IWSLT translation tasks with three translation directions, our extremely deep models (with 72-layer encoders) surpass strong baselines by +2.2~+3.1 BLEU points. In addition, our deep MSC achieves a BLEU score of 30.56 on WMT14 English-to-German task that significantly outperforms state-of-the-art deep NMT models. We have included the source code in supplementary materials.",Neural Translation|training models|IWSLT tasks|WMT14 task,Machine Translation,Long,https://www.aclweb.org/anthology/2020.acl-main.40.pdf -main.439,Pretraining with Contrastive Sentence Objectives Improves Discourse Performance of Language Models,Dan Iter|Kelvin Guu|Larry Lansing|Dan Jurafsky,"Recent models for unsupervised representation learning of text have employed a number of techniques to improve contextual word representations but have put little focus on discourse-level representations. We propose Conpono, an inter-sentence objective for pretraining language models that models discourse coherence and the distance between sentences. Given an anchor sentence, our model is trained to predict the text k sentences away using a sampled-softmax objective where the candidates consist of neighboring sentences and sentences randomly sampled from the corpus. On the discourse representation benchmark DiscoEval, our model improves over the previous state-of-the-art by up to 13% and on average 4% absolute across 7 tasks. Our model is the same size as BERT-Base, but outperforms the much larger BERT-Large model and other more recent approaches that incorporate discourse. We also show that Conpono yields gains of 2%-6% absolute even for tasks that do not explicitly evaluate discourse: textual entailment (RTE), common sense reasoning (COPA) and reading comprehension (ReCoRD).",Discourse|unsupervised text|contextual representations|discourse-level representations,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.439.pdf -main.54,End-to-End Neural Pipeline for Goal-Oriented Dialogue Systems using GPT-2,Donghoon Ham|Jeong-Gwan Lee|Youngsoo Jang|Kee-Eung Kim,"The goal-oriented dialogue system needs to be optimized for tracking the dialogue flow and carrying out an effective conversation under various situations to meet the user goal. The traditional approach to build such a dialogue system is to take a pipelined modular architecture, where its modules are optimized individually. However, such an optimization scheme does not necessarily yield the overall performance improvement of the whole system. On the other hand, end-to-end dialogue systems with monolithic neural architecture are often trained only with input-output utterances, without taking into account the entire annotations available in the corpus. This scheme makes it difficult for goal-oriented dialogues where the system needs to integrate with external systems or to provide interpretable information about why the system generated a particular response. In this paper, we present an end-to-end neural architecture for dialogue systems that addresses both challenges above. In the human evaluation, our dialogue system achieved the success rate of 68.32%, the language understanding score of 4.149, and the response appropriateness score of 4.287, which ranked the system at the top position in the end-to-end multi-domain dialogue system task in the 8th dialogue systems technology challenge (DSTC8).",tracking flow|dialogue systems|human evaluation|End-to-End Pipeline,Dialogue and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.acl-main.54.pdf -main.83,A Frame-based Sentence Representation for Machine Reading Comprehension,Shaoru Guo|Ru Li|Hongye Tan|Xiaoli Li|Yong Guan|Hongyan Zhao|Yueping Zhang,"Sentence representation (SR) is the most crucial and challenging task in Machine Reading Comprehension (MRC). MRC systems typically only utilize the information contained in the sentence itself, while human beings can leverage their semantic knowledge. To bridge the gap, we proposed a novel Frame-based Sentence Representation (FSR) method, which employs frame semantic knowledge to facilitate sentence modelling. Specifically, different from existing methods that only model lexical units (LUs), Frame Representation Models, which utilize both LUs in frame and Frame-to-Frame (F-to-F) relations, are designed to model frames and sentences with attention schema. Our proposed FSR method is able to integrate multiple-frame semantic information to get much better sentence representations. Our extensive experimental results show that it performs better than state-of-the-art technologies on machine reading comprehension task.",Machine Comprehension|Sentence representation|SR|Machine MRC,Question Answering,Short,https://www.aclweb.org/anthology/2020.acl-main.83.pdf -main.388,Tchebycheff Procedure for Multi-task Text Classification,Yuren Mao|Shuang Yun|Weiwei Liu|Bo Du,"Multi-task Learning methods have achieved great progress in text classification. However, existing methods assume that multi-task text classification problems are convex multiobjective optimization problems, which is unrealistic in real-world applications. To address this issue, this paper presents a novel Tchebycheff procedure to optimize the multi-task classification problems without convex assumption. The extensive experiments back up our theoretical analysis and validate the superiority of our proposals.",Multi-task Classification|text classification|multi-task problems|convex problems,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.388.pdf -main.97,DTCA: Decision Tree-based Co-Attention Networks for Explainable Claim Verification,Lianwei Wu|Yuan Rao|Yongqiang Zhao|Hao Liang|Ambreen Nazir,"Recently, many methods discover effective evidence from reliable sources by appropriate neural networks for explainable claim verification, which has been widely recognized. However, in these methods, the discovery process of evidence is nontransparent and unexplained. Simultaneously, the discovered evidence is aimed at the interpretability of the whole sequence of claims but insufficient to focus on the false parts of claims. In this paper, we propose a Decision Tree-based Co-Attention model (DTCA) to discover evidence for explainable claim verification. Specifically, we first construct Decision Tree-based Evidence model (DTE) to select comments with high credibility as evidence in a transparent and interpretable way. Then we design Co-attention Self-attention networks (CaSa) to make the selected evidence interact with claims, which is for 1) training DTE to determine the optimal decision thresholds and obtain more powerful evidence; and 2) utilizing the evidence to find the false parts in the claim. Experiments on two public datasets, RumourEval and PHEME, demonstrate that DTCA not only provides explanations for the results of claim verification but also achieves the state-of-the-art performance, boosting the F1-score by more than 3.11%, 2.41%, respectively.",Explainable Verification|discovery evidence|claim verification|DTCA,Computational Social Science and Social Media,Long,https://www.aclweb.org/anthology/2020.acl-main.97.pdf -main.607,Semi-Supervised Semantic Dependency Parsing Using CRF Autoencoders,Zixia Jia|Youmi Ma|Jiong Cai|Kewei Tu,"Semantic dependency parsing, which aims to find rich bi-lexical relationships, allows words to have multiple dependency heads, resulting in graph-structured representations. We propose an approach to semi-supervised learning of semantic dependency parsers based on the CRF autoencoder framework. Our encoder is a discriminative neural semantic dependency parser that predicts the latent parse graph of the input sentence. Our decoder is a generative neural model that reconstructs the input sentence conditioned on the latent parse graph. Our model is arc-factored and therefore parsing and learning are both tractable. Experiments show our model achieves significant and consistent improvement over the supervised baseline.",Semantic parsing|semi-supervised parsers|parsing|learning,Semantics: Sentence Level,Long,https://www.aclweb.org/anthology/2020.acl-main.607.pdf -main.161,Suspense in Short Stories is Predicted By Uncertainty Reduction over Neural Story Representation,David Wilmot|Frank Keller,"Suspense is a crucial ingredient of narrative fiction, engaging readers and making stories compelling. While there is a vast theoretical literature on suspense, it is computationally not well understood. We compare two ways for modelling suspense: surprise, a backward-looking measure of how unexpected the current state is given the story so far; and uncertainty reduction, a forward-looking measure of how unexpected the continuation of the story is. Both can be computed either directly over story representations or over their probability distributions. We propose a hierarchical language model that encodes stories and computes surprise and uncertainty reduction. Evaluating against short stories annotated with human suspense judgements, we find that uncertainty reduction over representations is the best predictor, resulting in near human accuracy. We also show that uncertainty reduction can be used to predict suspenseful events in movie synopses.",Uncertainty Reduction|narrative fiction|surprise reduction|Neural Representation,Cognitive Modeling and Psycholinguistics,Long,https://www.aclweb.org/anthology/2020.acl-main.161.pdf -main.175,Unsupervised Opinion Summarization with Noising and Denoising,Reinald Kim Amplayo|Mirella Lapata,"The supervised training of high-capacity models on large datasets containing hundreds of thousands of document-summary pairs is critical to the recent success of deep learning techniques for abstractive summarization. Unfortunately, in most domains (other than news) such training data is not available and cannot be easily sourced. In this paper we enable the use of supervised learning for the setting where there are only documents available (e.g., product or business reviews) without ground truth summaries. We create a synthetic dataset from a corpus of user reviews by sampling a review, pretending it is a summary, and generating noisy versions thereof which we treat as pseudo-review input. We introduce several linguistically motivated noise generation functions and a summarization model which learns to denoise the input and generate the original review. At test time, the model accepts genuine reviews and generates a summary containing salient opinions, treating those that do not reach consensus as noise. Extensive automatic and human evaluation shows that our model brings substantial improvements over both abstractive and extractive baselines.",Unsupervised Summarization|supervised models|abstractive summarization|Noising,Summarization,Long,https://www.aclweb.org/anthology/2020.acl-main.175.pdf -main.613,Document Translation vs. Query Translation for Cross-Lingual Information Retrieval in the Medical Domain,Shadi Saleh|Pavel Pecina,"We present a thorough comparison of two principal approaches to Cross-Lingual Information Retrieval: document translation (DT) and query translation (QT). Our experiments are conducted using the cross-lingual test collection produced within the CLEF eHealth information retrieval tasks in 2013–2015 containing English documents and queries in several European languages. We exploit the Statistical Machine Translation (SMT) and Neural Machine Translation (NMT) paradigms and train several domain-specific and task-specific machine translation systems to translate the non-English queries into English (for the QT approach) and the English documents to all the query languages (for the DT approach). The results show that the quality of QT by SMT is sufficient enough to outperform the retrieval results of the DT approach for all the languages. NMT then further boosts translation quality and retrieval quality for both QT and DT for most languages, but still, QT provides generally better retrieval results than DT.",Document Translation|Query Translation|Cross-Lingual Retrieval|DT,Information Retrieval and Text Mining,Long,https://www.aclweb.org/anthology/2020.acl-main.613.pdf -main.149,It's Easier to Translate out of English than into it: Measuring Neural Translation Difficulty by Cross-Mutual Information,Emanuele Bugliarello|Sabrina J. Mielke|Antonios Anastasopoulos|Ryan Cotterell|Naoaki Okazaki,"The performance of neural machine translation systems is commonly evaluated in terms of BLEU. However, due to its reliance on target language properties and generation, the BLEU metric does not allow an assessment of which translation directions are more difficult to model. In this paper, we propose cross-mutual information (XMI): an asymmetric information-theoretic metric of machine translation difficulty that exploits the probabilistic nature of most neural machine translation models. XMI allows us to better evaluate the difficulty of translating text into the target language while controlling for the difficulty of the target-side generation component independent of the translation task. We then present the first systematic and controlled study of cross-lingual translation difficulties using modern neural translation systems. Code for replicating our experiments is available online at https://github.com/e-bug/nmt-difficulty.",Measuring Difficulty|generation|asymmetric difficulty|machine difficulty,Machine Translation,Short,https://www.aclweb.org/anthology/2020.acl-main.149.pdf -main.773,Towards Robustifying NLI Models Against Lexical Dataset Biases,Xiang Zhou|Mohit Bansal,"While deep learning models are making fast progress on the task of Natural Language Inference, recent studies have also shown that these models achieve high accuracy by exploiting several dataset biases, and without deep understanding of the language semantics. Using contradiction-word bias and word-overlapping bias as our two bias examples, this paper explores both data-level and model-level debiasing methods to robustify models against lexical dataset biases. First, we debias the dataset through data augmentation and enhancement, but show that the model bias cannot be fully removed via this method. Next, we also compare two ways of directly debiasing the model without knowing what the dataset biases are in advance. The first approach aims to remove the label bias at the embedding level. The second approach employs a bag-of-words sub-model to capture the features that are likely to exploit the bias and prevents the original model from learning these biased features by forcing orthogonality between these two sub-models. We performed evaluations on new balanced datasets extracted from the original MNLI dataset as well as the NLI stress tests, and show that the orthogonality approach is better at debiasing the model while maintaining competitive overall accuracy.",Natural Inference|data augmentation|Robustifying Models|deep models,Semantics: Textual Inference and Other Areas of Semantics,Long,https://www.aclweb.org/anthology/2020.acl-main.773.pdf -main.767,Smart To-Do: Automatic Generation of To-Do Items from Emails,Sudipto Mukherjee|Subhabrata Mukherjee|Marcello Hasegawa|Ahmed Hassan Awadallah|Ryen White,"Intelligent features in email service applications aim to increase productivity by helping people organize their folders, compose their emails and respond to pending tasks. In this work, we explore a new application, Smart-To-Do, that helps users with task management over emails. We introduce a new task and dataset for automatically generating To-Do items from emails where the sender has promised to perform an action. We design a two-stage process leveraging recent advances in neural text generation and sequence-to-sequence learning, obtaining BLEU and ROUGE scores of 0.23 and 0.63 for this task. To the best of our knowledge, this is the first work to address the problem of composing To-Do items from emails.",Automatic Items|email applications|task management|automatically items,NLP Applications,Short,https://www.aclweb.org/anthology/2020.acl-main.767.pdf -main.217,Phone Features Improve Speech Translation,Elizabeth Salesky|Alan W Black,"End-to-end models for speech translation (ST) more tightly couple speech recognition (ASR) and machine translation (MT) than a traditional cascade of separate ASR and MT models, with simpler model architectures and the potential for reduced error propagation. Their performance is often assumed to be superior, though in many conditions this is not yet the case. We compare cascaded and end-to-end models across high, medium, and low-resource conditions, and show that cascades remain stronger baselines. Further, we introduce two methods to incorporate phone features into ST models. We show that these features improve both architectures, closing the gap between end-to-end models and cascades, and outperforming previous academic work -- by up to 9 BLEU on our low-resource setting.",Speech Translation|speech recognition|ASR|machine translation,Speech and Multimodality,Long,https://www.aclweb.org/anthology/2020.acl-main.217.pdf -main.571,Bipartite Flat-Graph Network for Nested Named Entity Recognition,Ying Luo|Hai Zhao,"In this paper, we propose a novel bipartite flat-graph network (BiFlaG) for nested named entity recognition (NER), which contains two subgraph modules: a flat NER module for outermost entities and a graph module for all the entities located in inner layers. Bidirectional LSTM (BiLSTM) and graph convolutional network (GCN) are adopted to jointly learn flat entities and their inner dependencies. Different from previous models, which only consider the unidirectional delivery of information from innermost layers to outer ones (or outside-to-inside), our model effectively captures the bidirectional interaction between them. We first use the entities recognized by the flat NER module to construct an entity graph, which is fed to the next graph module. The richer representation learned from graph module carries the dependencies of inner entities and can be exploited to improve outermost entity predictions. Experimental results on three standard nested NER datasets demonstrate that our BiFlaG outperforms previous state-of-the-art models.",Nested Recognition|NER|Bipartite Network|bipartite,Information Extraction,Long,https://www.aclweb.org/anthology/2020.acl-main.571.pdf -main.565,Dynamic Fusion Network for Multi-Domain End-to-end Task-Oriented Dialog,Libo Qin|Xiao Xu|Wanxiang Che|Yue Zhang|Ting Liu,"Recent studies have shown remarkable success in end-to-end task-oriented dialog system. However, most neural models rely on large training data, which are only available for a certain number of task domains, such as navigation and scheduling. This makes it difficult to scalable for a new domain with limited labeled data. However, there has been relatively little research on how to effectively use data from all domains to improve the performance of each domain and also unseen domains. To this end, we investigate methods that can make explicit use of domain knowledge and introduce a shared-private network to learn shared and specific knowledge. In addition, we propose a novel Dynamic Fusion Network (DF-Net) which automatically exploit the relevance between the target domain and each domain. Results show that our models outperforms existing methods on multi-domain dialogue, giving the state-of-the-art in the literature. Besides, with little training data, we show its transferability by outperforming prior best model by 13.9% on average.",Multi-Domain Dialog|end-to-end system|navigation|scheduling,Dialogue and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.acl-main.565.pdf -main.203,A Girl Has A Name: Detecting Authorship Obfuscation,Asad Mahmood|Zubair Shafiq|Padmini Srinivasan,"Authorship attribution aims to identify the author of a text based on the stylometric analysis. Authorship obfuscation, on the other hand, aims to protect against authorship attribution by modifying a text’s style. In this paper, we evaluate the stealthiness of state-of-the-art authorship obfuscation methods under an adversarial threat model. An obfuscator is stealthy to the extent an adversary finds it challenging to detect whether or not a text modified by the obfuscator is obfuscated – a decision that is key to the adversary interested in authorship attribution. We show that the existing authorship obfuscation methods are not stealthy as their obfuscated texts can be identified with an average F1 score of 0.87. The reason for the lack of stealthiness is that these obfuscators degrade text smoothness, as ascertained by neural language models, in a detectable manner. Our results highlight the need to develop stealthy authorship obfuscation methods that can better protect the identity of an author seeking anonymity.",Detecting Obfuscation|Authorship attribution|Authorship obfuscation|stylometric analysis,NLP Applications,Long,https://www.aclweb.org/anthology/2020.acl-main.203.pdf -main.559,Language (Re)modelling: Towards Embodied Language Understanding,Ronen Tamari|Chen Shani|Tom Hope|Miriam R L Petruck|Omri Abend|Dafna Shahaf,"While natural language understanding (NLU) is advancing rapidly, today's technology differs from human-like language understanding in fundamental ways, notably in its inferior efficiency, interpretability, and generalization. This work proposes an approach to representation and learning based on the tenets of embodied cognitive linguistics (ECL). According to ECL, natural language is inherently executable (like programming languages), driven by mental simulation and metaphoric mappings over hierarchical compositions of structures and schemata learned through embodied interaction. This position paper argues that the use of grounding by metaphoric reasoning and simulation will greatly benefit NLU systems, and proposes a system architecture along with a roadmap towards realizing this vision.",Embodied Understanding|natural understanding|representation|NLU systems,Theme,Long,https://www.aclweb.org/anthology/2020.acl-main.559.pdf -main.558,Are we Estimating or Guesstimating Translation Quality?,Shuo Sun|Francisco Guzmán|Lucia Specia,"Recent advances in pre-trained multilingual language models lead to state-of-the-art results on the task of quality estimation (QE) for machine translation. A carefully engineered ensemble of such models won the QE shared task at WMT19. Our in-depth analysis, however, shows that the success of using pre-trained language models for QE is over-estimated due to three issues we observed in current QE datasets: (i) The distributions of quality scores are imbalanced and skewed towards good quality scores; (iii) QE models can perform well on these datasets while looking at only source or translated sentences; (iii) They contain statistical artifacts that correlate well with human-annotated QE labels. Our findings suggest that although QE models might capture fluency of translated sentences and complexity of source sentences, they cannot model adequacy of translations effectively.",Estimating Quality|quality estimation|machine translation|QE task,Theme,Short,https://www.aclweb.org/anthology/2020.acl-main.558.pdf -main.564,Data Manipulation: Towards Effective Instance Learning for Neural Dialogue Generation via Learning to Augment and Reweight,Hengyi Cai|Hongshen Chen|Yonghao Song|Cheng Zhang|Xiaofang Zhao|Dawei Yin,"Current state-of-the-art neural dialogue models learn from human conversations following the data-driven paradigm. As such, a reliable training corpus is the crux of building a robust and well-behaved dialogue model. However, due to the open-ended nature of human conversations, the quality of user-generated training data varies greatly, and effective training samples are typically insufficient while noisy samples frequently appear. This impedes the learning of those data-driven neural dialogue models. Therefore, effective dialogue learning requires not only more reliable learning samples, but also fewer noisy samples. In this paper, we propose a data manipulation framework to proactively reshape the data distribution towards reliable samples by augmenting and highlighting effective learning samples as well as reducing the effect of inefficient samples simultaneously. In particular, the data manipulation model selectively augments the training samples and assigns an importance weight to each instance to reform the training data. Note that, the proposed data manipulation framework is fully data-driven and learnable. It not only manipulates training samples to optimize the dialogue generation model, but also learns to increase its manipulation skills through gradient descent with validation samples. Extensive experiments show that our framework can improve the dialogue generation performance with respect to various automatic evaluation metrics and human judgments.",Data Manipulation|Neural Generation|learning|dialogue generation,Dialogue and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.acl-main.564.pdf -main.202,XtremeDistil: Multi-stage Distillation for Massive Multilingual Models,Subhabrata Mukherjee|Ahmed Hassan Awadallah,"Deep and large pre-trained language models are the state-of-the-art for various natural language processing tasks. However, the huge size of these models could be a deterrent to using them in practice. Some recent works use knowledge distillation to compress these huge models into shallow ones. In this work we study knowledge distillation with a focus on multilingual Named Entity Recognition (NER). In particular, we study several distillation strategies and propose a stage-wise optimization scheme leveraging teacher internal representations, that is agnostic of teacher architecture, and show that it outperforms strategies employed in prior works. Additionally, we investigate the role of several factors like the amount of unlabeled data, annotation resources, model architecture and inference latency to name a few. We show that our approach leads to massive compression of teacher models like mBERT by upto 35x in terms of parameters and 51x in terms of latency for batch inference while retaining 95% of its F1-score for NER over 41 languages.",natural tasks|knowledge distillation|multilingual Recognition|multilingual NER,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.202.pdf -main.216,Multimodal and Multiresolution Speech Recognition with Transformers,Georgios Paraskevopoulos|Srinivas Parthasarathy|Aparna Khare|Shiva Sundaram,"This paper presents an audio visual automatic speech recognition (AV-ASR) system using a Transformer-based architecture. We particularly focus on the scene context provided by the visual information, to ground the ASR. We extract representations for audio features in the encoder layers of the transformer and fuse video features using an additional crossmodal multihead attention layer. Additionally, we incorporate a multitask training criterion for multiresolution ASR, where we train the model to generate both character and subword level transcriptions. Experimental results on the How2 dataset, indicate that multiresolution training can speed up convergence by around 50% and relatively improves word error rate (WER) performance by upto 18% over subword prediction models. Further, incorporating visual information improves performance with relative gains upto 3.76% over audio only models. Our results are comparable to state-of-the-art Listen, Attend and Spell-based architectures.",Multimodal Recognition|ASR|multiresolution ASR|Transformers,Speech and Multimodality,Short,https://www.aclweb.org/anthology/2020.acl-main.216.pdf -main.570,"Amalgamation of protein sequence, structure and textual information for improving protein-protein interaction identification",Pratik Dutta|Sriparna Saha,"An in-depth exploration of protein-protein interactions (PPI) is essential to understand the metabolism in addition to the regulations of biological entities like proteins, carbohydrates, and many more. Most of the recent PPI tasks in BioNLP domain have been carried out solely using textual data. In this paper, we argue that incorporating multimodal cues can improve the automatic identification of PPI. As a first step towards enabling the development of multimodal approaches for PPI identification, we have developed two multi-modal datasets which are extensions and multi-modal versions of two popular benchmark PPI corpora (BioInfer and HRPD50). Besides, existing textual modalities, two new modalities, 3D protein structure and underlying genomic sequence, are also added to each instance. Further, a novel deep multi-modal architecture is also implemented to efficiently predict the protein interactions from the developed datasets. A detailed experimental analysis reveals the superiority of the multi-modal approach in comparison to the strong baselines including unimodal approaches and state-of the-art methods over both the generated multi-modal datasets. The developed multi-modal datasets are available for use at https://github.com/sduttap16/MM_PPI_NLP.",protein-protein identification|in-depth interactions|PPI tasks|automatic PPI,Information Extraction,Long,https://www.aclweb.org/anthology/2020.acl-main.570.pdf -main.766,Should All Cross-Lingual Embeddings Speak English?,Antonios Anastasopoulos|Graham Neubig,"Most of recent work in cross-lingual word embeddings is severely Anglocentric. The vast majority of lexicon induction evaluation dictionaries are between English and another language, and the English embedding space is selected by default as the hub when learning in a multilingual setting. With this work, however, we challenge these practices. First, we show that the choice of hub language can significantly impact downstream lexicon induction zero-shot POS tagging performance. Second, we both expand a standard English-centered evaluation dictionary collection to include all language pairs using triangulation, and create new dictionaries for under-represented languages. Evaluating established methods over all these language pairs sheds light into their suitability for aligning embeddings from distant languages and presents new challenges for the field. Finally, in our analysis we identify general guidelines for strong cross-lingual embedding baselines, that extend to language pairs that do not include English.",cross-lingual embeddings|lexicon tagging|lexicon dictionaries|cross-lingual baselines,NLP Applications,Long,https://www.aclweb.org/anthology/2020.acl-main.766.pdf -main.772,QuASE: Question-Answer Driven Sentence Encoding,Hangfeng He|Qiang Ning|Dan Roth,"Question-answering (QA) data often encodes essential information in many facets. This paper studies a natural question: Can we get supervision from QA data for other tasks (typically, non-QA ones)? For example, can we use QAMR (Michael et al., 2017) to improve named entity recognition? We suggest that simply further pre-training BERT is often not the best option, and propose the question-answer driven sentence encoding (QuASE) framework. QuASE learns representations from QA data, using BERT or other state-of-the-art contextual language models. In particular, we observe the need to distinguish between two types of sentence encodings, depending on whether the target task is a single- or multi-sentence input; in both cases, the resulting encoding is shown to be an easy-to-use plugin for many downstream tasks. This work may point out an alternative way to supervise NLP tasks.",named recognition|NLP tasks|QuASE|QAMR,Semantics: Textual Inference and Other Areas of Semantics,Long,https://www.aclweb.org/anthology/2020.acl-main.772.pdf -main.148,Improving Massively Multilingual Neural Machine Translation and Zero-Shot Translation,Biao Zhang|Philip Williams|Ivan Titov|Rico Sennrich,"Massively multilingual models for neural machine translation (NMT) are theoretically attractive, but often underperform bilingual models and deliver poor zero-shot translations. In this paper, we explore ways to improve them. We argue that multilingual NMT requires stronger modeling capacity to support language pairs with varying typological characteristics, and overcome this bottleneck via language-specific components and deepening NMT architectures. We identify the off-target translation issue (i.e. translating into a wrong target language) as the major source of the inferior zero-shot performance, and propose random online backtranslation to enforce the translation of unseen training language pairs. Experiments on OPUS-100 (a novel multilingual dataset with 100 languages) show that our approach substantially narrows the performance gap with bilingual models in both one-to-many and many-to-many settings, and improves zero-shot performance by ~10 BLEU, approaching conventional pivot-based methods.",Massively Translation|Zero-Shot Translation|neural translation|NMT,Machine Translation,Long,https://www.aclweb.org/anthology/2020.acl-main.148.pdf -main.174,Screenplay Summarization Using Latent Narrative Structure,Pinelopi Papalampidi|Frank Keller|Lea Frermann|Mirella Lapata,"Most general-purpose extractive summarization models are trained on news articles, which are short and present all important information upfront. As a result, such models are biased on position and often perform a smart selection of sentences from the beginning of the document. When summarizing long narratives, which have complex structure and present information piecemeal, simple position heuristics are not sufficient. In this paper, we propose to explicitly incorporate the underlying structure of narratives into general unsupervised and supervised extractive summarization models. We formalize narrative structure in terms of key narrative events (turning points) and treat it as latent in order to summarize screenplays (i.e., extract an optimal sequence of scenes). Experimental results on the CSI corpus of TV screenplays, which we augment with scene-level summarization labels, show that latent turning points correlate with important aspects of a CSI episode and improve summarization performance over general extractive algorithms leading to more complete and diverse summaries.",Screenplay Summarization|summarization|general-purpose models|position heuristics,Summarization,Long,https://www.aclweb.org/anthology/2020.acl-main.174.pdf -main.612,Improving Entity Linking through Semantic Reinforced Entity Embeddings,Feng Hou|Ruili Wang|Jun He|Yi Zhou,"Entity embeddings, which represent different aspects of each entity with a single vector like word embeddings, are a key component of neural entity linking models. Existing entity embeddings are learned from canonical Wikipedia articles and local contexts surrounding target entities. Such entity embeddings are effective, but too distinctive for linking models to learn contextual commonality. We propose a simple yet effective method, FGS2EE, to inject fine-grained semantic information into entity embeddings to reduce the distinctiveness and facilitate the learning of contextual commonality. FGS2EE first uses the embeddings of semantic type words to generate semantic embeddings, and then combines them with existing entity embeddings through linear aggregation. Extensive experiments show the effectiveness of such embeddings. Based on our entity embeddings, we achieved new sate-of-the-art performance on entity linking.",Entity Linking|learning commonality|Semantic Embeddings|Entity embeddings,Information Extraction,Short,https://www.aclweb.org/anthology/2020.acl-main.612.pdf -main.606,Semantic Parsing for English as a Second Language,Yuanyuan Zhao|Weiwei Sun|Junjie Cao|Xiaojun Wan,"This paper is concerned with semantic parsing for English as a second language (ESL). Motivated by the theoretical emphasis on the learning challenges that occur at the syntax-semantics interface during second language acquisition, we formulate the task based on the divergence between literal and intended meanings. We combine the complementary strengths of English Resource Grammar, a linguistically-precise hand-crafted deep grammar, and TLE, an existing manually annotated ESL UD-TreeBank with a novel reranking model. Experiments demonstrate that in comparison to human annotations, our method can obtain a very promising SemBanking quality. By means of the newly created corpus, we evaluate state-of-the-art semantic parsing as well as grammatical error correction models. The evaluation profiles the performance of neural NLP techniques for handling ESL data and suggests some research directions.",semantic parsing|second acquisition|Semantic Parsing|ESL,Semantics: Sentence Level,Long,https://www.aclweb.org/anthology/2020.acl-main.606.pdf -main.160,Overestimation of Syntactic Representation in Neural Language Models,Jordan Kodner|Nitish Gupta,"With the advent of powerful neural language models over the last few years, research attention has increasingly focused on what aspects of language they represent that make them so successful. Several testing methodologies have been developed to probe models' syntactic representations. One popular method for determining a model's ability to induce syntactic structure trains a model on strings generated according to a template then tests the model's ability to distinguish such strings from superficially similar ones with different syntax. We illustrate a fundamental problem with this approach by reproducing positive results from a recent paper with two non-syntactic baseline language models: an n-gram model and an LSTM model trained on scrambled inputs.",Overestimation Representation|Neural Models|models representations|non-syntactic models,Cognitive Modeling and Psycholinguistics,Short,https://www.aclweb.org/anthology/2020.acl-main.160.pdf -main.96,"Code-Switching Patterns Can Be an Effective Route to Improve Performance of Downstream NLP Applications: A Case Study of Humour, Sarcasm and Hate Speech Detection",Srijan Bansal|Vishal Garimella|Ayush Suhane|Jasabanta Patro|Animesh Mukherjee,"In this paper, we demonstrate how code-switching patterns can be utilised to improve various downstream NLP applications. In particular, we encode various switching features to improve humour, sarcasm and hate speech detection tasks. We believe that this simple linguistic observation can also be potentially helpful in improving other similar NLP applications.",Downstream Applications|hate tasks|speech tasks|NLP applications,Computational Social Science and Social Media,Short,https://www.aclweb.org/anthology/2020.acl-main.96.pdf -main.389,Modeling Word Formation in English–German Neural Machine Translation,Marion Weller-Di Marco|Alexander Fraser,"This paper studies strategies to model word formation in NMT using rich linguistic information, namely a word segmentation approach that goes beyond splitting into substrings by considering fusional morphology. Our linguistically sound segmentation is combined with a method for target-side inflection to accommodate modeling word formation. The best system variants employ source-side morphological analysis and model complex target-side words, improving over a standard system.",Word Formation|English Translation|NMT|word approach,Machine Translation,Short,https://www.aclweb.org/anthology/2020.acl-main.389.pdf -main.82,Spelling Error Correction with Soft-Masked BERT,Shaohua Zhang|Haoran Huang|Jicong Liu|Hang Li,"Spelling error correction is an important yet challenging task because a satisfactory solution of it essentially needs human-level language understanding ability. Without loss of generality we consider Chinese spelling error correction (CSC) in this paper. A state-of-the-art method for the task selects a character from a list of candidates for correction (including non-correction) at each position of the sentence on the basis of BERT, the language representation model. The accuracy of the method can be sub-optimal, however, because BERT does not have sufficient capability to detect whether there is an error at each position, apparently due to the way of pre-training it using mask language modeling. In this work, we propose a novel neural architecture to address the aforementioned issue, which consists of a network for error detection and a network for error correction based on BERT, with the former being connected to the latter with what we call soft-masking technique. Our method of using `Soft-Masked BERT' is general, and it may be employed in other language detection-correction problems. Experimental results on two datasets, including one large dataset which we create and plan to release, demonstrate that the performance of our proposed method is significantly better than the baselines including the one solely based on BERT.",Spelling Correction|Chinese correction|Chinese CSC|error detection,NLP Applications,Long,https://www.aclweb.org/anthology/2020.acl-main.82.pdf -main.55,Evaluating Dialogue Generation Systems via Response Selection,Shiki Sato|Reina Akama|Hiroki Ouchi|Jun Suzuki|Kentaro Inui,"Existing automatic evaluation metrics for open-domain dialogue response generation systems correlate poorly with human evaluation. We focus on evaluating response generation systems via response selection. To evaluate systems properly via response selection, we propose a method to construct response selection test sets with well-chosen false candidates. Specifically, we propose to construct test sets filtering out some types of false candidates: (i) those unrelated to the ground-truth response and (ii) those acceptable as appropriate responses. Through experiments, we demonstrate that evaluating systems via response selection with the test set developed by our method correlates more strongly with human evaluation, compared with widely used automatic evaluation metrics such as BLEU.",open-domain systems|human evaluation|Dialogue Systems|Response Selection,Dialogue and Interactive Systems,Short,https://www.aclweb.org/anthology/2020.acl-main.55.pdf -main.41,Norm-Based Curriculum Learning for Neural Machine Translation,Xuebo Liu|Houtim Lai|Derek F. Wong|Lidia S. Chao,"A neural machine translation (NMT) system is expensive to train, especially with high-resource settings. As the NMT architectures become deeper and wider, this issue gets worse and worse. In this paper, we aim to improve the efficiency of training an NMT by introducing a novel norm-based curriculum learning method. We use the norm (aka length or module) of a word embedding as a measure of 1) the difficulty of the sentence, 2) the competence of the model, and 3) the weight of the sentence. The norm-based sentence difficulty takes the advantages of both linguistically motivated and model-based sentence difficulties. It is easy to determine and contains learning-dependent features. The norm-based model competence makes NMT learn the curriculum in a fully automated way, while the norm-based sentence weight further enhances the learning of the vector representation of the NMT. Experimental results for the WMT'14 English-German and WMT'17 Chinese-English translation tasks demonstrate that the proposed method outperforms strong baselines in terms of BLEU score (+1.17/+1.56) and training speedup (2.22x/3.33x).",Neural Translation|norm-based difficulty|WMT'14 tasks|Norm-Based Learning,Machine Translation,Long,https://www.aclweb.org/anthology/2020.acl-main.41.pdf -main.438,Learning Constraints for Structured Prediction Using Rectifier Networks,Xingyuan Pan|Maitrey Mehta|Vivek Srikumar,"Various natural language processing tasks are structured prediction problems where outputs are constructed with multiple interdependent decisions. Past work has shown that domain knowledge, framed as constraints over the output space, can help improve predictive accuracy. However, designing good constraints often relies on domain expertise. In this paper, we study the problem of learning such constraints. We frame the problem as that of training a two-layer rectifier network to identify valid structures or substructures, and show a construction for converting a trained network into a system of linear constraints over the inference variables. Our experiments on several NLP tasks show that the learned constraints can improve the prediction accuracy, especially when the number of training examples is small.",Structured Prediction|natural tasks|structured problems|construction,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.438.pdf -main.410,Clinical Reading Comprehension: A Thorough Analysis of the emrQA Dataset,Xiang Yue|Bernal Jimenez Gutierrez|Huan Sun,"Machine reading comprehension has made great progress in recent years owing to large-scale annotated datasets. In the clinical domain, however, creating such datasets is quite difficult due to the domain expertise required for annotation. Recently, Pampari et al. (EMNLP'18) tackled this issue by using expert-annotated question templates and existing i2b2 annotations to create emrQA, the first large-scale dataset for question answering (QA) based on clinical notes. In this paper, we provide an in-depth analysis of this dataset and the clinical reading comprehension (CliniRC) task. From our qualitative analysis, we find that (i) emrQA answers are often incomplete, and (ii) emrQA questions are often answerable without using domain knowledge. From our quantitative experiments, surprising results include that (iii) using a small sampled subset (5%-20%), we can obtain roughly equal performance compared to the model trained on the entire dataset, (iv) this performance is close to human expert's performance, and (v) BERT models do not beat the best performing base model. Following our analysis of the emrQA, we further explore two desired aspects of CliniRC systems: the ability to utilize clinical domain knowledge and to generalize to unseen questions and contexts. We argue that both should be considered when creating future datasets.",Clinical Comprehension|Machine comprehension|annotation|question answering,Question Answering,Long,https://www.aclweb.org/anthology/2020.acl-main.410.pdf -main.376,Enriched In-Order Linearization for Faster Sequence-to-Sequence Constituent Parsing,Daniel Fernández-González|Carlos Gómez-Rodríguez,"Sequence-to-sequence constituent parsing requires a linearization to represent trees as sequences. Top-down tree linearizations, which can be based on brackets or shift-reduce actions, have achieved the best accuracy to date. In this paper, we show that these results can be improved by using an in-order linearization instead. Based on this observation, we implement an enriched in-order shift-reduce linearization inspired by Vinyals et al. (2015)'s approach, achieving the best accuracy to date on the English PTB dataset among fully-supervised single-model sequence-to-sequence constituent parsers. Finally, we apply deterministic attention mechanisms to match the speed of state-of-the-art transition-based parsers, thus showing that sequence-to-sequence models can match them, not only in accuracy, but also in speed.",Sequence-to-sequence parsing|Enriched Linearization|Faster Parsing|Top-down linearizations,"Syntax: Tagging, Chunking and Parsing",Short,https://www.aclweb.org/anthology/2020.acl-main.376.pdf -main.69,Syn-QG: Syntactic and Shallow Semantic Rules for Question Generation,Kaustubh Dhole|Christopher D. Manning,"Question Generation (QG) is fundamentally a simple syntactic transformation; however, many aspects of semantics influence what questions are good to form. We implement this observation by developing Syn-QG, a set of transparent syntactic rules leveraging universal dependencies, shallow semantic parsing, lexical resources, and custom rules which transform declarative sentences into question-answer pairs. We utilize PropBank argument descriptions and VerbNet state predicates to incorporate shallow semantic content, which helps generate questions of a descriptive nature and produce inferential and semantically richer questions than existing systems. In order to improve syntactic fluency and eliminate grammatically incorrect questions, we employ back-translation over the output of these syntactic rules. A set of crowd-sourced evaluations shows that our system can generate a larger number of highly grammatical and relevant questions than previous QG systems and that back-translation drastically improves grammaticality at a slight cost of generating irrelevant questions.",Question Generation|syntactic transformation|crowd-sourced evaluations|generating questions,Generation,Long,https://www.aclweb.org/anthology/2020.acl-main.69.pdf -main.362,Graph-to-Tree Learning for Solving Math Word Problems,Jipeng Zhang|Lei Wang|Roy Ka-Wei Lee|Yi Bin|Yan Wang|Jie Shao|Ee-Peng Lim,"While the recent tree-based neural models have demonstrated promising results in generating solution expression for the math word problem (MWP), most of these models do not capture the relationships and order information among the quantities well. This results in poor quantity representations and incorrect solution expressions. In this paper, we propose Graph2Tree, a novel deep learning architecture that combines the merits of the graph-based encoder and tree-based decoder to generate better solution expressions. Included in our Graph2Tree framework are two graphs, namely the Quantity Cell Graph and Quantity Comparison Graph, which are designed to address limitations of existing methods by effectively representing the relationships and order information among the quantities in MWPs. We conduct extensive experiments on two available datasets. Our experiment results show that Graph2Tree outperforms the state-of-the-art baselines on two benchmark datasets significantly. We also discuss case studies and empirically examine Graph2Tree's effectiveness in translating the MWP text into solution expressions.",Solving Problems|Math Problems|math problem|quantity representations,Question Answering,Long,https://www.aclweb.org/anthology/2020.acl-main.362.pdf -main.404,Masking Actor Information Leads to Fairer Political Claims Detection,Erenay Dayanik|Sebastian Padó,"A central concern in Computational Social Sciences (CSS) is fairness: where the role of NLP is to scale up text analysis to large corpora, the quality of automatic analyses should be as independent as possible of textual properties. We analyze the performance of a state-of-the-art neural model on the task of political claims detection (i.e., the identification of forward-looking statements made by political actors) and identify a strong frequency bias: claims made by frequent actors are recognized better. We propose two simple debiasing methods which mask proper names and pronouns during training of the model, thus removing personal information bias. We find that (a) these methods significantly decrease frequency bias while keeping the overall performance stable; and (b) the resulting models improve when evaluated in an out-of-domain setting.",Masking Information|Fairer Detection|Computational Sciences|Computational,Computational Social Science and Social Media,Short,https://www.aclweb.org/anthology/2020.acl-main.404.pdf -main.374,SKEP: Sentiment Knowledge Enhanced Pre-training for Sentiment Analysis,Hao Tian|Can Gao|Xinyan Xiao|Hao Liu|Bolei He|Hua Wu|Haifeng Wang|Feng Wu,"Recently, sentiment analysis has seen remarkable advance with the help of pre-training approaches. However, sentiment knowledge, such as sentiment words and aspect-sentiment pairs, is ignored in the process of pre-training, despite the fact that they are widely used in traditional sentiment analysis approaches. In this paper, we introduce Sentiment Knowledge Enhanced Pre-training (SKEP) in order to learn a unified sentiment representation for multiple sentiment analysis tasks. With the help of automatically-mined knowledge, SKEP conducts sentiment masking and constructs three sentiment knowledge prediction objectives, so as to embed sentiment information at the word, polarity and aspect level into pre-trained sentiment representation. In particular, the prediction of aspect-sentiment pairs is converted into multi-label classification, aiming to capture the dependency between words in a pair. Experiments on three kinds of sentiment tasks show that SKEP significantly outperforms strong pre-training baseline, and achieves new state-of-the-art results on most of the test datasets. We release our code at https://github.com/baidu/Senta.",Sentiment Pre-training|Sentiment Analysis|pre-training|sentiment tasks,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Long,https://www.aclweb.org/anthology/2020.acl-main.374.pdf -main.412,Improving Multi-hop Question Answering over Knowledge Graphs using Knowledge Base Embeddings,Apoorv Saxena|Aditay Tripathi|Partha Talukdar,"Knowledge Graphs (KG) are multi-relational graphs consisting of entities as nodes and relations among them as typed edges. Goal of the Question Answering over KG (KGQA) task is to answer natural language queries posed over the KG. Multi-hop KGQA requires reasoning over multiple edges of the KG to arrive at the right answer. KGs are often incomplete with many missing links, posing additional challenges for KGQA, especially for multi-hop KGQA. Recent research on multi-hop KGQA has attempted to handle KG sparsity using relevant external text, which isn't always readily available. In a separate line of research, KG embedding methods have been proposed to reduce KG sparsity by performing missing link prediction. Such KG embedding methods, even though highly relevant, have not been explored for multi-hop KGQA so far. We fill this gap in this paper and propose EmbedKGQA. EmbedKGQA is particularly effective in performing multi-hop KGQA over sparse KGs. EmbedKGQA also relaxes the requirement of answer selection from a pre-specified neighborhood, a sub-optimal constraint enforced by previous multi-hop KGQA methods. Through extensive experiments on multiple benchmark datasets, we demonstrate EmbedKGQA's effectiveness over other state-of-the-art baselines.",Multi-hop Answering|Question task|natural queries|multi-hop KGQA,Question Answering,Long,https://www.aclweb.org/anthology/2020.acl-main.412.pdf -main.406,"“Who said it, and Why?” Provenance for Natural Language Claims",Yi Zhang|Zachary Ives|Dan Roth,"In an era where generating content and publishing it is so easy, we are bombarded with information and are exposed to all kinds of claims, some of which do not always rank high on the truth scale. This paper suggests that the key to a longer-term, holistic, and systematic approach to navigating this information pollution is capturing the provenance of claims. To do that, we develop a formal definition of provenance graph for a given natural language claim, aiming to understand where the claim may come from and how it has evolved. To construct the graph, we model provenance inference, formulated mainly as an information extraction task and addressed via a textual entailment model. We evaluate our approach using two benchmark datasets, showing initial success in capturing the notion of provenance and its effectiveness on the application of claim verification.",Natural Claims|generating content|publishing|provenance inference,Computational Social Science and Social Media,Long,https://www.aclweb.org/anthology/2020.acl-main.406.pdf -main.360,Successfully Applying the Stabilized Lottery Ticket Hypothesis to the Transformer Architecture,Christopher Brix|Parnia Bahar|Hermann Ney,"Sparse models require less memory for storage and enable a faster inference by reducing the necessary number of FLOPs. This is relevant both for time-critical and on-device computations using neural networks. The stabilized lottery ticket hypothesis states that networks can be pruned after none or few training iterations, using a mask computed based on the unpruned converged model. On the transformer architecture and the WMT 2014 English-to-German and English-to-French tasks, we show that stabilized lottery ticket pruning performs similar to magnitude pruning for sparsity levels of up to 85%, and propose a new combination of pruning techniques that outperforms all other techniques for even higher levels of sparsity. Furthermore, we confirm that the parameter's initial sign and not its specific value is the primary factor for successful training, and show that magnitude pruning cannot be used to find winning lottery tickets.",inference|time-critical computations|transformer architecture|WMT tasks,Machine Translation,Short,https://www.aclweb.org/anthology/2020.acl-main.360.pdf -main.57,Learning Low-Resource End-To-End Goal-Oriented Dialog for Fast and Reliable System Deployment,Yinpei Dai|Hangyu Li|Chengguang Tang|Yongbin Li|Jian Sun|Xiaodan Zhu,"Existing end-to-end dialog systems perform less effectively when data is scarce. To obtain an acceptable success in real-life online services with only a handful of training examples, both fast adaptability and reliable performance are highly desirable for dialog systems. In this paper, we propose the Meta-Dialog System (MDS), which combines the advantages of both meta-learning approaches and human-machine collaboration. We evaluate our methods on a new extended-bAbI dataset and a transformed MultiWOZ dataset for low-resource goal-oriented dialog learning. Experimental results show that MDS significantly outperforms non-meta-learning baselines and can achieve more than 90% per-turn accuracies with only 10 dialogs on the extended-bAbI dataset.",real-life services|dialog systems|human-machine collaboration|low-resource learning,Dialogue and Interactive Systems,Short,https://www.aclweb.org/anthology/2020.acl-main.57.pdf -main.348,Meta-Transfer Learning for Code-Switched Speech Recognition,Genta Indra Winata|Samuel Cahyawijaya|Zhaojiang Lin|Zihan Liu|Peng Xu|Pascale Fung,"An increasing number of people in the world today speak a mixed-language as a result of being multilingual. However, building a speech recognition system for code-switching remains difficult due to the availability of limited resources and the expense and significant effort required to collect mixed-language data. We therefore propose a new learning method, meta-transfer learning, to transfer learn on a code-switched speech recognition system in a low-resource setting by judiciously extracting information from high-resource monolingual datasets. Our model learns to recognize individual languages, and transfer them so as to better recognize mixed-language speech by conditioning the optimization on the code-switching data. Based on experimental results, our model outperforms existing baselines on speech recognition and language modeling tasks, and is faster to converge.",Code-Switched Recognition|speech recognition|speech tasks|language tasks,Speech and Multimodality,Short,https://www.aclweb.org/anthology/2020.acl-main.348.pdf -main.43,A Formal Hierarchy of RNN Architectures,William Merrill|Gail Weiss|Yoav Goldberg|Roy Schwartz|Noah A. Smith|Eran Yahav,"We develop a formal hierarchy of the expressive capacity of RNN architectures. The hierarchy is based on two formal properties: space complexity, which measures the RNN's memory, and rational recurrence, defined as whether the recurrent update can be described by a weighted finite-state machine. We place several RNN variants within this hierarchy. For example, we prove the LSTM is not rational, which formally separates it from the related QRNN (Bradbury et al., 2016). We also show how these models' expressive capacity is expanded by stacking multiple layers or composing them with different pooling functions. Our results build on the theory of ``saturated"" RNNs (Merrill, 2019). While formally extending these findings to unsaturated RNNs is left to future work, we hypothesize that the practical learnable capacity of unsaturated RNNs obeys a similar hierarchy. We provide empirical results to support this conjecture. Experimental findings from training unsaturated networks on formal languages support this conjecture.",Formal Architectures|RNN architectures|weighted machine|LSTM,Theory and Formalism in NLP (Linguistic and Mathematical),Long,https://www.aclweb.org/anthology/2020.acl-main.43.pdf -main.94,Revisiting the Context Window for Cross-lingual Word Embeddings,Ryokan Ri|Yoshimasa Tsuruoka,"Existing approaches to mapping-based cross-lingual word embeddings are based on the assumption that the source and target embedding spaces are structurally similar. The structures of embedding spaces largely depend on the co-occurrence statistics of each word, which the choice of context window determines. Despite this obvious connection between the context window and mapping-based cross-lingual embeddings, their relationship has been underexplored in prior work. In this work, we provide a thorough evaluation, in various languages, domains, and tasks, of bilingual embeddings trained with different context windows. The highlight of our findings is that increasing the size of both the source and target window sizes improves the performance of bilingual lexicon induction, especially the performance on frequent nouns.",Cross-lingual Embeddings|mapping-based embeddings|bilingual induction|mapping-based embeddings,Resources and Evaluation,Long,https://www.aclweb.org/anthology/2020.acl-main.94.pdf -main.80,Modeling Code-Switch Languages Using Bilingual Parallel Corpus,Grandee Lee|Haizhou Li,"Language modeling is the technique to estimate the probability of a sequence of words. A bilingual language model is expected to model the sequential dependency for words across languages, which is difficult due to the inherent lack of suitable training data as well as diverse syntactic structure across languages. We propose a bilingual attention language model (BALM) that simultaneously performs language modeling objective with a quasi-translation objective to model both the monolingual as well as the cross-lingual sequential dependency. The attention mechanism learns the bilingual context from a parallel corpus. BALM achieves state-of-the-art performance on the SEAME code-switch database by reducing the perplexity of 20.5% over the best-reported result. We also apply BALM in bilingual lexicon induction, and language normalization tasks to validate the idea.",Modeling Languages|bilingual induction|language tasks|Language modeling,NLP Applications,Long,https://www.aclweb.org/anthology/2020.acl-main.80.pdf -main.610,A Two-Stage Masked LM Method for Term Set Expansion,Guy Kushilevitz|Shaul Markovitch|Yoav Goldberg,"We tackle the task of Term Set Expansion (TSE): given a small seed set of example terms from a semantic class, finding more members of that class. The task is of great practical utility, and also of theoretical utility as it requires generalization from few examples. Previous approaches to the TSE task can be characterized as either distributional or pattern-based. We harness the power of neural masked language models (MLM) and propose a novel TSE algorithm, which combines the pattern-based and distributional approaches. Due to the small size of the seed set, fine-tuning methods are not effective, calling for more creative use of the MLM. The gist of the idea is to use the MLM to first mine for informative patterns with respect to the seed set, and then to obtain more members of the seed class by generalizing these patterns. Our method outperforms state-of-the-art TSE algorithms. Implementation is available at: https://github.com/ guykush/TermSetExpansion-MPB/",Term Expansion|TSE task|Two-Stage Method|TSE,Information Extraction,Short,https://www.aclweb.org/anthology/2020.acl-main.610.pdf -main.176,A Tale of Two Perplexities: Sensitivity of Neural Language Models to Lexical Retrieval Deficits in Dementia of the Alzheimer’s Type,Trevor Cohen|Serguei Pakhomov,"In recent years there has been a burgeoning interest in the use of computational methods to distinguish between elicited speech samples produced by patients with dementia, and those from healthy controls. The difference between perplexity estimates from two neural language models (LMs) - one trained on transcripts of speech produced by healthy participants and one trained on those with dementia - as a single feature for diagnostic classification of unseen transcripts has been shown to produce state-of-the-art performance. However, little is known about why this approach is effective, and on account of the lack of case/control matching in the most widely-used evaluation set of transcripts (DementiaBank), it is unclear if these approaches are truly diagnostic, or are sensitive to other variables. In this paper, we interrogate neural LMs trained on participants with and without dementia by using synthetic narratives previously developed to simulate progressive semantic dementia by manipulating lexical frequency. We find that perplexity of neural LMs is strongly and differentially associated with lexical frequency, and that using a mixture model resulting from interpolating control and dementia LMs improves upon the current state-of-the-art for models trained on transcript text exclusively.",Lexical Deficits|diagnostic classification|Neural Models|computational methods,Cognitive Modeling and Psycholinguistics,Long,https://www.aclweb.org/anthology/2020.acl-main.176.pdf -main.162,You Don't Have Time to Read This: An Exploration of Document Reading Time Prediction,Orion Weller|Jordan Hildebrandt|Ilya Reznik|Christopher Challis|E. Shannon Tass|Quinn Snell|Kevin Seppi,"Predicting reading time has been a subject of much previous work, focusing on how different words affect human processing, measured by reading time. However, previous work has dealt with a limited number of participants as well as word level only predictions (i.e. predicting the time to read a single word). We seek to extend these works by examining whether or not document level predictions are effective, given additional information such as subject matter, font characteristics, and readability metrics. We perform a novel experiment to examine how different features of text contribute to the time it takes to read, distributing and collecting data from over a thousand participants. We then employ a large number of machine learning methods to predict a user's reading time. We find that despite extensive research showing that word level reading time can be most effectively predicted by neural networks, larger scale text can be easily and most accurately predicted by one factor, the number of words.",Exploration Prediction|Predicting time|human processing|machine methods,Cognitive Modeling and Psycholinguistics,Short,https://www.aclweb.org/anthology/2020.acl-main.162.pdf -main.604,RikiNet: Reading Wikipedia Pages for Natural Question Answering,Dayiheng Liu|Yeyun Gong|Jie Fu|Yu Yan|Jiusheng Chen|Daxin Jiang|Jiancheng Lv|Nan Duan,"Reading long documents to answer open-domain questions remains challenging in natural language understanding. In this paper, we introduce a new model, called RikiNet, which reads Wikipedia pages for natural question answering. RikiNet contains a dynamic paragraph dual-attention reader and a multi-level cascaded answer predictor. The reader dynamically represents the document and question by utilizing a set of complementary attention mechanisms. The representations are then fed into the predictor to obtain the span of the short answer, the paragraph of the long answer, and the answer type in a cascaded manner. On the Natural Questions (NQ) dataset, a single RikiNet achieves 74.3 F1 and 57.9 F1 on long-answer and short-answer tasks. To our best knowledge, it is the first single model that outperforms the single human performance. Furthermore, an ensemble RikiNet obtains 76.1 F1 and 61.3 F1 on long-answer and short-answer tasks, achieving the best performance on the official NQ leaderboard.",Natural Answering|natural understanding|long-answer tasks|RikiNet,Question Answering,Long,https://www.aclweb.org/anthology/2020.acl-main.604.pdf -main.638,Multi-Domain Dialogue Acts and Response Co-Generation,Kai Wang|Junfeng Tian|Rui Wang|Xiaojun Quan|Jianxing Yu,"Generating fluent and informative responses is of critical importance for task-oriented dialogue systems. Existing pipeline approaches generally predict multiple dialogue acts first and use them to assist response generation. There are at least two shortcomings with such approaches. First, the inherent structures of multi-domain dialogue acts are neglected. Second, the semantic associations between acts and responses are not taken into account for response generation. To address these issues, we propose a neural co-generation model that generates dialogue acts and responses concurrently. Unlike those pipeline approaches, our act generation module preserves the semantic structures of multi-domain dialogue acts and our response generation module dynamically attends to different acts as needed. We train the two modules jointly using an uncertainty loss to adjust their task weights adaptively. Extensive experiments are conducted on the large-scale MultiWOZ dataset and the results show that our model achieves very favorable improvement over several state-of-the-art models in both automatic and human evaluations.",Generating responses|task-oriented systems|response generation|automatic evaluations,Dialogue and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.acl-main.638.pdf -main.189,Active Imitation Learning with Noisy Guidance,Kianté Brantley|Hal Daumé III|Amr Sharaf,"Imitation learning algorithms provide state-of-the-art results on many structured prediction tasks by learning near-optimal search policies. Such algorithms assume training-time access to an expert that can provide the optimal action at any queried state; unfortunately, the number of such queries is often prohibitive, frequently rendering these approaches impractical. To combat this query complexity, we consider an active learning setting in which the learning algorithm has additional access to a much cheaper noisy heuristic that provides noisy guidance. Our algorithm, LEAQI, learns a difference classifier that predicts when the expert is likely to disagree with the heuristic, and queries the expert only when necessary. We apply LEAQI to three sequence labelling tasks, demonstrating significantly fewer queries to the expert and comparable (or better) accuracies over a passive approach.",Active Learning|structured tasks|sequence tasks|Imitation algorithms,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.189.pdf -main.764,Predicting Performance for Natural Language Processing Tasks,Mengzhou Xia|Antonios Anastasopoulos|Ruochen Xu|Yiming Yang|Graham Neubig,"Given the complexity of combinations of tasks, languages, and domains in natural language processing (NLP) research, it is computationally prohibitive to exhaustively test newly proposed models on each possible experimental setting. In this work, we attempt to explore the possibility of gaining plausible judgments of how well an NLP model can perform under an experimental setting, without actually training or testing the model. To do so, we build regression models to predict the evaluation score of an NLP experiment given the experimental settings as input. Experimenting on~9 different NLP tasks, we find that our predictors can produce meaningful predictions over unseen languages and different modeling architectures, outperforming reasonable baselines as well as human experts. %we represent experimental settings using an array of features. Going further, we outline how our predictor can be used to find a small subset of representative experiments that should be run in order to obtain plausible predictions for all other experimental settings. (Code, data and logs are publicly available at https://github.com/xiamengzhou/NLPerf.)",Natural Tasks|natural research|NLP research|NLP tasks,NLP Applications,Long,https://www.aclweb.org/anthology/2020.acl-main.764.pdf -main.770,Mind the Trade-off: Debiasing NLU Models without Degrading the In-distribution Performance,Prasetya Ajie Utama|Nafise Sadat Moosavi|Iryna Gurevych,"Models for natural language understanding (NLU) tasks often rely on the idiosyncratic biases of the dataset, which make them brittle against test cases outside the training distribution. Recently, several proposed debiasing methods are shown to be very effective in improving out-of-distribution performance. However, their improvements come at the expense of performance drop when models are evaluated on the in-distribution data, which contain examples with higher diversity. This seemingly inevitable trade-off may not tell us much about the changes in the reasoning and understanding capabilities of the resulting models on broader types of examples beyond the small subset represented in the out-of-distribution data. In this paper, we address this trade-off by introducing a novel debiasing method, called confidence regularization, which discourage models from exploiting biases while enabling them to receive enough incentive to learn from all the training examples. We evaluate our method on three NLU tasks and show that, in contrast to its predecessors, it improves the performance on out-of-distribution datasets (e.g., 7pp gain on HANS dataset) while maintaining the original in-distribution accuracy.",Debiasing Models|natural tasks|NLU tasks|debiasing methods,Semantics: Textual Inference and Other Areas of Semantics,Long,https://www.aclweb.org/anthology/2020.acl-main.770.pdf -main.758,A Multi-Perspective Architecture for Semantic Code Search,Rajarshi Haldar|Lingfei Wu|JinJun Xiong|Julia Hockenmaier,"The ability to match pieces of code to their corresponding natural language descriptions and vice versa is fundamental for natural language search interfaces to software repositories. In this paper, we propose a novel multi-perspective cross-lingual neural framework for code--text matching, inspired in part by a previous model for monolingual text-to-text matching, to capture both global and local similarities. Our experiments on the CoNaLa dataset show that our proposed model yields better performance on this cross-lingual text-to-code matching task than previous approaches that map code and text to a single joint embedding space.",Semantic Search|code matching|monolingual matching|cross-lingual task,NLP Applications,Short,https://www.aclweb.org/anthology/2020.acl-main.758.pdf -main.200,To Pretrain or Not to Pretrain: Examining the Benefits of Pretrainng on Resource Rich Tasks,Sinong Wang|Madian Khabsa|Hao Ma,"Pretraining NLP models with variants of Masked Language Model (MLM) objectives has recently led to a significant improvements on many tasks. This paper examines the benefits of pretrained models as a function of the number of training samples used in the downstream task. On several text classification tasks, we show that as the number of training examples grow into the millions, the accuracy gap between finetuning BERT-based model and training vanilla LSTM from scratch narrows to within 1%. Our findings indicate that MLM-based models might reach a diminishing return point as the supervised data size increases significantly.",text tasks|Pretrainng|Pretraining models|NLP models,Machine Learning for NLP,Short,https://www.aclweb.org/anthology/2020.acl-main.200.pdf -main.566,Learning Efficient Dialogue Policy from Demonstrations through Shaping,Huimin Wang|Baolin Peng|Kam-Fai Wong,"Training a task-oriented dialogue agent with reinforcement learning is prohibitively expensive since it requires a large volume of interactions with users. Human demonstrations can be used to accelerate learning progress. However, how to effectively leverage demonstrations to learn dialogue policy remains less explored. In this paper, we present S^2Agent that efficiently learns dialogue policy from demonstrations through policy shaping and reward shaping. We use an imitation model to distill knowledge from demonstrations, based on which policy shaping estimates feedback on how the agent should act in policy space. Reward shaping is then incorporated to bonus state-actions similar to demonstrations explicitly in value space encouraging better exploration. The effectiveness of the proposed S^2Agentt is demonstrated in three dialogue domains and a challenging domain adaptation task with both user simulator evaluation and human evaluation.",Demonstrations|learning progress|domain task|human evaluation,Dialogue and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.acl-main.566.pdf -main.572,Connecting Embeddings for Knowledge Graph Entity Typing,Yu Zhao|anxiang zhang|Ruobing Xie|Kang Liu|Xiaojie WANG,"Knowledge graph (KG) entity typing aims at inferring possible missing entity type instances in KG, which is a very significant but still under-explored subtask of knowledge graph completion. In this paper, we propose a novel approach for KG entity typing which is trained by jointly utilizing local typing knowledge from existing entity type assertions and global triple knowledge in KGs. Specifically, we present two distinct knowledge-driven effective mechanisms of entity type inference. Accordingly, we build two novel embedding models to realize the mechanisms. Afterward, a joint model via connecting them is used to infer missing entity type instances, which favors inferences that agree with both entity type instances and triple knowledge in KGs. Experimental results on two real-world datasets (Freebase and YAGO) demonstrate the effectiveness of our proposed mechanisms and models for improving KG entity typing. The source code and data of this paper can be obtained from: https://github.com/Adam1679/ConnectE .",Connecting Embeddings|Knowledge Typing|knowledge completion|KG typing,Information Extraction,Long,https://www.aclweb.org/anthology/2020.acl-main.572.pdf -main.214,Integrating Multimodal Information in Large Pretrained Transformers,Wasifur Rahman|Md Kamrul Hasan|Sangwu Lee|AmirAli Bagher Zadeh|Chengfeng Mao|Louis-Philippe Morency|Ehsan Hoque,"Recent Transformer-based contextual word representations, including BERT and XLNet, have shown state-of-the-art performance in multiple disciplines within NLP. Fine-tuning the trained contextual models on task-specific datasets has been the key to achieving superior performance downstream. While fine-tuning these pre-trained models is straightforward for lexical applications (applications with only language modality), it is not trivial for multimodal language (a growing area in NLP focused on modeling face-to-face communication). More specifically, this is due to the fact that pre-trained models don't have the necessary components to accept two extra modalities of vision and acoustic. In this paper, we proposed an attachment to BERT and XLNet called Multimodal Adaptation Gate (MAG). MAG allows BERT and XLNet to accept multimodal nonverbal data during fine-tuning. It does so by generating a shift to internal representation of BERT and XLNet; a shift that is conditioned on the visual and acoustic modalities. In our experiments, we study the commonly used CMU-MOSI and CMU-MOSEI datasets for multimodal sentiment analysis. Fine-tuning MAG-BERT and MAG-XLNet significantly boosts the sentiment analysis performance over previous baselines as well as language-only fine-tuning of BERT and XLNet. On the CMU-MOSI dataset, MAG-XLNet achieves human-level multimodal sentiment analysis performance for the first time in the NLP community.",NLP|lexical applications|modeling communication|multimodal analysis,Speech and Multimodality,Long,https://www.aclweb.org/anthology/2020.acl-main.214.pdf -main.228,Simple and Effective Retrieve-Edit-Rerank Text Generation,Nabil Hossain|Marjan Ghazvininejad|Luke Zettlemoyer,"Retrieve-and-edit seq2seq methods typically retrieve an output from the training set and learn a model to edit it to produce the final output. We propose to extend this framework with a simple and effective post-generation ranking approach. Our framework (i) retrieves several potentially relevant outputs for each input, (ii) edits each candidate independently, and (iii) re-ranks the edited candidates to select the final output. We use a standard editing model with simple task-specific re-ranking approaches, and we show empirically that this approach outperforms existing, significantly more complex methodologies. Experiments on two machine translation (MT) datasets show new state-of-art results. We also achieve near state-of-art performance on the Gigaword summarization dataset, where our analyses show that there is significant room for performance improvement with better candidate output selection in future work.",Retrieve-Edit-Rerank Generation|candidate selection|Retrieve-and-edit methods|post-generation approach,Generation,Short,https://www.aclweb.org/anthology/2020.acl-main.228.pdf -main.599,Document Modeling with Graph Attention Networks for Multi-grained Machine Reading Comprehension,Bo Zheng|Haoyang Wen|Yaobo Liang|Nan Duan|Wanxiang Che|Daxin Jiang|Ming Zhou|Ting Liu,"Natural Questions is a new challenging machine reading comprehension benchmark with two-grained answers, which are a long answer (typically a paragraph) and a short answer (one or more entities inside the long answer). Despite the effectiveness of existing methods on this benchmark, they treat these two sub-tasks individually during training while ignoring their dependencies. To address this issue, we present a novel multi-grained machine reading comprehension framework that focuses on modeling documents at their hierarchical nature, which are different levels of granularity: documents, paragraphs, sentences, and tokens. We utilize graph attention networks to obtain different levels of representations so that they can be learned simultaneously. The long and short answers can be extracted from paragraph-level representation and token-level representation, respectively. In this way, we can model the dependencies between the two-grained answers to provide evidence for each other. We jointly train the two sub-tasks, and our experiments show that our approach significantly outperforms previous systems at both long and short answer criteria.",Document Modeling|Multi-grained Comprehension|machine comprehension|Graph Networks,Question Answering,Long,https://www.aclweb.org/anthology/2020.acl-main.599.pdf -main.598,Unsupervised Morphological Paradigm Completion,Huiming Jin|Liwei Cai|Yihui Peng|Chen Xia|Arya McCarthy|Katharina Kann,"We propose the task of unsupervised morphological paradigm completion. Given only raw text and a lemma list, the task consists of generating the morphological paradigms, i.e., all inflected forms, of the lemmas. From a natural language processing (NLP) perspective, this is a challenging unsupervised task, and high-performing systems have the potential to improve tools for low-resource languages or to assist linguistic annotators. From a cognitive science perspective, this can shed light on how children acquire morphological knowledge. We further introduce a system for the task, which generates morphological paradigms via the following steps: (i) EDIT TREE retrieval, (ii) additional lemma retrieval, (iii) paradigm size discovery, and (iv) inflection generation. We perform an evaluation on 14 typologically diverse languages. Our system outperforms trivial baselines with ease and, for some languages, even obtains a higher accuracy than minimally supervised systems.",unsupervised completion|unsupervised task|linguistic annotators|EDIT retrieval,"Phonology, Morphology and Word Segmentation",Long,https://www.aclweb.org/anthology/2020.acl-main.598.pdf -main.229,BabyWalk: Going Farther in Vision-and-Language Navigation by Taking Baby Steps,Wang Zhu|Hexiang Hu|Jiacheng Chen|Zhiwei Deng|Vihan Jain|Eugene Ie|Fei Sha,"Learning to follow instructions is of fundamental importance to autonomous agents for vision-and-language navigation (VLN). In this paper, we study how an agent can navigate long paths when learning from a corpus that consists of shorter ones. We show that existing state-of-the-art agents do not generalize well. To this end, we propose BabyWalk, a new VLN agent that is learned to navigate by decomposing long instructions into shorter ones (BabySteps) and completing them sequentially. A special design memory buffer is used by the agent to turn its past experiences into contexts for future steps. The learning process is composed of two phases. In the first phase, the agent uses imitation learning from demonstration to accomplish BabySteps. In the second phase, the agent uses curriculum-based reinforcement learning to maximize rewards on navigation tasks with increasingly longer instructions. We create two new benchmark datasets (of long navigation tasks) and use them in conjunction with existing ones to examine BabyWalk's generalization ability. Empirical results show that BabyWalk achieves state-of-the-art results on several metrics, in particular, is able to follow long instructions better. The codes and the datasets are released on our project page: https://github.com/Sha-Lab/babywalk.",Vision-and-Language Navigation|vision-and-language VLN|VLN|navigation tasks,"Language Grounding to Vision, Robotics and Beyond",Long,https://www.aclweb.org/anthology/2020.acl-main.229.pdf -main.573,Continual Relation Learning via Episodic Memory Activation and Reconsolidation,Xu Han|Yi Dai|Tianyu Gao|Yankai Lin|Zhiyuan Liu|Peng Li|Maosong Sun|Jie Zhou,"Continual relation learning aims to continually train a model on new data to learn incessantly emerging novel relations while avoiding catastrophically forgetting old relations. Some pioneering work has proved that storing a handful of historical relation examples in episodic memory and replaying them in subsequent training is an effective solution for such a challenging problem. However, these memory-based methods usually suffer from overfitting the few memorized examples of old relations, which may gradually cause inevitable confusion among existing relations. Inspired by the mechanism in human long-term memory formation, we introduce episodic memory activation and reconsolidation (EMAR) to continual relation learning. Every time neural models are activated to learn both new and memorized data, EMAR utilizes relation prototypes for memory reconsolidation exercise to keep a stable understanding of old relations. The experimental results show that EMAR could get rid of catastrophically forgetting old relations and outperform the state-of-the-art continual learning models.",Continual Learning|Reconsolidation|human formation|memory exercise,Information Extraction,Long,https://www.aclweb.org/anthology/2020.acl-main.573.pdf -main.215,MultiQT: Multimodal learning for real-time question tracking in speech,Jakob D. Havtorn|Jan Latko|Joakim Edin|Lars Maaløe|Lasse Borgholt|Lorenzo Belgrano|Nicolai Jacobsen|Regitze Sdun|Željko Agić,"We address a challenging and practical task of labeling questions in speech in real time during telephone calls to emergency medical services in English, which embeds within a broader decision support system for emergency call-takers. We propose a novel multimodal approach to real-time sequence labeling in speech. Our model treats speech and its own textual representation as two separate modalities or views, as it jointly learns from streamed audio and its noisy transcription into text via automatic speech recognition. Our results show significant gains of jointly learning from the two modalities when compared to text or audio only, under adverse noise and limited volume of training data. The results generalize to medical symptoms detection where we observe a similar pattern of improvements with multimodal learning.",real-time speech|labeling speech|emergency services|real-time labeling,Speech and Multimodality,Long,https://www.aclweb.org/anthology/2020.acl-main.215.pdf -main.201,Why Overfitting Isn't Always Bad: Retrofitting Cross-Lingual Word Embeddings to Dictionaries,Mozhi Zhang|Yoshinari Fujinuma|Michael J. Paul|Jordan Boyd-Graber,"Cross-lingual word embeddings (CLWE) are often evaluated on bilingual lexicon induction (BLI). Recent CLWE methods use linear projections, which underfit the training dictionary, to generalize on BLI. However, underfitting can hinder generalization to other downstream tasks that rely on words from the training dictionary. We address this limitation by retrofitting CLWE to the training dictionary, which pulls training translation pairs closer in the embedding space and overfits the training dictionary. This simple post-processing step often improves accuracy on two downstream tasks, despite lowering BLI test accuracy. We also retrofit to both the training dictionary and a synthetic dictionary induced from CLWE, which sometimes generalizes even better on downstream tasks. Our results confirm the importance of fully exploiting training dictionary in downstream tasks and explains why BLI is a flawed CLWE evaluation.",Dictionaries|BLI|generalization|downstream tasks,Machine Learning for NLP,Short,https://www.aclweb.org/anthology/2020.acl-main.201.pdf -main.567,SAS: Dialogue State Tracking via Slot Attention and Slot Information Sharing,Jiaying Hu|Yan Yang|Chencai Chen|liang he|Zhou Yu,"Dialogue state tracker is responsible for inferring user intentions through dialogue history. Previous methods have difficulties in handling dialogues with long interaction context, due to the excessive information. We propose a Dialogue State Tracker with Slot Attention and Slot Information Sharing (SAS) to reduce redundant information’s interference and improve long dialogue context tracking. Specially, we first apply a Slot Attention to learn a set of slot-specific features from the original dialogue and then integrate them using a slot information sharing module. Our model yields a significantly improved performance compared to previous state-of the-art models on the MultiWOZ dataset.",Dialogue Tracking|long tracking|SAS|Slot Sharing,Dialogue and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.acl-main.567.pdf -main.759,Automated Topical Component Extraction Using Neural Network Attention Scores from Source-based Essay Scoring,Haoran Zhang|Diane Litman,"While automated essay scoring (AES) can reliably grade essays at scale, automated writing evaluation (AWE) additionally provides formative feedback to guide essay revision. However, a neural AES typically does not provide useful feature representations for supporting AWE. This paper presents a method for linking AWE and neural AES, by extracting Topical Components (TCs) representing evidence from a source text using the intermediate output of attention layers. We evaluate performance using a feature-based AES requiring TCs. Results show that performance is comparable whether using automatically or manually constructed TCs for 1) representing essays as rubric-based features, 2) grading essays.",Automated Extraction|automated evaluation|essay revision|AWE,NLP Applications,Short,https://www.aclweb.org/anthology/2020.acl-main.759.pdf -main.771,NILE : Natural Language Inference with Faithful Natural Language Explanations,Sawan Kumar|Partha Talukdar,"The recent growth in the popularity and success of deep learning models on NLP classification tasks has accompanied the need for generating some form of natural language explanation of the predicted labels. Such generated natural language (NL) explanations are expected to be faithful, i.e., they should correlate well with the model's internal decision making. In this work, we focus on the task of natural language inference (NLI) and address the following question: can we build NLI systems which produce labels with high accuracy, while also generating faithful explanations of its decisions? We propose Natural-language Inference over Label-specific Explanations (NILE), a novel NLI method which utilizes auto-generated label-specific NL explanations to produce labels along with its faithful explanation. We demonstrate NILE's effectiveness over previously reported methods through automated and human evaluation of the produced labels and explanations. Our evaluation of NILE also supports the claim that accurate systems capable of providing testable explanations of their decisions can be designed. We discuss the faithfulness of NILE's explanations in terms of sensitivity of the decisions to the corresponding explanations. We argue that explicit evaluation of faithfulness, in addition to label and explanation accuracy, is an important step in evaluating model's explanations. Further, we demonstrate that task-specific probes are necessary to establish such sensitivity.",Natural Inference|NLP tasks|internal making|NLI,Semantics: Textual Inference and Other Areas of Semantics,Long,https://www.aclweb.org/anthology/2020.acl-main.771.pdf -main.765,ScriptWriter: Narrative-Guided Script Generation,Yutao Zhu|Ruihua Song|Zhicheng Dou|Jian-Yun Nie|Jin Zhou,"It is appealing to have a system that generates a story or scripts automatically from a storyline, even though this is still out of our reach. In dialogue systems, it would also be useful to drive dialogues by a dialogue plan. In this paper, we address a key problem involved in these applications - guiding a dialogue by a narrative. The proposed model ScriptWriter selects the best response among the candidates that fit the context as well as the given narrative. It keeps track of what in the narrative has been said and what is to be said. A narrative plays a different role than the context (i.e., previous utterances), which is generally used in current dialogue systems. Due to the unavailability of data for this new application, we construct a new large-scale data collection GraphMovie from a movie website where end- users can upload their narratives freely when watching a movie. Experimental results on the dataset show that our proposed approach based on narratives significantly outperforms the baselines that simply use the narrative as a kind of context.",Narrative-Guided Generation|dialogue systems|ScriptWriter|model ScriptWriter,NLP Applications,Long,https://www.aclweb.org/anthology/2020.acl-main.765.pdf -main.188,Calibrating Structured Output Predictors for Natural Language Processing,Abhyuday Jagannatha|Hong Yu,"We address the problem of calibrating prediction confidence for output entities of interest in natural language processing (NLP) applications. It is important that NLP applications such as named entity recognition and question answering produce calibrated confidence scores for their predictions, especially if the applications are to be deployed in a safety-critical domain such as healthcare. However the output space of such structured prediction models are often too large to directly adapt binary or multi-class calibration methods. In this study, we propose a general calibration scheme for output entities of interest in neural network based structured prediction models. Our proposed method can be used with any binary class calibration scheme and a neural network model. Additionally, we show that our calibration method can also be used as an uncertainty-aware, entity-specific decoding step to improve the performance of the underlying model at no additional training cost or data requirements. We show that our method outperforms current calibration techniques for Named Entity Recognition, Part-of-speech tagging and Question Answering systems. We also observe an improvement in model performance from our decoding step across several tasks and benchmark datasets. Our method improves the calibration and model performance on out-of-domain test scenarios as well.",Natural Processing|natural applications|NLP applications|named recognition,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.188.pdf -main.639,Exploring Contextual Word-level Style Relevance for Unsupervised Style Transfer,Chulun Zhou|Liangyu Chen|Jiachen Liu|Xinyan Xiao|Jinsong Su|Sheng Guo|Hua Wu,"Unsupervised style transfer aims to change the style of an input sentence while preserving its original content without using parallel training data. In current dominant approaches, owing to the lack of fine-grained control on the influence from the target style, they are unable to yield desirable output sentences. In this paper, we propose a novel attentional sequence-to-sequence (Seq2seq) model that dynamically exploits the relevance of each output word to the target style for unsupervised style transfer. Specifically, we first pretrain a style classifier, where the relevance of each input word to the original style can be quantified via layer-wise relevance propagation. In a denoising auto-encoding manner, we train an attentional Seq2seq model to reconstruct input sentences and repredict word-level previously-quantified style relevance simultaneously. In this way, this model is endowed with the ability to automatically predict the style relevance of each output word. Then, we equip the decoder of this model with a neural style component to exploit the predicted wordlevel style relevance for better style transfer. Particularly, we fine-tune this model using a carefully-designed objective function involving style transfer, style relevance consistency, content preservation and fluency modeling loss terms. Experimental results show that our proposed model achieves state-of-the-art performance in terms of both transfer accuracy and content preservation.",Exploring Relevance|Contextual Relevance|Unsupervised Transfer|style transfer,Generation,Long,https://www.aclweb.org/anthology/2020.acl-main.639.pdf -main.163,A Generative Model for Joint Natural Language Understanding and Generation,Bo-Hsiang Tseng|Jianpeng Cheng|Yimai Fang|David Vandyke,"Natural language understanding (NLU) and natural language generation (NLG) are two fundamental and related tasks in building task-oriented dialogue systems with opposite objectives: NLU tackles the transformation from natural language to formal representations, whereas NLG does the reverse. A key to success in either task is parallel training data which is expensive to obtain at a large scale. In this work, we propose a generative model which couples NLU and NLG through a shared latent variable. This approach allows us to explore both spaces of natural language and formal representations, and facilitates information sharing through the latent space to eventually benefit NLU and NLG. Our model achieves state-of-the-art performance on two dialogue datasets with both flat and tree-structured formal representations. We also show that the model can be trained in a semi-supervised fashion by utilising unlabelled data to boost its performance.",Joint Generation|Natural understanding|natural generation|NLG,Dialogue and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.acl-main.163.pdf -main.605,Parsing into Variable-in-situ Logico-Semantic Graphs,Yufei Chen|Weiwei Sun,"We propose variable-in-situ logico-semantic graphs to bridge the gap between semantic graph and logical form parsing. The new type of graph-based meaning representation allows us to include analysis for scope-related phenomena, such as quantification, negation and modality, in a way that is consistent with the state-of-the-art underspecification approach. Moreover, the well-formedness of such a graph is clear, since model-theoretic interpretation is available. We demonstrate the effectiveness of this new perspective by developing a new state-of-the-art semantic parser for English Resource Semantics. At the core of this parser is a novel neural graph rewriting system which combines the strengths of Hyperedge Replacement Grammar, a knowledge-intensive model, and Graph Neural Networks, a data-intensive model. Our parser achieves an accuracy of 92.39% in terms of elementary dependency match, which is a 2.88 point improvement over the best data-driven model in the literature. The output of our parser is highly coherent: at least 91% graphs are valid, in that they allow at least one sound scope-resolved logical form.",logical parsing|Parsing|variable-in-situ graphs|graph-based representation,Semantics: Sentence Level,Long,https://www.aclweb.org/anthology/2020.acl-main.605.pdf -main.611,FLAT: Chinese NER Using Flat-Lattice Transformer,Xiaonan Li|Hang Yan|Xipeng Qiu|Xuanjing Huang,"Recently, the character-word lattice structure has been proved to be effective for Chinese named entity recognition (NER) by incorporating the word information. However, since the lattice structure is complex and dynamic, the lattice-based models are hard to fully utilize the parallel computation of GPUs and usually have a low inference speed. In this paper, we propose FLAT: Flat-LAttice Transformer for Chinese NER, which converts the lattice structure into a flat structure consisting of spans. Each span corresponds to a character or latent word and its position in the original lattice. With the power of Transformer and well-designed position encoding, FLAT can fully leverage the lattice information and has an excellent parallel ability. Experiments on four datasets show FLAT outperforms other lexicon-based models in performance and efficiency.",Chinese NER|Chinese recognition|NER|FLAT,Information Extraction,Short,https://www.aclweb.org/anthology/2020.acl-main.611.pdf -main.177,Probing Linguistic Systematicity,Emily Goodwin|Koustuv Sinha|Timothy J. O'Donnell,"Recently, there has been much interest in the question of whether deep natural language understanding (NLU) models exhibit systematicity, generalizing such that units like words make consistent contributions to the meaning of the sentences in which they appear. There is accumulating evidence that neural models do not learn systematically. We examine the notion of systematicity from a linguistic perspective, defining a set of probing tasks and a set of metrics to measure systematic behaviour. We also identify ways in which network architectures can generalize non-systematically, and discuss why such forms of generalization may be unsatisfying. As a case study, we perform a series of experiments in the setting of natural language inference (NLI). We provide evidence that current state-of-the-art NLU systems do not generalize systematically, despite overall high performance.",Probing Systematicity|probing tasks|generalization|natural inference,Cognitive Modeling and Psycholinguistics,Long,https://www.aclweb.org/anthology/2020.acl-main.177.pdf -main.81,SpellGCN: Incorporating Phonological and Visual Similarities into Language Models for Chinese Spelling Check,Xingyi Cheng|Weidi Xu|Kunlong Chen|Shaohua Jiang|Feng Wang|Taifeng Wang|Wei Chu|Yuan Qi,"Chinese Spelling Check (CSC) is a task to detect and correct spelling errors in Chinese natural language. Existing methods have made attempts to incorporate the similarity knowledge between Chinese characters. However, they take the similarity knowledge as either an external input resource or just heuristic rules. This paper proposes to incorporate phonological and visual similarity knowledge into language models for CSC via a specialized graph convolutional network (SpellGCN). The model builds a graph over the characters, and SpellGCN is learned to map this graph into a set of inter-dependent character classifiers. These classifiers are applied to the representations extracted by another network, such as BERT, enabling the whole network to be end-to-end trainable. Experiments are conducted on three human-annotated datasets. Our method achieves superior performance against previous models by a large margin.",Chinese Check|spelling errors|spelling language|CSC,NLP Applications,Long,https://www.aclweb.org/anthology/2020.acl-main.81.pdf -main.95,Moving Down the Long Tail of Word Sense Disambiguation with Gloss Informed Bi-encoders,Terra Blevins|Luke Zettlemoyer,"A major obstacle in Word Sense Disambiguation (WSD) is that word senses are not uniformly distributed, causing existing models to generally perform poorly on senses that are either rare or unseen during training. We propose a bi-encoder model that independently embeds (1) the target word with its surrounding context and (2) the dictionary definition, or gloss, of each sense. The encoders are jointly optimized in the same representation space, so that sense disambiguation can be performed by finding the nearest sense embedding for each target word embedding. Our system outperforms previous state-of-the-art models on English all-words WSD; these gains predominantly come from improved performance on rare senses, leading to a 31.1% error reduction on less frequent senses over prior work. This demonstrates that rare senses can be more effectively disambiguated by modeling their definitions.",Word Disambiguation|Word WSD|WSD|sense disambiguation,Semantics: Lexical,Long,https://www.aclweb.org/anthology/2020.acl-main.95.pdf -main.42,Opportunistic Decoding with Timely Correction for Simultaneous Translation,Renjie Zheng|Mingbo Ma|Baigong Zheng|Kaibo Liu|Liang Huang,"Simultaneous translation has many important application scenarios and attracts much attention from both academia and industry recently. Most existing frameworks, however, have difficulties in balancing between the translation quality and latency, i.e., the decoding policy is usually either too aggressive or too conservative. We propose an opportunistic decoding technique with timely correction ability, which always (over-)generates a certain mount of extra words at each step to keep the audience on track with the latest information. At the same time, it also corrects, in a timely fashion, the mistakes in the former overgenerated words when observing more source context to ensure high translation quality. Experiments show our technique achieves substantial reduction in latency and up to +3.1 increase in BLEU, with revision rate under 8% in Chinese-to-English and English-to-Chinese translation.",Simultaneous Translation|Chinese-to-English translation|Opportunistic Decoding|Timely Correction,Machine Translation,Short,https://www.aclweb.org/anthology/2020.acl-main.42.pdf -main.349,Reasoning with Multimodal Sarcastic Tweets via Modeling Cross-Modality Contrast and Semantic Association,Nan Xu|Zhixiong Zeng|Wenji Mao,"Sarcasm is a sophisticated linguistic phenomenon to express the opposite of what one really means. With the rapid growth of social media, multimodal sarcastic tweets are widely posted on various social platforms. In multimodal context, sarcasm is no longer a pure linguistic phenomenon, and due to the nature of social media short text, the opposite is more often manifested via cross-modality expressions. Thus traditional text-based methods are insufficient to detect multimodal sarcasm. To reason with multimodal sarcastic tweets, in this paper, we propose a novel method for modeling cross-modality contrast in the associated context. Our method models both cross-modality contrast and semantic association by constructing the Decomposition and Relation Network (namely D&R Net). The decomposition network represents the commonality and discrepancy between image and text, and the relation network models the semantic association in cross-modality context. Experimental results on a public dataset demonstrate the effectiveness of our model in multimodal sarcasm detection.",Reasoning|sarcasm|multimodal detection|Sarcasm,Speech and Multimodality,Long,https://www.aclweb.org/anthology/2020.acl-main.349.pdf -main.56,Gated Convolutional Bidirectional Attention-based Model for Off-topic Spoken Response Detection,Yefei Zha|Ruobing Li|Hui Lin,"Off-topic spoken response detection, the task aiming at predicting whether a response is off-topic for the corresponding prompt, is important for an automated speaking assessment system. In many real-world educational applications, off-topic spoken response detectors are required to achieve high recall for off-topic responses not only on seen prompts but also on prompts that are unseen during training. In this paper, we propose a novel approach for off-topic spoken response detection with high off-topic recall on both seen and unseen prompts. We introduce a new model, Gated Convolutional Bidirectional Attention-based Model (GCBiA), which applies bi-attention mechanism and convolutions to extract topic words of prompts and key-phrases of responses, and introduces gated unit and residual connections between major layers to better represent the relevance of responses and prompts. Moreover, a new negative sampling method is proposed to augment training data. Experiment results demonstrate that our novel approach can achieve significant improvements in detecting off-topic responses with extremely high on-topic recall, for both seen and unseen prompts.",Off-topic Detection|automated system|real-world applications|detecting responses,Dialogue and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.acl-main.56.pdf -main.407,Compositionality and Generalization In Emergent Languages,Rahma Chaabouni|Eugene Kharitonov|Diane Bouchacourt|Emmanuel Dupoux|Marco Baroni,"Natural language allows us to refer to novel composite concepts by combining expressions denoting their parts according to systematic rules, a property known as compositionality. In this paper, we study whether the language emerging in deep multi-agent simulations possesses a similar ability to refer to novel primitive combinations, and whether it accomplishes this feat by strategies akin to human-language compositionality. Equipped with new ways to measure compositionality in emergent languages inspired by disentanglement in representation learning, we establish three main results: First, given sufficiently large input spaces, the emergent language will naturally develop the ability to refer to novel composite concepts. Second, there is no correlation between the degree of compositionality of an emergent language and its ability to generalize. Third, while compositionality is not necessary for generalization, it provides an advantage in terms of language transmission: The more compositional a language is, the more easily it will be picked up by new learners, even when the latter differ in architecture from the original agents. We conclude that compositionality does not arise from simple generalization pressure, but if an emergent language does chance upon it, it will be more likely to survive and thrive.",generalization|language transmission|representation learning|Compositionality,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.407.pdf -main.361,A Self-Training Method for Machine Reading Comprehension with Soft Evidence Extraction,Yilin Niu|Fangkai Jiao|Mantong Zhou|Ting Yao|Jingfang Xu|Minlie Huang,"Neural models have achieved great success on machine reading comprehension (MRC), many of which typically consist of two components: an evidence extractor and an answer predictor. The former seeks the most relevant information from a reference text, while the latter is to locate or generate answers from the extracted evidence. Despite the importance of evidence labels for training the evidence extractor, they are not cheaply accessible, particularly in many non-extractive MRC tasks such as YES/NO question answering and multi-choice MRC. To address this problem, we present a Self-Training method (STM), which supervises the evidence extractor with auto-generated evidence labels in an iterative process. At each iteration, a base MRC model is trained with golden answers and noisy evidence labels. The trained model will predict pseudo evidence labels as extra supervision in the next iteration. We evaluate STM on seven datasets over three MRC tasks. Experimental results demonstrate the improvement on existing MRC models, and we also analyze how and why such a self-training method works in MRC.",Machine Comprehension|Soft Extraction|machine|MRC,Question Answering,Long,https://www.aclweb.org/anthology/2020.acl-main.361.pdf -main.375,Do Neural Language Models Show Preferences for Syntactic Formalisms?,Artur Kulmizev|Vinit Ravishankar|Mostafa Abdou|Joakim Nivre,"Recent work on the interpretability of deep neural language models has concluded that many properties of natural language syntax are encoded in their representational spaces. However, such studies often suffer from limited scope by focusing on a single language and a single linguistic formalism. In this study, we aim to investigate the extent to which the semblance of syntactic structure captured by language models adheres to a surface-syntactic or deep syntactic style of analysis, and whether the patterns are consistent across different languages. We apply a probe for extracting directed dependency trees to BERT and ELMo models trained on 13 different languages, probing for two different syntactic annotation styles: Universal Dependencies (UD), prioritizing deep syntactic relations, and Surface-Syntactic Universal Dependencies (SUD), focusing on surface structure. We find that both models exhibit a preference for UD over SUD --- with interesting variations across languages and layers --- and that the strength of this preference is correlated with differences in tree shape.",probing|Neural Models|deep models|linguistic formalism,"Syntax: Tagging, Chunking and Parsing",Long,https://www.aclweb.org/anthology/2020.acl-main.375.pdf -main.413,Template-Based Question Generation from Retrieved Sentences for Improved Unsupervised Question Answering,Alexander Fabbri|Patrick Ng|Zhiguo Wang|Ramesh Nallapati|Bing Xiang,"Question Answering (QA) is in increasing demand as the amount of information available online and the desire for quick access to this content grows. A common approach to QA has been to fine-tune a pretrained language model on a task-specific labeled dataset. This paradigm, however, relies on scarce, and costly to obtain, large-scale human-labeled data. We propose an unsupervised approach to training QA models with generated pseudo-training data. We show that generating questions for QA training by applying a simple template on a related, retrieved sentence rather than the original context sentence improves downstream QA performance by allowing the model to learn more complex context-question relationships. Training a QA model on this data gives a relative improvement over a previous unsupervised model in F1 score on the SQuAD dataset by about 14%, and 20% when the answer is a named entity, achieving state-of-the-art performance on SQuAD for unsupervised QA.",Template-Based Generation|Unsupervised Answering|Question Answering|QA,Question Answering,Short,https://www.aclweb.org/anthology/2020.acl-main.413.pdf -main.31,Every Document Owns Its Structure: Inductive Text Classification via Graph Neural Networks,Yufeng Zhang|Xueli Yu|Zeyu Cui|Shu Wu|Zhongzhen Wen|Liang Wang,"Text classification is fundamental in natural language processing (NLP) and Graph Neural Networks (GNN) are recently applied in this task. However, the existing graph-based works can neither capture the contextual word relationships within each document nor fulfil the inductive learning of new words. Therefore in this work, to overcome such problems, we propose TextING for inductive text classification via GNN. We first build individual graphs for each document and then use GNN to learn the fine-grained word representations based on their local structure, which can also effectively produce embeddings for unseen words in the new document. Finally, the word nodes are aggregated as the document embedding. Extensive experiments on four benchmark datasets show that our method outperforms state-of-the-art text classification methods.",Inductive Classification|Text classification|natural processing|NLP,Information Retrieval and Text Mining,Short,https://www.aclweb.org/anthology/2020.acl-main.31.pdf -main.448,Tangled up in BLEU: Reevaluating the Evaluation of Automatic Machine Translation Evaluation Metrics,Nitika Mathur|Timothy Baldwin|Trevor Cohn,"Automatic metrics are fundamental for the development and evaluation of machine translation systems. Judging whether, and to what extent, automatic metrics concur with the gold standard of human evaluation is not a straightforward problem. We show that current methods for judging metrics are highly sensitive to the translations used for assessment, particularly the presence of outliers, which often leads to falsely confident conclusions about a metric's efficacy. Finally, we turn to pairwise system ranking, developing a method for thresholding performance improvement under an automatic metric against human judgements, which allows quantification of type I versus type II errors incurred, i.e., insignificant human differences in system quality that are accepted, and significant human differences that are rejected. Together, these findings suggest improvements to the protocols for metric evaluation and system performance evaluation in machine translation.",judging metrics|assessment|pairwise ranking|thresholding,Resources and Evaluation,Long,https://www.aclweb.org/anthology/2020.acl-main.448.pdf -main.25,Reverse Engineering Configurations of Neural Text Generation Models,Yi Tay|Dara Bahri|Che Zheng|Clifford Brunk|Donald Metzler|Andrew Tomkins,"Recent advances in neural text generation modeling have resulted in a number of societal concerns related to how such approaches might be used in malicious ways. It is therefore desirable to develop a deeper understanding of the fundamental properties of such models. The study of artifacts that emerge in machine generated text as a result of modeling choices is a nascent research area. To this end, the extent and degree to which these artifacts surface in generated text is still unclear. In the spirit of better understanding generative text models and their artifacts, we propose the new task of distinguishing which of several variants of a given model generated some piece of text. Specifically, we conduct an extensive suite of diagnostic tests to observe whether modeling choices (e.g., sampling methods, top-k probabilities, model architectures, etc.) leave detectable artifacts in the text they generate. Our key finding, which is backed by a rigorous set of experiments, is that such artifacts are present and that different modeling choices can be inferred by looking at generated text alone. This suggests that neural text generators may actually be more sensitive to various modeling choices than previously thought.",Reverse Models|neural modeling|Neural Models|generative models,Generation,Short,https://www.aclweb.org/anthology/2020.acl-main.25.pdf -main.312,Understanding Attention for Text Classification,Xiaobing Sun|Wei Lu,"Attention has been proven successful in many natural language processing (NLP) tasks. Recently, many researchers started to investigate the interpretability of attention on NLP tasks. Many existing approaches focused on examining whether the local attention weights could reflect the importance of input representations. In this work, we present a study on understanding the internal mechanism of attention by looking into the gradient update process, checking its behavior when approaching a local minimum during training. We propose to analyze for each word token the following two quantities: its polarity score and its attention score, where the latter is a global assessment on the token’s significance. We discuss conditions under which the attention mechanism may become more (or less) interpretable, and show how the interplay between the two quantities can contribute towards model performance.",Text Classification|natural tasks|NLP tasks|gradient process,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.312.pdf -main.474,Text and Causal Inference: A Review of Using Text to Remove Confounding from Causal Estimates,Katherine Keith|David Jensen|Brendan O'Connor,"Many applications of computational social science aim to infer causal conclusions from non-experimental data. Such observational data often contains confounders, variables that influence both potential causes and potential effects. Unmeasured or latent confounders can bias causal estimates, and this has motivated interest in measuring potential confounders from observed text. For example, an individual’s entire history of social media posts or the content of a news article could provide a rich measurement of multiple confounders.Yet, methods and applications for this problem are scattered across different communities and evaluation practices are inconsistent.This review is the first to gather and categorize these examples and provide a guide to data-processing and evaluation decisions. Despite increased attention on adjusting for confounding using text, there are still many open problems, which we highlight in this paper.",Text Inference|computational science|causal conclusions|causal estimates,Computational Social Science and Social Media,Long,https://www.aclweb.org/anthology/2020.acl-main.474.pdf -main.460,The Summary Loop: Learning to Write Abstractive Summaries Without Examples,Philippe Laban|Andrew Hsi|John Canny|Marti A. Hearst,"This work presents a new approach to unsupervised abstractive summarization based on maximizing a combination of coverage and fluency for a given length constraint. It introduces a novel method that encourages the inclusion of key terms from the original document into the summary: key terms are masked out of the original document and must be filled in by a coverage model using the current generated summary. A novel unsupervised training procedure leverages this coverage model along with a fluency model to generate and score summaries. When tested on popular news summarization datasets, the method outperforms previous unsupervised methods by more than 2 R-1 points, and approaches results of competitive supervised methods. Our model attains higher levels of abstraction with copied passages roughly two times shorter than prior work, and learns to compress and merge sentences without supervision.",unsupervised summarization|coverage model|unsupervised procedure|fluency model,Summarization,Long,https://www.aclweb.org/anthology/2020.acl-main.460.pdf -main.19,Fluent Response Generation for Conversational Question Answering,Ashutosh Baheti|Alan Ritter|Kevin Small,"Question answering (QA) is an important aspect of open-domain conversational agents, garnering specific research focus in the conversational QA (ConvQA) subtask. One notable limitation of recent ConvQA efforts is the response being answer span extraction from the target corpus, thus ignoring the natural language generation (NLG) aspect of high-quality conversational agents. In this work, we propose a method for situating QA responses within a SEQ2SEQ NLG approach to generate fluent grammatical answer responses while maintaining correctness. From a technical perspective, we use data augmentation to generate training data for an end-to-end system. Specifically, we develop Syntactic Transformations (STs) to produce question-specific candidate answer responses and rank them using a BERT-based classifier (Devlin et al., 2019). Human evaluation on SQuAD 2.0 data (Rajpurkar et al., 2018) demonstrate that the proposed model outperforms baseline CoQA and QuAC models in generating conversational responses. We further show our model's scalability by conducting tests on the CoQA dataset. The code and data are available at https://github.com/abaheti95/QADialogSystem.",Fluent Generation|Conversational Answering|Question answering|Question QA,Generation,Long,https://www.aclweb.org/anthology/2020.acl-main.19.pdf -main.306,Improving Multimodal Named Entity Recognition via Entity Span Detection with Unified Multimodal Transformer,Jianfei Yu|Jing Jiang|Li Yang|Rui Xia,"In this paper, we study Multimodal Named Entity Recognition (MNER) for social media posts. Existing approaches for MNER mainly suffer from two drawbacks: (1) despite generating word-aware visual representations, their word representations are insensitive to the visual context; (2) most of them ignore the bias brought by the visual context. To tackle the first issue, we propose a multimodal interaction module to obtain both image-aware word representations and word-aware visual representations. To alleviate the visual bias, we further propose to leverage purely text-based entity span detection as an auxiliary module, and design a Unified Multimodal Transformer to guide the final predictions with the entity span predictions. Experiments show that our unified approach achieves the new state-of-the-art performance on two benchmark datasets.",Multimodal Recognition|Multimodal MNER|Multimodal|MNER,Computational Social Science and Social Media,Long,https://www.aclweb.org/anthology/2020.acl-main.306.pdf -main.138,NAT: Noise-Aware Training for Robust Neural Sequence Labeling,Marcin Namysl|Sven Behnke|Joachim Köhler,"Sequence labeling systems should perform reliably not only under ideal conditions but also with corrupted inputs---as these systems often process user-generated text or follow an error-prone upstream component. To this end, we formulate the noisy sequence labeling problem, where the input may undergo an unknown noising process and propose two Noise-Aware Training (NAT) objectives that improve robustness of sequence labeling performed on perturbed input: Our data augmentation method trains a neural model using a mixture of clean and noisy samples, whereas our stability training algorithm encourages the model to create a noise-invariant latent representation. We employ a vanilla noise model at training time. For evaluation, we use both the original data and its variants perturbed with real OCR errors and misspellings. Extensive experiments on English and German named entity recognition benchmarks confirmed that NAT consistently improved robustness of popular sequence labeling models, preserving accuracy on the original input. We make our code and data publicly available for the research community.",Noise-Aware Training|Robust Labeling|Sequence systems|noisy problem,Information Extraction,Long,https://www.aclweb.org/anthology/2020.acl-main.138.pdf -main.676,Good-Enough Compositional Data Augmentation,Jacob Andreas,"We propose a simple data augmentation protocol aimed at providing a compositional inductive bias in conditional and unconditional sequence models. Under this protocol, synthetic training examples are constructed by taking real training examples and replacing (possibly discontinuous) fragments with other fragments that appear in at least one similar environment. The protocol is model-agnostic and useful for a variety of tasks. Applied to neural sequence-to-sequence models, it reduces error rate by as much as 87% on diagnostic tasks from the SCAN dataset and 16% on a semantic parsing task. Applied to n-gram language models, it reduces perplexity by roughly 1% on small corpora in several languages.",Good-Enough Augmentation|diagnostic tasks|semantic task|data protocol,Semantics: Sentence Level,Long,https://www.aclweb.org/anthology/2020.acl-main.676.pdf -main.110,Generating Counter Narratives against Online Hate Speech: Data and Strategies,Serra Sinem Tekiroğlu|Yi-Ling Chung|Marco Guerini,"Recently research has started focusing on avoiding undesired effects that come with content moderation, such as censorship and overblocking, when dealing with hatred online. The core idea is to directly intervene in the discussion with textual responses that are meant to counter the hate content and prevent it from further spreading. Accordingly, automation strategies, such as natural language generation, are beginning to be investigated. Still, they suffer from the lack of sufficient amount of quality data and tend to produce generic/repetitive responses. Being aware of the aforementioned limitations, we present a study on how to collect responses to hate effectively, employing large scale unsupervised language models such as GPT-2 for the generation of silver data, and the best annotation strategies/neural architectures that can be used for data filtering before expert validation/post-editing.",natural generation|generation data|data filtering|expert validation/post-editing,Resources and Evaluation,Long,https://www.aclweb.org/anthology/2020.acl-main.110.pdf -main.104,Hierarchy-Aware Global Model for Hierarchical Text Classification,Jie Zhou|Chunping Ma|Dingkun Long|Guangwei Xu|Ning Ding|Haoyu Zhang|Pengjun Xie|Gongshen Liu,"Hierarchical text classification is an essential yet challenging subtask of multi-label text classification with a taxonomic hierarchy. Existing methods have difficulties in modeling the hierarchical label structure in a global view. Furthermore, they cannot make full use of the mutual interactions between the text feature space and the label space. In this paper, we formulate the hierarchy as a directed graph and introduce hierarchy-aware structure encoders for modeling label dependencies. Based on the hierarchy encoder, we propose a novel end-to-end hierarchy-aware global model (HiAGM) with two variants. A multi-label attention variant (HiAGM-LA) learns hierarchy-aware label embeddings through the hierarchy encoder and conducts inductive fusion of label-aware text features. A text feature propagation model (HiAGM-TP) is proposed as the deductive variant that directly feeds text features into hierarchy encoders. Compared with previous works, both HiAGM-LA and HiAGM-TP achieve significant and consistent improvements on three benchmark datasets.",Hierarchical Classification|multi-label classification|inductive features|Hierarchy-Aware Model,Information Retrieval and Text Mining,Long,https://www.aclweb.org/anthology/2020.acl-main.104.pdf -main.662,What Question Answering can Learn from Trivia Nerds,Jordan Boyd-Graber|Benjamin Börschinger,"In addition to the traditional task of machines answering questions, question answering (QA) research creates interesting, challenging questions that help systems how to answer questions and reveal the best systems. We argue that creating a QA dataset—and the ubiquitous leaderboard that goes with it—closely resembles running a trivia tournament: you write questions, have agents (either humans or machines) answer the questions, and declare a winner. However, the research community has ignored the hard-learned lessons from decades of the trivia community creating vibrant, fair, and effective question answering competitions. After detailing problems with existing QA datasets, we outline the key lessons—removing ambiguity, discriminating skill, and adjudicating disputes---that can transfer to QA research and how they might be implemented.",Question Answering|machines questions|QA|QA research,Theme,Long,https://www.aclweb.org/anthology/2020.acl-main.662.pdf -main.689,Learning a Multi-Domain Curriculum for Neural Machine Translation,Wei Wang|Ye Tian|Jiquan Ngiam|Yinfei Yang|Isaac Caswell|Zarana Parekh,"Most data selection research in machine translation focuses on improving a single domain. We perform data selection for multiple domains at once. This is achieved by carefully introducing instance-level domain-relevance features and automatically constructing a training curriculum to gradually concentrate on multi-domain relevant and noise-reduced data batches. Both the choice of features and the use of curriculum are crucial for balancing and improving all domains, including out-of-domain. In large-scale experiments, the multi-domain curriculum simultaneously reaches or outperforms the individual performance and brings solid gains over no-curriculum training.",Neural Translation|data selection|machine translation|multi-domain curriculum,Machine Translation,Long,https://www.aclweb.org/anthology/2020.acl-main.689.pdf -main.702,Gender Gap in Natural Language Processing Research: Disparities in Authorship and Citations,Saif M. Mohammad,"Disparities in authorship and citations across genders can have substantial adverse consequences not just on the disadvantaged gender, but also on the field of study as a whole. In this work, we examine female first author percentages and the citations to their papers in Natural Language Processing. We find that only about 29% of first authors are female and only about 25% of last authors are female. Notably, this percentage has not improved since the mid 2000s. We also show that, on average, female first authors are cited less than male first authors, even when controlling for experience and area of research. We hope that recording citation and participation gaps across demographic groups will improve awareness of gender gaps and encourage more inclusiveness and fairness in research.",Gender Gap|Natural Research|Natural Processing|citations,Theme,Long,https://www.aclweb.org/anthology/2020.acl-main.702.pdf -main.716,From English to Code-Switching: Transfer Learning with Strong Morphological Clues,Gustavo Aguilar|Thamar Solorio,"Linguistic Code-switching (CS) is still an understudied phenomenon in natural language processing. The NLP community has mostly focused on monolingual and multi-lingual scenarios, but little attention has been given to CS in particular. This is partly because of the lack of resources and annotated data, despite its increasing occurrence in social media platforms. In this paper, we aim at adapting monolingual models to code-switched text in various tasks. Specifically, we transfer English knowledge from a pre-trained ELMo model to different code-switched language pairs (i.e., Nepali-English, Spanish-English, and Hindi-English) using the task of language identification. Our method, CS-ELMo, is an extension of ELMo with a simple yet effective position-aware attention mechanism inside its character convolutions. We show the effectiveness of this transfer learning step by outperforming multilingual BERT and homologous CS-unaware ELMo models and establishing a new state of the art in CS tasks, such as NER and POS tagging. Our technique can be expanded to more English-paired code-switched languages, providing more resources to the CS community.",natural processing|CS|language identification|CS tasks,Information Extraction,Long,https://www.aclweb.org/anthology/2020.acl-main.716.pdf -main.528,Simplify the Usage of Lexicon in Chinese NER,Ruotian Ma|Minlong Peng|Qi Zhang|Zhongyu Wei|Xuanjing Huang,"Recently, many works have tried to augment the performance of Chinese named entity recognition (NER) using word lexicons. As a representative, Lattice-LSTM has achieved new benchmark results on several public Chinese NER datasets. However, Lattice-LSTM has a complex model architecture. This limits its application in many industrial areas where real-time NER responses are needed. In this work, we propose a simple but effective method for incorporating the word lexicon into the character representations. This method avoids designing a complicated sequence modeling architecture, and for any neural NER model, it requires only subtle adjustment of the character representation layer to introduce the lexicon information. Experimental studies on four benchmark Chinese NER datasets show that our method achieves an inference speed up to 6.15 times faster than those of state-of-the-art methods, along with a better performance. The experimental results also show that the proposed method can be easily incorporated with pre-trained models like BERT.",Chinese recognition|NER|Lattice-LSTM|complex architecture,Information Extraction,Long,https://www.aclweb.org/anthology/2020.acl-main.528.pdf -main.266,A Probabilistic Generative Model for Typographical Analysis of Early Modern Printing,Kartik Goyal|Chris Dyer|Christopher Warren|Maxwell G'Sell|Taylor Berg-Kirkpatrick,"We propose a deep and interpretable probabilistic generative model to analyze glyph shapes in printed Early Modern documents. We focus on clustering extracted glyph images into underlying templates in the presence of multiple confounding sources of variance. Our approach introduces a neural editor model that first generates well-understood printing phenomena like spatial perturbations from template parameters via interpertable latent variables, and then modifies the result by generating a non-interpretable latent vector responsible for inking variations, jitter, noise from the archiving process, and other unforeseen phenomena associated with Early Modern printing. Critically, by introducing an inference network whose input is restricted to the visual residual between the observation and the interpretably-modified template, we are able to control and isolate what the vector-valued latent variable captures. We show that our approach outperforms rigid interpretable clustering baselines (c.f. Ocular) and overly-flexible deep generative models (VAE) alike on the task of completely unsupervised discovery of typefaces in mixed-fonts documents.",Typographical Printing|clustering images|archiving process|Early printing,Machine Learning for NLP,Short,https://www.aclweb.org/anthology/2020.acl-main.266.pdf -main.500,On the Importance of Diversity in Question Generation for QA,Md Arafat Sultan|Shubham Chandel|Ramón Fernandez Astudillo|Vittorio Castelli,"Automatic question generation (QG) has shown promise as a source of synthetic training data for question answering (QA). In this paper we ask: Is textual diversity in QG beneficial for downstream QA? Using top-p nucleus sampling to derive samples from a transformer-based question generator, we show that diversity-promoting QG indeed provides better QA training than likelihood maximization approaches such as beam search. We also show that standard QG evaluation metrics such as BLEU, ROUGE and METEOR are inversely correlated with diversity, and propose a diversity-aware intrinsic measure of overall QG quality that correlates well with extrinsic evaluation on QA.",Question Generation|QA|Automatic generation|Automatic QG,Question Answering,Short,https://www.aclweb.org/anthology/2020.acl-main.500.pdf -main.514,A Comprehensive Analysis of Preprocessing for Word Representation Learning in Affective Tasks,Nastaran Babanejad|Ameeta Agrawal|Aijun An|Manos Papagelis,"Affective tasks such as sentiment analysis, emotion classification, and sarcasm detection have been popular in recent years due to an abundance of user-generated data, accurate computational linguistic models, and a broad range of relevant applications in various domains. At the same time, many studies have highlighted the importance of text preprocessing, as an integral step to any natural language processing prediction model and downstream task. While preprocessing in affective systems is well-studied, preprocessing in word vector-based models applied to affective systems, is not. To address this limitation, we conduct a comprehensive analysis of the role of preprocessing techniques in affective analysis based on word vector models. Our analysis is the first of its kind and provides useful insights of the importance of each preprocessing technique when applied at the training phase, commonly ignored in pretrained word vector models, and/or at the downstream task phase.",Word Learning|Affective Tasks|sentiment analysis|emotion classification,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Long,https://www.aclweb.org/anthology/2020.acl-main.514.pdf -main.272,Zero-shot Text Classification via Reinforced Self-training,Zhiquan Ye|Yuxia Geng|Jiaoyan Chen|Jingmin Chen|Xiaoxiao Xu|Suhang Zheng|Feng Wang|Jun Zhang|Huajun Chen,"Zero-shot learning has been a tough problem since no labeled data is available for unseen classes during training, especially for classes with low similarity. In this situation, transferring from seen classes to unseen classes is extremely hard. To tackle this problem, in this paper we propose a self-training based method to efficiently leverage unlabeled data. Traditional self-training methods use fixed heuristics to select instances from unlabeled data, whose performance varies among different datasets. We propose a reinforcement learning framework to learn data selection strategy automatically and provide more reliable selection. Experimental results on both benchmarks and a real-world e-commerce dataset show that our approach significantly outperforms previous methods in zero-shot text classification",Zero-shot Classification|Reinforced Self-training|Zero-shot learning|self-training method,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.272.pdf -main.299,A Span-based Linearization for Constituent Trees,Yang Wei|Yuanbin Wu|Man Lan,"We propose a novel linearization of a constituent tree, together with a new locally normalized model. For each split point in a sentence, our model computes the normalizer on all spans ending with that split point, and then predicts a tree span from them. Compared with global models, our model is fast and parallelizable. Different from previous local models, our linearization method is tied on the spans directly and considers more local features when performing span prediction, which is more interpretable and effective. Experiments on PTB (95.8 F1) and CTB (92.4 F1) show that our model significantly outperforms existing local models and efficiently achieves competitive results with global models.",PTB|CTB|Span-based Linearization|linearization tree,"Syntax: Tagging, Chunking and Parsing",Long,https://www.aclweb.org/anthology/2020.acl-main.299.pdf -main.298,Towards Better Non-Tree Argument Mining: Proposition-Level Biaffine Parsing with Task-Specific Parameterization,Gaku Morio|Hiroaki Ozaki|Terufumi Morishita|Yuta Koreeda|Kohsuke Yanai,"State-of-the-art argument mining studies have advanced the techniques for predicting argument structures. However, the technology for capturing non-tree-structured arguments is still in its infancy. In this paper, we focus on non-tree argument mining with a neural model. We jointly predict proposition types and edges between propositions. Our proposed model incorporates (i) task-specific parameterization (TSP) that effectively encodes a sequence of propositions and (ii) a proposition-level biaffine attention (PLBA) that can predict a non-tree argument consisting of edges. Experimental results show that both TSP and PLBA boost edge prediction performance compared to baselines.",Non-Tree Mining|predicting structures|edge prediction|Proposition-Level Parsing,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Short,https://www.aclweb.org/anthology/2020.acl-main.298.pdf -main.515,Diverse and Informative Dialogue Generation with Context-Specific Commonsense Knowledge Awareness,Sixing Wu|Ying Li|Dawei Zhang|Yang Zhou|Zhonghai Wu,"Generative dialogue systems tend to produce generic responses, which often leads to boring conversations. For alleviating this issue, Recent studies proposed to retrieve and introduce knowledge facts from knowledge graphs. While this paradigm works to a certain extent, it usually retrieves knowledge facts only based on the entity word itself, without considering the specific dialogue context. Thus, the introduction of the context-irrelevant knowledge facts can impact the quality of generations. To this end, this paper proposes a novel commonsense knowledge-aware dialogue generation model, ConKADI. We design a Felicitous Fact mechanism to help the model focus on the knowledge facts that are highly relevant to the context; furthermore, two techniques, Context-Knowledge Fusion and Flexible Mode Fusion are proposed to facilitate the integration of the knowledge in the ConKADI. We collect and build a large-scale Chinese dataset aligned with the commonsense knowledge for dialogue generation. Extensive evaluations over both an open-released English dataset and our Chinese dataset demonstrate that our approach ConKADI outperforms the state-of-the-art approach CCM, in most experiments.",Diverse Generation|dialogue generation|Context-Specific Awareness|Generative systems,Dialogue and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.acl-main.515.pdf -main.273,A Novel Graph-based Multi-modal Fusion Encoder for Neural Machine Translation,Yongjing Yin|Fandong Meng|Jinsong Su|Chulun Zhou|Zhengyuan Yang|Jie Zhou|Jiebo Luo,"Multi-modal neural machine translation (NMT) aims to translate source sentences into a target language paired with images. However, dominant multi-modal NMT models do not fully exploit fine-grained semantic correspondences between semantic units of different modalities, which have potential to refine multi-modal representation learning. To deal with this issue, in this paper, we propose a novel graph-based multi-modal fusion encoder for NMT. Specifically, we first represent the input sentence and image using a unified multi-modal graph, which captures various semantic relationships between multi-modal semantic units (words and visual objects). We then stack multiple graph-based multi-modal fusion layers that iteratively perform semantic interactions to learn node representations. Finally, these representations provide an attention-based context vector for the decoder. We evaluate our proposed encoder on the Multi30K datasets. Experimental results and in-depth analysis show the superiority of our multi-modal NMT model.",Neural Translation|multi-modal learning|NMT|Graph-based Encoder,Machine Translation,Long,https://www.aclweb.org/anthology/2020.acl-main.273.pdf -main.267,Attentive Pooling with Learnable Norms for Text Representation,Chuhan Wu|Fangzhao Wu|Tao Qi|Xiaohui Cui|Yongfeng Huang,"Pooling is an important technique for learning text representations in many neural NLP models. In conventional pooling methods such as average, max and attentive pooling, text representations are weighted summations of the L1 or L∞ norm of input features. However, their pooling norms are always fixed and may not be optimal for learning accurate text representations in different tasks. In addition, in many popular pooling methods such as max and attentive pooling some features may be over-emphasized, while other useful ones are not fully exploited. In this paper, we propose an Attentive Pooling with Learnable Norms (APLN) approach for text representation. Different from existing pooling methods that use a fixed pooling norm, we propose to learn the norm in an end-to-end manner to automatically find the optimal ones for text representation in different tasks. In addition, we propose two methods to ensure the numerical stability of the model training. The first one is scale limiting, which re-scales the input to ensure non-negativity and alleviate the risk of exponential explosion. The second one is re-formulation, which decomposes the exponent operation to avoid computing the real-valued powers of the input and further accelerate the pooling operation. Experimental results on four benchmark datasets show that our approach can effectively improve the performance of attentive pooling.",Text Representation|text representations|model training|Pooling,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.267.pdf -main.501,Probabilistic Assumptions Matter: Improved Models for Distantly-Supervised Document-Level Question Answering,Hao Cheng|Ming-Wei Chang|Kenton Lee|Kristina Toutanova,"We address the problem of extractive question answering using document-level distant super-vision, pairing questions and relevant documents with answer strings. We compare previously used probability space and distant supervision assumptions (assumptions on the correspondence between the weak answer string labels and possible answer mention spans). We show that these assumptions interact, and that different configurations provide complementary benefits. We demonstrate that a multi-objective model can efficiently combine the advantages of multiple assumptions and outperform the best individual formulation. Our approach outperforms previous state-of-the-art models by 4.3 points in F1 on TriviaQA-Wiki and 1.7 points in Rouge-L on NarrativeQA summaries.",Distantly-Supervised Answering|extractive answering|document-level super-vision|probability assumptions,Question Answering,Long,https://www.aclweb.org/anthology/2020.acl-main.501.pdf -main.529,AdvAug: Robust Adversarial Augmentation for Neural Machine Translation,Yong Cheng|Lu Jiang|Wolfgang Macherey|Jacob Eisenstein,"In this paper, we propose a new adversarial augmentation method for Neural Machine Translation (NMT). The main idea is to minimize the vicinal risk over virtual sentences sampled from two vicinity distributions, in which the crucial one is a novel vicinity distribution for adversarial sentences that describes a smooth interpolated embedding space centered around observed training sentence pairs. We then discuss our approach, AdvAug, to train NMT models using the embeddings of virtual sentences in sequence-to-sequence learning. Experiments on Chinese-English, English-French, and English-German translation benchmarks show that AdvAug achieves significant improvements over theTransformer (up to 4.9 BLEU points), and substantially outperforms other data augmentation techniques (e.g.back-translation) without using extra corpora.",Robust Augmentation|Neural Translation|Neural NMT|Neural,Machine Translation,Long,https://www.aclweb.org/anthology/2020.acl-main.529.pdf -main.717,"Learning Interpretable Relationships between Entities, Relations and Concepts via Bayesian Structure Learning on Open Domain Facts",Jingyuan Zhang|Mingming Sun|Yue Feng|Ping Li,"Concept graphs are created as universal taxonomies for text understanding in the open-domain knowledge. The nodes in concept graphs include both entities and concepts. The edges are from entities to concepts, showing that an entity is an instance of a concept. In this paper, we propose the task of learning interpretable relationships from open-domain facts to enrich and refine concept graphs. The Bayesian network structures are learned from open-domain facts as the interpretable relationships between relations of facts and concepts of entities. We conduct extensive experiments on public English and Chinese datasets. Compared to the state-of-the-art methods, the learned network structures help improving the identification of concepts for entities based on the relations of entities on both datasets.",text understanding|learning relationships|identification concepts|Bayesian Learning,Information Extraction,Long,https://www.aclweb.org/anthology/2020.acl-main.717.pdf -main.703,"BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension",Mike Lewis|Yinhan Liu|Naman Goyal|Marjan Ghazvininejad|Abdelrahman Mohamed|Omer Levy|Veselin Stoyanov|Luke Zettlemoyer,"We present BART, a denoising autoencoder for pretraining sequence-to-sequence models. BART is trained by (1) corrupting text with an arbitrary noising function, and (2) learning a model to reconstruct the original text. It uses a standard Tranformer-based neural machine translation architecture which, despite its simplicity, can be seen as generalizing BERT (due to the bidirectional encoder), GPT (with the left-to-right decoder), and other recent pretraining schemes. We evaluate a number of noising approaches, finding the best performance by both randomly shuffling the order of sentences and using a novel in-filling scheme, where spans of text are replaced with a single mask token. BART is particularly effective when fine tuned for text generation but also works well for comprehension tasks. It matches the performance of RoBERTa on GLUE and SQuAD, and achieves new state-of-the-art results on a range of abstractive dialogue, question answering, and summarization tasks, with gains of up to 3.5 ROUGE. BART also provides a 1.1 BLEU increase over a back-translation system for machine translation, with only target language pretraining. We also replicate other pretraining schemes within the BART framework, to understand their effect on end-task performance.",Natural Generation|Translation|Comprehension|pretraining models,Generation,Long,https://www.aclweb.org/anthology/2020.acl-main.703.pdf -main.688,"In Neural Machine Translation, What Does Transfer Learning Transfer?",Alham Fikri Aji|Nikolay Bogoychev|Kenneth Heafield|Rico Sennrich,"Transfer learning improves quality for low-resource machine translation, but it is unclear what exactly it transfers. We perform several ablation studies that limit information transfer, then measure the quality impact across three language pairs to gain a black-box understanding of transfer learning. Word embeddings play an important role in transfer learning, particularly if they are properly aligned. Although transfer learning can be performed without embeddings, results are sub-optimal. In contrast, transferring only the embeddings but nothing else yields catastrophic results. We then investigate diagonal alignments with auto-encoders over real languages and randomly generated sequences, finding even randomly generated sequences as parents yield noticeable but smaller gains. Finally, transfer learning can eliminate the need for a warm-up phase when training transformer models in high resource language pairs.",Neural Translation|Transfer Transfer|low-resource translation|information transfer,Machine Translation,Long,https://www.aclweb.org/anthology/2020.acl-main.688.pdf -main.105,Keyphrase Generation for Scientific Document Retrieval,Florian Boudin|Ygor Gallina|Akiko Aizawa,"Sequence-to-sequence models have lead to significant progress in keyphrase generation, but it remains unknown whether they are reliable enough to be beneficial for document retrieval. This study provides empirical evidence that such models can significantly improve retrieval performance, and introduces a new extrinsic evaluation framework that allows for a better understanding of the limitations of keyphrase generation models. Using this framework, we point out and discuss the difficulties encountered with supplementing documents with -not present in text- keyphrases, and generalizing models across domains. Our code is available at https://github.com/boudinfl/ir-using-kg",Keyphrase Generation|Scientific Retrieval|document retrieval|retrieval,Information Retrieval and Text Mining,Short,https://www.aclweb.org/anthology/2020.acl-main.105.pdf -main.663,What are the Goals of Distributional Semantics?,Guy Emerson,"Distributional semantic models have become a mainstay in NLP, providing useful features for downstream tasks. However, assessing long-term progress requires explicit long-term goals. In this paper, I take a broad linguistic perspective, looking at how well current models can deal with various semantic challenges. Given stark differences between models proposed in different subfields, a broad perspective is needed to see how we could integrate them. I conclude that, while linguistic insights can guide the design of model architectures, future progress will require balancing the often conflicting demands of linguistic expressiveness and computational tractability.",NLP|downstream tasks|assessing progress|Distributional Semantics,Theme,Long,https://www.aclweb.org/anthology/2020.acl-main.663.pdf -main.677,RAT-SQL: Relation-Aware Schema Encoding and Linking for Text-to-SQL Parsers,Bailin Wang|Richard Shin|Xiaodong Liu|Oleksandr Polozov|Matthew Richardson,"When translating natural language questions into SQL queries to answer questions from a database, contemporary semantic parsing models struggle to generalize to unseen database schemas. The generalization challenge lies in (a) encoding the database relations in an accessible way for the semantic parser, and (b) modeling alignment between database columns and their mentions in a given query. We present a unified framework, based on the relation-aware self-attention mechanism, to address schema encoding, schema linking, and feature representation within a text-to-SQL encoder. On the challenging Spider dataset this framework boosts the exact match accuracy to 57.2%, surpassing its best counterparts by 8.7% absolute improvement. Further augmented with BERT, it achieves the new state-of-the-art performance of 65.6% on the Spider leaderboard. In addition, we observe qualitative improvements in the model's understanding of schema linking and alignment. Our implementation will be open-sourced at https://github.com/Microsoft/rat-sql.",generalization challenge|schema encoding|schema linking|RAT-SQL,Semantics: Sentence Level,Long,https://www.aclweb.org/anthology/2020.acl-main.677.pdf -main.111,KLEJ: Comprehensive Benchmark for Polish Language Understanding,Piotr Rybak|Robert Mroczkowski|Janusz Tracz|Ireneusz Gawlik,"In recent years, a series of Transformer-based models unlocked major improvements in general natural language understanding (NLU) tasks. Such a fast pace of research would not be possible without general NLU benchmarks, which allow for a fair comparison of the proposed methods. However, such benchmarks are available only for a handful of languages. To alleviate this issue, we introduce a comprehensive multi-task benchmark for the Polish language understanding, accompanied by an online leaderboard. It consists of a diverse set of tasks, adopted from existing datasets for named entity recognition, question-answering, textual entailment, and others. We also introduce a new sentiment analysis task for the e-commerce domain, named Allegro Reviews (AR). To ensure a common evaluation scheme and promote models that generalize to different NLU tasks, the benchmark includes datasets from varying domains and applications. Additionally, we release HerBERT, a Transformer-based model trained specifically for the Polish language, which has the best average performance and obtains the best results for three out of nine tasks. Finally, we provide an extensive evaluation, including several standard baselines and recently proposed, multilingual Transformer-based models.",Polish Understanding|general tasks|named recognition|textual entailment,Resources and Evaluation,Long,https://www.aclweb.org/anthology/2020.acl-main.111.pdf -main.139,Named Entity Recognition without Labelled Data: A Weak Supervision Approach,Pierre Lison|Jeremy Barnes|Aliaksandr Hubin|Samia Touileb,"Named Entity Recognition (NER) performance often degrades rapidly when applied to target domains that differ from the texts observed during training. When in-domain labelled data is available, transfer learning techniques can be used to adapt existing NER models to the target domain. But what should one do when there is no hand-labelled data for the target domain? This paper presents a simple but powerful approach to learn NER models in the absence of labelled data through weak supervision. The approach relies on a broad spectrum of labelling functions to automatically annotate texts from the target domain. These annotations are then merged together using a hidden Markov model which captures the varying accuracies and confusions of the labelling functions. A sequence labelling model can finally be trained on the basis of this unified annotation. We evaluate the approach on two English datasets (CoNLL 2003 and news articles from Reuters and Bloomberg) and demonstrate an improvement of about 7 percentage points in entity-level F1 scores compared to an out-of-domain neural NER model.",Named Recognition|NER|Weak Approach|transfer techniques,Information Extraction,Long,https://www.aclweb.org/anthology/2020.acl-main.139.pdf -main.461,Unsupervised Opinion Summarization as Copycat-Review Generation,Arthur Bražinskas|Mirella Lapata|Ivan Titov,"Opinion summarization is the task of automatically creating summaries that reflect subjective information expressed in multiple documents, such as product reviews. While the majority of previous work has focused on the extractive setting, i.e., selecting fragments from input reviews to produce a summary, we let the model generate novel sentences and hence produce abstractive summaries. Recent progress in summarization has seen the development of supervised models which rely on large quantities of document-summary pairs. Since such training data is expensive to acquire, we instead consider the unsupervised setting, in other words, we do not use any summaries in training. We define a generative model for a review collection which capitalizes on the intuition that when generating a new review given a set of other reviews of a product, we should be able to control the “amount of novelty” going into the new review or, equivalently, vary the extent to which it deviates from the input. At test time, when generating summaries, we force the novelty to be minimal, and produce a text reflecting consensus opinions. We capture this intuition by defining a hierarchical variational autoencoder model. Both individual reviews and the products they correspond to are associated with stochastic latent codes, and the review generator (“decoder”) has direct access to the text of input reviews through the pointer-generator mechanism. Experiments on Amazon and Yelp datasets, show that setting at test time the review’s latent code to its mean, allows the model to produce fluent and coherent summaries reflecting common opinions.",Unsupervised Summarization|Copycat-Review Generation|Opinion summarization|automatically summaries,Summarization,Long,https://www.aclweb.org/anthology/2020.acl-main.461.pdf -main.307,"Stock Embeddings Acquired from News Articles and Price History, and an Application to Portfolio Optimization",Xin Du|Kumiko Tanaka-Ishii,"Previous works that integrated news articles to better process stock prices used a variety of neural networks to predict price movements. The textual and price information were both encoded in the neural network, and it is therefore difficult to apply this approach in situations other than the original framework of the notoriously hard problem of price prediction. In contrast, this paper presents a method to encode the influence of news articles through a vector representation of stocks called a stock embedding. The stock embedding is acquired with a deep learning framework using both news articles and price history. Because the embedding takes the operational form of a vector, it is applicable to other financial problems besides price prediction. As one example application, we show the results of portfolio optimization using Reuters & Bloomberg headlines, producing a capital gain 2.8 times larger than that obtained with a baseline method using only stock price data. This suggests that the proposed stock embedding can leverage textual financial semantics to solve financial prediction problems.",Stock Embeddings|Portfolio Optimization|financial problems|price prediction,Computational Social Science and Social Media,Long,https://www.aclweb.org/anthology/2020.acl-main.307.pdf -main.18,Few-Shot NLG with Pre-Trained Language Model,Zhiyu Chen|Harini Eavani|Wenhu Chen|Yinyin Liu|William Yang Wang,"Neural-based end-to-end approaches to natural language generation (NLG) from structured data or knowledge are data-hungry, making their adoption for real-world applications difficult with limited data. In this work, we propose the new task of few-shot natural language generation. Motivated by how humans tend to summarize tabular data, we propose a simple yet effective approach and show that it not only demonstrates strong performance but also provides good generalization across domains. The design of the model architecture is based on two aspects: content selection from input data and language modeling to compose coherent sentences, which can be acquired from prior knowledge. With just 200 training examples, across multiple domains, we show that our approach achieves very reasonable performances and outperforms the strongest baseline by an average of over 8.0 BLEU points improvement. Our code and data can be found at https://github.com/czyssrs/Few-Shot-NLG",natural generation|NLG|real-world applications|content selection,Generation,Short,https://www.aclweb.org/anthology/2020.acl-main.18.pdf -main.313,A Relational Memory-based Embedding Model for Triple Classification and Search Personalization,Dai Quoc Nguyen|Tu Nguyen|Dinh Phung,"Knowledge graph embedding methods often suffer from a limitation of memorizing valid triples to predict new ones for triple classification and search personalization problems. To this end, we introduce a novel embedding model, named R-MeN, that explores a relational memory network to encode potential dependencies in relationship triples. R-MeN considers each triple as a sequence of 3 input vectors that recurrently interact with a memory using a transformer self-attention mechanism. Thus R-MeN encodes new information from interactions between the memory and each input vector to return a corresponding vector. Consequently, R-MeN feeds these 3 returned vectors to a convolutional neural network-based decoder to produce a scalar score for the triple. Experimental results show that our proposed R-MeN obtains state-of-the-art results on SEARCH17 for the search personalization task, and on WN11 and FB13 for the triple classification task.",Triple Classification|Search Personalization|search problems|SEARCH17,Machine Learning for NLP,Short,https://www.aclweb.org/anthology/2020.acl-main.313.pdf -main.475,Text-Based Ideal Points,Keyon Vafa|Suresh Naidu|David Blei,"Ideal point models analyze lawmakers' votes to quantify their political positions, or ideal points. But votes are not the only way to express a political position. Lawmakers also give speeches, release press statements, and post tweets. In this paper, we introduce the text-based ideal point model (TBIP), an unsupervised probabilistic topic model that analyzes texts to quantify the political positions of its authors. We demonstrate the TBIP with two types of politicized text data: U.S. Senate speeches and senator tweets. Though the model does not analyze their votes or political affiliations, the TBIP separates lawmakers by party, learns interpretable politicized topics, and infers ideal points close to the classical vote-based ideal points. One benefit of analyzing texts, as opposed to votes, is that the TBIP can estimate ideal points of anyone who authors political texts, including non-voting actors. To this end, we use it to study tweets from the 2020 Democratic presidential candidates. Using only the texts of their tweets, it identifies them along an interpretable progressive-to-moderate spectrum.",Ideal models|Lawmakers|text-based model|TBIP,Computational Social Science and Social Media,Long,https://www.aclweb.org/anthology/2020.acl-main.475.pdf -main.24,Probabilistically Masked Language Model Capable of Autoregressive Generation in Arbitrary Word Order,Yi Liao|Xin Jiang|Qun Liu,"Masked language model and autoregressive language model are two types of language models. While pretrained masked language models such as BERT overwhelm the line of natural language understanding (NLU) tasks, autoregressive language models such as GPT are especially capable in natural language generation (NLG). In this paper, we propose a probabilistic masking scheme for the masked language model, which we call probabilistically masked language model (PMLM). We implement a specific PMLM with a uniform prior distribution on the masking ratio named u-PMLM. We prove that u-PMLM is equivalent to an autoregressive permutated language model. One main advantage of the model is that it supports text generation in arbitrary order with surprisingly good quality, which could potentially enable new applications over traditional unidirectional generation. Besides, the pretrained u-PMLM also outperforms BERT on a bunch of downstream NLU tasks.",Autoregressive Generation|natural tasks|natural generation|natural NLG,Generation,Long,https://www.aclweb.org/anthology/2020.acl-main.24.pdf -main.30,Contextualized Weak Supervision for Text Classification,Dheeraj Mekala|Jingbo Shang,"Weakly supervised text classification based on a few user-provided seed words has recently attracted much attention from researchers. Existing methods mainly generate pseudo-labels in a context-free manner (e.g., string matching), therefore, the ambiguous, context-dependent nature of human language has been long overlooked. In this paper, we propose a novel framework ConWea, providing contextualized weak supervision for text classification. Specifically, we leverage contextualized representations of word occurrences and seed word information to automatically differentiate multiple interpretations of the same word, and thus create a contextualized corpus. This contextualized corpus is further utilized to train the classifier and expand seed words in an iterative manner. This process not only adds new contextualized, highly label-indicative keywords but also disambiguates initial seed words, making our weak supervision fully contextualized. Extensive experiments and case studies on real-world datasets demonstrate the necessity and significant advantages of using contextualized weak supervision, especially when the class labels are fine-grained.",Text Classification|Weakly classification|string matching|Contextualized Supervision,Information Retrieval and Text Mining,Long,https://www.aclweb.org/anthology/2020.acl-main.30.pdf -main.449,A Transformer-based Approach for Source Code Summarization,Wasi Ahmad|Saikat Chakraborty|Baishakhi Ray|Kai-Wei Chang,"Generating a readable summary that describes the functionality of a program is known as source code summarization. In this task, learning code representation by modeling the pairwise relationship between code tokens to capture their long-range dependencies is crucial. To learn code representation for summarization, we explore the Transformer model that uses a self-attention mechanism and has shown to be effective in capturing long-range dependencies. In this work, we show that despite the approach is simple, it outperforms the state-of-the-art techniques by a significant margin. We perform extensive analysis and ablation studies that reveal several important findings, e.g., the absolute encoding of source code tokens' position hinders, while relative encoding significantly improves the summarization performance. We have made our code publicly available (https://github.com/wasiahmad/NeuralCodeSum) to facilitate future research.",Source Summarization|summarization|ablation studies|Transformer-based Approach,Summarization,Short,https://www.aclweb.org/anthology/2020.acl-main.449.pdf -main.26,Review-based Question Generation with Adaptive Instance Transfer and Augmentation,Qian Yu|Lidong Bing|Qiong Zhang|Wai Lam|Luo Si,"While online reviews of products and services become an important information source, it remains inefficient for potential consumers to exploit verbose reviews for fulfilling their information need. We propose to explore question generation as a new way of review information exploitation, namely generating questions that can be answered by the corresponding review sentences. One major challenge of this generation task is the lack of training data, i.e. explicit mapping relation between the user-posed questions and review sentences. To obtain proper training instances for the generation model, we propose an iterative learning framework with adaptive instance transfer and augmentation. To generate to the point questions about the major aspects in reviews, related features extracted in an unsupervised manner are incorporated without the burden of aspect annotation. Experiments on data from various categories of a popular E-commerce site demonstrate the effectiveness of the framework, as well as the potentials of the proposed review-based question generation task.",Review-based Generation|Adaptive Transfer|Adaptive Augmentation|online services,Generation,Long,https://www.aclweb.org/anthology/2020.acl-main.26.pdf -main.339,Don't Eclipse Your Arts Due to Small Discrepancies: Boundary Repositioning with a Pointer Network for Aspect Extraction,Zhenkai Wei|Yu Hong|Bowei Zou|Meng Cheng|Jianmin Yao,"The current aspect extraction methods suffer from boundary errors. In general, these errors lead to a relatively minor difference between the extracted aspects and the ground-truth. However, they hurt the performance severely. In this paper, we propose to utilize a pointer network for repositioning the boundaries. Recycling mechanism is used, which enables the training data to be collected without manual intervention. We conduct the experiments on the benchmark datasets SE14 of laptop and SE14-16 of restaurant. Experimental results show that our method achieves substantial improvements over the baseline, and outperforms state-of-the-art methods.",Boundary Repositioning|Aspect Extraction|Pointer Network|aspect methods,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Short,https://www.aclweb.org/anthology/2020.acl-main.339.pdf -main.32,Neural Topic Modeling with Bidirectional Adversarial Training,Rui Wang|Xuemeng Hu|Deyu Zhou|Yulan He|Yuxuan Xiong|Chenchen Ye|Haiyang Xu,"Recent years have witnessed a surge of interests of using neural topic models for automatic topic extraction from text, since they avoid the complicated mathematical derivations for model inference as in traditional topic models such as Latent Dirichlet Allocation (LDA). However, these models either typically assume improper prior (e.g. Gaussian or Logistic Normal) over latent topic space or could not infer topic distribution for a given document. To address these limitations, we propose a neural topic modeling approach, called Bidirectional Adversarial Topic (BAT) model, which represents the first attempt of applying bidirectional adversarial training for neural topic modeling. The proposed BAT builds a two-way projection between the document-topic distribution and the document-word distribution. It uses a generator to capture the semantic patterns from texts and an encoder for topic inference. Furthermore, to incorporate word relatedness information, the Bidirectional Adversarial Topic model with Gaussian (Gaussian-BAT) is extended from BAT. To verify the effectiveness of BAT and Gaussian-BAT, three benchmark corpora are used in our experiments. The experimental results show that BAT and Gaussian-BAT obtain more coherent topics, outperforming several competitive baselines. Moreover, when performing text clustering based on the extracted topics, our models outperform all the baselines, with more significant improvements achieved by Gaussian-BAT where an increase of near 6% is observed in accuracy.",automatic extraction|model inference|neural modeling|topic inference,Information Retrieval and Text Mining,Long,https://www.aclweb.org/anthology/2020.acl-main.32.pdf -main.305,Dynamic Online Conversation Recommendation,Xingshan Zeng|Jing Li|Lu Wang|Zhiming Mao|Kam-Fai Wong,"Trending topics in social media content evolve over time, and it is therefore crucial to understand social media users and their interpersonal communications in a dynamic manner. Here we study dynamic online conversation recommendation, to help users engage in conversations that satisfy their evolving interests. While most prior work assumes static user interests, our model is able to capture the temporal aspects of user interests, and further handle future conversations that are unseen during training time. Concretely, we propose a neural architecture to exploit changes of user interactions and interests over time, to predict which discussions they are likely to enter. We conduct experiments on large-scale collections of Reddit conversations, and results on three subreddits show that our model significantly outperforms state-of-the-art models that make a static assumption of user interests. We further evaluate on handling “cold start”, and observe consistently better performance by our model when considering various degrees of sparsity of user’s chatting history and conversation contexts. Lastly, analyses on our model outputs indicate user interest change, explaining the advantage and efficacy of our approach.",Dynamic Recommendation|neural architecture|Trending topics|social users,Computational Social Science and Social Media,Long,https://www.aclweb.org/anthology/2020.acl-main.305.pdf -main.463,"Climbing towards NLU: On Meaning, Form, and Understanding in the Age of Data",Emily M. Bender|Alexander Koller,"The success of the large neural language models on many NLP tasks is exciting. However, we find that these successes sometimes lead to hype in which these models are being described as ``understanding'' language or capturing ``meaning''. In this position paper, we argue that a system trained only on form has a priori no way to learn meaning. In keeping with the EMNLP 2020 theme of ``Taking Stock of Where We've Been and Where We're Going'', we argue that a clear understanding of the distinction between form and meaning will help guide the field towards better science around natural language understanding.",NLP tasks|natural understanding|large models|NLU,Theme,Long,https://www.aclweb.org/anthology/2020.acl-main.463.pdf -main.477,Would you Rather? A New Benchmark for Learning Machine Alignment with Cultural Values and Social Preferences,Yi Tay|Donovan Ong|Jie Fu|Alvin Chan|Nancy Chen|Anh Tuan Luu|Chris Pal,"Understanding human preferences, along with cultural and social nuances, lives at the heart of natural language understanding. Concretely, we present a new task and corpus for learning alignments between machine and human preferences. Our newly introduced problem is concerned with predicting the preferable options from two sentences describing scenarios that may involve social, cultural, ethical, or moral situations. Our problem is framed as a natural language inference task with crowd-sourced preference votes by human players, obtained from a gamified voting platform. Along with the release of a new dataset of 200K data points, we benchmark several state-of-the-art neural models, along with BERT and friends on this task. Our experimental results show that current state-of-the-art NLP models still leave much room for improvement.",Machine Alignment|Understanding preferences|natural understanding|natural task,Computational Social Science and Social Media,Short,https://www.aclweb.org/anthology/2020.acl-main.477.pdf -main.311,Roles and Utilization of Attention Heads in Transformer-based Neural Language Models,Jae-young Jo|Sung-Hyon Myaeng,"Sentence encoders based on the transformer architecture have shown promising results on various natural language tasks. The main impetus lies in the pre-trained neural language models that capture long-range dependencies among words, owing to multi-head attention that is unique in the architecture. However, little is known for how linguistic properties are processed, represented, and utilized for downstream tasks among hundreds of attention heads inside the pre-trained transformer-based model. For the initial goal of examining the roles of attention heads in handling a set of linguistic features, we conducted a set of experiments with ten probing tasks and three downstream tasks on four pre-trained transformer families (GPT, GPT2, BERT, and ELECTRA). Meaningful insights are shown through the lens of heat map visualization and utilized to propose a relatively simple sentence representation method that takes advantage of most influential attention heads, resulting in additional performance improvements on the downstream tasks.",Transformer-based Models|natural tasks|downstream tasks|probing tasks,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.311.pdf -main.488,Towards Debiasing Sentence Representations,Paul Pu Liang|Irene Mengze Li|Emily Zheng|Yao Chong Lim|Ruslan Salakhutdinov|Louis-Philippe Morency,"As natural language processing methods are increasingly deployed in real-world scenarios such as healthcare, legal systems, and social science, it becomes necessary to recognize the role they potentially play in shaping social biases and stereotypes. Previous work has revealed the presence of social biases in widely used word embeddings involving gender, race, religion, and other social constructs. While some methods were proposed to debias these word-level embeddings, there is a need to perform debiasing at the sentence-level given the recent shift towards new contextualized sentence representations such as ELMo and BERT. In this paper, we investigate the presence of social biases in sentence-level representations and propose a new method, Sent-Debias, to reduce these biases. We show that Sent-Debias is effective in removing biases, and at the same time, preserves performance on sentence-level downstream tasks such as sentiment analysis, linguistic acceptability, and natural language understanding. We hope that our work will inspire future research on characterizing and removing social biases from widely adopted sentence representations for fairer NLP.",Debiasing Representations|real-world scenarios|legal systems|debiasing,Ethics and NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.488.pdf -main.649,Predicting the Growth of Morphological Families from Social and Linguistic Factors,Valentin Hofmann|Janet Pierrehumbert|Hinrich Schütze,"We present the first study that examines the evolution of morphological families, i.e., sets of morphologically related words such as “trump”, “antitrumpism”, and “detrumpify”, in social media. We introduce the novel task of Morphological Family Expansion Prediction (MFEP) as predicting the increase in the size of a morphological family. We create a ten-year Reddit corpus as a benchmark for MFEP and evaluate a number of baselines on this benchmark. Our experiments demonstrate very good performance on MFEP.",evolution families|Morphological MFEP|MFEP|Growth Families,"Phonology, Morphology and Word Segmentation",Long,https://www.aclweb.org/anthology/2020.acl-main.649.pdf -main.661,Speech Translation and the End-to-End Promise: Taking Stock of Where We Are,Matthias Sperber|Matthias Paulik,"Over its three decade history, speech translation has experienced several shifts in its primary research themes; moving from loosely coupled cascades of speech recognition and machine translation, to exploring questions of tight coupling, and finally to end-to-end models that have recently attracted much attention. This paper provides a brief survey of these developments, along with a discussion of the main challenges of traditional approaches which stem from committing to intermediate representations from the speech recognizer, and from training cascaded models separately towards different objectives. Recent end-to-end modeling techniques promise a principled way of overcoming these issues by allowing joint training of all model components and removing the need for explicit intermediate representations. However, a closer look reveals that many end-to-end models fall short of solving these issues, due to compromises made to address data scarcity. This paper provides a unifying categorization and nomenclature that covers both traditional and recent approaches and that may help researchers by highlighting both trade-offs and open research questions.",Speech Translation|speech recognition|machine translation|data scarcity,Theme,Long,https://www.aclweb.org/anthology/2020.acl-main.661.pdf -main.107,Building a User-Generated Content North-African Arabizi Treebank: Tackling Hell,Djamé Seddah|Farah Essaidi|Amal Fethi|Matthieu Futeral|Benjamin Muller|Pedro Javier Ortiz Suárez|Benoît Sagot|Abhishek Srivastava,"We introduce the first treebank for a romanized user-generated content variety of Algerian, a North-African Arabic dialect known for its frequent usage of code-switching. Made of 1500 sentences, fully annotated in morpho-syntax and Universal Dependency syntax, with full translation at both the word and the sentence levels, this treebank is made freely available. It is supplemented with 50k unlabeled sentences collected from Common Crawl and web-crawled data using intensive data-mining techniques. Preliminary experiments demonstrate its usefulness for POS tagging and dependency parsing. We believe that what we present in this paper is useful beyond the low-resource language community. This is the first time that enough unlabeled and annotated data is provided for an emerging user-generated content dialectal language with rich morphology and code switching, making it an challenging test-bed for most recent NLP approaches.",POS tagging|dependency parsing|intensive techniques|NLP approaches,Resources and Evaluation,Long,https://www.aclweb.org/anthology/2020.acl-main.107.pdf -main.113,Multi-Hypothesis Machine Translation Evaluation,Marina Fomicheva|Lucia Specia|Francisco Guzmán,"Reliably evaluating Machine Translation (MT) through automated metrics is a long-standing problem. One of the main challenges is the fact that multiple outputs can be equally valid. Attempts to minimise this issue include metrics that relax the matching of MT output and reference strings, and the use of multiple references. The latter has been shown to significantly improve the performance of evaluation metrics. However, collecting multiple references is expensive and in practice a single reference is generally used. In this paper, we propose an alternative approach: instead of modelling linguistic variation in human reference we exploit the MT model uncertainty to generate multiple diverse translations and use these: (i) as surrogates to reference translations; (ii) to obtain a quantification of translation variability to either complement existing metric scores or (iii) replace references altogether. We show that for a number of popular evaluation metrics our variability estimates lead to substantial improvements in correlation with human judgements of quality by up 15%.",Multi-Hypothesis Evaluation|Machine MT|MT uncertainty|linguistic reference,Resources and Evaluation,Long,https://www.aclweb.org/anthology/2020.acl-main.113.pdf -main.675,Non-Linear Instance-Based Cross-Lingual Mapping for Non-Isomorphic Embedding Spaces,Goran Glavaš|Ivan Vulić,"We present InstaMap, an instance-based method for learning projection-based cross-lingual word embeddings. Unlike prior work, it deviates from learning a single global linear projection. InstaMap is a non-parametric model that learns a non-linear projection by iteratively: (1) finding a globally optimal rotation of the source embedding space relying on the Kabsch algorithm, and then (2) moving each point along an instance-specific translation vector estimated from the translation vectors of the point's nearest neighbours in the training dictionary. We report performance gains with InstaMap over four representative state-of-the-art projection-based models on bilingual lexicon induction across a set of 28 diverse language pairs. We note prominent improvements, especially for more distant language pairs (i.e., languages with non-isomorphic monolingual spaces).",bilingual induction|Non-Linear Mapping|InstaMap|instance-based method,Semantics: Lexical,Short,https://www.aclweb.org/anthology/2020.acl-main.675.pdf -main.729,Mapping Natural Language Instructions to Mobile UI Action Sequences,Yang Li|Jiacong He|Xin Zhou|Yuan Zhang|Jason Baldridge,"We present a new problem: grounding natural language instructions to mobile user interface actions, and contribute three new datasets for it. For full task evaluation, we create PixelHelp, a corpus that pairs English instructions with actions performed by people on a mobile UI emulator. To scale training, we decouple the language and action data by (a) annotating action phrase spans in How-To instructions and (b) synthesizing grounded descriptions of actions for mobile user interfaces. We use a Transformer to extract action phrase tuples from long-range natural language instructions. A grounding Transformer then contextually represents UI objects using both their content and screen position and connects them to object descriptions. Given a starting screen and instruction, our model achieves 70.59% accuracy on predicting complete ground-truth action sequences in PixelHelp.",full evaluation|scale training|predicting sequences|PixelHelp,"Language Grounding to Vision, Robotics and Beyond",Long,https://www.aclweb.org/anthology/2020.acl-main.729.pdf -main.715,Exploiting the Syntax-Model Consistency for Neural Relation Extraction,Amir Pouran Ben Veyseh|Franck Dernoncourt|Dejing Dou|Thien Huu Nguyen,"This paper studies the task of Relation Extraction (RE) that aims to identify the semantic relations between two entity mentions in text. In the deep learning models for RE, it has been beneficial to incorporate the syntactic structures from the dependency trees of the input sentences. In such models, the dependency trees are often used to directly structure the network architectures or to obtain the dependency relations between the word pairs to inject the syntactic information into the models via multi-task learning. The major problem with these approaches is the lack of generalization beyond the syntactic structures in the training data or the failure to capture the syntactic importance of the words for RE. In order to overcome these issues, we propose a novel deep learning model for RE that uses the dependency trees to extract the syntax-based importance scores for the words, serving as a tree representation to introduce syntactic information into the models with greater generalization. In particular, we leverage Ordered-Neuron Long-Short Term Memory Networks (ON-LSTM) to infer the model-based importance scores for RE for every word in the sentences that are then regulated to be consistent with the syntax-based scores to enable syntactic information injection. We perform extensive experiments to demonstrate the effectiveness of the proposed method, leading to the state-of-the-art performance on three RE benchmark datasets.",Neural Extraction|Relation Extraction|RE|syntactic injection,Information Extraction,Long,https://www.aclweb.org/anthology/2020.acl-main.715.pdf -main.701,"To Test Machine Comprehension, Start by Defining Comprehension",Jesse Dunietz|Greg Burnham|Akash Bharadwaj|Owen Rambow|Jennifer Chu-Carroll|Dave Ferrucci,"Many tasks aim to measure machine reading comprehension (MRC), often focusing on question types presumed to be difficult. Rarely, however, do task designers start by considering what systems should in fact comprehend. In this paper we make two key contributions. First, we argue that existing approaches do not adequately define comprehension; they are too unsystematic about what content is tested. Second, we present a detailed definition of comprehension—a ""Template of Understanding""—for a widely useful class of texts, namely short narratives. We then conduct an experiment that strongly suggests existing systems are not up to the task of narrative understanding as we define it.",Machine Comprehension|Defining Comprehension|MRC|narrative understanding,Theme,Long,https://www.aclweb.org/anthology/2020.acl-main.701.pdf -main.259,Verbal Multiword Expressions for Identification of Metaphor,Omid Rohanian|Marek Rei|Shiva Taslimipoor|Le An Ha,"Metaphor is a linguistic device in which a concept is expressed by mentioning another. Identifying metaphorical expressions, therefore, requires a non-compositional understanding of semantics. Multiword Expressions (MWEs), on the other hand, are linguistic phenomena with varying degrees of semantic opacity and their identification poses a challenge to computational models. This work is the first attempt at analysing the interplay of metaphor and MWEs processing through the design of a neural architecture whereby classification of metaphors is enhanced by informing the model of the presence of MWEs. To the best of our knowledge, this is the first ``MWE-aware"" metaphor identification system paving the way for further experiments on the complex interactions of these phenomena. The results and analyses show that this proposed architecture reach state-of-the-art on two different established metaphor datasets.",Verbal Expressions|Identification Metaphor|Identifying expressions|MWEs processing,Semantics: Lexical,Short,https://www.aclweb.org/anthology/2020.acl-main.259.pdf -main.271,Single Model Ensemble using Pseudo-Tags and Distinct Vectors,Ryosuke Kuwabara|Jun Suzuki|Hideki Nakayama,"Model ensemble techniques often increase task performance in neural networks; however, they require increased time, memory, and management effort. In this study, we propose a novel method that replicates the effects of a model ensemble with a single model. Our approach creates K-virtual models within a single parameter space using K-distinct pseudo-tags and K-distinct vectors. Experiments on text classification and sequence labeling tasks on several datasets demonstrate that our method emulates or outperforms a traditional model ensemble with 1/K-times fewer parameters.",text tasks|Single Ensemble|Distinct Vectors|Model techniques,Machine Learning for NLP,Short,https://www.aclweb.org/anthology/2020.acl-main.271.pdf -main.517,Learning to Customize Model Structures for Few-shot Dialogue Generation Tasks,Yiping Song|Zequn Liu|Wei Bi|Rui Yan|Ming Zhang,"Training the generative models with minimal corpus is one of the critical challenges for building open-domain dialogue systems. Existing methods tend to use the meta-learning framework which pre-trains the parameters on all non-target tasks then fine-tunes on the target task. However, fine-tuning distinguishes tasks from the parameter perspective but ignores the model-structure perspective, resulting in similar dialogue models for different tasks. In this paper, we propose an algorithm that can customize a unique dialogue model for each task in the few-shot setting. In our approach, each dialogue model consists of a shared module, a gating module, and a private module. The first two modules are shared among all the tasks, while the third one will differentiate into different network structures to better capture the characteristics of the corresponding task. The extensive experiments on two datasets show that our method outperforms all the baselines in terms of task consistency, response quality, and diversity.",Few-shot Tasks|open-domain systems|generative models|meta-learning framework,Dialogue and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.acl-main.517.pdf -main.503,Selective Question Answering under Domain Shift,Amita Kamath|Robin Jia|Percy Liang,"To avoid giving wrong answers, question answering (QA) models need to know when to abstain from answering. Moreover, users often ask questions that diverge from the model's training data, making errors more likely and thus abstention more critical. In this work, we propose the setting of selective question answering under domain shift, in which a QA model is tested on a mixture of in-domain and out-of-domain data, and must answer (i.e., not abstain on) as many questions as possible while maintaining high accuracy. Abstention policies based solely on the model's softmax probabilities fare poorly, since models are overconfident on out-of-domain inputs. Instead, we train a calibrator to identify inputs on which the QA model errs, and abstain when it predicts an error is likely. Crucially, the calibrator benefits from observing the model's behavior on out-of-domain data, even if from a different domain than the test data. We combine this method with a SQuAD-trained QA model and evaluate on mixtures of SQuAD and five other QA datasets. Our method answers 56% of questions while maintaining 80% accuracy; in contrast, directly using the model's probabilities only answers 48% at 80% accuracy.",Selective Answering|Domain Shift|question models|QA models,Question Answering,Long,https://www.aclweb.org/anthology/2020.acl-main.503.pdf -main.265,Towards Understanding Gender Bias in Relation Extraction,Andrew Gaut|Tony Sun|Shirlyn Tang|Yuxin Huang|Jing Qian|Mai ElSherief|Jieyu Zhao|Diba Mirza|Elizabeth Belding|Kai-Wei Chang|William Yang Wang,"Recent developments in Neural Relation Extraction (NRE) have made significant strides towards Automated Knowledge Base Construction. While much attention has been dedicated towards improvements in accuracy, there have been no attempts in the literature to evaluate social biases exhibited in NRE systems. In this paper, we create WikiGenderBias, a distantly supervised dataset composed of over 45,000 sentences including a 10% human annotated test set for the purpose of analyzing gender bias in relation extraction systems. We find that when extracting spouse-of and hypernym (i.e., occupation) relations, an NRE system performs differently when the gender of the target entity is different. However, such disparity does not appear when extracting relations such as birthDate or birthPlace. We also analyze how existing bias mitigation techniques, such as name anonymization, word embedding debiasing, and data augmentation affect the NRE system in terms of maintaining the test performance and reducing biases. Unfortunately, due to NRE models rely heavily on surface level cues, we find that existing bias mitigation approaches have a negative effect on NRE. Our analysis lays groundwork for future quantifying and mitigating bias in NRE.",Relation Extraction|Automated Construction|name anonymization|data augmentation,Ethics and NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.265.pdf -main.502,SCDE: Sentence Cloze Dataset with High Quality Distractors From Examinations,Xiang Kong|Varun Gangal|Eduard Hovy,"We introduce SCDE, a dataset to evaluate the performance of computational models through sentence prediction. SCDE is a human created sentence cloze dataset, collected from public school English examinations. Our task requires a model to fill up multiple blanks in a passage from a shared candidate set with distractors designed by English teachers. Experimental results demonstrate that this task requires the use of non-local, discourse-level context beyond the immediate sentence neighborhood. The blanks require joint solving and significantly impair each other’s context. Furthermore, through ablations, we show that the distractors are of high quality and make the task more challenging. Our experiments show that there is a significant performance gap between advanced models (72%) and humans (87%), encouraging future models to bridge this gap.",SCDE|computational models|sentence prediction|joint solving,Question Answering,Long,https://www.aclweb.org/anthology/2020.acl-main.502.pdf -main.264,Mitigating Gender Bias Amplification in Distribution by Posterior Regularization,Shengyu Jia|Tao Meng|Jieyu Zhao|Kai-Wei Chang,"Advanced machine learning techniques have boosted the performance of natural language processing. Nevertheless, recent studies, e.g., show that these techniques inadvertently capture the societal bias hidden in the corpus and further amplify it. However, their analysis is conducted only on models' top predictions. In this paper, we investigate the gender bias amplification issue from the distribution perspective and demonstrate that the bias is amplified in the view of predicted probability distribution over labels. We further propose a bias mitigation approach based on posterior regularization. With little performance loss, our method can almost remove the bias amplification in the distribution. Our study sheds the light on understanding the bias amplification.",Mitigating Amplification|natural processing|gender issue|Posterior Regularization,Ethics and NLP,Short,https://www.aclweb.org/anthology/2020.acl-main.264.pdf -main.270,Improving Transformer Models by Reordering their Sublayers,Ofir Press|Noah A. Smith|Omer Levy,"Multilayer transformer networks consist of interleaved self-attention and feedforward sublayers. Could ordering the sublayers in a different pattern lead to better performance? We generate randomly ordered transformers and train them with the language modeling objective. We observe that some of these models are able to achieve better performance than the interleaved baseline, and that those successful variants tend to have more self-attention at the bottom and more feedforward sublayers at the top. We propose a new transformer pattern that adheres to this property, the sandwich transformer, and show that it improves perplexity on multiple word-level and character-level language modeling benchmarks, at no cost in parameters, memory, or training time. However, the sandwich reordering pattern does not guarantee performance gains across every task, as we demonstrate on machine translation models. Instead, we suggest that further exploration of task-specific sublayer reorderings is needed in order to unlock additional gains.",task-specific reorderings|Transformer Models|Multilayer networks|randomly transformers,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.270.pdf -main.516,"Generate, Delete and Rewrite: A Three-Stage Framework for Improving Persona Consistency of Dialogue Generation",Haoyu Song|Yan Wang|Wei-Nan Zhang|Xiaojiang Liu|Ting Liu,"Maintaining a consistent personality in conversations is quite natural for human beings, but is still a non-trivial task for machines. The persona-based dialogue generation task is thus introduced to tackle the personality-inconsistent problem by incorporating explicit persona text into dialogue generation models. Despite the success of existing persona-based models on generating human-like responses, their one-stage decoding framework can hardly avoid the generation of inconsistent persona words. In this work, we introduce a three-stage framework that employs a generate-delete-rewrite mechanism to delete inconsistent words from a generated response prototype and further rewrite it to a personality-consistent one. We carry out evaluations by both human and automatic metrics. Experiments on the Persona-Chat dataset show that our approach achieves good performance.",Persona Generation|persona-based task|personality-inconsistent problem|generating responses,Dialogue and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.acl-main.516.pdf -main.258,Predicting Degrees of Technicality in Automatic Terminology Extraction,Anna Hätty|Dominik Schlechtweg|Michael Dorna|Sabine Schulte im Walde,"While automatic term extraction is a well-researched area, computational approaches to distinguish between degrees of technicality are still understudied. We semi-automatically create a German gold standard of technicality across four domains, and illustrate the impact of a web-crawled general-language corpus on technicality prediction. When defining a classification approach that combines general-language and domain-specific word embeddings, we go beyond previous work and align vector spaces to gain comparative embeddings. We suggest two novel models to exploit general- vs. domain-specific comparisons: a simple neural network model with pre-computed comparative-embedding information as input, and a multi-channel model computing the comparison internally. Both models outperform previous approaches, with the multi-channel model performing best.",Automatic Extraction|technicality prediction|computational approaches|classification approach,Semantics: Lexical,Short,https://www.aclweb.org/anthology/2020.acl-main.258.pdf -main.700,Returning the N to NLP: Towards Contextually Personalized Classification Models,Lucie Flek,"Most NLP models today treat language as universal, even though socio- and psycholingustic research shows that the communicated message is influenced by the characteristics of the speaker as well as the target audience. This paper surveys the landscape of personalization in natural language processing and related fields, and offers a path forward to mitigate the decades of deviation of the NLP tools from sociolingustic findings, allowing to flexibly process the ``natural'' language of each user rather than enforcing a uniform NLP treatment. It outlines a possible direction to incorporate these aspects into neural NLP models by means of socially contextual personalization, and proposes to shift the focus of our evaluation strategies accordingly.",NLP|natural fields|Contextually Models|NLP models,Theme,Short,https://www.aclweb.org/anthology/2020.acl-main.700.pdf -main.714,Document-Level Event Role Filler Extraction using Multi-Granularity Contextualized Encoding,Xinya Du|Claire Cardie,"Few works in the literature of event extraction have gone beyond individual sentences to make extraction decisions. This is problematic when the information needed to recognize an event argument is spread across multiple sentences. We argue that document-level event extraction is a difficult task since it requires a view of a larger context to determine which spans of text correspond to event role fillers. We first investigate how end-to-end neural sequence models (with pre-trained language model representations) perform on document-level role filler extraction, as well as how the length of context captured affects the models’ performance. To dynamically aggregate information captured by neural representations learned at different levels of granularity (e.g., the sentence- and paragraph-level), we propose a novel multi-granularity reader. We evaluate our models on the MUC-4 event extraction dataset, and show that our best system performs substantially better than prior work. We also report findings on the relationship between context length and neural model performance on the task.",Document-Level Extraction|event extraction|extraction decisions|Multi-Granularity Encoding,Information Extraction,Long,https://www.aclweb.org/anthology/2020.acl-main.714.pdf -main.728,History for Visual Dialog: Do we really need it?,Shubham Agarwal|Trung Bui|Joon-Young Lee|Ioannis Konstas|Verena Rieser,"Visual Dialogue involves ""understanding'' the dialogue history (what has been discussed previously) and the current question (what is asked), in addition to grounding information in the image, to accurately generate the correct response. In this paper, we show that co-attention models which explicitly encode dialoh history outperform models that don't, achieving state-of-the-art performance (72 % NDCG on val set). However, we also expose shortcomings of the crowdsourcing dataset collection procedure, by showing that dialogue history is indeed only required for a small amount of the data, and that the current evaluation metric encourages generic replies. To that end, we propose a challenging subset (VisdialConv) of the VisdialVal set and the benchmark NDCG of 63%.",Visual Dialogue|Visual Dialog|co-attention models|crowdsourcing procedure,"Language Grounding to Vision, Robotics and Beyond",Long,https://www.aclweb.org/anthology/2020.acl-main.728.pdf -main.112,Learning and Evaluating Emotion Lexicons for 91 Languages,Sven Buechel|Susanna Rücker|Udo Hahn,"Emotion lexicons describe the affective meaning of words and thus constitute a centerpiece for advanced sentiment and emotion analysis. Yet, manually curated lexicons are only available for a handful of languages, leaving most languages of the world without such a precious resource for downstream applications. Even worse, their coverage is often limited both in terms of the lexical units they contain and the emotional variables they feature. In order to break this bottleneck, we here introduce a methodology for creating almost arbitrarily large emotion lexicons for any target language. Our approach requires nothing but a source language emotion lexicon, a bilingual word translation model, and a target language embedding model. Fulfilling these requirements for 91 languages, we are able to generate representationally rich high-coverage lexicons comprising eight emotional variables with more than 100k lexical entries each. We evaluated the automatically generated lexicons against human judgment from 26 datasets, spanning 12 typologically diverse languages, and found that our approach produces results in line with state-of-the-art monolingual approaches to lexicon creation and even surpasses human reliability for some languages and variables. Code and data are available at https://github.com/JULIELab/MEmoLon archived under DOI 10.5281/zenodo.3779901.",sentiment analysis|downstream applications|lexicon creation|bilingual model,Resources and Evaluation,Long,https://www.aclweb.org/anthology/2020.acl-main.112.pdf -main.674,Understanding Advertisements with BERT,Kanika Kalra|Bhargav Kurma|Silpa Vadakkeeveetil Sreelatha|Manasi Patwardhan|Shirish Karande,"We consider a task based on CVPR 2018 challenge dataset on advertisement (Ad) understanding. The task involves detecting the viewer's interpretation of an Ad image captured as text. Recent results have shown that the embedded scene-text in the image holds a vital cue for this task. Motivated by this, we fine-tune the base BERT model for a sentence-pair classification task. Despite utilizing the scene-text as the only source of visual information, we could achieve a hit-or-miss accuracy of 84.95% on the challenge test data. To enable BERT to process other visual information, we append image captions to the scene-text. This achieves an accuracy of 89.69%, which is an improvement of 4.7%. This is the best reported result for this task.",Advertisements|advertisement understanding|sentence-pair task|BERT model,NLP Applications,Short,https://www.aclweb.org/anthology/2020.acl-main.674.pdf -main.660,From SPMRL to NMRL: What Did We Learn (and Unlearn) in a Decade of Parsing Morphologically-Rich Languages (MRLs)?,Reut Tsarfaty|Dan Bareket|Stav Klein|Amit Seker,"It has been exactly a decade since the first establishment of SPMRL, a research initiative unifying multiple research efforts to address the peculiar challenges of Statistical Parsing for Morphologically-Rich Languages (MRLs). Here we reflect on parsing MRLs in that decade, highlight the solutions and lessons learned for the architectural, modeling and lexical challenges in the pre-neural era, and argue that similar challenges re-emerge in neural architectures for MRLs. We then aim to offer a climax, suggesting that incorporating symbolic ideas proposed in SPMRL terms into nowadays neural architectures has the potential to push NLP for MRLs to a new level. We sketch a strategies for designing Neural Models for MRLs (NMRL), and showcase preliminary support for these strategies via investigating the task of multi-tagging in Hebrew, a morphologically-rich, high-fusion, language.",parsing MRLs|MRLs|NLP|multi-tagging,Theme,Long,https://www.aclweb.org/anthology/2020.acl-main.660.pdf -main.106,A Graph Auto-encoder Model of Derivational Morphology,Valentin Hofmann|Hinrich Schütze|Janet Pierrehumbert,"There has been little work on modeling the morphological well-formedness (MWF) of derivatives, a problem judged to be complex and difficult in linguistics. We present a graph auto-encoder that learns embeddings capturing information about the compatibility of affixes and stems in derivation. The auto-encoder models MWF in English surprisingly well by combining syntactic and semantic information with associative information from the mental lexicon.",Graph Morphology|graph auto-encoder|auto-encoder MWF|MWF,"Phonology, Morphology and Word Segmentation",Long,https://www.aclweb.org/anthology/2020.acl-main.106.pdf -main.648,2kenize: Tying Subword Sequences for Chinese Script Conversion,- Pranav A|Isabelle Augenstein,"Simplified Chinese to Traditional Chinese character conversion is a common preprocessing step in Chinese NLP. Despite this, current approaches have insufficient performance because they do not take into account that a simplified Chinese character can correspond to multiple traditional characters. Here, we propose a model that can disambiguate between mappings and convert between the two scripts. The model is based on subword segmentation, two language models, as well as a method for mapping between subword sequences. We further construct benchmark datasets for topic classification and script conversion. Our proposed method outperforms previous Chinese Character conversion approaches by 6 points in accuracy. These results are further confirmed in a downstream application, where 2kenize is used to convert pretraining dataset for topic classification. An error analysis reveals that our method's particular strengths are in dealing with code mixing and named entities.",Chinese Conversion|Chinese NLP|mapping sequences|topic classification,"Phonology, Morphology and Word Segmentation",Long,https://www.aclweb.org/anthology/2020.acl-main.648.pdf -main.489,A Re-evaluation of Knowledge Graph Completion Methods,Zhiqing Sun|Shikhar Vashishth|Soumya Sanyal|Partha Talukdar|Yiming Yang,"Knowledge Graph Completion (KGC) aims at automatically predicting missing links for large-scale knowledge graphs. A vast number of state-of-the-art KGC techniques have got published at top conferences in several research fields, including data mining, machine learning, and natural language processing. However, we notice that several recent papers report very high performance, which largely outperforms previous state-of-the-art methods. In this paper, we find that this can be attributed to the inappropriate evaluation protocol used by them and propose a simple evaluation protocol to address this problem. The proposed protocol is robust to handle bias in the model, which can substantially affect the final results. We conduct extensive experiments and report performance of several existing methods using our protocol. The reproducible code has been made publicly available.",large-scale graphs|data mining|machine learning|natural processing,Interpretability and Analysis of Models for NLP,Short,https://www.aclweb.org/anthology/2020.acl-main.489.pdf -main.476,Understanding the Language of Political Agreement and Disagreement in Legislative Texts,Maryam Davoodi|Eric Waltenburg|Dan Goldwasser,"While national politics often receive the spotlight, the overwhelming majority of legislation proposed, discussed, and enacted is done at the state level. Despite this fact, there is little awareness of the dynamics that lead to adopting these policies. In this paper, we take the first step towards a better understanding of these processes and the underlying dynamics that shape them, using data-driven methods. We build a new large-scale dataset, from multiple data sources, connecting state bills and legislator information, geographical information about their districts, and donations and donors' information. We suggest a novel task, predicting the legislative body's vote breakdown for a given bill, according to different criteria of interest, such as gender, rural-urban and ideological splits. Finally, we suggest a shared relational embedding model, representing the interactions between the text of the bill and the legislative context in which it is presented. Our experiments show that providing this context helps improve the prediction over strong text-based models.",data-driven methods|shared model|text-based models|national politics,Computational Social Science and Social Media,Long,https://www.aclweb.org/anthology/2020.acl-main.476.pdf -main.310,On the Robustness of Language Encoders against Grammatical Errors,Fan Yin|Quanyu Long|Tao Meng|Kai-Wei Chang,"We conduct a thorough study to diagnose the behaviors of pre-trained language encoders (ELMo, BERT, and RoBERTa) when confronted with natural grammatical errors. Specifically, we collect real grammatical errors from non-native speakers and conduct adversarial attacks to simulate these errors on clean text data. We use this approach to facilitate debugging models on downstream applications. Results confirm that the performance of all tested models is affected but the degree of impact varies. To interpret model behaviors, we further design a linguistic acceptability task to reveal their abilities in identifying ungrammatical sentences and the position of errors. We find that fixed contextual encoders with a simple classifier trained on the prediction of sentence correctness are able to locate error positions. We also design a cloze test for BERT and discover that BERT captures the interaction between errors and specific tokens in context. Our results shed light on understanding the robustness and behaviors of language encoders against grammatical errors.",downstream applications|linguistic task|Language Encoders|pre-trained encoders,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.310.pdf -main.304,Structure-Level Knowledge Distillation For Multilingual Sequence Labeling,Xinyu Wang|Yong Jiang|Nguyen Bach|Tao Wang|Fei Huang|Kewei Tu,"Multilingual sequence labeling is a task of predicting label sequences using a single unified model for multiple languages. Compared with relying on multiple monolingual models, using a multilingual model has the benefit of a smaller model size, easier in online serving, and generalizability to low-resource languages. However, current multilingual models still underperform individual monolingual models significantly due to model capacity limitations. In this paper, we propose to reduce the gap between monolingual models and the unified multilingual model by distilling the structural knowledge of several monolingual models (teachers) to the unified multilingual model (student). We propose two novel KD methods based on structure-level information: (1) approximately minimizes the distance between the student's and the teachers' structure-level probability distributions, (2) aggregates the structure-level knowledge to local distributions and minimizes the distance between two local probability distributions. Our experiments on 4 multilingual tasks with 25 datasets show that our approaches outperform several strong baselines and have stronger zero-shot generalizability than both the baseline model and teacher models.",Multilingual Labeling|predicting sequences|online serving|Structure-Level Distillation,"Syntax: Tagging, Chunking and Parsing",Long,https://www.aclweb.org/anthology/2020.acl-main.304.pdf -main.462,(Re)construing Meaning in NLP,Sean Trott|Tiago Timponi Torrent|Nancy Chang|Nathan Schneider,"Human speakers have an extensive toolkit of ways to express themselves. In this paper, we engage with an idea largely absent from discussions of meaning in natural language understanding—namely, that the way something is expressed reflects different ways of conceptualizing or construing the information being conveyed. We first define this phenomenon more precisely, drawing on considerable prior work in theoretical cognitive semantics and psycholinguistics. We then survey some dimensions of construed meaning and show how insights from construal could inform theoretical and practical work in NLP.",NLP|natural understanding|theoretical semantics|construed meaning,Theme,Long,https://www.aclweb.org/anthology/2020.acl-main.462.pdf -main.33,Text Classification with Negative Supervision,Sora Ohashi|Junya Takayama|Tomoyuki Kajiwara|Chenhui Chu|Yuki Arase,"Advanced pre-trained models for text representation have achieved state-of-the-art performance on various text classification tasks. However, the discrepancy between the semantic similarity of texts and labelling standards affects classifiers, i.e. leading to lower performance in cases where classifiers should assign different labels to semantically similar texts. To address this problem, we propose a simple multitask learning model that uses negative supervision. Specifically, our model encourages texts with different labels to have distinct representations. Comprehensive experiments show that our model outperforms the state-of-the-art pre-trained model on both single- and multi-label classifications, sentence and document classifications, and classifications in three different languages.",Text Classification|text representation|text tasks|single- classifications,Information Retrieval and Text Mining,Short,https://www.aclweb.org/anthology/2020.acl-main.33.pdf -main.338,Aspect Sentiment Classification with Document-level Sentiment Preference Modeling,Xiao Chen|Changlong Sun|Jingjing Wang|Shoushan Li|Luo Si|Min Zhang|Guodong Zhou,"In the literature, existing studies always consider Aspect Sentiment Classification (ASC) as an independent sentence-level classification problem aspect by aspect, which largely ignore the document-level sentiment preference information, though obviously such information is crucial for alleviating the information deficiency problem in ASC. In this paper, we explore two kinds of sentiment preference information inside a document, i.e., contextual sentiment consistency w.r.t. the same aspect (namely intra-aspect sentiment consistency) and contextual sentiment tendency w.r.t. all the related aspects (namely inter-aspect sentiment tendency). On the basis, we propose a Cooperative Graph Attention Networks (CoGAN) approach for cooperatively learning the aspect-related sentence representation. Specifically, two graph attention networks are leveraged to model above two kinds of document-level sentiment preference information respectively, followed by an interactive mechanism to integrate the two-fold preference. Detailed evaluation demonstrates the great advantage of the proposed approach to ASC over the state-of-the-art baselines. This justifies the importance of the document-level sentiment preference information to ASC and the effectiveness of our approach capturing such information.",Aspect Classification|ASC|independent problem|information problem,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Long,https://www.aclweb.org/anthology/2020.acl-main.338.pdf -main.27,TAG : Type Auxiliary Guiding for Code Comment Generation,Ruichu Cai|Zhihao Liang|Boyan Xu|zijian li|Yuexing Hao|Yao Chen,"Existing leading code comment generation approaches with the structure-to-sequence framework ignores the type information of the interpretation of the code, e.g., operator, string, etc. However, introducing the type information into the existing framework is non-trivial due to the hierarchical dependence among the type information. In order to address the issues above, we propose a Type Auxiliary Guiding encoder-decoder framework for the code comment generation task which considers the source code as an N-ary tree with type information associated with each node. Specifically, our framework is featured with a Type-associated Encoder and a Type-restricted Decoder which enables adaptive summarization of the source code. We further propose a hierarchical reinforcement learning method to resolve the training difficulties of our proposed framework. Extensive evaluations demonstrate the state-of-the-art performance of our framework with both the auto-evaluated metrics and case studies.",Code Generation|code task|adaptive code|TAG,Generation,Long,https://www.aclweb.org/anthology/2020.acl-main.27.pdf -main.466,How Does NLP Benefit Legal System: A Summary of Legal Artificial Intelligence,Haoxi Zhong|Chaojun Xiao|Cunchao Tu|Tianyang Zhang|Zhiyuan Liu|Maosong Sun,"Legal Artificial Intelligence (LegalAI) focuses on applying the technology of artificial intelligence, especially natural language processing, to benefit tasks in the legal domain. In recent years, LegalAI has drawn increasing attention rapidly from both AI researchers and legal professionals, as LegalAI is beneficial to the legal system for liberating legal professionals from a maze of paperwork. Legal professionals often think about how to solve tasks from rule-based and symbol-based methods, while NLP researchers concentrate more on data-driven and embedding methods. In this paper, we introduce the history, the current state, and the future directions of research in LegalAI. We illustrate the tasks from the perspectives of legal professionals and NLP researchers and show several representative applications in LegalAI. We conduct experiments and provide an in-depth analysis of the advantages and disadvantages of existing works to explore possible future directions. You can find the implementation of our work from https://github.com/thunlp/CLAIM.",Legal Intelligence|natural processing|Legal System|artificial intelligence,Theme,Long,https://www.aclweb.org/anthology/2020.acl-main.466.pdf -main.300,An Empirical Comparison of Unsupervised Constituency Parsing Methods,Jun Li|Yifan Cao|Jiong Cai|Yong Jiang|Kewei Tu,"Unsupervised constituency parsing aims to learn a constituency parser from a training corpus without parse tree annotations. While many methods have been proposed to tackle the problem, including statistical and neural methods, their experimental results are often not directly comparable due to discrepancies in datasets, data preprocessing, lexicalization, and evaluation metrics. In this paper, we first examine experimental settings used in previous work and propose to standardize the settings for better comparability between methods. We then empirically compare several existing methods, including decade-old and newly proposed ones, under the standardized settings on English and Japanese, two languages with different branching tendencies. We find that recent models do not show a clear advantage over decade-old models in our experiments. We hope our work can provide new insights into existing methods and facilitate future empirical evaluation of unsupervised constituency parsing.",data preprocessing|empirical parsing|unsupervised parsing|Unsupervised Methods,"Syntax: Tagging, Chunking and Parsing",Short,https://www.aclweb.org/anthology/2020.acl-main.300.pdf -main.314,Do you have the right scissors? Tailoring Pre-trained Language Models via Monte-Carlo Methods,Ning Miao|Yuxuan Song|Hao Zhou|Lei Li,"It has been a common approach to pre-train a language model on a large corpus and fine-tune it on task-specific data. In practice, we observe that fine-tuning a pre-trained model on a small dataset may lead to over- and/or under-estimate problem. In this paper, we propose MC-Tailor, a novel method to alleviate the above issue in text generation tasks by truncating and transferring the probability mass from over-estimated regions to under-estimated ones. Experiments on a variety of text generation datasets show that MC-Tailor consistently and significantly outperforms the fine-tuning approach.",over- problem|text tasks|Tailoring Models|Monte-Carlo Methods,Machine Learning for NLP,Short,https://www.aclweb.org/anthology/2020.acl-main.314.pdf -main.472,Hierarchical Modeling for User Personality Prediction: The Role of Message-Level Attention,Veronica Lynn|Niranjan Balasubramanian|H. Andrew Schwartz,"Not all documents are equally important. Language processing is increasingly finding use as a supplement for questionnaires to assess psychological attributes of consenting individuals, but most approaches neglect to consider whether all documents of an individual are equally informative. In this paper, we present a novel model that uses message-level attention to learn the relative weight of users' social media posts for assessing their five factor personality traits. We demonstrate that models with message-level attention outperform those with word-level attention, and ultimately yield state-of-the-art accuracies for all five traits by using both word and message attention in combination with past approaches (an average increase in Pearson r of 2.5%). In addition, examination of the high-signal posts identified by our model provides insight into the relationship between language and personality, helping to inform future work.",User Prediction|Language processing|Hierarchical Modeling|Message-Level Attention,Computational Social Science and Social Media,Long,https://www.aclweb.org/anthology/2020.acl-main.472.pdf -main.23,Pre-train and Plug-in: Flexible Conditional Text Generation with Variational Auto-Encoders,Yu Duan|Canwen Xu|Jiaxin Pei|Jialong Han|Chenliang Li,"Conditional Text Generation has drawn much attention as a topic of Natural Language Generation (NLG) which provides the possibility for humans to control the properties of generated contents. Current conditional generation models cannot handle emerging conditions due to their joint end-to-end learning fashion. When a new condition added, these techniques require full retraining. In this paper, we present a new framework named Pre-train and Plug-in Variational Auto-Encoder (PPVAE) towards flexible conditional text generation. PPVAE decouples the text generation module from the condition representation module to allow ""one-to-many'' conditional generation. When a fresh condition emerges, only a lightweight network needs to be trained and works as a plug-in for PPVAE, which is efficient and desirable for real-world applications. Extensive experiments demonstrate the superiority of PPVAE against the existing alternatives with better conditionality and diversity but less training effort.",Flexible Generation|Conditional Generation|Natural Generation|Natural NLG,Generation,Long,https://www.aclweb.org/anthology/2020.acl-main.23.pdf -main.37,Learning Source Phrase Representations for Neural Machine Translation,Hongfei Xu|Josef van Genabith|Deyi Xiong|Qiuhui Liu|Jingyi Zhang,"The Transformer translation model (Vaswani et al., 2017) based on a multi-head attention mechanism can be computed effectively in parallel and has significantly pushed forward the performance of Neural Machine Translation (NMT). Though intuitively the attentional network can connect distant words via shorter network paths than RNNs, empirical analysis demonstrates that it still has difficulty in fully capturing long-distance dependencies (Tang et al., 2018). Considering that modeling phrases instead of words has significantly improved the Statistical Machine Translation (SMT) approach through the use of larger translation blocks (""phrases"") and its reordering ability, modeling NMT at phrase level is an intuitive proposal to help the model capture long-distance relationships. In this paper, we first propose an attentive phrase representation generation mechanism which is able to generate phrase representations from corresponding token representations. In addition, we incorporate the generated phrase representations into the Transformer translation model to enhance its ability to capture long-distance relationships. In our experiments, we obtain significant improvements on the WMT 14 English-German and English-French tasks on top of the strong Transformer baseline, which shows the effectiveness of our approach. Our approach helps Transformer Base models perform at the level of Transformer Big models, and even significantly better for long sentences, but with substantially fewer parameters and training steps. The fact that phrase representations help even in the big setting further supports our conjecture that they make a valuable contribution to long-distance relations.",Neural Translation|WMT tasks|Learning Representations|Transformer model,Machine Translation,Long,https://www.aclweb.org/anthology/2020.acl-main.37.pdf -main.328,ChartDialogs: Plotting from Natural Language Instructions,Yutong Shao|Ndapa Nakashole,"This paper presents the problem of conversational plotting agents that carry out plotting actions from natural language instructions. To facilitate the development of such agents, we introduce ChartDialogs, a new multi-turn dialog dataset, covering a popular plotting library, matplotlib. The dataset contains over 15,000 dialog turns from 3,200 dialogs covering the majority of matplotlib plot types. Extensive experiments show the best-performing method achieving 61% plotting accuracy, demonstrating that the dataset presents a non-trivial challenge for future research on this task.",Natural Instructions|conversational agents|ChartDialogs|plotting library,Resources and Evaluation,Long,https://www.aclweb.org/anthology/2020.acl-main.328.pdf -main.499,Logic-Guided Data Augmentation and Regularization for Consistent Question Answering,Akari Asai|Hannaneh Hajishirzi,"Many natural language questions require qualitative, quantitative or logical comparisons between two entities or events. This paper addresses the problem of improving the accuracy and consistency of responses to comparison questions by integrating logic rules and neural models. Our method leverages logical and linguistic knowledge to augment labeled training data and then uses a consistency-based regularizer to train the model. Improving the global consistency of predictions, our approach achieves large improvements over previous methods in a variety of question answering (QA) tasks, including multiple-choice qualitative reasoning, cause-effect reasoning, and extractive machine reading comprehension. In particular, our method significantly improves the performance of RoBERTa-based models by 1-5% across datasets. We advance state of the art by around 5-8% on WIQA and QuaRel and reduce consistency violations by 58% on HotpotQA. We further demonstrate that our approach can learn effectively from limited data.",Logic-Guided Augmentation|Regularization|Consistent Answering|natural questions,Question Answering,Short,https://www.aclweb.org/anthology/2020.acl-main.499.pdf -main.102,Dynamic Memory Induction Networks for Few-Shot Text Classification,Ruiying Geng|Binhua Li|Yongbin Li|Jian Sun|Xiaodan Zhu,"This paper proposes Dynamic Memory Induction Networks (DMIN) for few-short text classification. The model develops a dynamic routing mechanism over static memory, enabling it to better adapt to unseen classes, a critical capability for few-short classification. The model also expands the induction process with supervised learning weights and query information to enhance the generalization ability of meta-learning. The proposed model brings forward the state-of-the-art performance significantly by 2~4% improvement on the miniRCV1 and ODIC datasets. Detailed analysis is further performed to show how the proposed network achieves the new performance.",Few-Shot Classification|few-short classification|Dynamic Networks|Dynamic DMIN,Information Retrieval and Text Mining,Short,https://www.aclweb.org/anthology/2020.acl-main.102.pdf -main.664,Improving Image Captioning with Better Use of Caption,Zhan Shi|Xu Zhou|Xipeng Qiu|Xiaodan Zhu,"Image captioning is a multimodal problem that has drawn extensive attention in both the natural language processing and computer vision community. In this paper, we present a novel image captioning architecture to better explore semantics available in captions and leverage that to enhance both image representation and caption generation. Our models first construct caption-guided visual relationship graphs that introduce beneficial inductive bias using weakly supervised multi-instance learning. The representation is then enhanced with neighbouring and contextual nodes with their textual and visual features. During generation, the model further incorporates visual relationships using multi-task learning for jointly predicting word and object/predicate tag sequences. We perform extensive experiments on the MSCOCO dataset, showing that the proposed framework significantly outperforms the baselines, resulting in the state-of-the-art performance under a wide range of evaluation metrics. The code of our paper has been made publicly available.",Image Captioning|multimodal problem|natural processing|computer community,Generation,Long,https://www.aclweb.org/anthology/2020.acl-main.664.pdf -main.670,SciREX: A Challenge Dataset for Document-Level Information Extraction,Sarthak Jain|Madeleine van Zuylen|Hannaneh Hajishirzi|Iz Beltagy,"Extracting information from full documents is an important problem in many domains, but most previous work focus on identifying relationships within a sentence or a paragraph. It is challenging to create a large-scale information extraction (IE) dataset at the document level since it requires an understanding of the whole document to annotate entities and their document-level relationships that usually span beyond sentences or even sections. In this paper, we introduce SciREX, a document level IE dataset that encompasses multiple IE tasks, including salient entity identification and document level N-ary relation identification from scientific articles. We annotate our dataset by integrating automatic and human annotations, leveraging existing scientific knowledge resources. We develop a neural model as a strong baseline that extends previous state-of-the-art IE models to document-level IE. Analyzing the model performance shows a significant gap between human performance and current baselines, inviting the community to use our dataset as a challenge to develop document-level IE models. Our data and code are publicly available at https://github.com/allenai/SciREX .",Document-Level Extraction|IE tasks|salient identification|document identification,Information Extraction,Long,https://www.aclweb.org/anthology/2020.acl-main.670.pdf -main.116,The SOFC-Exp Corpus and Neural Approaches to Information Extraction in the Materials Science Domain,Annemarie Friedrich|Heike Adel|Federico Tomazic|Johannes Hingerl|Renou Benteau|Anika Marusczyk|Lukas Lange,"This paper presents a new challenging information extraction task in the domain of materials science. We develop an annotation scheme for marking information on experiments related to solid oxide fuel cells in scientific publications, such as involved materials and measurement conditions. With this paper, we publish our annotation guidelines, as well as our SOFC-Exp corpus consisting of 45 open-access scholarly articles annotated by domain experts. A corpus and an inter-annotator agreement study demonstrate the complexity of the suggested named entity recognition and slot filling tasks as well as high annotation quality. We also present strong neural-network based models for a variety of tasks that can be addressed on the basis of our new data set. On all tasks, using BERT embeddings leads to large performance gains, but with increasing task complexity, adding a recurrent neural network on top seems beneficial. Our models will serve as competitive baselines in future work, and analysis of their performance highlights difficult cases when modeling the data and suggests promising research directions.",Information Extraction|information task|materials science|slot tasks,Resources and Evaluation,Long,https://www.aclweb.org/anthology/2020.acl-main.116.pdf -main.658,A Call for More Rigor in Unsupervised Cross-lingual Learning,Mikel Artetxe|Sebastian Ruder|Dani Yogatama|Gorka Labaka|Eneko Agirre,"We review motivations, definition, approaches, and methodology for unsupervised cross-lingual learning and call for a more rigorous position in each of them. An existing rationale for such research is based on the lack of parallel data for many of the world's languages. However, we argue that a scenario without any parallel data and abundant monolingual data is unrealistic in practice. We also discuss different training signals that have been used in previous work, which depart from the pure unsupervised setting. We then describe common methodological issues in tuning and evaluation of unsupervised cross-lingual models and present best practices. Finally, we provide a unified outlook for different types of research in this area (i.e., cross-lingual word embeddings, deep multilingual pretraining, and unsupervised machine translation) and argue for comparable evaluation of these models.",unsupervised learning|unsupervised setting|cross-lingual embeddings|deep pretraining,Theme,Long,https://www.aclweb.org/anthology/2020.acl-main.658.pdf -main.710,One Size Does Not Fit All: Generating and Evaluating Variable Number of Keyphrases,Xingdi Yuan|Tong Wang|Rui Meng|Khushboo Thaker|Peter Brusilovsky|Daqing He|Adam Trischler,"Different texts shall by nature correspond to different number of keyphrases. This desideratum is largely missing from existing neural keyphrase generation models. In this study, we address this problem from both modeling and evaluation perspectives. We first propose a recurrent generative model that generates multiple keyphrases as delimiter-separated sequences. Generation diversity is further enhanced with two novel techniques by manipulating decoder hidden states. In contrast to previous approaches, our model is capable of generating diverse keyphrases and controlling number of outputs. We further propose two evaluation metrics tailored towards the variable-number generation. We also introduce a new dataset StackEx that expands beyond the only existing genre (i.e., academic writing) in keyphrase generation tasks. With both previous and new evaluation metrics, our model outperforms strong baselines on all datasets.",modeling perspectives|variable-number generation|keyphrase tasks|neural models,Generation,Long,https://www.aclweb.org/anthology/2020.acl-main.710.pdf -main.704,BLEURT: Learning Robust Metrics for Text Generation,Thibault Sellam|Dipanjan Das|Ankur Parikh,"Text generation has made significant advances in the last few years. Yet, evaluation metrics have lagged behind, as the most popular choices (e.g., BLEU and ROUGE) may correlate poorly with human judgment. We propose BLEURT, a learned evaluation metric for English based on BERT. BLEURT can model human judgment with a few thousand possibly biased training examples. A key aspect of our approach is a novel pre-training scheme that uses millions of synthetic examples to help the model generalize. BLEURT provides state-of-the-art results on the last three years of the WMT Metrics shared task and the WebNLG data set. In contrast to a vanilla BERT-based approach, it yields superior results even when the training data is scarce and out-of-distribution.",Learning Metrics|Text Generation|WMT task|pre-training scheme,Generation,Long,https://www.aclweb.org/anthology/2020.acl-main.704.pdf -main.738,Active Learning for Coreference Resolution using Discrete Annotation,Belinda Z. Li|Gabriel Stanovsky|Luke Zettlemoyer,"We improve upon pairwise annotation for active learning in coreference resolution, by asking annotators to identify mention antecedents if a presented mention pair is deemed not coreferent. This simple modification, when combined with a novel mention clustering algorithm for selecting which examples to label, is much more efficient in terms of the performance obtained per annotation budget. In experiments with existing benchmark coreference datasets, we show that the signal from this additional question leads to significant performance gains per human-annotation hour. Future work can use our annotation protocol to effectively develop coreference models for new domains. Our code is publicly available.",Coreference Resolution|active resolution|Active Learning|Discrete Annotation,Semantics: Sentence Level,Short,https://www.aclweb.org/anthology/2020.acl-main.738.pdf -main.512,Entity-Aware Dependency-Based Deep Graph Attention Network for Comparative Preference Classification,Nianzu Ma|Sahisnu Mazumder|Hao Wang|Bing Liu,"This paper studies the task of comparative preference classification (CPC). Given two entities in a sentence, our goal is to classify whether the first (or the second) entity is preferred over the other or no comparison is expressed at all between the two entities. Existing works either do not learn entity-aware representations well and fail to deal with sentences involving multiple entity pairs or use sequential modeling approaches that are unable to capture long-range dependencies between the entities. Some also use traditional machine learning approaches that do not generalize well. This paper proposes a novel Entity-aware Dependency-based Deep Graph Attention Network (ED-GAT) that employs a multi-hop graph attention over a dependency graph sentence representation to leverage both the semantic information from word embeddings and the syntactic information from the dependency graph to solve the problem. Empirical evaluation shows that the proposed model achieves the state-of-the-art performance in comparative preference classification.",Comparative Classification|Entity-Aware Network|CPC|entity-aware representations,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Short,https://www.aclweb.org/anthology/2020.acl-main.512.pdf -main.274,A Relaxed Matching Procedure for Unsupervised BLI,Xu Zhao|Zihao Wang|Yong Zhang|Hao Wu,"Recently unsupervised Bilingual Lexicon Induction(BLI) without any parallel corpus has attracted much research interest. One of the crucial parts in methods for the BLI task is the matching procedure. Previous works impose a too strong constraint on the matching and lead to many counterintuitive translation pairings. Thus We propose a relaxed matching procedure to find a more precise matching between two languages. We also find that aligning source and target language embedding space bidirectionally will bring significant improvement. We follow the previous iterative framework to conduct experiments. Results on standard benchmark demonstrate the effectiveness of our proposed method, which substantially outperforms previous unsupervised methods.",Unsupervised BLI|unsupervised Induction(BLI|unsupervised|BLI task,Machine Translation,Short,https://www.aclweb.org/anthology/2020.acl-main.274.pdf -main.260,Gender Bias in Multilingual Embeddings and Cross-Lingual Transfer,Jieyu Zhao|Subhabrata Mukherjee|Saghar Hosseini|Kai-Wei Chang|Ahmed Hassan Awadallah,"Multilingual representations embed words from many languages into a single semantic space such that words with similar meanings are close to each other regardless of the language. These embeddings have been widely used in various settings, such as cross-lingual transfer, where a natural language processing (NLP) model trained on one language is deployed to another language. While the cross-lingual transfer techniques are powerful, they carry gender bias from the source to target languages. In this paper, we study gender bias in multilingual embeddings and how it affects transfer learning for NLP applications. We create a multilingual dataset for bias analysis and propose several ways for quantifying bias in multilingual representations from both the intrinsic and extrinsic perspectives. Experimental results show that the magnitude of bias in the multilingual representations changes differently when we align the embeddings to different target spaces and that the alignment direction can also have an influence on the bias in transfer learning. We further provide recommendations for using the multilingual word representations for downstream tasks.",cross-lingual transfer|multilingual embeddings|NLP applications|bias analysis,Ethics and NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.260.pdf -main.506,Not All Claims are Created Equal: Choosing the Right Statistical Approach to Assess Hypotheses,Erfan Sadeqi Azer|Daniel Khashabi|Ashish Sabharwal|Dan Roth,"Empirical research in Natural Language Processing (NLP) has adopted a narrow set of principles for assessing hypotheses, relying mainly on p-value computation, which suffers from several known issues. While alternative proposals have been well-debated and adopted in other fields, they remain rarely discussed or used within the NLP community. We address this gap by contrasting various hypothesis assessment techniques, especially those not commonly used in the field (such as evaluations based on Bayesian inference). Since these statistical techniques differ in the hypotheses they can support, we argue that practitioners should first decide their target hypothesis before choosing an assessment method. This is crucial because common fallacies, misconceptions, and misinterpretation surrounding hypothesis assessment methods often stem from a discrepancy between what one would like to claim versus what the method used actually assesses. Our survey reveals that these issues are omnipresent in the NLP research community. As a step forward, we provide best practices and guidelines tailored to NLP research, as well as an easy-to-use package for Bayesian assessment of hypotheses, complementing existing tools.",Natural Processing|NLP|NLP research|Bayesian hypotheses,Resources and Evaluation,Long,https://www.aclweb.org/anthology/2020.acl-main.506.pdf -main.248,Topological Sort for Sentence Ordering,Shrimai Prabhumoye|Ruslan Salakhutdinov|Alan W Black,"Sentence ordering is the task of arranging the sentences of a given text in the correct order. Recent work using deep neural networks for this task has framed it as a sequence prediction problem. In this paper, we propose a new framing of this task as a constraint solving problem and introduce a new technique to solve it. Additionally, we propose a human evaluation for this task. The results on both automatic and human metrics across four different datasets show that this new technique is better at capturing coherence in documents.",Sentence Ordering|sequence problem|constraint problem|Topological Sort,Machine Learning for NLP,Short,https://www.aclweb.org/anthology/2020.acl-main.248.pdf -main.249,Weight Poisoning Attacks on Pretrained Models,Keita Kurita|Paul Michel|Graham Neubig,"Recently, NLP has seen a surge in the usage of large pre-trained models. Users download weights of models pre-trained on large datasets, then fine-tune the weights on a task of their choice. This raises the question of whether downloading untrusted pre-trained weights can pose a security threat. In this paper, we show that it is possible to construct ``weight poisoning'' attacks where pre-trained weights are injected with vulnerabilities that expose ``backdoors'' after fine-tuning, enabling the attacker to manipulate the model prediction simply by injecting an arbitrary keyword. We show that by applying a regularization method which we call RIPPLe and an initialization procedure we call Embedding Surgery, such attacks are possible even with limited knowledge of the dataset and fine-tuning procedure. Our experiments on sentiment classification, toxicity detection, and spam detection show that this attack is widely applicable and poses a serious threat. Finally, we outline practical defenses against such attacks. Code to reproduce our experiments is available at https://github.com/neulab/RIPPLe.",Weight Attacks|sentiment classification|toxicity detection|spam detection,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.249.pdf -main.261,"Give Me Convenience and Give Her Death: Who Should Decide What Uses of NLP are Appropriate, and on What Basis?",Kobi Leins|Jey Han Lau|Timothy Baldwin,"As part of growing NLP capabilities, coupled with an awareness of the ethical dimensions of research, questions have been raised about whether particular datasets and tasks should be deemed off-limits for NLP research. We examine this question with respect to a paper on automatic legal sentencing from EMNLP 2019 which was a source of some debate, in asking whether the paper should have been allowed to be published, who should have been charged with making such a decision, and on what basis. We focus in particular on the role of data statements in ethically assessing research, but also discuss the topic of dual use, and examine the outcomes of similar debates in other scientific disciplines.",NLP research|automatic sentencing|ethically research|NLP,Ethics and NLP,Short,https://www.aclweb.org/anthology/2020.acl-main.261.pdf -main.507,STARC: Structured Annotations for Reading Comprehension,Yevgeni Berzak|Jonathan Malmaud|Roger Levy,"We present STARC (Structured Annotations for Reading Comprehension), a new annotation framework for assessing reading comprehension with multiple choice questions. Our framework introduces a principled structure for the answer choices and ties them to textual span annotations. The framework is implemented in OneStopQA, a new high-quality dataset for evaluation and analysis of reading comprehension in English. We use this dataset to demonstrate that STARC can be leveraged for a key new application for the development of SAT-like reading comprehension materials: automatic annotation quality probing via span ablation experiments. We further show that it enables in-depth analyses and comparisons between machine and human reading comprehension behavior, including error distributions and guessing ability. Our experiments also reveal that the standard multiple choice dataset in NLP, RACE, is limited in its ability to measure reading comprehension. 47% of its questions can be guessed by machines without accessing the passage, and 18% are unanimously judged by humans as not having a unique correct answer. OneStopQA provides an alternative test set for reading comprehension which alleviates these shortcomings and has a substantially higher human ceiling performance.",Reading Comprehension|Structured Comprehension|evaluation comprehension|SAT-like materials,Resources and Evaluation,Long,https://www.aclweb.org/anthology/2020.acl-main.507.pdf -main.513,OpinionDigest: A Simple Framework for Opinion Summarization,Yoshihiko Suhara|Xiaolan Wang|Stefanos Angelidis|Wang-Chiew Tan,"We present OpinionDigest, an abstractive opinion summarization framework, which does not rely on gold-standard summaries for training. The framework uses an Aspect-based Sentiment Analysis model to extract opinion phrases from reviews, and trains a Transformer model to reconstruct the original reviews from these extractions. At summarization time, we merge extractions from multiple reviews and select the most popular ones. The selected opinions are used as input to the trained Transformer model, which verbalizes them into an opinion summary. OpinionDigest can also generate customized summaries, tailored to specific user needs, by filtering the selected opinions according to their aspect and/or sentiment. Automatic evaluation on Yelp data shows that our framework outperforms competitive baselines. Human studies on two corpora verify that OpinionDigest produces informative summaries and shows promising customization capabilities.",Opinion Summarization|Automatic data|Human studies|OpinionDigest,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Short,https://www.aclweb.org/anthology/2020.acl-main.513.pdf -main.275,Dynamic Programming Encoding for Subword Segmentation in Neural Machine Translation,Xuanli He|Gholamreza Haffari|Mohammad Norouzi,"This paper introduces Dynamic Programming Encoding (DPE), a new segmentation algorithm for tokenizing sentences into subword units. We view the subword segmentation of output sentences as a latent variable that should be marginalized out for learning and inference. A mixed character-subword transformer is proposed, which enables exact log marginal likelihood estimation and exact MAP inference to find target segmentations with maximum posterior probability. DPE uses a lightweight mixed character-subword transformer as a means of pre-processing parallel data to segment output sentences using dynamic programming. Empirical results on machine translation suggest that DPE is effective for segmenting output sentences and can be combined with BPE dropout for stochastic segmentation of source sentences. DPE achieves an average improvement of 0.9 BLEU over BPE (Sennrich et al., 2016) and an average improvement of 0.55 BLEU over BPE dropout (Provilkov et al., 2019) on several WMT datasets including English <=> (German, Romanian, Estonian, Finnish, Hungarian).",Subword Segmentation|Neural Translation|learning|inference,Machine Translation,Long,https://www.aclweb.org/anthology/2020.acl-main.275.pdf -main.739,Beyond Possession Existence: Duration and Co-Possession,Dhivya Chinnappa|Srikala Murugan|Eduardo Blanco,"This paper introduces two tasks: determining (a) the duration of possession relations and (b) co-possessions, i.e., whether multiple possessors possess a possessee at the same time. We present new annotations on top of corpora annotating possession existence and experimental results. Regarding possession duration, we derive the time spans we work with empirically from annotations indicating lower and upper bounds. Regarding co-possessions, we use a binary label. Cohen's kappa coefficients indicate substantial agreement, and experimental results show that text is more useful than the image for solving these tasks.",Possession Existence|Duration|Co-Possession|duration relations,Semantics: Sentence Level,Long,https://www.aclweb.org/anthology/2020.acl-main.739.pdf -main.705,Distilling Knowledge Learned in BERT for Text Generation,Yen-Chun Chen|Zhe Gan|Yu Cheng|Jingzhou Liu|Jingjing Liu,"Large-scale pre-trained language model such as BERT has achieved great success in language understanding tasks. However, it remains an open question how to utilize BERT for language generation. In this paper, we present a novel approach, Conditional Masked Language Modeling (C-MLM), to enable the finetuning of BERT on target generation tasks. The finetuned BERT (teacher) is exploited as extra supervision to improve conventional Seq2Seq models (student) for better text generation performance. By leveraging BERT's idiosyncratic bidirectional nature, distilling knowledge learned in BERT can encourage auto-regressive Seq2Seq models to plan ahead, imposing global sequence-level supervision for coherent text generation. Experiments show that the proposed approach significantly outperforms strong Transformer baselines on multiple language generation tasks such as machine translation and text summarization. Our proposed model also achieves new state of the art on IWSLT German-English and English-Vietnamese MT datasets.",Text Generation|language tasks|language generation|generation tasks,Generation,Long,https://www.aclweb.org/anthology/2020.acl-main.705.pdf -main.711,"R^3: Reverse, Retrieve, and Rank for Sarcasm Generation with Commonsense Knowledge",Tuhin Chakrabarty|Debanjan Ghosh|Smaranda Muresan|Nanyun Peng,"We propose an unsupervised approach for sarcasm generation based on a non-sarcastic input sentence. Our method employs a retrieve-and-edit framework to instantiate two major characteristics of sarcasm: reversal of valence and semantic incongruity with the context, which could include shared commonsense or world knowledge between the speaker and the listener. While prior works on sarcasm generation predominantly focus on context incongruity, we show that combining valence reversal and semantic incongruity based on the commonsense knowledge generates sarcasm of higher quality. Human evaluation shows that our system generates sarcasm better than humans 34% of the time, and better than a reinforced hybrid baseline 90% of the time.",Sarcasm Generation|unsupervised approach|retrieve-and-edit framework|Human evaluation,Generation,Long,https://www.aclweb.org/anthology/2020.acl-main.711.pdf -main.659,A Tale of a Probe and a Parser,Rowan Hall Maudslay|Josef Valvoda|Tiago Pimentel|Adina Williams|Ryan Cotterell,"Measuring what linguistic information is encoded in neural models of language has become popular in NLP. Researchers approach this enterprise by training “probes”—supervised models designed to extract linguistic structure from another model’s output. One such probe is the structural probe (Hewitt and Manning, 2019), designed to quantify the extent to which syntactic information is encoded in contextualised word representations. The structural probe has a novel design, unattested in the parsing literature, the precise benefit of which is not immediately obvious. To explore whether syntactic probes would do better to make use of existing techniques, we compare the structural probe to a more traditional parser with an identical lightweight parameterisation. The parser outperforms structural probe on UUAS in seven of nine analysed languages, often by a substantial amount (e.g. by 11.1 points in English). Under a second less common metric, however, there is the opposite trend—the structural probe outperforms the parser. This begs the question: which metric should we prefer?",NLP|parsing literature|Parser|neural language,Theme,Short,https://www.aclweb.org/anthology/2020.acl-main.659.pdf -main.671,Contrastive Self-Supervised Learning for Commonsense Reasoning,Tassilo Klein|Moin Nabi,"We propose a self-supervised method to solve Pronoun Disambiguation and Winograd Schema Challenge problems. Our approach exploits the characteristic structure of training corpora related to so-called ``trigger'' words, which are responsible for flipping the answer in pronoun disambiguation. We achieve such commonsense reasoning by constructing pair-wise contrastive auxiliary predictions. To this end, we leverage a mutual exclusive loss regularized by a contrastive margin. Our architecture is based on the recently introduced transformer networks, BERT, that exhibits strong performance on many NLP benchmarks. Empirical results show that our method alleviates the limitation of current supervised approaches for commonsense reasoning. This study opens up avenues for exploiting inexpensive self-supervision to achieve performance gain in commonsense reasoning tasks.",Commonsense Reasoning|Pronoun problems|pronoun disambiguation|commonsense tasks,Machine Learning for NLP,Short,https://www.aclweb.org/anthology/2020.acl-main.671.pdf -main.117,The TechQA Dataset,Vittorio Castelli|Rishav Chakravarti|Saswati Dana|Anthony Ferritto|Radu Florian|Martin Franz|Dinesh Garg|Dinesh Khandelwal|Scott McCarley|Michael McCawley|Mohamed Nasr|Lin Pan|Cezar Pendus|John Pitrelli|Saurabh Pujar|Salim Roukos|Andrzej Sakrajda|Avi Sil|Rosario Uceda-Sosa|Todd Ward|Rong Zhang,"We introduce TECHQA, a domain-adaptation question answering dataset for the technical support domain. The TECHQA corpus highlights two real-world issues from the automated customer support domain. First, it contains actual questions posed by users on a technical forum, rather than questions generated specifically for a competition or a task. Second, it has a real-world size – 600 training, 310 dev, and 490 evaluation question/answer pairs – thus reflecting the cost of creating large labeled datasets with actual data. Hence, TECHQA is meant to stimulate research in domain adaptation rather than as a resource to build QA systems from scratch. TECHQA was obtained by crawling the IBMDeveloper and DeveloperWorks forums for questions with accepted answers provided in an IBM Technote—a technical document that addresses a specific technical issue. We also release a collection of the 801,998 Technotes available on the web as of April 4, 2019 as a companion resource that can be used to learn representations of the IT domain language.",real-world issues|domain adaptation|representations language|TECHQA,Resources and Evaluation,Long,https://www.aclweb.org/anthology/2020.acl-main.117.pdf -main.103,Exclusive Hierarchical Decoding for Deep Keyphrase Generation,Wang Chen|Hou Pong Chan|Piji Li|Irwin King,"Keyphrase generation (KG) aims to summarize the main ideas of a document into a set of keyphrases. A new setting is recently introduced into this problem, in which, given a document, the model needs to predict a set of keyphrases and simultaneously determine the appropriate number of keyphrases to produce. Previous work in this setting employs a sequential decoding process to generate keyphrases. However, such a decoding method ignores the intrinsic hierarchical compositionality existing in the keyphrase set of a document. Moreover, previous work tends to generate duplicated keyphrases, which wastes time and computing resources. To overcome these limitations, we propose an exclusive hierarchical decoding framework that includes a hierarchical decoding process and either a soft or a hard exclusion mechanism. The hierarchical decoding process is to explicitly model the hierarchical compositionality of a keyphrase set. Both the soft and the hard exclusion mechanisms keep track of previously-predicted keyphrases within a window size to enhance the diversity of the generated keyphrases. Extensive experiments on multiple KG benchmark datasets demonstrate the effectiveness of our method to generate less duplicated and more accurate keyphrases.",Deep Generation|Keyphrase generation|Exclusive Decoding|KG,Information Retrieval and Text Mining,Long,https://www.aclweb.org/anthology/2020.acl-main.103.pdf -main.665,Shape of Synth to Come: Why We Should Use Synthetic Data for English Surface Realization,Henry Elder|Robert Burke|Alexander O'Connor|Jennifer Foster,"The Surface Realization Shared Tasks of 2018 and 2019 were Natural Language Generation shared tasks with the goal of exploring approaches to surface realization from Universal-Dependency-like trees to surface strings for several languages. In the 2018 shared task there was very little difference in the absolute performance of systems trained with and without additional, synthetically created data, and a new rule prohibiting the use of synthetic data was introduced for the 2019 shared task. Contrary to the findings of the 2018 shared task, we show, in experiments on the English 2018 dataset, that the use of synthetic data can have a substantial positive effect -- an improvement of almost 8 BLEU points for a previously state-of-the-art system. We analyse the effects of synthetic data, and we argue that its use should be encouraged rather than prohibited so that future research efforts continue to explore systems that can take advantage of such data.",English Realization|Surface Tasks|Natural tasks|2018 task,Generation,Short,https://www.aclweb.org/anthology/2020.acl-main.665.pdf -main.498,Crossing Variational Autoencoders for Answer Retrieval,Wenhao Yu|Lingfei Wu|Qingkai Zeng|Shu Tao|Yu Deng|Meng Jiang,"Answer retrieval is to find the most aligned answer from a large set of candidates given a question. Learning vector representations of questions/answers is the key factor. Question-answer alignment and question/answer semantics are two important signals for learning the representations. Existing methods learned semantic representations with dual encoders or dual variational auto-encoders. The semantic information was learned from language models or question-to-question (answer-to-answer) generative processes. However, the alignment and semantics were too separate to capture the aligned semantics between question and answer. In this work, we propose to cross variational auto-encoders by generating questions with aligned answers and generating answers with aligned questions. Experiments show that our method outperforms the state-of-the-art answer retrieval method on SQuAD.",Answer Retrieval|vector questions/answers|Question-answer alignment|SQuAD,Question Answering,Short,https://www.aclweb.org/anthology/2020.acl-main.498.pdf -main.329,GLUECoS: An Evaluation Benchmark for Code-Switched NLP,Simran Khanuja|Sandipan Dandapat|Anirudh Srinivasan|Sunayana Sitaram|Monojit Choudhury,"Code-switching is the use of more than one language in the same conversation or utterance. Recently, multilingual contextual embedding models, trained on multiple monolingual corpora, have shown promising results on cross-lingual and multilingual tasks. We present an evaluation benchmark, GLUECoS, for code-switched languages, that spans several NLP tasks in English-Hindi and English-Spanish. Specifically, our evaluation benchmark includes Language Identification from text, POS tagging, Named Entity Recognition, Sentiment Analysis, Question Answering and a new task for code-switching, Natural Language Inference. We present results on all these tasks using cross-lingual word embedding models and multilingual models. In addition, we fine-tune multilingual models on artificially generated code-switched data. Although multilingual models perform significantly better than cross-lingual models, our results show that in most tasks, across both language pairs, multilingual models fine-tuned on code-switched data perform best, showing that multilingual models can be further optimized for code-switching tasks.",Code-Switched NLP|cross-lingual tasks|NLP tasks|Language Identification,Resources and Evaluation,Long,https://www.aclweb.org/anthology/2020.acl-main.329.pdf -main.36,Jointly Masked Sequence-to-Sequence Model for Non-Autoregressive Neural Machine Translation,Junliang Guo|Linli Xu|Enhong Chen,"The masked language model has received remarkable attention due to its effectiveness on various natural language processing tasks. However, few works have adopted this technique in the sequence-to-sequence models. In this work, we introduce a jointly masked sequence-to-sequence model and explore its application on non-autoregressive neural machine translation~(NAT). Specifically, we first empirically study the functionalities of the encoder and the decoder in NAT models, and find that the encoder takes a more important role than the decoder regarding the translation quality. Therefore, we propose to train the encoder more rigorously by masking the encoder input while training. As for the decoder, we propose to train it based on the consecutive masking of the decoder input with an n-gram loss function to alleviate the problem of translating duplicate words. The two types of masks are applied to the model jointly at the training stage. We conduct experiments on five benchmark machine translation tasks, and our model can achieve 27.69/32.24 BLEU scores on WMT14 English-German/German-English tasks with 5+ times speed up compared with an autoregressive model.",Non-Autoregressive Translation|natural tasks|non-autoregressive translation~(NAT|non-autoregressive,Machine Translation,Long,https://www.aclweb.org/anthology/2020.acl-main.36.pdf -main.22,Neural Syntactic Preordering for Controlled Paraphrase Generation,Tanya Goyal|Greg Durrett,"Paraphrasing natural language sentences is a multifaceted process: it might involve replacing individual words or short phrases, local rearrangement of content, or high-level restructuring like topicalization or passivization. Past approaches struggle to cover this space of paraphrase possibilities in an interpretable manner. Our work, inspired by pre-ordering literature in machine translation, uses syntactic transformations to softly ""reorder'' the source sentence and guide our neural paraphrasing model. First, given an input sentence, we derive a set of feasible syntactic rearrangements using an encoder-decoder model. This model operates over a partially lexical, partially syntactic view of the sentence and can reorder big chunks. Next, we use each proposed rearrangement to produce a sequence of position embeddings, which encourages our final encoder-decoder paraphrase model to attend to the source words in a particular order. Our evaluation, both automatic and human, shows that the proposed system retains the quality of the baseline approaches while giving a substantial increase in the diversity of the generated paraphrases.",Controlled Generation|Paraphrasing sentences|machine translation|Neural Preordering,Generation,Long,https://www.aclweb.org/anthology/2020.acl-main.22.pdf -main.315,Enhancing Pre-trained Chinese Character Representation with Word-aligned Attention,Yanzeng Li|Bowen Yu|Xue Mengge|Tingwen Liu,"Most Chinese pre-trained models take character as the basic unit and learn representation according to character's external contexts, ignoring the semantics expressed in the word, which is the smallest meaningful utterance in Chinese. Hence, we propose a novel word-aligned attention to exploit explicit word information, which is complementary to various character-based Chinese pre-trained language models. Specifically, we devise a pooling mechanism to align the character-level attention to the word level and propose to alleviate the potential issue of segmentation error propagation by multi-source information fusion. As a result, word and character information are explicitly integrated at the fine-tuning procedure. Experimental results on five Chinese NLP benchmark tasks demonstrate that our method achieves significant improvements against BERT, ERNIE and BERT-wwm.",segmentation propagation|Pre-trained Representation|Chinese models|word-aligned attention,Machine Learning for NLP,Short,https://www.aclweb.org/anthology/2020.acl-main.315.pdf -main.473,Measuring Forecasting Skill from Text,Shi Zong|Alan Ritter|Eduard Hovy,"People vary in their ability to make accurate predictions about the future. Prior studies have shown that some individuals can predict the outcome of future events with consistently better accuracy. This leads to a natural question: what makes some forecasters better than others? In this paper we explore connections between the language people use to describe their predictions and their forecasting skill. Datasets from two different forecasting domains are explored: (1) geopolitical forecasts from Good Judgment Open, an online prediction forum and (2) a corpus of company earnings forecasts made by financial analysts. We present a number of linguistic metrics which are computed over text associated with people's predictions about the future including: uncertainty, readability, and emotion. By studying linguistic factors associated with predictions, we are able to shed some light on the approach taken by skilled forecasters. Furthermore, we demonstrate that it is possible to accurately predict forecasting skill using a model that is based solely on language. This could potentially be useful for identifying accurate predictions or potentially skilled forecasters earlier.",Measuring Skill|forecasting domains|geopolitical forecasts|corpus forecasts,Computational Social Science and Social Media,Long,https://www.aclweb.org/anthology/2020.acl-main.473.pdf -main.467,Intermediate-Task Transfer Learning with Pretrained Language Models: When and Why Does It Work?,Yada Pruksachatkun|Jason Phang|Haokun Liu|Phu Mon Htut|Xiaoyi Zhang|Richard Yuanzhe Pang|Clara Vania|Katharina Kann|Samuel R. Bowman,"While pretrained models such as BERT have shown large gains across natural language understanding tasks, their performance can be improved by further training the model on a data-rich intermediate task, before fine-tuning it on a target task. However, it is still poorly understood when and why intermediate-task training is beneficial for a given target task. To investigate this, we perform a large-scale study on the pretrained RoBERTa model with 110 intermediate-target task combinations. We further evaluate all trained models with 25 probing tasks meant to reveal the specific skills that drive transfer. We observe that intermediate tasks requiring high-level inference and reasoning abilities tend to work best. We also observe that target task performance is strongly correlated with higher-level abilities such as coreference resolution. However, we fail to observe more granular correlations between probing and target task performance, highlighting the need for further work on broad-coverage probing benchmarks. We also observe evidence that the forgetting of knowledge learned during pretraining may limit our analysis, highlighting the need for further work on transfer learning methods in these settings.",Intermediate-Task Learning|natural tasks|data-rich task|intermediate-task training,Theme,Long,https://www.aclweb.org/anthology/2020.acl-main.467.pdf -main.301,Efficient Constituency Parsing by Pointing,Thanh-Tung Nguyen|Xuan-Phi Nguyen|Shafiq Joty|Xiaoli Li,"We propose a novel constituency parsing model that casts the parsing problem into a series of pointing tasks. Specifically, our model estimates the likelihood of a span being a legitimate tree constituent via the pointing score corresponding to the boundary words of the span. Our parsing model supports efficient top-down decoding and our learning objective is able to enforce structural consistency without resorting to the expensive CKY inference. The experiments on the standard English Penn Treebank parsing task show that our method achieves 92.78 F1 without using pre-trained models, which is higher than all the existing methods with similar time complexity. Using pre-trained BERT, our model achieves 95.48 F1, which is competitive with the state-of-the-art while being faster. Our approach also establishes new state-of-the-art in Basque and Swedish in the SPMRL shared tasks on multilingual constituency parsing.",parsing problem|pointing tasks|English task|SPMRL tasks,"Syntax: Tagging, Chunking and Parsing",Long,https://www.aclweb.org/anthology/2020.acl-main.301.pdf -main.471,Detecting Perceived Emotions in Hurricane Disasters,Shrey Desai|Cornelia Caragea|Junyi Jessy Li,"Natural disasters (e.g., hurricanes) affect millions of people each year, causing widespread destruction in their wake. People have recently taken to social media websites (e.g., Twitter) to share their sentiments and feelings with the larger community. Consequently, these platforms have become instrumental in understanding and perceiving emotions at scale. In this paper, we introduce HurricaneEmo, an emotion dataset of 15,000 English tweets spanning three hurricanes: Harvey, Irma, and Maria. We present a comprehensive study of fine-grained emotions and propose classification tasks to discriminate between coarse-grained emotion groups. Our best BERT model, even after task-guided pre-training which leverages unlabeled Twitter data, achieves only 68% accuracy (averaged across all groups). HurricaneEmo serves not only as a challenging benchmark for models but also as a valuable resource for analyzing emotions in disaster-centric domains.",Hurricane Disasters|Natural disasters|classification tasks|HurricaneEmo,Computational Social Science and Social Media,Long,https://www.aclweb.org/anthology/2020.acl-main.471.pdf -main.317,SAFER: A Structure-free Approach for Certified Robustness to Adversarial Word Substitutions,Mao Ye|Chengyue Gong|Qiang Liu,"State-of-the-art NLP models can often be fooled by human-unaware transformations such as synonymous word substitution. For security reasons, it is of critical importance to develop models with certified robustness that can provably guarantee that the prediction is can not be altered by any possible synonymous word substitution. In this work, we propose a certified robust method based on a new randomized smoothing technique, which constructs a stochastic ensemble by applying random word substitutions on the input sentences, and leverage the statistical properties of the ensemble to provably certify the robustness. Our method is simple and structure-free in that it only requires the black-box queries of the model outputs, and hence can be applied to any pre-trained models (such as BERT) and any types of models (world-level or subword-level). Our method significantly outperforms recent state-of-the-art methods for certified robustness on both IMDB and Amazon text classification tasks. To the best of our knowledge, we are the first work to achieve certified robustness on large systems such as BERT with practically meaningful certified accuracy.",Certified Robustness|Adversarial Substitutions|human-unaware transformations|ensemble,Machine Learning for NLP,Short,https://www.aclweb.org/anthology/2020.acl-main.317.pdf -main.303,Representations of Syntax [MASK] Useful: Effects of Constituency and Dependency Structure in Recursive LSTMs,Michael Lepori|Tal Linzen|R. Thomas McCoy,"Sequence-based neural networks show significant sensitivity to syntactic structure, but they still perform less well on syntactic tasks than tree-based networks. Such tree-based networks can be provided with a constituency parse, a dependency parse, or both. We evaluate which of these two representational schemes more effectively introduces biases for syntactic structure that increase performance on the subject-verb agreement prediction task. We find that a constituency-based network generalizes more robustly than a dependency-based one, and that combining the two types of structure does not yield further improvement. Finally, we show that the syntactic robustness of sequential models can be substantially improved by fine-tuning on a small amount of constructed data, suggesting that data augmentation is a viable alternative to explicit constituency structure for imparting the syntactic biases that sequential models are lacking.",syntactic tasks|subject-verb task|fine-tuning|data augmentation,"Syntax: Tagging, Chunking and Parsing",Short,https://www.aclweb.org/anthology/2020.acl-main.303.pdf -main.465,How Can We Accelerate Progress Towards Human-like Linguistic Generalization?,Tal Linzen,"This position paper describes and critiques the Pretraining-Agnostic Identically Distributed (PAID) evaluation paradigm, which has become a central tool for measuring progress in natural language understanding. This paradigm consists of three stages: (1) pre-training of a word prediction model on a corpus of arbitrary size; (2) fine-tuning (transfer learning) on a training set representing a classification task; (3) evaluation on a test set drawn from the same distribution as that training set. This paradigm favors simple, low-bias architectures, which, first, can be scaled to process vast amounts of data, and second, can capture the fine-grained statistical properties of a particular data set, regardless of whether those properties are likely to generalize to examples of the task outside the data set. This contrasts with humans, who learn language from several orders of magnitude less data than the systems favored by this evaluation paradigm, and generalize to new tasks in a consistent way. We advocate for supplementing or replacing PAID with paradigms that reward architectures that generalize as quickly and robustly as humans.",natural understanding|classification task|Pretraining-Agnostic paradigm|pre-training,Theme,Short,https://www.aclweb.org/anthology/2020.acl-main.465.pdf -main.34,Content Word Aware Neural Machine Translation,Kehai Chen|Rui Wang|Masao Utiyama|Eiichiro Sumita,"Neural machine translation (NMT) encodes the source sentence in a universal way to generate the target sentence word-by-word. However, NMT does not consider the importance of word in the sentence meaning, for example, some words (i.e., content words) express more important meaning than others (i.e., function words). To address this limitation, we first utilize word frequency information to distinguish between content and function words in a sentence, and then design a content word-aware NMT to improve translation performance. Empirical results on the WMT14 English-to-German, WMT14 English-to-French, and WMT17 Chinese-to-English translation tasks show that the proposed methods can significantly improve the performance of Transformer-based NMT.",Content Translation|Neural translation|translation|WMT17 tasks,Machine Translation,Short,https://www.aclweb.org/anthology/2020.acl-main.34.pdf -main.20,Generating Diverse and Consistent QA pairs from Contexts with Information-Maximizing Hierarchical Conditional VAEs,Dong Bok Lee|Seanie Lee|Woo Tae Jeong|Donghwan Kim|Sung Ju Hwang,"One of the most crucial challenges in question answering (QA) is the scarcity of labeled data, since it is costly to obtain question-answer (QA) pairs for a target text domain with human annotation. An alternative approach to tackle the problem is to use automatically generated QA pairs from either the problem context or from large amount of unstructured texts (e.g. Wikipedia). In this work, we propose a hierarchical conditional variational autoencoder (HCVAE) for generating QA pairs given unstructured texts as contexts, while maximizing the mutual information between generated QA pairs to ensure their consistency. We validate our Information Maximizing Hierarchical Conditional Variational AutoEncoder (Info-HCVAE) on several benchmark datasets by evaluating the performance of the QA model (BERT-base) using only the generated QA pairs (QA-based evaluation) or by using both the generated and human-labeled pairs (semi-supervised learning) for training, against state-of-the-art baseline models. The results show that our model obtains impressive performance gains over all baselines on both tasks, using only a fraction of data for training.",question answering|QA|QA|Information-Maximizing VAEs,Generation,Long,https://www.aclweb.org/anthology/2020.acl-main.20.pdf -main.459,Storytelling with Dialogue: A Critical Role Dungeons and Dragons Dataset,Revanth Rameshkumar|Peter Bailey,"This paper describes the {Critical Role Dungeons and Dragons Dataset} ({CRD3}) and related analyses. {Critical Role} is an unscripted, live-streamed show where a fixed group of people play {Dungeons and Dragons}, an open-ended role-playing game. The dataset is collected from 159 {Critical Role} episodes transcribed to text dialogues, consisting of 398,682 turns. It also includes corresponding abstractive summaries collected from the {Fandom} wiki. The dataset is linguistically unique in that the narratives are generated entirely through player collaboration and spoken interaction. For each dialogue, there are a large number of turns, multiple abstractive summaries with varying levels of detail, and semantic ties to the previous dialogues. {I}n addition, we provide a data augmentation method that produces 34,243 summary-dialogue chunk pairs to support current neural {ML} approaches, and we provide an abstractive summarization benchmark and evaluation.",abstractive evaluation|CRD3|open-ended game|data method,Summarization,Long,https://www.aclweb.org/anthology/2020.acl-main.459.pdf -main.115,PuzzLing Machines: A Challenge on Learning From Small Data,Gözde Gül Şahin|Yova Kementchedjhieva|Phillip Rust|Iryna Gurevych,"Deep neural models have repeatedly proved excellent at memorizing surface patterns from large datasets for various ML and NLP benchmarks. They struggle to achieve human-like thinking, however, because they lack the skill of iterative reasoning upon knowledge. To expose this problem in a new light, we introduce a challenge on learning from small data, PuzzLing Machines, which consists of Rosetta Stone puzzles from Linguistic Olympiads for high school students. These puzzles are carefully designed to contain only the minimal amount of parallel text necessary to deduce the form of unseen expressions. Solving them does not require external information (e.g., knowledge bases, visual signals) or linguistic expertise, but meta-linguistic awareness and deductive skills. Our challenge contains around 100 puzzles covering a wide range of linguistic phenomena from 81 languages. We show that both simple statistical algorithms and state-of-the-art deep neural models perform inadequately on this challenge, as expected. We hope that this benchmark, available at https://ukplab.github.io/PuzzLing-Machines/, inspires further efforts towards a new paradigm in NLP---one that is grounded in human-like reasoning and understanding.",Learning|memorizing patterns|Solving them|NLP,Resources and Evaluation,Long,https://www.aclweb.org/anthology/2020.acl-main.115.pdf -main.673,Improving Disentangled Text Representation Learning with Information-Theoretic Guidance,Pengyu Cheng|Martin Renqiang Min|Dinghan Shen|Christopher Malon|Yizhe Zhang|Yitong Li|Lawrence Carin,"Learning disentangled representations of natural language is essential for many NLP tasks, e.g., conditional text generation, style transfer, personalized dialogue systems, etc. Similar problems have been studied extensively for other forms of data, such as images and videos. However, the discrete nature of natural language makes the disentangling of textual representations more challenging (e.g., the manipulation over the data space cannot be easily achieved). Inspired by information theory, we propose a novel method that effectively manifests disentangled representations of text, without any supervision on semantics. A new mutual information upper bound is derived and leveraged to measure dependence between style and content. By minimizing this upper bound, the proposed method induces style and content embeddings into two independent low-dimensional spaces. Experiments on both conditional text generation and text-style transfer demonstrate the high quality of our disentangled representation in terms of content and style preservation.",Learning language|NLP tasks|conditional generation|style transfer,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.673.pdf -main.667,A Two-Step Approach for Implicit Event Argument Detection,Zhisong Zhang|Xiang Kong|Zhengzhong Liu|Xuezhe Ma|Eduard Hovy,"In this work, we explore the implicit event argument detection task, which studies event arguments beyond sentence boundaries. The addition of cross-sentence argument candidates imposes great challenges for modeling. To reduce the number of candidates, we adopt a two-step approach, decomposing the problem into two sub-problems: argument head-word detection and head-to-span expansion. Evaluated on the recent RAMS dataset (Ebner et al., 2020), our model achieves overall better performance than a strong sequence labeling baseline. We further provide detailed error analysis, presenting where the model mainly makes errors and indicating directions for future improvements. It remains a challenge to detect implicit arguments, calling for more future work of document-level modeling for this task.",Implicit Detection|implicit task|modeling|argument detection,Information Extraction,Short,https://www.aclweb.org/anthology/2020.acl-main.667.pdf -main.101,Towards Faithful Neural Table-to-Text Generation with Content-Matching Constraints,Zhenyi Wang|Xiaoyang Wang|Bang An|Dong Yu|Changyou Chen,"Text generation from a knowledge base aims to translate knowledge triples to natural language descriptions. Most existing methods ignore the faithfulness between a generated text description and the original table, leading to generated information that goes beyond the content of the table. In this paper, for the first time, we propose a novel Transformer-based generation framework to achieve the goal. The core techniques in our method to enforce faithfulness include a new table-text optimal-transport matching loss and a table-text embedding similarity loss based on the Transformer model. Furthermore, to evaluate faithfulness, we propose a new automatic metric specialized to the table-to-text generation problem. We also provide detailed analysis on each component of our model in our experiments. Automatic and human evaluations show that our framework can significantly outperform state-of-the-art by a large margin.",Faithful Generation|Text generation|table-to-text problem|Transformer-based framework,Generation,Long,https://www.aclweb.org/anthology/2020.acl-main.101.pdf -main.129,Learning Dialog Policies from Weak Demonstrations,Gabriel Gordon-Hall|Philip John Gorinski|Shay B. Cohen,"Deep reinforcement learning is a promising approach to training a dialog manager, but current methods struggle with the large state and action spaces of multi-domain dialog systems. Building upon Deep Q-learning from Demonstrations (DQfD), an algorithm that scores highly in difficult Atari games, we leverage dialog data to guide the agent to successfully respond to a user's requests. We make progressively fewer assumptions about the data needed, using labeled, reduced-labeled, and even unlabeled data to train expert demonstrators. We introduce Reinforced Fine-tune Learning, an extension to DQfD, enabling us to overcome the domain gap between the datasets and the environment. Experiments in a challenging multi-domain dialog system framework validate our approaches, and get high success rates even when trained on out-of-domain data.",Weak Demonstrations|dialog manager|multi-domain systems|expert demonstrators,Dialogue and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.acl-main.129.pdf -main.698,"Negated and Misprimed Probes for Pretrained Language Models: Birds Can Talk, But Cannot Fly",Nora Kassner|Hinrich Schütze,"Building on Petroni et al. 2019, we propose two new probing tasks analyzing factual knowledge stored in Pretrained Language Models (PLMs). (1) Negation. We find that PLMs do not distinguish between negated (``Birds cannot [MASK]'') and non-negated (``Birds can [MASK]'') cloze questions. (2) Mispriming. Inspired by priming methods in human psychology, we add ``misprimes'' to cloze questions (``Talk? Birds can [MASK]''). We find that PLMs are easily distracted by misprimes. These results suggest that PLMs still have a long way to go to adequately learn human-like factual knowledge.",Pretrained Models|probing tasks|Negation|Pretrained Models,Theme,Short,https://www.aclweb.org/anthology/2020.acl-main.698.pdf -main.707,Iterative Edit-Based Unsupervised Sentence Simplification,Dhruv Kumar|Lili Mou|Lukasz Golab|Olga Vechtomova,"We present a novel iterative, edit-based approach to unsupervised sentence simplification. Our model is guided by a scoring function involving fluency, simplicity, and meaning preservation. Then, we iteratively perform word and phrase-level edits on the complex sentence. Compared with previous approaches, our model does not require a parallel training set, but is more controllable and interpretable. Experiments on Newsela and WikiLarge datasets show that our approach is nearly as effective as state-of-the-art supervised approaches.",Iterative Simplification|unsupervised simplification|iterative approach|word edits,Generation,Long,https://www.aclweb.org/anthology/2020.acl-main.707.pdf -main.713,A Joint Neural Model for Information Extraction with Global Features,Ying Lin|Heng Ji|Fei Huang|Lingfei Wu,"Most existing joint neural models for Information Extraction (IE) use local task-specific classifiers to predict labels for individual instances (e.g., trigger, relation) regardless of their interactions. For example, a victim of a die event is likely to be a victim of an attack event in the same sentence. In order to capture such cross-subtask and cross-instance inter-dependencies, we propose a joint neural framework, OneIE, that aims to extract the globally optimal IE result as a graph from an input sentence. OneIE performs end-to-end IE in four stages: (1) Encoding a given sentence as contextualized word representations; (2) Identifying entity mentions and event triggers as nodes; (3) Computing label scores for all nodes and their pairwise links using local classifiers; (4) Searching for the globally optimal graph with a beam decoder. At the decoding stage, we incorporate global features to capture the cross-subtask and cross-instance interactions. Experiments show that adding global features improves the performance of our model and achieves new state of-the-art on all subtasks. In addition, as OneIE does not use any language-specific feature, we prove it can be easily applied to new languages or trained in a multilingual manner.",Information Extraction|Information IE|IE|end-to-end IE,Information Extraction,Long,https://www.aclweb.org/anthology/2020.acl-main.713.pdf -main.505,Transformers to Learn Hierarchical Contexts in Multiparty Dialogue for Span-based Question Answering,Changmao Li|Jinho D. Choi,"We introduce a novel approach to transformers that learns hierarchical representations in multiparty dialogue. First, three language modeling tasks are used to pre-train the transformers, token- and utterance-level language modeling and utterance order prediction, that learn both token and utterance embeddings for better understanding in dialogue contexts. Then, multi-task learning between the utterance prediction and the token span prediction is applied to fine-tune for span-based question answering (QA). Our approach is evaluated on the FriendsQA dataset and shows improvements of 3.8% and 1.4% over the two state-of-the-art transformer models, BERT and RoBERTa, respectively.",Span-based Answering|language tasks|token- modeling|utterance prediction,Question Answering,Short,https://www.aclweb.org/anthology/2020.acl-main.505.pdf -main.263,It’s Morphin’ Time! Combating Linguistic Discrimination with Inflectional Perturbations,Samson Tan|Shafiq Joty|Min-Yen Kan|Richard Socher,"Training on only perfect Standard English corpora predisposes pre-trained neural networks to discriminate against minorities from non-standard linguistic backgrounds (e.g., African American Vernacular English, Colloquial Singapore English, etc.). We perturb the inflectional morphology of words to craft plausible and semantically similar adversarial examples that expose these biases in popular NLP models, e.g., BERT and Transformer, and show that adversarially fine-tuning them for a single epoch significantly improves robustness without sacrificing performance on clean data.",Linguistic Discrimination|Inflectional Perturbations|pre-trained networks|NLP models,Ethics and NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.263.pdf -main.277,Learning to Recover from Multi-Modality Errors for Non-Autoregressive Neural Machine Translation,Qiu Ran|Yankai Lin|Peng Li|Jie Zhou,"Non-autoregressive neural machine translation (NAT) predicts the entire target sequence simultaneously and significantly accelerates inference process. However, NAT discards the dependency information in a sentence, and thus inevitably suffers from the multi-modality problem: the target tokens may be provided by different possible translations, often causing token repetitions or missing. To alleviate this problem, we propose a novel semi-autoregressive model RecoverSAT in this work, which generates a translation as a sequence of segments. The segments are generated simultaneously while each segment is predicted token-by-token. By dynamically determining segment length and deleting repetitive segments, RecoverSAT is capable of recovering from repetitive and missing token errors. Experimental results on three widely-used benchmark datasets show that our proposed model achieves more than 4 times speedup while maintaining comparable performance compared with the corresponding autoregressive model.",Non-Autoregressive Translation|Non-Autoregressive |inference process|multi-modality problem,Machine Translation,Long,https://www.aclweb.org/anthology/2020.acl-main.277.pdf -main.511,Efficient Pairwise Annotation of Argument Quality,Lukas Gienapp|Benno Stein|Matthias Hagen|Martin Potthast,"We present an efficient annotation framework for argument quality, a feature difficult to be measured reliably as per previous work. A stochastic transitivity model is combined with an effective sampling strategy to infer high-quality labels with low effort from crowdsourced pairwise judgments. The model's capabilities are showcased by compiling Webis-ArgQuality-20, an argument quality corpus that comprises scores for rhetorical, logical, dialectical, and overall quality inferred from a total of 41,859 pairwise judgments among 1,271 arguments. With up to 93% cost savings, our approach significantly outperforms existing annotation procedures. Furthermore, novel insight into argument quality is provided through statistical analysis, and a new aggregation method to infer overall quality from individual quality dimensions is proposed.",argument quality|annotation framework|stochastic model|sampling strategy,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Long,https://www.aclweb.org/anthology/2020.acl-main.511.pdf -main.539,LogicalFactChecker: Leveraging Logical Operations for Fact Checking with Graph Module Network,Wanjun Zhong|Duyu Tang|Zhangyin Feng|Nan Duan|Ming Zhou|Ming Gong|Linjun Shou|Daxin Jiang|Jiahai Wang|Jian Yin,"Verifying the correctness of a textual statement requires not only semantic reasoning about the meaning of words, but also symbolic reasoning about logical operations like count, superlative, aggregation, etc. In this work, we propose LogicalFactChecker, a neural network approach capable of leveraging logical operations for fact checking. It achieves the state-of-the-art performance on TABFACT, a large-scale, benchmark dataset built for verifying a textual statement with semi-structured tables. This is achieved by a graph module network built upon the Transformer-based architecture. With a textual statement and a table as the input, LogicalFactChecker automatically derives a program (a.k.a. logical form) of the statement in a semantic parsing manner. A heterogeneous graph is then constructed to capture not only the structures of the table and the program, but also the connections between inputs with different modalities. Such a graph reveals the related contexts of each word in the statement, the table and the program. The graph is used to obtain graph-enhanced contextual representations of words in Transformer-based architecture. After that, a program-driven module network is further introduced to exploit the hierarchical structure of the program, where semantic compositionality is dynamically modeled along the program structure with a set of function-specific modules. Ablation experiments suggest that both the heterogeneous graph and the module network are important to obtain strong results.",Fact Checking|LogicalFactChecker|Graph Network|semantic reasoning,Semantics: Sentence Level,Long,https://www.aclweb.org/anthology/2020.acl-main.539.pdf -main.288,"ECPE-2D: Emotion-Cause Pair Extraction based on Joint Two-Dimensional Representation, Interaction and Prediction",Zixiang Ding|Rui Xia|Jianfei Yu,"In recent years, a new interesting task, called emotion-cause pair extraction (ECPE), has emerged in the area of text emotion analysis. It aims at extracting the potential pairs of emotions and their corresponding causes in a document. To solve this task, the existing research employed a two-step framework, which first extracts individual emotion set and cause set, and then pair the corresponding emotions and causes. However, such a pipeline of two steps contains some inherent flaws: 1) the modeling does not aim at extracting the final emotion-cause pair directly; 2) the errors from the first step will affect the performance of the second step. To address these shortcomings, in this paper we propose a new end-to-end approach, called ECPE-Two-Dimensional (ECPE-2D), to represent the emotion-cause pairs by a 2D representation scheme. A 2D transformer module and two variants, window-constrained and cross-road 2D transformers, are further proposed to model the interactions of different emotion-cause pairs. The 2D representation, interaction, and prediction are integrated into a joint framework. In addition to the advantages of joint modeling, the experimental results on the benchmark emotion cause corpus show that our approach improves the F1 score of the state-of-the-art from 61.28% to 68.89%.",Prediction|emotion-cause extraction|text analysis|Joint Representation,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Long,https://www.aclweb.org/anthology/2020.acl-main.288.pdf -main.289,Effective Inter-Clause Modeling for End-to-End Emotion-Cause Pair Extraction,Penghui Wei|Jiahao Zhao|Wenji Mao,"Emotion-cause pair extraction aims to extract all emotion clauses coupled with their cause clauses from a given document. Previous work employs two-step approaches, in which the first step extracts emotion clauses and cause clauses separately, and the second step trains a classifier to filter out negative pairs. However, such pipeline-style system for emotion-cause pair extraction is suboptimal because it suffers from error propagation and the two steps may not adapt to each other well. In this paper, we tackle emotion-cause pair extraction from a ranking perspective, i.e., ranking clause pair candidates in a document, and propose a one-step neural approach which emphasizes inter-clause modeling to perform end-to-end extraction. It models the interrelations between the clauses in a document to learn clause representations with graph attention, and enhances clause pair representations with kernel-based relative position embedding for effective ranking. Experimental results show that our approach significantly outperforms the current two-step systems, especially in the condition of extracting multiple pairs in one document.",End-to-End Extraction|Emotion-cause extraction|ranking|Inter-Clause Modeling,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Long,https://www.aclweb.org/anthology/2020.acl-main.289.pdf -main.538,Incorporating External Knowledge through Pre-training for Natural Language to Code Generation,Frank F. Xu|Zhengbao Jiang|Pengcheng Yin|Bogdan Vasilescu|Graham Neubig,"Open-domain code generation aims to generate code in a general-purpose programming language (such as Python) from natural language (NL) intents. Motivated by the intuition that developers usually retrieve resources on the web when writing code, we explore the effectiveness of incorporating two varieties of external knowledge into NL-to-code generation: automatically mined NL-code pairs from the online programming QA forum StackOverflow and programming language API documentation. Our evaluations show that combining the two sources with data augmentation and retrieval-based data re-sampling improves the current state-of-the-art by up to 2.2% absolute BLEU score on the code generation testbed CoNaLa. The code and resources are available at https://github.com/neulab/external-knowledge-codegen.",Open-domain generation|NL-to-code generation|data augmentation|retrieval-based re-sampling,Semantics: Sentence Level,Short,https://www.aclweb.org/anthology/2020.acl-main.538.pdf -main.276,Geometry-aware domain adaptation for unsupervised alignment of word embeddings,Pratik Jawanpuria|Mayank Meghwanshi|Bamdev Mishra,"We propose a novel manifold based geometric approach for learning unsupervised alignment of word embeddings between the source and the target languages. Our approach formulates the alignment learning problem as a domain adaptation problem over the manifold of doubly stochastic matrices. This viewpoint arises from the aim to align the second order information of the two language spaces. The rich geometry of the doubly stochastic manifold allows to employ efficient Riemannian conjugate gradient algorithm for the proposed formulation. Empirically, the proposed approach outperforms state-of-the-art optimal transport based approach on the bilingual lexicon induction task across several language pairs. The performance improvement is more significant for distant language pairs.",unsupervised embeddings|alignment problem|domain problem|bilingual task,Machine Translation,Short,https://www.aclweb.org/anthology/2020.acl-main.276.pdf -main.510,Cross-Lingual Unsupervised Sentiment Classification with Multi-View Transfer Learning,Hongliang Fei|Ping Li,"Recent neural network models have achieved impressive performance on sentiment classification in English as well as other languages. Their success heavily depends on the availability of a large amount of labeled data or parallel corpus. In this paper, we investigate an extreme scenario of cross-lingual sentiment classification, in which the low-resource language does not have any labels or parallel corpus. We propose an unsupervised cross-lingual sentiment classification model named multi-view encoder-classifier (MVEC) that leverages an unsupervised machine translation (UMT) system and a language discriminator. Unlike previous language model (LM) based fine-tuning approaches that adjust parameters solely based on the classification error on training data, we employ the encoder-decoder framework of a UMT as a regularization component on the shared network parameters. In particular, the cross-lingual encoder of our model learns a shared representation, which is effective for both reconstructing input sentences of two languages and generating more representative views from the input for classification. Extensive experiments on five language pairs verify that our model significantly outperforms other models for 8/11 sentiment classification tasks.",Cross-Lingual Classification|sentiment classification|unsupervised system|classification,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Long,https://www.aclweb.org/anthology/2020.acl-main.510.pdf -main.504,The Cascade Transformer: an Application for Efficient Answer Sentence Selection,Luca Soldaini|Alessandro Moschitti,"Large transformer-based language models have been shown to be very effective in many classification tasks. However, their computational complexity prevents their use in applications requiring the classification of a large set of candidates. While previous works have investigated approaches to reduce model size, relatively little attention has been paid to techniques to improve batch throughput during inference. In this paper, we introduce the Cascade Transformer, a simple yet effective technique to adapt transformer-based models into a cascade of rankers. Each ranker is used to prune a subset of candidates in a batch, thus dramatically increasing throughput at inference time. Partial encodings from the transformer model are shared among rerankers, providing further speed-up. When compared to a state-of-the-art transformer model, our approach reduces computation by 37% with almost no impact on accuracy, as measured on two English Question Answering datasets.",Efficient Selection|Answer Selection|classification tasks|classification,Question Answering,Long,https://www.aclweb.org/anthology/2020.acl-main.504.pdf -main.262,Is Your Classifier Actually Biased? Measuring Fairness under Uncertainty with Bernstein Bounds,Kawin Ethayarajh,"Most NLP datasets are not annotated with protected attributes such as gender, making it difficult to measure classification bias using standard measures of fairness (e.g., equal opportunity). However, manually annotating a large dataset with a protected attribute is slow and expensive. Instead of annotating all the examples, can we annotate a subset of them and use that sample to estimate the bias? While it is possible to do so, the smaller this annotated sample is, the less certain we are that the estimate is close to the true bias. In this work, we propose using Bernstein bounds to represent this uncertainty about the bias estimate as a confidence interval. We provide empirical evidence that a 95% confidence interval derived this way consistently bounds the true bias. In quantifying this uncertainty, our method, which we call Bernstein-bounded unfairness, helps prevent classifiers from being deemed biased or unbiased when there is insufficient evidence to make either claim. Our findings suggest that the datasets currently used to measure specific biases are too small to conclusively identify bias except in the most egregious cases. For example, consider a co-reference resolution system that is 5% more accurate on gender-stereotypical sentences -- to claim it is biased with 95% confidence, we need a bias-specific dataset that is 3.8 times larger than WinoBias, the largest available.",Classifier|Bernstein Bounds|classifiers|co-reference system,Ethics and NLP,Short,https://www.aclweb.org/anthology/2020.acl-main.262.pdf -main.712,Structural Information Preserving for Graph-to-Text Generation,Linfeng Song|Ante Wang|Jinsong Su|Yue Zhang|Kun Xu|Yubin Ge|Dong Yu,"The task of graph-to-text generation aims at producing sentences that preserve the meaning of input graphs. As a crucial defect, the current state-of-the-art models may mess up or even drop the core structural information of input graphs when generating outputs. We propose to tackle this problem by leveraging richer training signals that can guide our model for preserving input information. In particular, we introduce two types of autoencoding losses, each individually focusing on different aspects (a.k.a. views) of input graphs. The losses are then back-propagated to better calibrate our model via multi-task training. Experiments on two benchmarks for graph-to-text generation show the effectiveness of our approach over a state-of-the-art baseline.",Structural Preserving|Graph-to-Text Generation|multi-task training|meaning graphs,Generation,Long,https://www.aclweb.org/anthology/2020.acl-main.712.pdf -main.706,ESPRIT: Explaining Solutions to Physical Reasoning Tasks,Nazneen Fatema Rajani|Rui Zhang|Yi Chern Tan|Stephan Zheng|Jeremy Weiss|Aadit Vyas|Abhijit Gupta|Caiming Xiong|Richard Socher|Dragomir Radev,"Neural networks lack the ability to reason about qualitative physics and so cannot generalize to scenarios and tasks unseen during training. We propose ESPRIT, a framework for commonsense reasoning about qualitative physics in natural language that generates interpretable descriptions of physical events. We use a two-step approach of first identifying the pivotal physical events in an environment and then generating natural language descriptions of those events using a data-to-text approach. Our framework learns to generate explanations of how the physical simulation will causally evolve so that an agent or a human can easily reason about a solution using those interpretable descriptions. Human evaluations indicate that ESPRIT produces crucial fine-grained details and has high coverage of physical concepts compared to even human annotations. Dataset, code and documentation are available at https://github.com/salesforce/esprit.",Physical Tasks|commonsense reasoning|ESPRIT|Neural networks,Generation,Long,https://www.aclweb.org/anthology/2020.acl-main.706.pdf -main.699,On Forgetting to Cite Older Papers: An Analysis of the EMNLP Anthology,Marcel Bollmann|Desmond Elliott,"The field of natural language processing is experiencing a period of unprecedented growth, and with it a surge of published papers. This represents an opportunity for us to take stock of how we cite the work of other researchers, and whether this growth comes at the expense of ""forgetting"" about older literature. In this paper, we address this question through bibliographic analysis. By looking at the age of outgoing citations in papers published at selected EMNLP venues between 2010 and 2019, we find that there is indeed a tendency for recent papers to cite more recent work, but the rate at which papers older than 15 years are cited has remained relatively stable.",natural processing|bibliographic analysis|outgoing citations|EMNLP Anthology,Theme,Short,https://www.aclweb.org/anthology/2020.acl-main.699.pdf -main.128,Few-shot Slot Tagging with Collapsed Dependency Transfer and Label-enhanced Task-adaptive Projection Network,Yutai Hou|Wanxiang Che|Yongkui Lai|Zhihan Zhou|Yijia Liu|Han Liu|Ting Liu,"In this paper, we explore the slot tagging with only a few labeled support sentences (a.k.a. few-shot). Few-shot slot tagging faces a unique challenge compared to the other fewshot classification problems as it calls for modeling the dependencies between labels. But it is hard to apply previously learned label dependencies to an unseen domain, due to the discrepancy of label sets. To tackle this, we introduce a collapsed dependency transfer mechanism into the conditional random field (CRF) to transfer abstract label dependency patterns as transition scores. In the few-shot setting, the emission score of CRF can be calculated as a word’s similarity to the representation of each label. To calculate such similarity, we propose a Label-enhanced Task-Adaptive Projection Network (L-TapNet) based on the state-of-the-art few-shot classification model – TapNet, by leveraging label name semantics in representing labels. Experimental results show that our model significantly outperforms the strongest few-shot learning baseline by 14.64 F1 scores in the one-shot setting.",slot tagging|Few-shot tagging|fewshot problems|Few-shot Tagging,Dialogue and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.acl-main.128.pdf -main.666,Toward Better Storylines with Sentence-Level Language Models,Daphne Ippolito|David Grangier|Douglas Eck|Chris Callison-Burch,"We propose a sentence-level language model which selects the next sentence in a story from a finite set of fluent alternatives. Since it does not need to model fluency, the sentence-level language model can focus on longer range dependencies, which are crucial for multi-sentence coherence. Rather than dealing with individual words, our method treats the story so far as a list of pre-trained sentence embeddings and predicts an embedding for the next sentence, which is more efficient than predicting word embeddings. Notably this allows us to consider a large number of candidates for the next sentence during training. We demonstrate the effectiveness of our approach with state-of-the-art accuracy on the unsupervised Story Cloze task and with promising results on larger-scale next sentence prediction tasks.",unsupervised task|larger-scale tasks|Sentence-Level Models|sentence-level model,Generation,Short,https://www.aclweb.org/anthology/2020.acl-main.666.pdf -main.100,Expertise Style Transfer: A New Task Towards Better Communication between Experts and Laymen,Yixin Cao|Ruihao Shui|Liangming Pan|Min-Yen Kan|Zhiyuan Liu|Tat-Seng Chua,"The curse of knowledge can impede communication between experts and laymen. We propose a new task of expertise style transfer and contribute a manually annotated dataset with the goal of alleviating such cognitive biases. Solving this task not only simplifies the professional language, but also improves the accuracy and expertise level of laymen descriptions using simple words. This is a challenging task, unaddressed in previous work, as it requires the models to have expert intelligence in order to modify text with a deep understanding of domain knowledge and structures. We establish the benchmark performance of five state-of-the-art models for style transfer and text simplification. The results demonstrate a significant gap between machine and human performance. We also discuss the challenges of automatic evaluation, to provide insights into future research directions. The dataset is publicly available at https://srhthu.github.io/expertise-style-transfer/.",Expertise Transfer|style transfer|text simplification|automatic evaluation,Generation,Long,https://www.aclweb.org/anthology/2020.acl-main.100.pdf -main.114,Multimodal Quality Estimation for Machine Translation,Shu Okabe|Frédéric Blain|Lucia Specia,"We propose approaches to Quality Estimation (QE) for Machine Translation that explore both text and visual modalities for Multimodal QE. We compare various multimodality integration and fusion strategies. For both sentence-level and document-level predictions, we show that state-of-the-art neural and feature-based QE frameworks obtain better results when using the additional modality.",Multimodal Estimation|Machine Translation|Quality Estimation|Quality QE,Resources and Evaluation,Short,https://www.aclweb.org/anthology/2020.acl-main.114.pdf -main.672,Do Transformers Need Deep Long-Range Memory?,Jack Rae|Ali Razavi,"Deep attention models have advanced the modelling of sequential data across many domains. For language modelling in particular, the Transformer-XL --- a Transformer augmented with a long-range memory of past activations --- has been shown to be state-of-the-art across a variety of well-studied benchmarks. The Transformer-XL incorporates a long-range memory at every layer of the network, which renders its state to be thousands of times larger than RNN predecessors. However it is unclear whether this is necessary. We perform a set of interventions to show that comparable performance can be obtained with 6X fewer long range memories and better performance can be obtained by limiting the range of attention in lower layers of the network.",modelling data|language modelling|Transformers|Deep models,Machine Learning for NLP,Short,https://www.aclweb.org/anthology/2020.acl-main.672.pdf -main.21,Learning to Ask More: Semi-Autoregressive Sequential Question Generation under Dual-Graph Interaction,Zi Chai|Xiaojun Wan,"Traditional Question Generation (TQG) aims to generate a question given an input passage and an answer. When there is a sequence of answers, we can perform Sequential Question Generation (SQG) to produce a series of interconnected questions. Since the frequently occurred information omission and coreference between questions, SQG is rather challenging. Prior works regarded SQG as a dialog generation task and recurrently produced each question. However, they suffered from problems caused by error cascades and could only capture limited context dependencies. To this end, we generate questions in a semi-autoregressive way. Our model divides questions into different groups and generates each group of them in parallel. During this process, it builds two graphs focusing on information from passages, answers respectively and performs dual-graph interaction to get information for generation. Besides, we design an answer-aware attention mechanism and the coarse-to-fine generation scenario. Experiments on our new dataset containing 81.9K questions show that our model substantially outperforms prior works.",Semi-Autoregressive Generation|Question Generation|dialog task|generation,Generation,Long,https://www.aclweb.org/anthology/2020.acl-main.21.pdf -main.458,Optimizing the Factual Correctness of a Summary: A Study of Summarizing Radiology Reports,Yuhao Zhang|Derek Merck|Emily Tsai|Christopher D. Manning|Curtis Langlotz,"Neural abstractive summarization models are able to generate summaries which have high overlap with human references. However, existing models are not optimized for factual correctness, a critical metric in real-world applications. In this work, we develop a general framework where we evaluate the factual correctness of a generated summary by fact-checking it automatically against its reference using an information extraction module. We further propose a training strategy which optimizes a neural summarization model with a factual correctness reward via reinforcement learning. We apply the proposed method to the summarization of radiology reports, where factual correctness is a key requirement. On two separate datasets collected from hospitals, we show via both automatic and human evaluation that the proposed approach substantially improves the factual correctness and overall quality of outputs over a competitive neural summarization system, producing radiology summaries that approach the quality of human-authored ones.",Summarizing Reports|real-world applications|summarization reports|Neural models,Summarization,Long,https://www.aclweb.org/anthology/2020.acl-main.458.pdf -main.35,Evaluating Explanation Methods for Neural Machine Translation,Jierui Li|Lemao Liu|Huayang Li|Guanlin Li|Guoping Huang|Shuming Shi,"Recently many efforts have been devoted to interpreting the black-box NMT models, but little progress has been made on metrics to evaluate explanation methods. Word Alignment Error Rate can be used as such a metric that matches human understanding, however, it can not measure explanation methods on those target words that are not aligned to any source word. This paper thereby makes an initial attempt to evaluate explanation methods from an alternative viewpoint. To this end, it proposes a principled metric based on fidelity in regard to the predictive behavior of the NMT model. As the exact computation for this metric is intractable, we employ an efficient approach as its approximation. On six standard translation tasks, we quantitatively evaluate several explanation methods in terms of the proposed metric and we reveal some valuable findings for these explanation methods in our experiments.",Neural Translation|translation tasks|Explanation Methods|black-box models,Machine Translation,Long,https://www.aclweb.org/anthology/2020.acl-main.35.pdf -main.302,Efficient Second-Order TreeCRF for Neural Dependency Parsing,Yu Zhang|Zhenghua Li|Min Zhang,"In the deep learning (DL) era, parsing models are extremely simplified with little hurt on performance, thanks to the remarkable capability of multi-layer BiLSTMs in context representation. As the most popular graph-based dependency parser due to its high efficiency and performance, the biaffine parser directly scores single dependencies under the arc-factorization assumption, and adopts a very simple local token-wise cross-entropy training loss. This paper for the first time presents a second-order TreeCRF extension to the biaffine parser. For a long time, the complexity and inefficiency of the inside-outside algorithm hinder the popularity of TreeCRF. To address this issue, we propose an effective way to batchify the inside and Viterbi algorithms for direct large matrix operation on GPUs, and to avoid the complex outside algorithm via efficient back-propagation. Experiments and analysis on 27 datasets from 13 languages clearly show that techniques developed before the DL era, such as structural learning (global TreeCRF loss) and high-order modeling are still useful, and can further boost parsing performance over the state-of-the-art biaffine parser, especially for partially annotated training data. We release our code at https://github.com/yzhangcs/crfpar.",Neural Parsing|deep era|context representation|direct operation,"Syntax: Tagging, Chunking and Parsing",Long,https://www.aclweb.org/anthology/2020.acl-main.302.pdf -main.464,Examining Citations of Natural Language Processing Literature,Saif M. Mohammad,"We extracted information from the EMNLP Anthology (AA) and Google Scholar (GS) to examine trends in citations of NLP papers. We explore questions such as: how well cited are papers of different types (journal articles, conference papers, demo papers, etc.)? how well cited are papers from different areas of within NLP? etc. Notably, we show that only about 56% of the papers in AA are cited ten or more times. CL Journal has the most cited papers, but its citation dominance has lessened in recent years. On average, long papers get almost three times as many citations as short papers; and papers on sentiment classification, anaphora resolution, and entity recognition have the highest median citations. The analyses presented here, and the associated dataset of NLP papers mapped to citations, have a number of uses including: understanding how the field is growing and quantifying the impact of different types of papers.",NLP|AA|sentiment classification|anaphora resolution,Theme,Long,https://www.aclweb.org/anthology/2020.acl-main.464.pdf -main.470,Balancing Objectives in Counseling Conversations: Advancing Forwards or Looking Backwards,Justine Zhang|Cristian Danescu-Niculescu-Mizil,"Throughout a conversation, participants make choices that can orient the flow of the interaction. Such choices are particularly salient in the consequential domain of crisis counseling, where a difficulty for counselors is balancing between two key objectives: advancing the conversation towards a resolution, and empathetically addressing the crisis situation. In this work, we develop an unsupervised methodology to quantify how counselors manage this balance. Our main intuition is that if an utterance can only receive a narrow range of appropriate replies, then its likely aim is to advance the conversation forwards, towards a target within that range. Likewise, an utterance that can only appropriately follow a narrow range of possible utterances is likely aimed backwards at addressing a specific situation within that range. By applying this intuition, we can map each utterance to a continuous orientation axis that captures the degree to which it is intended to direct the flow of the conversation forwards or backwards. This unsupervised method allows us to characterize counselor behaviors in a large dataset of crisis counseling conversations, where we show that known counseling strategies intuitively align with this axis. We also illustrate how our measure can be indicative of a conversation's progress, as well as its effectiveness.",Counseling Conversations|crisis counseling|unsupervised methodology|unsupervised method,Computational Social Science and Social Media,Long,https://www.aclweb.org/anthology/2020.acl-main.470.pdf -main.316,On the Encoder-Decoder Incompatibility in Variational Text Modeling and Beyond,Chen Wu|Prince Zizhuang Wang|William Yang Wang,"Variational autoencoders (VAEs) combine latent variables with amortized variational inference, whose optimization usually converges into a trivial local optimum termed posterior collapse, especially in text modeling. By tracking the optimization dynamics, we observe the encoder-decoder incompatibility that leads to poor parameterizations of the data manifold. We argue that the trivial local optimum may be avoided by improving the encoder and decoder parameterizations since the posterior network is part of a transition map between them. To this end, we propose Coupled-VAE, which couples a VAE model with a deterministic autoencoder with the same structure and improves the encoder and decoder parameterizations via encoder weight sharing and decoder signal matching. We apply the proposed Coupled-VAE approach to various VAE models with different regularization, posterior family, decoder structure, and optimization strategy. Experiments on benchmark datasets (i.e., PTB, Yelp, and Yahoo) show consistently improved results in terms of probability estimation and richness of the latent space. We also generalize our method to conditional language modeling and propose Coupled-CVAE, which largely improves the diversity of dialogue generation on the Switchboard dataset.",Encoder-Decoder Incompatibility|Variational Modeling|text modeling|probability estimation,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.316.pdf -main.10,Slot-consistent NLG for Task-oriented Dialogue Systems with Iterative Rectification Network,Yangming Li|Kaisheng Yao|Libo Qin|Wanxiang Che|Xiaolong Li|Ting Liu,"Data-driven approaches using neural networks have achieved promising performances in natural language generation (NLG). However, neural generators are prone to make mistakes, e.g., neglecting an input slot value and generating a redundant slot value. Prior works refer this to hallucination phenomenon. In this paper, we study slot consistency for building reliable NLG systems with all slot values of input dialogue act (DA) properly generated in output sentences. We propose Iterative Rectification Network (IRN) for improving general NLG systems to produce both correct and fluent responses. It applies a bootstrapping algorithm to sample training candidates and uses reinforcement learning to incorporate discrete reward related to slot inconsistency into training. Comprehensive studies have been conducted on multiple benchmark datasets, showing that the proposed methods have significantly reduced the slot error rate (ERR) for all strong baselines. Human evaluations also have confirmed its effectiveness.",Task-oriented Systems|natural generation|natural NLG|NLG,Dialogue and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.acl-main.10.pdf -main.469,What Does BERT with Vision Look At?,Liunian Harold Li|Mark Yatskar|Da Yin|Cho-Jui Hsieh|Kai-Wei Chang,"Pre-trained visually grounded language models such as ViLBERT, LXMERT, and UNITER have achieved significant performance improvement on vision-and-language tasks but what they learn during pre-training remains unclear. In this work, we demonstrate that certain attention heads of a visually grounded language model actively ground elements of language to image regions. Specifically, some heads can map entities to image regions, performing the task known as entity grounding. Some heads can even detect the syntactic relations between non-entity words and image regions, tracking, for example, associations between verbs and regions corresponding to their arguments. We denote this ability as syntactic grounding. We verify grounding both quantitatively and qualitatively, using Flickr30K Entities as a testbed.",vision-and-language tasks|pre-training|entity grounding|Pre-trained models,Theme,Short,https://www.aclweb.org/anthology/2020.acl-main.469.pdf -main.333,Towards Holistic and Automatic Evaluation of Open-Domain Dialogue Generation,Bo Pang|Erik Nijkamp|Wenjuan Han|Linqi Zhou|Yixian Liu|Kewei Tu,"Open-domain dialogue generation has gained increasing attention in Natural Language Processing. Its evaluation requires a holistic means. Human ratings are deemed as the gold standard. As human evaluation is inefficient and costly, an automated substitute is highly desirable. In this paper, we propose holistic evaluation metrics that capture different aspects of open-domain dialogues. Our metrics consist of (1) GPT-2 based context coherence between sentences in a dialogue, (2) GPT-2 based fluency in phrasing, (3) n-gram based diversity in responses to augmented queries, and (4) textual-entailment-inference based logical self-consistency. The empirical validity of our metrics is demonstrated by strong correlations with human judgments. We open source the code and relevant materials.",Holistic Generation|Open-domain generation|Natural Processing|human evaluation,Resources and Evaluation,Long,https://www.aclweb.org/anthology/2020.acl-main.333.pdf -main.455,Fact-based Content Weighting for Evaluating Abstractive Summarisation,Xinnuo Xu|Ondřej Dušek|Jingyi Li|Verena Rieser|Ioannis Konstas,"Abstractive summarisation is notoriously hard to evaluate since standard word-overlap-based metrics are insufficient. We introduce a new evaluation metric which is based on fact-level content weighting, i.e. relating the facts of the document to the facts of the summary. We fol- low the assumption that a good summary will reflect all relevant facts, i.e. the ones present in the ground truth (human-generated refer- ence summary). We confirm this hypothe- sis by showing that our weightings are highly correlated to human perception and compare favourably to the recent manual highlight- based metric of Hardy et al. (2019).",Fact-based Weighting|Evaluating Summarisation|Abstractive Summarisation|fact-level weighting,Summarization,Short,https://www.aclweb.org/anthology/2020.acl-main.455.pdf -main.441,Adversarial NLI: A New Benchmark for Natural Language Understanding,Yixin Nie|Adina Williams|Emily Dinan|Mohit Bansal|Jason Weston|Douwe Kiela,"We introduce a new large-scale NLI benchmark dataset, collected via an iterative, adversarial human-and-model-in-the-loop procedure. We show that training models on this new dataset leads to state-of-the-art performance on a variety of popular NLI benchmarks, while posing a more difficult challenge with its new test set. Our analysis sheds light on the shortcomings of current state-of-the-art models, and shows that non-expert annotators are successful at finding their weaknesses. The data collection method can be applied in a never-ending learning scenario, becoming a moving target for NLU, rather than a static benchmark that will quickly saturate.",Adversarial NLI|Natural Understanding|never-ending scenario|NLU,Resources and Evaluation,Long,https://www.aclweb.org/anthology/2020.acl-main.441.pdf -main.38,Lipschitz Constrained Parameter Initialization for Deep Transformers,Hongfei Xu|Qiuhui Liu|Josef van Genabith|Deyi Xiong|Jingyi Zhang,"The Transformer translation model employs residual connection and layer normalization to ease the optimization difficulties caused by its multi-layer encoder/decoder structure. Previous research shows that even with residual connection and layer normalization, deep Transformers still have difficulty in training, and particularly Transformer models with more than 12 encoder/decoder layers fail to converge. In this paper, we first empirically demonstrate that a simple modification made in the official implementation, which changes the computation order of residual connection and layer normalization, can significantly ease the optimization of deep Transformers. We then compare the subtle differences in computation order in considerable detail, and present a parameter initialization method that leverages the Lipschitz constraint on the initialization of Transformer parameters that effectively ensures training convergence. In contrast to findings in previous research we further demonstrate that with Lipschitz parameter initialization, deep Transformers with the original computation order can converge, and obtain significant BLEU improvements with up to 24 layers. In contrast to previous research which focuses on deep encoders, our approach additionally enables Transformers to also benefit from deep decoders.",Lipschitz Initialization|Deep Transformers|Transformer model|layer normalization,Machine Translation,Short,https://www.aclweb.org/anthology/2020.acl-main.38.pdf -main.327,Automatic Machine Translation Evaluation using Source Language Inputs and Cross-lingual Language Model,Kosuke Takahashi|Katsuhito Sudoh|Satoshi Nakamura,"We propose an automatic evaluation method of machine translation that uses source language sentences regarded as additional pseudo references. The proposed method evaluates a translation hypothesis in a regression model. The model takes the paired source, reference, and hypothesis sentence all together as an input. A pretrained large scale cross-lingual language model encodes the input to sentence-pair vectors, and the model predicts a human evaluation score with those vectors. Our experiments show that our proposed method using Cross-lingual Language Model (XLM) trained with a translation language modeling (TLM) objective achieves a higher correlation with human judgments than a baseline method that uses only hypothesis and reference sentences. Additionally, using source sentences in our proposed method is confirmed to improve the evaluation performance.",Automatic Evaluation|machine translation|Cross-lingual Model|regression model,Resources and Evaluation,Short,https://www.aclweb.org/anthology/2020.acl-main.327.pdf -main.496,Rationalizing Text Matching: Learning Sparse Alignments via Optimal Transport,Kyle Swanson|Lili Yu|Tao Lei,"Selecting input features of top relevance has become a popular method for building self-explaining models. In this work, we extend this selective rationalization approach to text matching, where the goal is to jointly select and align text pieces, such as tokens or sentences, as a justification for the downstream prediction. Our approach employs optimal transport (OT) to find a minimal cost alignment between the inputs. However, directly applying OT often produces dense and therefore uninterpretable alignments. To overcome this limitation, we introduce novel constrained variants of the OT problem that result in highly sparse alignments with controllable sparsity. Our model is end-to-end differentiable using the Sinkhorn algorithm for OT and can be trained without any alignment annotations. We evaluate our model on the StackExchange, MultiNews, e-SNLI, and MultiRC datasets. Our model achieves very sparse rationale selections with high fidelity while preserving prediction accuracy compared to strong attention baseline models.",Rationalizing Matching|text matching|downstream prediction|constrained problem,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.496.pdf -main.482,ZPR2: Joint Zero Pronoun Recovery and Resolution using Multi-Task Learning and BERT,Linfeng Song|Kun Xu|Yue Zhang|Jianshu Chen|Dong Yu,"Zero pronoun recovery and resolution aim at recovering the dropped pronoun and pointing out its anaphoric mentions, respectively. We propose to better explore their interaction by solving both tasks together, while the previous work treats them separately. For zero pronoun resolution, we study this task in a more realistic setting, where no parsing trees or only automatic trees are available, while most previous work assumes gold trees. Experiments on two benchmarks show that joint modeling significantly outperforms our baseline that already beats the previous state of the arts.",Joint Resolution|Zero recovery|resolution|zero resolution,Discourse and Pragmatics,Short,https://www.aclweb.org/anthology/2020.acl-main.482.pdf -main.119,AMR Parsing via Graph-Sequence Iterative Inference,Deng Cai|Wai Lam,"We propose a new end-to-end model that treats AMR parsing as a series of dual decisions on the input sequence and the incrementally constructed graph. At each time step, our model performs multiple rounds of attention, reasoning, and composition that aim to answer two critical questions: (1) which part of the input sequence to abstract; and (2) where in the output graph to construct the new concept. We show that the answers to these two questions are mutually causalities. We design a model based on iterative inference that helps achieve better answers in both perspectives, leading to greatly improved parsing accuracy. Our experimental results significantly outperform all previously reported Smatch scores by large margins. Remarkably, without the help of any large-scale pre-trained language model (e.g., BERT), our model already surpasses previous state-of-the-art using BERT. With the help of BERT, we can push the state-of-the-art results to 80.2% on LDC2017T10 (AMR 2.0) and 75.4% on LDC2014T12 (AMR 1.0).",AMR parsing|AMR Parsing|Graph-Sequence Inference|end-to-end model,Semantics: Sentence Level,Long,https://www.aclweb.org/anthology/2020.acl-main.119.pdf -main.657,Premise Selection in Natural Language Mathematical Texts,Deborah Ferreira|André Freitas,"The discovery of supporting evidence for addressing complex mathematical problems is a semantically challenging task, which is still unexplored in the field of natural language processing for mathematical text. The natural language premise selection task consists in using conjectures written in both natural language and mathematical formulae to recommend premises that most likely will be useful to prove a particular statement. We propose an approach to solve this task as a link prediction problem, using Deep Convolutional Graph Neural Networks. This paper also analyses how different baselines perform in this task and shows that a graph structure can provide higher F1-score, especially when considering multi-hop premise selection.",Premise Selection|complex problems|semantically task|natural processing,Semantics: Textual Inference and Other Areas of Semantics,Long,https://www.aclweb.org/anthology/2020.acl-main.657.pdf -main.131,You Impress Me: Dialogue Generation via Mutual Persona Perception,Qian Liu|Yihong Chen|Bei Chen|Jian-Guang Lou|Zixuan Chen|Bin Zhou|Dongmei Zhang,"Despite the continuing efforts to improve the engagingness and consistency of chit-chat dialogue systems, the majority of current work simply focus on mimicking human-like responses, leaving understudied the aspects of modeling understanding between interlocutors. The research in cognitive science, instead, suggests that understanding is an essential signal for a high-quality chit-chat conversation. Motivated by this, we propose P^2 Bot, a transmitter-receiver based framework with the aim of explicitly modeling understanding. Specifically, P^2 Bot incorporates mutual persona perception to enhance the quality of personalized dialogue generation. Experiments on a large public dataset, Persona-Chat, demonstrate the effectiveness of our approach, with a considerable boost over the state-of-the-art baselines across both automatic metrics and human evaluations.",Dialogue Generation|mimicking responses|cognitive science|understanding,Dialogue and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.acl-main.131.pdf -main.125,Self-Attention Guided Copy Mechanism for Abstractive Summarization,Song Xu|Haoran Li|Peng Yuan|Youzheng Wu|Xiaodong He|Bowen Zhou,"Copy module has been widely equipped in the recent abstractive summarization models, which facilitates the decoder to extract words from the source into the summary. Generally, the encoder-decoder attention is served as the copy distribution, while how to guarantee that important words in the source are copied remains a challenge. In this work, we propose a Transformer-based model to enhance the copy mechanism. Specifically, we identify the importance of each source word based on the degree centrality with a directed graph built by the self-attention layer in the Transformer. We use the centrality of each source word to guide the copy process explicitly. Experimental results show that the self-attention graph provides useful guidance for the copy distribution. Our proposed models significantly outperform the baseline methods on the CNN/Daily Mail dataset and the Gigaword dataset.",Abstractive Summarization|copy mechanism|copy process|copy distribution,Summarization,Short,https://www.aclweb.org/anthology/2020.acl-main.125.pdf -main.643,Multimodal Neural Graph Memory Networks for Visual Question Answering,Mahmoud Khademi,"We introduce a new neural network architecture, Multimodal Neural Graph Memory Networks (MN-GMN), for visual question answering. Our novel approach uses graph structure with different region features as node attributes and applies a recently proposed powerful graph neural network model, Graph Network (GN), to reason about objects and their interactions in the scene context. The input module of the MN-GMN generates a set of visual features plus a set of region-grounded captions (RGCs) for the image. The RGCs capture object attributes and their relationships. Two GNs are constructed from the input module using visual features and RGCs. Each node of the GNs iteratively computes a question-guided contextualized representation of the visual/textual information assigned to it. To combine the information from both GNs, each node writes the updated representations to an external spatial memory. The final states of the memory cells are fed into an answer module to predict an answer. Experiments show that MN-GMN rivals the state-of-the-art models on the Visual7W, VQA-v2.0, and CLEVR datasets.",Multimodal Networks|Visual Answering|neural architecture|Multimodal Networks,"Language Grounding to Vision, Robotics and Beyond",Long,https://www.aclweb.org/anthology/2020.acl-main.643.pdf -main.694,Variational Neural Machine Translation with Normalizing Flows,Hendra Setiawan|Matthias Sperber|Udhyakumar Nallasamy|Matthias Paulik,"Variational Neural Machine Translation (VNMT) is an attractive framework for modeling the generation of target translations, conditioned not only on the source sentence but also on some latent random variables. The latent variable modeling may introduce useful statistical dependencies that can improve translation accuracy. Unfortunately, learning informative latent variables is non-trivial, as the latent space can be prohibitively large, and the latent codes are prone to be ignored by many translation models at training time. Previous works impose strong assumptions on the distribution of the latent code and limit the choice of the NMT architecture. In this paper, we propose to apply the VNMT framework to the state-of-the-art Transformer and introduce a more flexible approximate posterior based on normalizing flows. We demonstrate the efficacy of our proposal under both in-domain and out-of-domain conditions, significantly outperforming strong baselines.",Variational Translation|Variational VNMT|Variational|generation translations,Machine Translation,Short,https://www.aclweb.org/anthology/2020.acl-main.694.pdf -main.680,Temporally-Informed Analysis of Named Entity Recognition,Shruti Rijhwani|Daniel Preotiuc-Pietro,"Natural language processing models often have to make predictions on text data that evolves over time as a result of changes in language use or the information described in the text. However, evaluation results on existing data sets are seldom reported by taking the timestamp of the document into account. We analyze and propose methods that make better use of temporally-diverse training data, with a focus on the task of named entity recognition. To support these experiments, we introduce a novel data set of English tweets annotated with named entities. We empirically demonstrate the effect of temporal drift on performance, and how the temporal information of documents can be used to obtain better models compared to those that disregard temporal information. Our analysis gives insights into why this information is useful, in the hope of informing potential avenues of improvement for named entity recognition as well as other NLP tasks under similar experimental setups.",named recognition|NLP tasks|Natural models|language use,Information Extraction,Long,https://www.aclweb.org/anthology/2020.acl-main.680.pdf -main.723,A Prioritization Model for Suicidality Risk Assessment,Han-Chin Shing|Philip Resnik|Douglas Oard,"We reframe suicide risk assessment from social media as a ranking problem whose goal is maximizing detection of severely at-risk individuals given the time available. Building on measures developed for resource-bounded document retrieval, we introduce a well founded evaluation paradigm, and demonstrate using an expert-annotated test collection that meaningful improvements over plausible cascade model baselines can be achieved using an approach that jointly ranks individuals and their social media posts.",Suicidality Assessment|suicide assessment|ranking problem|resource-bounded retrieval,Information Retrieval and Text Mining,Long,https://www.aclweb.org/anthology/2020.acl-main.723.pdf -main.737,Phonetic and Visual Priors for Decipherment of Informal Romanization,Maria Ryskina|Matthew R. Gormley|Taylor Berg-Kirkpatrick,"Informal romanization is an idiosyncratic process used by humans in informal digital communication to encode non-Latin script languages into Latin character sets found on common keyboards. Character substitution choices differ between users but have been shown to be governed by the same main principles observed across a variety of languages---namely, character pairs are often associated through phonetic or visual similarity. We propose a noisy-channel WFST cascade model for deciphering the original non-Latin script from observed romanized text in an unsupervised fashion. We train our model directly on romanized data from two languages: Egyptian Arabic and Russian. We demonstrate that adding inductive bias through phonetic and visual priors on character mappings substantially improves the model's performance on both languages, yielding results much closer to the supervised skyline. Finally, we introduce a new dataset of romanized Russian, collected from a Russian social network website and partially annotated for our experiments.",Decipherment Romanization|Informal romanization|idiosyncratic process|noisy-channel model,"Phonology, Morphology and Word Segmentation",Long,https://www.aclweb.org/anthology/2020.acl-main.737.pdf -main.509,Agreement Prediction of Arguments in Cyber Argumentation for Detecting Stance Polarity and Intensity,Joseph Sirrianni|Xiaoqing Liu|Douglas Adams,"In online debates, users express different levels of agreement/disagreement with one another's arguments and ideas. Often levels of agreement/disagreement are implicit in the text, and must be predicted to analyze collective opinions. Existing stance detection methods predict the polarity of a post's stance toward a topic or post, but don't consider the stance's degree of intensity. We introduce a new research problem, stance polarity and intensity prediction in response relationships between posts. This problem is challenging because differences in stance intensity are often subtle and require nuanced language understanding. Cyber argumentation research has shown that incorporating both stance polarity and intensity data in online debates leads to better discussion analysis. We explore five different learning models: Ridge-M regression, Ridge-S regression, SVR-RF-R, pkudblab-PIP, and T-PAN-PIP for predicting stance polarity and intensity in argumentation. These models are evaluated using a new dataset for stance polarity and intensity prediction collected using a cyber argumentation platform. The SVR-RF-R model performs best for prediction of stance polarity with an accuracy of 70.43% and intensity with RMSE of 0.596. This work is the first to train models for predicting a post's stance polarity and intensity in one combined value in cyber argumentation with reasonably good accuracy.",Cyber Argumentation|Detecting Polarity|online debates|intensity relationships,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Long,https://www.aclweb.org/anthology/2020.acl-main.509.pdf -main.247,Span Selection Pre-training for Question Answering,Michael Glass|Alfio Gliozzo|Rishav Chakravarti|Anthony Ferritto|Lin Pan|G P Shrivatsa Bhargav|Dinesh Garg|Avi Sil,"BERT (Bidirectional Encoder Representations from Transformers) and related pre-trained Transformers have provided large gains across many language understanding tasks, achieving a new state-of-the-art (SOTA). BERT is pretrained on two auxiliary tasks: Masked Language Model and Next Sentence Prediction. In this paper we introduce a new pre-training task inspired by reading comprehension to better align the pre-training from memorization to understanding. Span Selection PreTraining (SSPT) poses cloze-like training instances, but rather than draw the answer from the model’s parameters, it is selected from a relevant passage. We find significant and consistent improvements over both BERT-BASE and BERT-LARGE on multiple Machine Reading Comprehension (MRC) datasets. Specifically, our proposed model has strong empirical evidence as it obtains SOTA results on Natural Questions, a new benchmark MRC dataset, outperforming BERT-LARGE by 3 F1 points on short answer prediction. We also show significant impact in HotpotQA, improving answer prediction F1 by 4 points and supporting fact prediction F1 by 1 point and outperforming the previous best system. Moreover, we show that our pre-training approach is particularly effective when training data is limited, improving the learning curve by a large amount.",Question Answering|language tasks|Next Prediction|pre-training task,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.247.pdf -main.521,IMoJIE: Iterative Memory-Based Joint Open Information Extraction,Keshav Kolluru|Samarth Aggarwal|Vipul Rathore|Mausam -|Soumen Chakrabarti,"While traditional systems for Open Information Extraction were statistical and rule-based, recently neural models have been introduced for the task. Our work builds upon CopyAttention, a sequence generation OpenIE model (Cui et. al. 18). Our analysis reveals that CopyAttention produces a constant number of extractions per sentence, and its extracted tuples often express redundant information. We present IMoJIE, an extension to CopyAttention, which produces the next extraction conditioned on all previously extracted tuples. This approach overcomes both shortcomings of CopyAttention, resulting in a variable number of diverse extractions per sentence. We train IMoJIE on training data bootstrapped from extractions of several non-neural systems, which have been automatically filtered to reduce redundancy and noise. IMoJIE outperforms CopyAttention by about 18 F1 pts, and a BERT-based strong baseline by 2 F1 pts, establishing a new state of the art for the task.",Iterative Extraction|Open Extraction|IMoJIE|Iterative ,Information Extraction,Long,https://www.aclweb.org/anthology/2020.acl-main.521.pdf -main.535,Paraphrase Generation by Learning How to Edit from Samples,Amirhossein Kazemnejad|Mohammadreza Salehi|Mahdieh Soleymani Baghshah,"Neural sequence to sequence text generation has been proved to be a viable approach to paraphrase generation. Despite promising results, paraphrases generated by these models mostly suffer from lack of quality and diversity. To address these problems, we propose a novel retrieval-based method for paraphrase generation. Our model first retrieves a paraphrase pair similar to the input sentence from a pre-defined index. With its novel editor module, the model then paraphrases the input sequence by editing it using the extracted relations between the retrieved pair of sentences. In order to have fine-grained control over the editing process, our model uses the newly introduced concept of Micro Edit Vectors. It both extracts and exploits these vectors using the attention mechanism in the Transformer architecture. Experimental results show the superiority of our paraphrase generation method in terms of both automatic metrics, and human evaluation of relevance, grammaticality, and diversity of generated paraphrases.",Paraphrase Generation|Neural sequence|sequence generation|retrieval-based method,NLP Applications,Long,https://www.aclweb.org/anthology/2020.acl-main.535.pdf -main.253,On The Evaluation of Machine Translation SystemsTrained With Back-Translation,Sergey Edunov|Myle Ott|Marc'Aurelio Ranzato|Michael Auli,"Back-translation is a widely used data augmentation technique which leverages target monolingual data. However, its effectiveness has been challenged since automatic metrics such as BLEU only show significant improvements for test examples where the source itself is a translation, or translationese. This is believed to be due to translationese inputs better matching the back-translated training data. In this work, we show that this conjecture is not empirically supported and that back-translation improves translation quality of both naturally occurring text as well as translationese according to professional human translators. We provide empirical evidence to support the view that back-translation is preferred by humans because it produces more fluent outputs. BLEU cannot capture human preferences because references are translationese when source sentences are natural text. We recommend complementing BLEU with a language model score to measure fluency.",Machine SystemsTrained|translation|translationese|fluency,Machine Translation,Long,https://www.aclweb.org/anthology/2020.acl-main.253.pdf -main.284,Improving Segmentation for Technical Support Problems,Kushal Chauhan|Abhirut Gupta,"Technical support problems are often long and complex. They typically contain user descriptions of the problem, the setup, and steps for attempted resolution. Often they also contain various non-natural language text elements like outputs of commands, snippets of code, error messages or stack traces. These elements contain potentially crucial information for problem resolution. However, they cannot be correctly parsed by tools designed for natural language. In this paper, we address the problem of segmentation for technical support questions. We formulate the problem as a sequence labelling task, and study the performance of state of the art approaches. We compare this against an intuitive contextual sentence-level classification baseline, and a state of the art supervised text-segmentation approach. We also introduce a novel component of combining contextual embeddings from multiple language models pre-trained on different data sources, which achieves a marked improvement over using embeddings from a single pre-trained language model. Finally, we also demonstrate the usefulness of such segmentation with improvements on the downstream task of answer retrieval.",Segmentation|Technical Problems|attempted resolution|problem resolution,NLP Applications,Long,https://www.aclweb.org/anthology/2020.acl-main.284.pdf -main.290,Embarrassingly Simple Unsupervised Aspect Extraction,Stéphan Tulkens|Andreas van Cranenburgh,"We present a simple but effective method for aspect identification in sentiment analysis. Our unsupervised method only requires word embeddings and a POS tagger, and is therefore straightforward to apply to new domains and languages. We introduce Contrastive Attention (CAt), a novel single-head attention mechanism based on an RBF kernel, which gives a considerable boost in performance and makes the model interpretable. Previous work relied on syntactic features and complex neural models. We show that given the simplicity of current benchmark datasets for aspect extraction, such complex models are not needed. The code to reproduce the experiments reported in this paper is available at https://github.com/clips/cat.",Embarrassingly Extraction|aspect identification|sentiment analysis|aspect extraction,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Short,https://www.aclweb.org/anthology/2020.acl-main.290.pdf -main.291,Enhancing Cross-target Stance Detection with Transferable Semantic-Emotion Knowledge,Bowen Zhang|Min Yang|Xutao Li|Yunming Ye|Xiaofei Xu|Kuai Dai,"Stance detection is an important task, which aims to classify the attitude of an opinionated text towards a given target. Remarkable success has been achieved when sufficient labeled training data is available. However, annotating sufficient data is labor-intensive, which establishes significant barriers for generalizing the stance classifier to the data with new targets. In this paper, we proposed a Semantic-Emotion Knowledge Transferring (SEKT) model for cross-target stance detection, which uses the external knowledge (semantic and emotion lexicons) as a bridge to enable knowledge transfer across different targets. Specifically, a semantic-emotion heterogeneous graph is constructed from external semantic and emotion lexicons, which is then fed into a graph convolutional network to learn multi-hop semantic connections between words and emotion tags. Then, the learned semantic-emotion graph representation, which serves as prior knowledge bridging the gap between the source and target domains, is fully integrated into the bidirectional long short-term memory (BiLSTM) stance classifier by adding a novel knowledge-aware memory unit to the BiLSTM cell. Extensive experiments on a large real-world dataset demonstrate the superiority of SEKT against the state-of-the-art baseline methods.",Cross-target Detection|Stance detection|knowledge transfer|stance classifier,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Long,https://www.aclweb.org/anthology/2020.acl-main.291.pdf -main.285,MOOCCube: A Large-scale Data Repository for NLP Applications in MOOCs,Jifan Yu|Gan Luo|Tong Xiao|Qingyang Zhong|Yuquan Wang|Wenzheng Feng|Junyi Luo|Chenyu Wang|Lei Hou|Juanzi Li|Zhiyuan Liu|Jie Tang,"The prosperity of Massive Open Online Courses (MOOCs) provides fodder for many NLP and AI research for education applications, e.g., course concept extraction, prerequisite relation discovery, etc. However, the publicly available datasets of MOOC are limited in size with few types of data, which hinders advanced models and novel attempts in related topics. Therefore, we present MOOCCube, a large-scale data repository of over 700 MOOC courses, 100k concepts, 8 million student behaviors with an external resource. Moreover, we conduct a prerequisite discovery task as an example application to show the potential of MOOCCube in facilitating relevant research. The data repository is now available at http://moocdata.cn/data/MOOCCube.",NLP Applications|NLP MOOCs|NLP research|education applications,NLP Applications,Short,https://www.aclweb.org/anthology/2020.acl-main.285.pdf -main.534,Neural-DINF: A Neural Network based Framework for Measuring Document Influence,Jie Tan|Changlin Yang|Ying Li|Siliang Tang|Chen Huang|Yueting Zhuang,"Measuring the scholarly impact of a document without citations is an important and challenging problem. Existing approaches such as Document Influence Model (DIM) are based on dynamic topic models, which only consider the word frequency change. In this paper, we use both frequency changes and word semantic shifts to measure document influence by developing a neural network framework. Our model has three steps. Firstly, we train the word embeddings for different time periods. Subsequently, we propose an unsupervised method to align vectors for different time periods. Finally, we compute the influence value of documents. Our experimental results show that our model outperforms DIM.",Measuring Influence|Measuring impact|Neural-DINF|Neural Framework,NLP Applications,Short,https://www.aclweb.org/anthology/2020.acl-main.534.pdf -main.252,Leveraging Monolingual Data with Self-Supervision for Multilingual Neural Machine Translation,Aditya Siddhant|Ankur Bapna|Yuan Cao|Orhan Firat|Mia Chen|Sneha Kudugunta|Naveen Arivazhagan|Yonghui Wu,"Over the last few years two promising research directions in low-resource neural machine translation (NMT) have emerged. The first focuses on utilizing high-resource languages to improve the quality of low-resource languages via multilingual NMT. The second direction employs monolingual data with self-supervision to pre-train translation models, followed by fine-tuning on small amounts of supervised data. In this work, we join these two lines of research and demonstrate the efficacy of monolingual data with self-supervision in multilingual NMT. We offer three major results: (i) Using monolingual data significantly boosts the translation quality of low-resource languages in multilingual models. (ii) Self-supervision improves zero-shot translation quality in multilingual models. (iii) Leveraging monolingual data with self-supervision provides a viable path towards adding new languages to multilingual models, getting up to 33 BLEU on ro-en translation without any parallel data or back-translation.",Multilingual Translation|Multilingual |low-resource translation|low-resource NMT,Machine Translation,Short,https://www.aclweb.org/anthology/2020.acl-main.252.pdf -main.246,Showing Your Work Doesn't Always Work,Raphael Tang|Jaejun Lee|Ji Xin|Xinyu Liu|Yaoliang Yu|Jimmy Lin,"In natural language processing, a recently popular line of work explores how to best report the experimental results of neural networks. One exemplar publication, titled ""Show Your Work: Improved Reporting of Experimental Results"" (Dodge et al., 2019), advocates for reporting the expected validation effectiveness of the best-tuned model, with respect to the computational budget. In the present work, we critically examine this paper. As far as statistical generalizability is concerned, we find unspoken pitfalls and caveats with this approach. We analytically show that their estimator is biased and uses error-prone assumptions. We find that the estimator favors negative errors and yields poor bootstrapped confidence intervals. We derive an unbiased alternative and bolster our claims with empirical evidence from statistical simulation. Our codebase is at https://github.com/castorini/meanmax.",natural processing|neural networks|statistical simulation|statistical generalizability,Machine Learning for NLP,Short,https://www.aclweb.org/anthology/2020.acl-main.246.pdf -main.520,An Effective Transition-based Model for Discontinuous NER,Xiang Dai|Sarvnaz Karimi|Ben Hachey|Cecile Paris,"Unlike widely used Named Entity Recognition (NER) data sets in generic domains, biomedical NER data sets often contain mentions consisting of discontinuous spans. Conventional sequence tagging techniques encode Markov assumptions that are efficient but preclude recovery of these mentions. We propose a simple, effective transition-based model with generic neural encoding for discontinuous NER. Through extensive experiments on three biomedical data sets, we show that our model can effectively recognize discontinuous mentions without sacrificing the accuracy on continuous mentions.",Discontinuous NER|Transition-based Model|sequence techniques|generic encoding,Information Extraction,Long,https://www.aclweb.org/anthology/2020.acl-main.520.pdf -main.508,WinoWhy: A Deep Diagnosis of Essential Commonsense Knowledge for Answering Winograd Schema Challenge,Hongming Zhang|Xinran Zhao|Yangqiu Song,"In this paper, we present the first comprehensive categorization of essential commonsense knowledge for answering the Winograd Schema Challenge (WSC). For each of the questions, we invite annotators to first provide reasons for making correct decisions and then categorize them into six major knowledge categories. By doing so, we better understand the limitation of existing methods (i.e., what kind of knowledge cannot be effectively represented or inferred with existing methods) and shed some light on the commonsense knowledge that we need to acquire in the future for better commonsense reasoning. Moreover, to investigate whether current WSC models can understand the commonsense or they simply solve the WSC questions based on the statistical bias of the dataset, we leverage the collected reasons to develop a new task called WinoWhy, which requires models to distinguish plausible reasons from very similar but wrong reasons for all WSC questions. Experimental results prove that even though pre-trained language representation models have achieved promising progress on the original WSC dataset, they are still struggling at WinoWhy. Further experiments show that even though supervised models can achieve better performance, the performance of these models can be sensitive to the dataset distribution. WinoWhy and all codes are available at: https://github.com/HKUST-KnowComp/WinoWhy.",Deep Knowledge|Answering Challenge|WinoWhy|commonsense reasoning,Resources and Evaluation,Long,https://www.aclweb.org/anthology/2020.acl-main.508.pdf -main.736,"Joint Diacritization, Lemmatization, Normalization, and Fine-Grained Morphological Tagging",Nasser Zalmout|Nizar Habash,"The written forms of Semitic languages are both highly ambiguous and morphologically rich: a word can have multiple interpretations and is one of many inflected forms of the same concept or lemma. This is further exacerbated for dialectal content, which is more prone to noise and lacks a standard orthography. The morphological features can be lexicalized, like lemmas and diacritized forms, or non-lexicalized, like gender, number, and part-of-speech tags, among others. Joint modeling of the lexicalized and non-lexicalized features can identify more intricate morphological patterns, which provide better context modeling, and further disambiguate ambiguous lexical choices. However, the different modeling granularity can make joint modeling more difficult. Our approach models the different features jointly, whether lexicalized (on the character-level), or non-lexicalized (on the word-level). We use Arabic as a test case, and achieve state-of-the-art results for Modern Standard Arabic with 20% relative error reduction, and Egyptian Arabic with 11% relative error reduction.",Joint features|joint modeling|Lemmatization|Normalization,"Phonology, Morphology and Word Segmentation",Long,https://www.aclweb.org/anthology/2020.acl-main.736.pdf -main.722,Soft Gazetteers for Low-Resource Named Entity Recognition,Shruti Rijhwani|Shuyan Zhou|Graham Neubig|Jaime Carbonell,"Traditional named entity recognition models use gazetteers (lists of entities) as features to improve performance. Although modern neural network models do not require such hand-crafted features for strong performance, recent work has demonstrated their utility for named entity recognition on English data. However, designing such features for low-resource languages is challenging, because exhaustive entity gazetteers do not exist in these languages. To address this problem, we propose a method of ``soft gazetteers'' that incorporates ubiquitously available information from English knowledge bases, such as Wikipedia, into neural named entity recognition models through cross-lingual entity linking. Our experiments on four low-resource languages show an average improvement of 4 points in F1 score.",Low-Resource Recognition|named recognition|'|Soft Gazetteers,Information Extraction,Short,https://www.aclweb.org/anthology/2020.acl-main.722.pdf -main.681,Towards Open Domain Event Trigger Identification using Adversarial Domain Adaptation,Aakanksha Naik|Carolyn Rose,"We tackle the task of building supervised event trigger identification models which can generalize better across domains. Our work leverages the adversarial domain adaptation (ADA) framework to introduce domain-invariance. ADA uses adversarial training to construct representations that are predictive for trigger identification, but not predictive of the example's domain. It requires no labeled data from the target domain, making it completely unsupervised. Experiments with two domains (English literature and news) show that ADA leads to an average F1 score improvement of 3.9 on out-of-domain data. Our best performing model (BERT-A) reaches 44-49 F1 across both domains, using no labeled target data. Preliminary experiments reveal that finetuning on 1% labeled data, followed by self-training leads to substantial improvement, reaching 51.5 and 67.2 F1 on literature and news respectively.",Open Identification|trigger identification|Adversarial Adaptation|supervised models,Information Extraction,Short,https://www.aclweb.org/anthology/2020.acl-main.681.pdf -main.695,The Paradigm Discovery Problem,Alexander Erdmann|Micha Elsner|Shijie Wu|Ryan Cotterell|Nizar Habash,"This work treats the paradigm discovery problem (PDP), the task of learning an inflectional morphological system from unannotated sentences. We formalize the PDP and develop evaluation metrics for judging systems. Using currently available resources, we construct datasets for the task. We also devise a heuristic benchmark for the PDP and report empirical results on five diverse languages. Our benchmark system first makes use of word embeddings and string similarity to cluster forms by cell and by paradigm. Then, we bootstrap a neural transducer on top of the clustered data to predict words to realize the empty paradigm slots. An error analysis of our system suggests clustering by cell across different inflection classes is the most pressing challenge for future work.",Paradigm Problem|judging systems|PDP|inflectional system,"Phonology, Morphology and Word Segmentation",Long,https://www.aclweb.org/anthology/2020.acl-main.695.pdf -main.124,SUPERT: Towards New Frontiers in Unsupervised Evaluation Metrics for Multi-Document Summarization,Yang Gao|Wei Zhao|Steffen Eger,"We study unsupervised multi-document summarization evaluation metrics, which require neither human-written reference summaries nor human annotations (e.g. preferences, ratings, etc.). We propose SUPERT, which rates the quality of a summary by measuring its semantic similarity with a pseudo reference summary, i.e. selected salient sentences from the source documents, using contextualized embeddings and soft token alignment techniques. Compared to the state-of-the-art unsupervised evaluation metrics, SUPERT correlates better with human ratings by 18- 39%. Furthermore, we use SUPERT as rewards to guide a neural-based reinforcement learning summarizer, yielding favorable performance compared to the state-of-the-art unsupervised summarizers. All source code is available at https://github.com/yg211/acl20-ref-free-eval.",Multi-Document Summarization|SUPERT|contextualized embeddings|soft techniques,Summarization,Short,https://www.aclweb.org/anthology/2020.acl-main.124.pdf -main.642,Aligned Dual Channel Graph Convolutional Network for Visual Question Answering,Qingbao Huang|Jielong Wei|Yi Cai|Changmeng Zheng|Junying Chen|Ho-fung Leung|Qing Li,"Visual question answering aims to answer the natural language question about a given image. Existing graph-based methods only focus on the relations between objects in an image and neglect the importance of the syntactic dependency relations between words in a question. To simultaneously capture the relations between objects in an image and the syntactic dependency relations between words in a question, we propose a novel dual channel graph convolutional network (DC-GCN) for better combining visual and textual advantages. The DC-GCN model consists of three parts: an I-GCN module to capture the relations between objects in an image, a Q-GCN module to capture the syntactic dependency relations between words in a question, and an attention alignment module to align image representations and question representations. Experimental results show that our model achieves comparable performance with the state-of-the-art approaches.",Visual Answering|image representations|question representations|Aligned Network,"Language Grounding to Vision, Robotics and Beyond",Long,https://www.aclweb.org/anthology/2020.acl-main.642.pdf -main.656,Generating Fact Checking Explanations,Pepa Atanasova|Jakob Grue Simonsen|Christina Lioma|Isabelle Augenstein,"Most existing work on automated fact checking is concerned with predicting the veracity of claims based on metadata, social network spread, language used in claims, and, more recently, evidence supporting or denying claims. A crucial piece of the puzzle that is still missing is to understand how to automate the most elaborate part of the process -- generating justifications for verdicts on claims. This paper provides the first study of how these explanations can be generated automatically based on available claim context, and how this task can be modelled jointly with veracity prediction. Our results indicate that optimising both objectives at the same time, rather than training them separately, improves the performance of a fact checking system. The results of a manual evaluation further suggest that the informativeness, coverage and overall quality of the generated explanations are also improved in the multi-task model.",Generating Explanations|automated checking|predicting claims|generating justifications,Semantics: Textual Inference and Other Areas of Semantics,Long,https://www.aclweb.org/anthology/2020.acl-main.656.pdf -main.130,MuTual: A Dataset for Multi-Turn Dialogue Reasoning,Leyang Cui|Yu Wu|Shujie Liu|Yue Zhang|Ming Zhou,"Non-task oriented dialogue systems have achieved great success in recent years due to largely accessible conversation data and the development of deep learning techniques. Given a context, current systems are able to yield a relevant and fluent response, but sometimes make logical mistakes because of weak reasoning capabilities. To facilitate the conversation reasoning research, we introduce MuTual, a novel dataset for Multi-Turn dialogue Reasoning, consisting of 8,860 manually annotated dialogues based on Chinese student English listening comprehension exams. Compared to previous benchmarks for non-task oriented dialogue systems, MuTual is much more challenging since it requires a model that be able to handle various reasoning problems. Empirical results show that state-of-the-art methods only reach 71%, which is far behind human performance of 94%, indicating that there is ample room for improving reasoning ability.",Multi-Turn Reasoning|conversation reasoning|conversation research|reasoning problems,Dialogue and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.acl-main.130.pdf -main.118,iSarcasm: A Dataset of Intended Sarcasm,Silviu Oprea|Walid Magdy,"We consider the distinction between intended and perceived sarcasm in the context of textual sarcasm detection. The former occurs when an utterance is sarcastic from the perspective of its author, while the latter occurs when the utterance is interpreted as sarcastic by the audience. We show the limitations of previous labelling methods in capturing intended sarcasm and introduce the iSarcasm dataset of tweets labeled for sarcasm directly by their authors. Examining the state-of-the-art sarcasm detection models on our dataset showed low performance compared to previously studied datasets, which indicates that these datasets might be biased or obvious and sarcasm could be a phenomenon under-studied computationally thus far. By providing the iSarcasm dataset, we aim to encourage future NLP research to develop methods for detecting sarcasm in text as intended by the authors of the text, not as labeled under assumptions that we demonstrate to be sub-optimal.",textual detection|NLP research|iSarcasm|labelling methods,Resources and Evaluation,Long,https://www.aclweb.org/anthology/2020.acl-main.118.pdf -main.483,Contextualizing Hate Speech Classifiers with Post-hoc Explanation,Brendan Kennedy|Xisen Jin|Aida Mostafazadeh Davani|Morteza Dehghani|Xiang Ren,"Hate speech classifiers trained on imbalanced datasets struggle to determine if group identifiers like ""gay"" or ""black"" are used in offensive or prejudiced ways. Such biases manifest in false positives when these identifiers are present, due to models' inability to learn the contexts which constitute a hateful usage of identifiers. We extract post-hoc explanations from fine-tuned BERT classifiers to detect bias towards identity terms. Then, we propose a novel regularization technique based on these explanations that encourages models to learn from the context of group identifiers in addition to the identifiers themselves. Our approach improved over baselines in limiting false positives on out-of-domain data while maintaining and in cases improving in-domain performance.",Contextualizing Classifiers|Post-hoc Explanation|Hate classifiers|fine-tuned classifiers,Ethics and NLP,Short,https://www.aclweb.org/anthology/2020.acl-main.483.pdf -main.497,Benefits of Intermediate Annotations in Reading Comprehension,Dheeru Dua|Sameer Singh|Matt Gardner,"Complex compositional reading comprehension datasets require performing latent sequential decisions that are learned via supervision from the final answer. A large combinatorial space of possible decision paths that result in the same answer, compounded by the lack of intermediate supervision to help choose the right path, makes the learning particularly hard for this task. In this work, we study the benefits of collecting intermediate reasoning supervision along with the answer during data collection. We find that these intermediate annotations can provide two-fold benefits. First, we observe that for any collection budget, spending a fraction of it on intermediate annotations results in improved model performance, for two complex compositional datasets: DROP and Quoref. Second, these annotations encourage the model to learn the correct latent reasoning steps, helping combat some of the biases introduced during the data collection process.",Reading Comprehension|data collection|data process|Intermediate Annotations,Question Answering,Short,https://www.aclweb.org/anthology/2020.acl-main.497.pdf -main.440,A Recipe for Creating Multimodal Aligned Datasets for Sequential Tasks,Angela Lin|Sudha Rao|Asli Celikyilmaz|Elnaz Nouri|Chris Brockett|Debadeepta Dey|Bill Dolan,"Many high-level procedural tasks can be decomposed into sequences of instructions that vary in their order and choice of tools. In the cooking domain, the web offers many, partially-overlapping, text and video recipes (i.e. procedures) that describe how to make the same dish (i.e. high-level task). Aligning instructions for the same dish across different sources can yield descriptive visual explanations that are far richer semantically than conventional textual instructions, providing commonsense insight into how real-world procedures are structured. Learning to align these different instruction sets is challenging because: a) different recipes vary in their order of instructions and use of ingredients; and b) video instructions can be noisy and tend to contain far more information than text instructions. To address these challenges, we use an unsupervised alignment algorithm that learns pairwise alignments between instructions of different recipes for the same dish. We then use a graph algorithm to derive a joint alignment between multiple text and multiple video recipes for the same dish. We release the Microsoft Research Multimodal Aligned Recipe Corpus containing ~150K pairwise alignments between recipes across 4262 dishes with rich commonsense information.",Sequential Tasks|high-level tasks|cooking domain|high-level task,Resources and Evaluation,Long,https://www.aclweb.org/anthology/2020.acl-main.440.pdf -main.326,"On Exposure Bias, Hallucination and Domain Shift in Neural Machine Translation",Chaojun Wang|Rico Sennrich,"The standard training algorithm in neural machine translation (NMT) suffers from exposure bias, and alternative algorithms have been proposed to mitigate this. However, the practical impact of exposure bias is under debate. In this paper, we link exposure bias to another well-known problem in NMT, namely the tendency to generate hallucinations under domain shift. In experiments on three datasets with multiple test domains, we show that exposure bias is partially to blame for hallucinations, and that training with Minimum Risk Training, which avoids exposure bias, can mitigate this. Our analysis explains why exposure bias is more problematic under domain shift, and also links exposure bias to the beam search problem, i.e. performance deterioration with increasing beam size. Our results provide a new justification for methods that reduce exposure bias: even if they do not increase performance on in-domain test sets, they can increase model robustness to domain shift.",Domain Translation|neural translation|NMT|beam problem,Machine Translation,Short,https://www.aclweb.org/anthology/2020.acl-main.326.pdf -main.39,Location Attention for Extrapolation to Longer Sequences,Yann Dubois|Gautier Dagan|Dieuwke Hupkes|Elia Bruni,"Neural networks are surprisingly good at interpolating and perform remarkably well when the training set examples resemble those in the test set. However, they are often unable to extrapolate patterns beyond the seen data, even when the abstractions required for such patterns are simple. In this paper, we first review the notion of extrapolation, why it is important and how one could hope to tackle it. We then focus on a specific type of extrapolation which is especially useful for natural language processing: generalization to sequences that are longer than the training ones. We hypothesize that models with a separate content- and location-based attention are more likely to extrapolate than those with common attention mechanisms. We empirically support our claim for recurrent seq2seq models with our proposed attention on variants of the Lookup Table task. This sheds light on some striking failures of neural models for sequences and on possible methods to approaching such issues.",Extrapolation|natural processing|generalization|Lookup task,Machine Translation,Long,https://www.aclweb.org/anthology/2020.acl-main.39.pdf -main.332,That is a Known Lie: Detecting Previously Fact-Checked Claims,Shaden Shaar|Nikolay Babulkov|Giovanni Da San Martino|Preslav Nakov,"The recent proliferation of ''fake news'' has triggered a number of responses, most notably the emergence of several manual fact-checking initiatives. As a result and over time, a large number of fact-checked claims have been accumulated, which increases the likelihood that a new claim in social media or a new statement by a politician might have already been fact-checked by some trusted fact-checking organization, as viral claims often come back after a while in social media, and politicians like to repeat their favorite statements, true or false, over and over again. As manual fact-checking is very time-consuming (and fully automatic fact-checking has credibility issues), it is important to try to save this effort and to avoid wasting time on claims that have already been fact-checked. Interestingly, despite the importance of the task, it has been largely ignored by the research community so far. Here, we aim to bridge this gap. In particular, we formulate the task and we discuss how it relates to, but also differs from, previous work. We further create a specialized dataset, which we release to the research community. Finally, we present learning-to-rank experiments that demonstrate sizable improvements over state-of-the-art retrieval and textual similarity approaches.",Detecting Claims|manual initiatives|manual fact-checking|retrieval,Resources and Evaluation,Long,https://www.aclweb.org/anthology/2020.acl-main.332.pdf -main.454,FEQA: A Question Answering Evaluation Framework for Faithfulness Assessment in Abstractive Summarization,Esin Durmus|He He|Mona Diab,"Neural abstractive summarization models are prone to generate content inconsistent with the source document, i.e. unfaithful. Existing automatic metrics do not capture such mistakes effectively. We tackle the problem of evaluating faithfulness of a generated summary given its source document. We first collected human annotations of faithfulness for outputs from numerous models on two datasets. We find that current models exhibit a trade-off between abstractiveness and faithfulness: outputs with less word overlap with the source document are more likely to be unfaithful. Next, we propose an automatic question answering (QA) based metric for faithfulness, FEQA, which leverages recent advances in reading comprehension. Given question-answer pairs generated from the summary, a QA model extracts answers from the document; non-matched answers indicate unfaithful information in the summary. Among metrics based on word overlap, embedding similarity, and learned language understanding models, our QA-based metric has significantly higher correlation with human faithfulness scores, especially on highly abstractive summaries.",Faithfulness Assessment|Abstractive Summarization|evaluating summary|reading comprehension,Summarization,Long,https://www.aclweb.org/anthology/2020.acl-main.454.pdf -main.11,Span-ConveRT: Few-shot Span Extraction for Dialog with Pretrained Conversational Representations,Samuel Coope|Tyler Farghly|Daniela Gerz|Ivan Vulić|Matthew Henderson,"We introduce Span-ConveRT, a light-weight model for dialog slot-filling which frames the task as a turn-based span extraction task. This formulation allows for a simple integration of conversational knowledge coded in large pretrained conversational models such as ConveRT (Henderson et al., 2019). We show that leveraging such knowledge in Span-ConveRT is especially useful for few-shot learning scenarios: we report consistent gains over 1) a span extractor that trains representations from scratch in the target domain, and 2) a BERT-based span extractor. In order to inspire more work on span extraction for the slot-filling task, we also release RESTAURANTS-8K, a new challenging data set of 8,198 utterances, compiled from actual conversations in the restaurant booking domain.",Dialog|dialog slot-filling|turn-based task|few-shot scenarios,Dialogue and Interactive Systems,Short,https://www.aclweb.org/anthology/2020.acl-main.11.pdf -main.468,Predictive Biases in Natural Language Processing Models: A Conceptual Framework and Overview,Deven Santosh Shah|H. Andrew Schwartz|Dirk Hovy,"An increasing number of natural language processing papers address the effect of bias on predictions, introducing mitigation techniques at different parts of the standard NLP pipeline (data and models). However, these works have been conducted individually, without a unifying framework to organize efforts within the field. This situation leads to repetitive approaches, and focuses overly on bias symptoms/effects, rather than on their origins, which could limit the development of effective countermeasures. In this paper, we propose a unifying predictive bias framework for NLP. We summarize the NLP literature and suggest general mathematical definitions of predictive bias. We differentiate two consequences of bias: outcome disparities and error disparities, as well as four potential origins of biases: label bias, selection bias, model overamplification, and semantic bias. Our framework serves as an overview of predictive bias in NLP, integrating existing work into a single structure, and providing a conceptual baseline for improved frameworks.",NLP|Natural Models|Conceptual Framework|mitigation techniques,Theme,Long,https://www.aclweb.org/anthology/2020.acl-main.468.pdf -main.318,A Graph-based Coarse-to-fine Method for Unsupervised Bilingual Lexicon Induction,Shuo Ren|Shujie Liu|Ming Zhou|Shuai Ma,"Unsupervised bilingual lexicon induction is the task of inducing word translations from monolingual corpora of two languages. Recent methods are mostly based on unsupervised cross-lingual word embeddings, the key to which is to find initial solutions of word translations, followed by the learning and refinement of mappings between the embedding spaces of two languages. However, previous methods find initial solutions just based on word-level information, which may be (1) limited and inaccurate, and (2) prone to contain some noise introduced by the insufficiently pre-trained embeddings of some words. To deal with those issues, in this paper, we propose a novel graph-based paradigm to induce bilingual lexicons in a coarse-to-fine way. We first build a graph for each language with its vertices representing different words. Then we extract word cliques from the graphs and map the cliques of two languages. Based on that, we induce the initial word translation solution with the central words of the aligned cliques. This coarse-to-fine approach not only leverages clique-level information, which is richer and more accurate, but also effectively reduces the bad effect of the noise in the pre-trained embeddings. Finally, we take the initial solution as the seed to learn cross-lingual embeddings, from which we induce bilingual lexicons. Experiments show that our approach improves the performance of bilingual lexicon induction compared with previous methods.",Unsupervised Induction|inducing translations|bilingual induction|Graph-based Method,Machine Translation,Long,https://www.aclweb.org/anthology/2020.acl-main.318.pdf -main.13,A Complete Shift-Reduce Chinese Discourse Parser with Robust Dynamic Oracle,Shyh-Shiun Hung|Hen-Hsen Huang|Hsin-Hsi Chen,"This work proposes a standalone, complete Chinese discourse parser for practical applications. We approach Chinese discourse parsing from a variety of aspects and improve the shift-reduce parser not only by integrating the pre-trained text encoder, but also by employing novel training strategies. We revise the dynamic-oracle procedure for training the shift-reduce parser, and apply unsupervised data augmentation to enhance rhetorical relation recognition. Experimental results show that our Chinese discourse parser achieves the state-of-the-art performance.",Chinese parsing|rhetorical recognition|Shift-Reduce Parser|Robust Oracle,Discourse and Pragmatics,Short,https://www.aclweb.org/anthology/2020.acl-main.13.pdf -main.324,Knowledge Distillation for Multilingual Unsupervised Neural Machine Translation,Haipeng Sun|Rui Wang|Kehai Chen|Masao Utiyama|Eiichiro Sumita|Tiejun Zhao,"Unsupervised neural machine translation (UNMT) has recently achieved remarkable results for several language pairs. However, it can only translate between a single language pair and cannot produce translation results for multiple language pairs at the same time. That is, research on multilingual UNMT has been limited. In this paper, we empirically introduce a simple method to translate between thirteen languages using a single encoder and a single decoder, making use of multilingual data to improve UNMT for all language pairs. On the basis of the empirical findings, we propose two knowledge distillation methods to further enhance multilingual UNMT performance. Our experiments on a dataset with English translated to and from twelve other languages (including three language families and six language branches) show remarkable results, surpassing strong unsupervised individual baselines while achieving promising performance between non-English language pairs in zero-shot translation scenarios and alleviating poor performance in low-resource language pairs.",Multilingual Translation|Unsupervised translation|Unsupervised UNMT|multilingual UNMT,Machine Translation,Long,https://www.aclweb.org/anthology/2020.acl-main.324.pdf -main.442,Beyond Accuracy: Behavioral Testing of NLP Models with CheckList,Marco Tulio Ribeiro|Tongshuang Wu|Carlos Guestrin|Sameer Singh,"Although measuring held-out accuracy has been the primary approach to evaluate generalization, it often overestimates the performance of NLP models, while alternative approaches for evaluating models either focus on individual tasks or on specific behaviors. Inspired by principles of behavioral testing in software engineering, we introduce CheckList, a task-agnostic methodology for testing NLP models. CheckList includes a matrix of general linguistic capabilities and test types that facilitate comprehensive test ideation, as well as a software tool to generate a large and diverse number of test cases quickly. We illustrate the utility of CheckList with tests for three tasks, identifying critical failures in both commercial and state-of-art models. In a user study, a team responsible for a commercial sentiment analysis model found new and actionable bugs in an extensively tested model. In another user study, NLP practitioners with CheckList created twice as many tests, and found almost three times as many bugs as users without it.",measuring accuracy|generalization|behavioral testing|software engineering,Resources and Evaluation,Long,https://www.aclweb.org/anthology/2020.acl-main.442.pdf -main.456,Hooks in the Headline: Learning to Generate Headlines with Controlled Styles,Di Jin|Zhijing Jin|Joey Tianyi Zhou|Lisa Orii|Peter Szolovits,"Current summarization systems only produce plain, factual headlines, far from the practical needs for the exposure and memorableness of the articles. We propose a new task, Stylistic Headline Generation (SHG), to enrich the headlines with three style options (humor, romance and clickbait), thus attracting more readers. With no style-specific article-headline pair (only a standard headline summarization dataset and mono-style corpora), our method TitleStylist generates stylistic headlines by combining the summarization and reconstruction tasks into a multitasking framework. We also introduced a novel parameter sharing scheme to further disentangle the style from text. Through both automatic and human evaluation, we demonstrate that TitleStylist can generate relevant, fluent headlines with three target styles: humor, romance, and clickbait. The attraction score of our model generated headlines outperforms the state-of-the-art summarization model by 9.68%, even outperforming human-written references.",Stylistic Generation|summarization tasks|automatic evaluation|summarization systems,Summarization,Long,https://www.aclweb.org/anthology/2020.acl-main.456.pdf -main.330,"MATINF: A Jointly Labeled Large-Scale Dataset for Classification, Question Answering and Summarization",Canwen Xu|Jiaxin Pei|Hongtao Wu|Yiyu Liu|Chenliang Li,"Recently, large-scale datasets have vastly facilitated the development in nearly all domains of Natural Language Processing. However, there is currently no cross-task dataset in NLP, which hinders the development of multi-task learning. We propose MATINF, the first jointly labeled large-scale dataset for classification, question answering and summarization. MATINF contains 1.07 million question-answer pairs with human-labeled categories and user-generated question descriptions. Based on such rich information, MATINF is applicable for three major NLP tasks, including classification, question answering, and summarization. We benchmark existing methods and a novel multi-task baseline over MATINF to inspire further research. Our comprehensive comparison and experiments over MATINF and other datasets demonstrate the merits held by MATINF.",Classification|Question Answering|Summarization|Natural Processing,Resources and Evaluation,Long,https://www.aclweb.org/anthology/2020.acl-main.330.pdf -main.481,PeTra: A Sparsely Supervised Memory Model for People Tracking,Shubham Toshniwal|Allyson Ettinger|Kevin Gimpel|Karen Livescu,"We propose PeTra, a memory-augmented neural network designed to track entities in its memory slots. PeTra is trained using sparse annotation from the GAP pronoun resolution dataset and outperforms a prior memory model on the task while using a simpler architecture. We empirically compare key modeling choices, finding that we can simplify several aspects of the design of the memory module while retaining strong performance. To measure the people tracking capability of memory models, we (a) propose a new diagnostic evaluation based on counting the number of unique entities in text, and (b) conduct a small scale human evaluation to compare evidence of people tracking in the memory logs of PeTra relative to a previous approach. PeTra is highly effective in both evaluations, demonstrating its ability to track people in its memory despite being trained with limited annotation.",People Tracking|PeTra|Sparsely Model|memory-augmented network,Discourse and Pragmatics,Long,https://www.aclweb.org/anthology/2020.acl-main.481.pdf -main.495,Obtaining Faithful Interpretations from Compositional Neural Networks,Sanjay Subramanian|Ben Bogin|Nitish Gupta|Tomer Wolfson|Sameer Singh|Jonathan Berant|Matt Gardner,"Neural module networks (NMNs) are a popular approach for modeling compositionality: they achieve high accuracy when applied to problems in language and vision, while reflecting the compositional structure of the problem in the network architecture. However, prior work implicitly assumed that the structure of the network modules, describing the abstract reasoning process, provides a faithful explanation of the model's reasoning; that is, that all modules perform their intended behaviour. In this work, we propose and conduct a systematic evaluation of the intermediate outputs of NMNs on NLVR2 and DROP, two datasets which require composing multiple reasoning steps. We find that the intermediate outputs differ from the expected output, illustrating that the network structure does not provide a faithful explanation of model behaviour. To remedy that, we train the model with auxiliary supervision and propose particular choices for module architecture that yield much better faithfulness, at a minimal cost to accuracy.",vision|abstract process|Compositional Networks|Neural networks,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.495.pdf -main.668,Machine Reading of Historical Events,Or Honovich|Lucas Torroba Hennigen|Omri Abend|Shay B. Cohen,"Machine reading is an ambitious goal in NLP that subsumes a wide range of text understanding capabilities. Within this broad framework, we address the task of machine reading the time of historical events, compile datasets for the task, and develop a model for tackling it. Given a brief textual description of an event, we show that good performance can be achieved by extracting relevant sentences from Wikipedia, and applying a combination of task-specific and general-purpose feature embeddings for the classification. Furthermore, we establish a link between the historical event ordering task and the event focus time task from the information retrieval literature, showing they also provide a challenging test case for machine reading algorithms.",Machine Events|Machine reading|NLP|classification,Information Extraction,Long,https://www.aclweb.org/anthology/2020.acl-main.668.pdf -main.640,Heterogeneous Graph Transformer for Graph-to-Sequence Learning,Shaowei Yao|Tianming Wang|Xiaojun Wan,"The graph-to-sequence (Graph2Seq) learning aims to transduce graph-structured representations to word sequences for text generation. Recent studies propose various models to encode graph structure. However, most previous works ignore the indirect relations between distance nodes, or treat indirect relations and direct relations in the same way. In this paper, we propose the Heterogeneous Graph Transformer to independently model the different relations in the individual subgraphs of the original graph, including direct relations, indirect relations and multiple possible relations between nodes. Experimental results show that our model strongly outperforms the state of the art on all four standard benchmarks of AMR-to-text generation and syntax-based neural machine translation.",Graph-to-Sequence Learning|text generation|AMR-to-text generation|syntax-based translation,Generation,Long,https://www.aclweb.org/anthology/2020.acl-main.640.pdf -main.126,Beyond User Self-Reported Likert Scale Ratings: A Comparison Model for Automatic Dialog Evaluation,Weixin Liang|James Zou|Zhou Yu,"Open Domain dialog system evaluation is one of the most important challenges in dialog research. Existing automatic evaluation metrics, such as BLEU are mostly reference-based. They calculate the difference between the generated response and a limited number of available references. Likert-score based self-reported user rating is widely adopted by social conversational systems, such as Amazon Alexa Prize chatbots. However, self-reported user rating suffers from bias and variance among different users. To alleviate this problem, we formulate dialog evaluation as a comparison task. We also propose an automatic evaluation model CMADE (Comparison Model for Automatic Dialog Evaluation) that automatically cleans self-reported user ratings as it trains on them. Specifically, we first use a self-supervised method to learn better dialog feature representation, and then use KNN and Shapley to remove confusing samples. Our experiments show that CMADE achieves 89.2% accuracy in the dialog comparison task.",Automatic Evaluation|Open evaluation|dialog research|dialog evaluation,Dialogue and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.acl-main.126.pdf -main.132,Bridging Anaphora Resolution as Question Answering,Yufang Hou,"Most previous studies on bridging anaphora resolution (Poesio et al., 2004; Hou et al., 2013b; Hou, 2018a) use the pairwise model to tackle the problem and assume that the gold mention information is given. In this paper, we cast bridging anaphora resolution as question answering based on context. This allows us to find the antecedent for a given anaphor without knowing any gold mention information (except the anaphor itself). We present a question answering framework (BARQA) for this task, which leverages the power of transfer learning. Furthermore, we propose a novel method to generate a large amount of “quasi-bridging” training data. We show that our model pre-trained on this dataset and fine-tuned on a small amount of in-domain dataset achieves new state-of-the-art results for bridging anaphora resolution on two bridging corpora (ISNotes (Markert et al., 2012) and BASHI (Ro ̈siger, 2018)).",Anaphora Resolution|Question Answering|bridging resolution|pairwise model,Discourse and Pragmatics,Long,https://www.aclweb.org/anthology/2020.acl-main.132.pdf -main.654,Multi-source Meta Transfer for Low Resource Multiple-Choice Question Answering,Ming Yan|Hao Zhang|Di Jin|Joey Tianyi Zhou,"Multiple-choice question answering (MCQA) is one of the most challenging tasks in machine reading comprehension since it requires more advanced reading comprehension skills such as logical reasoning, summarization, and arithmetic operations. Unfortunately, most existing MCQA datasets are small in size, which increases the difficulty of model learning and generalization. To address this challenge, we propose a multi-source meta transfer (MMT) for low-resource MCQA. In this framework, we first extend meta learning by incorporating multiple training sources to learn a generalized feature representation across domains. To bridge the distribution gap between training sources and the target, we further introduce the meta transfer that can be integrated into the multi-source meta training. More importantly, the proposed MMT is independent of backbone language models. Extensive experiments demonstrate the superiority of MMT over state-of-the-arts, and continuous improvements can be achieved on different backbone networks on both supervised and unsupervised domain adaptation settings.",Multi-source Transfer|Low Answering|Multiple-choice answering|machine comprehension,Question Answering,Long,https://www.aclweb.org/anthology/2020.acl-main.654.pdf -main.683,Cross-Modality Relevance for Reasoning on Language and Vision,Chen Zheng|Quan Guo|Parisa Kordjamshidi,"This work deals with the challenge of learning and reasoning over language and vision data for the related downstream tasks such as visual question answering (VQA) and natural language for visual reasoning (NLVR). We design a novel cross-modality relevance module that is used in an end-to-end framework to learn the relevance representation between components of various input modalities under the supervision of a target task, which is more generalizable to unobserved data compared to merely reshaping the original representation space. In addition to modeling the relevance between the textual entities and visual entities, we model the higher-order relevance between entity relations in the text and object relations in the image. Our proposed approach shows competitive performance on two different language and vision tasks using public benchmarks and improves the state-of-the-art published results. The learned alignments of input spaces and their relevance representations by NLVR task boost the training efficiency of VQA task.",Cross-Modality Relevance|Language Vision|visual answering|VQA,"Language Grounding to Vision, Robotics and Beyond",Long,https://www.aclweb.org/anthology/2020.acl-main.683.pdf -main.697,Automated Evaluation of Writing – 50 Years and Counting,Beata Beigman Klebanov|Nitin Madnani,"In this theme paper, we focus on Automated Writing Evaluation (AWE), using Ellis Page’s seminal 1966 paper to frame the presentation. We discuss some of the current frontiers in the field and offer some thoughts on the emergent uses of this technology.",Automated Writing|Automated Evaluation|Automated AWE|Automated,Theme,Long,https://www.aclweb.org/anthology/2020.acl-main.697.pdf -main.708,Logical Natural Language Generation from Open-Domain Tables,Wenhu Chen|Jianshu Chen|Yu Su|Zhiyu Chen|William Yang Wang,"Neural natural language generation (NLG) models have recently shown remarkable progress in fluency and coherence. However, existing studies on neural NLG are primarily focused on surface-level realizations with limited emphasis on logical inference, an important aspect of human thinking and language. In this paper, we suggest a new NLG task where a model is tasked with generating natural language statements that can be logically entailed by the facts in an open-domain semi-structured table. To facilitate the study of the proposed logical NLG problem, we use the existing TabFact dataset featured with a wide range of logical/symbolic inferences as our testbed, and propose new automatic metrics to evaluate the fidelity of generation models w.r.t.\ logical inference. The new task poses challenges to the existing monotonic generation frameworks due to the mismatch between sequence order and logical order. In our experiments, we comprehensively survey different generation architectures (LSTM, Transformer, Pre-Trained LM) trained with different algorithms (RL, Adversarial Training, Coarse-to-Fine) on the dataset and made following observations: 1) Pre-Trained LM can significantly boost both the fluency and logical fidelity metrics, 2) RL and Adversarial Training are trading fluency for fidelity, 3) Coarse-to-Fine generation can help partially alleviate the fidelity issue while maintaining high language fluency. The code and data are available at https://github.com/wenhuchen/LogicNLG.",Logical Generation|neural NLG|surface-level realizations|logical inference,Generation,Long,https://www.aclweb.org/anthology/2020.acl-main.708.pdf -main.734,Improving Chinese Word Segmentation with Wordhood Memory Networks,Yuanhe Tian|Yan Song|Fei Xia|Tong Zhang|Yonggang Wang,"Contextual features always play an important role in Chinese word segmentation (CWS). Wordhood information, being one of the contextual features, is proved to be useful in many conventional character-based segmenters. However, this feature receives less attention in recent neural models and it is also challenging to design a framework that can properly integrate wordhood information from different wordhood measures to existing neural frameworks. In this paper, we therefore propose a neural framework, WMSeg, which uses memory networks to incorporate wordhood information with several popular encoder-decoder combinations for CWS. Experimental results on five benchmark datasets indicate the memory mechanism successfully models wordhood information for neural segmenters and helps WMSeg achieve state-of-the-art performance on all those datasets. Further experiments and analyses also demonstrate the robustness of our proposed framework with respect to different wordhood measures and the efficiency of wordhood information in cross-domain experiments.",Chinese Segmentation|character-based segmenters|cross-domain experiments|Wordhood Networks,"Phonology, Morphology and Word Segmentation",Long,https://www.aclweb.org/anthology/2020.acl-main.734.pdf -main.720,Sources of Transfer in Multilingual Named Entity Recognition,David Mueller|Nicholas Andrews|Mark Dredze,"Named-entities are inherently multilingual, and annotations in any given language may be limited. This motivates us to consider polyglot named-entity recognition (NER), where one model is trained using annotated data drawn from more than one language. However, a straightforward implementation of this simple idea does not always work in practice: naive training of NER models using annotated data drawn from multiple languages consistently underperforms models trained on monolingual data alone, despite having access to more training data. The starting point of this paper is a simple solution to this problem, in which polyglot models are fine-tuned on monolingual data to consistently and significantly outperform their monolingual counterparts. To explain this phenomena, we explore the sources of multilingual transfer in polyglot NER models and examine the weight structure of polyglot models compared to their monolingual counterparts. We find that polyglot models efficiently share many parameters across languages and that fine-tuning may utilize a large number of those parameters.",Multilingual Recognition|polyglot recognition|multilingual transfer|naive models,Information Extraction,Long,https://www.aclweb.org/anthology/2020.acl-main.720.pdf -main.278,On the Inference Calibration of Neural Machine Translation,Shuo Wang|Zhaopeng Tu|Shuming Shi|Yang Liu,"Confidence calibration, which aims to make model predictions equal to the true correctness measures, is important for neural machine translation (NMT) because it is able to offer useful indicators of translation errors in the generated output. While prior studies have shown that NMT models trained with label smoothing are well-calibrated on the ground-truth training data, we find that miscalibration still remains a severe challenge for NMT during inference due to the discrepancy between training and inference. By carefully designing experiments on three language pairs, our work provides in-depth analyses of the correlation between calibration and translation performance as well as linguistic properties of miscalibration and reports a number of interesting findings that might help humans better analyze, understand and improve NMT models. Based on these observations, we further propose a new graduated label smoothing method that can improve both inference calibration and translation performance.",Inference Translation|neural translation|NMT|inference,Machine Translation,Long,https://www.aclweb.org/anthology/2020.acl-main.278.pdf -main.250,schuBERT: Optimizing Elements of BERT,Ashish Khetan|Zohar Karnin,"Transformers have gradually become a key component for many state-of-the-art natural language representation models. A recent Transformer based model- BERTachieved state-of-the-art results on various natural language processing tasks, including GLUE, SQuAD v1.1, and SQuAD v2.0. This model however is computationally prohibitive and has a huge number of parameters. In this work we revisit the architecture choices of BERT in efforts to obtain a lighter model. We focus on reducing the number of parameters yet our methods can be applied towards other objectives such FLOPs or latency. We show that much efficient light BERT models can be obtained by reducing algorithmically chosen correct architecture design dimensions rather than reducing the number of Transformer encoder layers. In particular, our schuBERT gives 6.6% higher average accuracy on GLUE and SQuAD datasets as compared to BERT with three encoder layers while having the same number of parameters.",Optimizing BERT|natural tasks|schuBERT|Transformers,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.250.pdf -main.536,Emerging Cross-lingual Structure in Pretrained Language Models,Alexis Conneau|Shijie Wu|Haoran Li|Luke Zettlemoyer|Veselin Stoyanov,"We study the problem of multilingual masked language modeling, i.e. the training of a single model on concatenated text from multiple languages, and present a detailed study of several factors that influence why these models are so effective for cross-lingual transfer. We show, contrary to what was previously hypothesized, that transfer is possible even when there is no shared vocabulary across the monolingual corpora and also when the text comes from very different domains. The only requirement is that there are some shared parameters in the top layers of the multi-lingual encoder. To better understand this result, we also show that representations from monolingual BERT models in different languages can be aligned post-hoc quite effectively, strongly suggesting that, much like for non-contextual word embeddings, there are universal latent symmetries in the learned embedding spaces. For multilingual masked language modeling, these symmetries are automatically discovered and aligned during the joint training process.",multilingual modeling|cross-lingual transfer|transfer|Cross-lingual Models,Semantics: Sentence Level,Long,https://www.aclweb.org/anthology/2020.acl-main.536.pdf -main.522,Improving Event Detection via Open-domain Trigger Knowledge,Meihan Tong|Bin Xu|Shuai Wang|Yixin Cao|Lei Hou|Juanzi Li|Jun Xie,"Event Detection (ED) is a fundamental task in automatically structuring texts. Due to the small scale of training data, previous methods perform poorly on unseen/sparsely labeled trigger words and are prone to overfitting densely labeled trigger words. To address the issue, we propose a novel Enrichment Knowledge Distillation (EKD) model to leverage external open-domain trigger knowledge to reduce the in-built biases to frequent trigger words in annotations. Experiments on benchmark ACE2005 show that our model outperforms nine strong baselines, is especially effective for unseen/sparsely labeled trigger words. The source code is released on https://github.com/shuaiwa16/ekd.git.",Event Detection|ED|automatically texts|structuring texts,Information Extraction,Long,https://www.aclweb.org/anthology/2020.acl-main.522.pdf -main.244,Pretrained Transformers Improve Out-of-Distribution Robustness,Dan Hendrycks|Xiaoyuan Liu|Eric Wallace|Adam Dziedzic|Rishabh Krishnan|Dawn Song,"Although pretrained Transformers such as BERT achieve high accuracy on in-distribution examples, do they generalize to new distributions? We systematically measure out-of-distribution (OOD) generalization for seven NLP datasets by constructing a new robustness benchmark with realistic distribution shifts. We measure the generalization of previous models including bag-of-words models, ConvNets, and LSTMs, and we show that pretrained Transformers' performance declines are substantially smaller. Pretrained transformers are also more effective at detecting anomalous or OOD examples, while many previous models are frequently worse than chance. We examine which factors affect robustness, finding that larger models are not necessarily more robust, distillation can be harmful, and more diverse pretraining data can enhance robustness. Finally, we show where future work can improve OOD robustness.",out-of-distribution generalization|detecting examples|distillation|Pretrained,Machine Learning for NLP,Short,https://www.aclweb.org/anthology/2020.acl-main.244.pdf -main.293,Modelling Context and Syntactical Features for Aspect-based Sentiment Analysis,Minh Hieu Phan|Philip O. Ogunbona,"The aspect-based sentiment analysis (ABSA) consists of two conceptual tasks, namely an aspect extraction and an aspect sentiment classification. Rather than considering the tasks separately, we build an end-to-end ABSA solution. Previous works in ABSA tasks did not fully leverage the importance of syntactical information. Hence, the aspect extraction model often failed to detect the boundaries of multi-word aspect terms. On the other hand, the aspect sentiment classifier was unable to account for the syntactical correlation between aspect terms and the context words. This paper explores the grammatical aspect of the sentence and employs the self-attention mechanism for syntactical learning. We combine part-of-speech embeddings, dependency-based embeddings and contextualized embeddings (e.g. BERT, RoBERTa) to enhance the performance of the aspect extractor. We also propose the syntactic relative distance to de-emphasize the adverse effects of unrelated words, having weak syntactic connection with the aspect terms. This increases the accuracy of the aspect sentiment classifier. Our solutions outperform the state-of-the-art models on SemEval-2014 dataset in both two subtasks.",Modelling Context|Aspect-based Analysis|aspect extraction|aspect classification,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Long,https://www.aclweb.org/anthology/2020.acl-main.293.pdf -main.287,Analyzing the Persuasive Effect of Style in News Editorial Argumentation,Roxanne El Baff|Henning Wachsmuth|Khalid Al Khatib|Benno Stein,"News editorials argue about political issues in order to challenge or reinforce the stance of readers with different ideologies. Previous research has investigated such persuasive effects for argumentative content. In contrast, this paper studies how important the style of news editorials is to achieve persuasion. To this end, we first compare content- and style-oriented classifiers on editorials from the liberal NYTimes with ideology-specific effect annotations. We find that conservative readers are resistant to NYTimes style, but on liberals, style even has more impact than content. Focusing on liberals, we then cluster the leads, bodies, and endings of editorials, in order to learn about writing style patterns of effective argumentation.",News Argumentation|persuasion|content- classifiers|Persuasive Style,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Short,https://www.aclweb.org/anthology/2020.acl-main.287.pdf -main.286,Towards Interpretable Clinical Diagnosis with Bayesian Network Ensembles Stacked on Entity-Aware CNNs,Jun Chen|Xiaoya Dai|Quan Yuan|Chao Lu|Haifeng Huang,"The automatic text-based diagnosis remains a challenging task for clinical use because it requires appropriate balance between accuracy and interpretability. In this paper, we attempt to propose a solution by introducing a novel framework that stacks Bayesian Network Ensembles on top of Entity-Aware Convolutional Neural Networks (CNN) towards building an accurate yet interpretable diagnosis system. The proposed framework takes advantage of the high accuracy and generality of deep neural networks as well as the interpretability of Bayesian Networks, which is critical for AI-empowered healthcare. The evaluation conducted on the real Electronic Medical Record (EMR) documents from hospitals and annotated by professional doctors proves that, the proposed framework outperforms the previous automatic diagnosis methods in accuracy performance and the diagnosis explanation of the framework is reasonable.",Interpretable Diagnosis|automatic diagnosis|clinical use|accurate system,NLP Applications,Long,https://www.aclweb.org/anthology/2020.acl-main.286.pdf -main.292,KinGDOM: Knowledge-Guided DOMain Adaptation for Sentiment Analysis,Deepanway Ghosal|Devamanyu Hazarika|Abhinaba Roy|Navonil Majumder|Rada Mihalcea|Soujanya Poria,"Cross-domain sentiment analysis has received significant attention in recent years, prompted by the need to combat the domain gap between different applications that make use of sentiment analysis. In this paper, we take a novel perspective on this task by exploring the role of external commonsense knowledge. We introduce a new framework, KinGDOM, which utilizes the ConceptNet knowledge graph to enrich the semantics of a document by providing both domain-specific and domain-general background concepts. These concepts are learned by training a graph convolutional autoencoder that leverages inter-domain concepts in a domain-invariant manner. Conditioning a popular domain-adversarial baseline method with these learned concepts helps improve its performance over state-of-the-art approaches, demonstrating the efficacy of our proposed framework.",Sentiment Analysis|Cross-domain analysis|KinGDOM|Knowledge-Guided Adaptation,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Long,https://www.aclweb.org/anthology/2020.acl-main.292.pdf -main.523,Improving Low-Resource Named Entity Recognition using Joint Sentence and Token Labeling,Canasai Kruengkrai|Thien Hai Nguyen|Sharifah Mahani Aljunied|Lidong Bing,"Exploiting sentence-level labels, which are easy to obtain, is one of the plausible methods to improve low-resource named entity recognition (NER), where token-level labels are costly to annotate. Current models for jointly learning sentence and token labeling are limited to binary classification. We present a joint model that supports multi-class classification and introduce a simple variant of self-attention that allows the model to learn scaling factors. Our model produces 3.78%, 4.20%, 2.08% improvements in F1 over the BiLSTM-CRF baseline on e-commerce product titles in three different low-resource languages: Vietnamese, Thai, and Indonesian, respectively.",Low-Resource Recognition|low-resource NER|NER|binary classification,Information Extraction,Short,https://www.aclweb.org/anthology/2020.acl-main.523.pdf -main.245,Robust Encodings: A Framework for Combating Adversarial Typos,Erik Jones|Robin Jia|Aditi Raghunathan|Percy Liang,"Despite excellent performance on many tasks, NLP systems are easily fooled by small adversarial perturbations of inputs. Existing procedures to defend against such perturbations are either (i) heuristic in nature and susceptible to stronger attacks or (ii) provide guaranteed robustness to worst-case attacks, but are incompatible with state-of-the-art models like BERT. In this work, we introduce robust encodings (RobEn): a simple framework that confers guaranteed robustness, without making compromises on model architecture. The core component of RobEn is an encoding function, which maps sentences to a smaller, discrete space of encodings. Systems using these encodings as a bottleneck confer guaranteed robustness with standard training, and the same encodings can be used across multiple tasks. We identify two desiderata to construct robust encoding functions: perturbations of a sentence should map to a small set of encodings (stability), and models using encodings should still perform well (fidelity). We instantiate RobEn to defend against a large family of adversarial typos. Across six tasks from GLUE, our instantiation of RobEn paired with BERT achieves an average robust accuracy of 71.3% against all adversarial typos in the family considered, while previous work using a typo-corrector achieves only 35.3% accuracy against a simple greedy attack.",Robust Encodings|NLP systems|RobEn|model architecture,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.245.pdf -main.251,ENGINE: Energy-Based Inference Networks for Non-Autoregressive Machine Translation,Lifu Tu|Richard Yuanzhe Pang|Sam Wiseman|Kevin Gimpel,"We propose to train a non-autoregressive machine translation model to minimize the energy defined by a pretrained autoregressive model. In particular, we view our non-autoregressive translation system as an inference network (Tu and Gimpel, 2018) trained to minimize the autoregressive teacher energy. This contrasts with the popular approach of training a non-autoregressive model on a distilled corpus consisting of the beam-searched outputs of such a teacher model. Our approach, which we call ENGINE (ENerGy-based Inference NEtworks), achieves state-of-the-art non-autoregressive results on the IWSLT 2014 DE-EN and WMT 2016 RO-EN datasets, approaching the performance of autoregressive models.",Energy-Based Networks|Non-Autoregressive Translation|Non-Autoregressive |ENGINE,Machine Translation,Short,https://www.aclweb.org/anthology/2020.acl-main.251.pdf -main.537,FastBERT: a Self-distilling BERT with Adaptive Inference Time,Weijie Liu|Peng Zhou|Zhiruo Wang|Zhe Zhao|Haotang Deng|QI JU,"Pre-trained language models like BERT have proven to be highly performant. However, they are often computationally expensive in many practical scenarios, for such heavy models can hardly be readily implemented with limited resources. To improve their efficiency with an assured model performance, we propose a novel speed-tunable FastBERT with adaptive inference time. The speed at inference can be flexibly adjusted under varying demands, while redundant calculation of samples is avoided. Moreover, this model adopts a unique self-distillation mechanism at fine-tuning, further enabling a greater computational efficacy with minimal loss in performance. Our model achieves promising results in twelve English and Chinese datasets. It is able to speed up by a wide range from 1 to 12 times than BERT if given different speedup thresholds to make a speed-performance tradeoff.",inference|FastBERT|Self-distilling BERT|Pre-trained models,Semantics: Sentence Level,Long,https://www.aclweb.org/anthology/2020.acl-main.537.pdf -main.279,Camouflaged Chinese Spam Content Detection with Semi-supervised Generative Active Learning,Zhuoren Jiang|Zhe Gao|Yu Duan|Yangyang Kang|Changlong Sun|Qiong Zhang|Xiaozhong Liu,"We propose a Semi-supervIsed GeNerative Active Learning (SIGNAL) model to address the imbalance, efficiency, and text camouflage problems of Chinese text spam detection task. A “self-diversity” criterion is proposed for measuring the “worthiness” of a candidate for annotation. A semi-supervised variational autoencoder with masked attention learning approach and a character variation graph-enhanced augmentation procedure are proposed for data augmentation. The preliminary experiment demonstrates the proposed SIGNAL model is not only sensitive to spam sample selection, but also can improve the performance of a series of conventional active learning models for Chinese spam detection task. To the best of our knowledge, this is the first work to integrate active learning and semi-supervised generative learning for text spam detection.",Camouflaged Detection|text problems|Chinese task|annotation,NLP Applications,Short,https://www.aclweb.org/anthology/2020.acl-main.279.pdf -main.721,ZeroShotCeres: Zero-Shot Relation Extraction from Semi-Structured Webpages,Colin Lockard|Prashant Shiralkar|Xin Luna Dong|Hannaneh Hajishirzi,"In many documents, such as semi-structured webpages, textual semantics are augmented with additional information conveyed using visual elements including layout, font size, and color. Prior work on information extraction from semi-structured websites has required learning an extraction model specific to a given template via either manually labeled or distantly supervised data from that template. In this work, we propose a solution for ""zero-shot"" open-domain relation extraction from webpages with a previously unseen template, including from websites with little overlap with existing sources of knowledge for distant supervision and websites in entirely new subject verticals. Our model uses a graph neural network-based approach to build a rich representation of text fields on a webpage and the relationships between them, enabling generalization to new templates. Experiments show this approach provides a 31% F1 gain over a baseline for zero-shot extraction in a new subject vertical.",Zero-Shot Extraction|information extraction|distant supervision|generalization,Information Extraction,Long,https://www.aclweb.org/anthology/2020.acl-main.721.pdf -main.735,Joint Chinese Word Segmentation and Part-of-speech Tagging via Two-way Attentions of Auto-analyzed Knowledge,Yuanhe Tian|Yan Song|Xiang Ao|Fei Xia|Xiaojun Quan|Tong Zhang|Yonggang Wang,"Chinese word segmentation (CWS) and part-of-speech (POS) tagging are important fundamental tasks for Chinese language processing, where joint learning of them is an effective one-step solution for both tasks. Previous studies for joint CWS and POS tagging mainly follow the character-based tagging paradigm with introducing contextual information such as n-gram features or sentential representations from recurrent neural models. However, for many cases, the joint tagging needs not only modeling from context features but also knowledge attached to them (e.g., syntactic relations among words); limited efforts have been made by existing research to meet such needs. In this paper, we propose a neural model named TwASP for joint CWS and POS tagging following the character-based sequence labeling paradigm, where a two-way attention mechanism is used to incorporate both context feature and their corresponding syntactic knowledge for each input character. Particularly, we use existing language processing toolkits to obtain the auto-analyzed syntactic knowledge for the context, and the proposed attention module can learn and benefit from them although their quality may not be perfect. Our experiments illustrate the effectiveness of the two-way attentions for joint CWS and POS tagging, where state-of-the-art performance is achieved on five benchmark datasets.",Chinese Segmentation|Part-of-speech Tagging|Chinese processing|joint tagging,"Phonology, Morphology and Word Segmentation",Long,https://www.aclweb.org/anthology/2020.acl-main.735.pdf -main.709,Neural CRF Model for Sentence Alignment in Text Simplification,Chao Jiang|Mounica Maddela|Wuwei Lan|Yang Zhong|Wei Xu,"The success of a text simplification system heavily depends on the quality and quantity of complex-simple sentence pairs in the training corpus, which are extracted by aligning sentences between parallel articles. To evaluate and improve sentence alignment quality, we create two manually annotated sentence-aligned datasets from two commonly used text simplification corpora, Newsela and Wikipedia. We propose a novel neural CRF alignment model which not only leverages the sequential nature of sentences in parallel documents but also utilizes a neural sentence pair model to capture semantic similarity. Experiments demonstrate that our proposed approach outperforms all the previous work on monolingual sentence alignment task by more than 5 points in F1. We apply our CRF aligner to construct two new text simplification datasets, Newsela-Auto and Wiki-Auto, which are much larger and of better quality compared to the existing datasets. A Transformer-based seq2seq model trained on our datasets establishes a new state-of-the-art for text simplification in both automatic and human evaluation.",Sentence Alignment|Text Simplification|monolingual task|automatic evaluation,Generation,Long,https://www.aclweb.org/anthology/2020.acl-main.709.pdf -main.696,Supervised Grapheme-to-Phoneme Conversion of Orthographic Schwas in Hindi and Punjabi,Aryaman Arora|Luke Gessler|Nathan Schneider,"Hindi grapheme-to-phoneme (G2P) conversion is mostly trivial, with one exception: whether a schwa represented in the orthography is pronounced or unpronounced (deleted). Previous work has attempted to predict schwa deletion in a rule-based fashion using prosodic or phonetic analysis. We present the first statistical schwa deletion classifier for Hindi, which relies solely on the orthography as the input and outperforms previous approaches. We trained our model on a newly-compiled pronunciation lexicon extracted from various online dictionaries. Our best Hindi model achieves state of the art performance, and also achieves good performance on a closely related language, Punjabi, without modification.",Hindi conversion|schwa deletion|Supervised Schwas|rule-based fashion,"Phonology, Morphology and Word Segmentation",Short,https://www.aclweb.org/anthology/2020.acl-main.696.pdf -main.682,CompGuessWhat?!: A Multi-task Evaluation Framework for Grounded Language Learning,Alessandro Suglia|Ioannis Konstas|Andrea Vanzo|Emanuele Bastianelli|Desmond Elliott|Stella Frank|Oliver Lemon,"Approaches to Grounded Language Learning are commonly focused on a single task-based final performance measure which may not depend on desirable properties of the learned hidden representations, such as their ability to predict object attributes or generalize to unseen situations. To remedy this, we present GroLLA, an evaluation framework for Grounded Language Learning with Attributes based on three sub-tasks: 1) Goal-oriented evaluation; 2) Object attribute prediction evaluation; and 3) Zero-shot evaluation. We also propose a new dataset CompGuessWhat?! as an instance of this framework for evaluating the quality of learned neural representations, in particular with respect to attribute grounding. To this end, we extend the original GuessWhat?! dataset by including a semantic layer on top of the perceptual one. Specifically, we enrich the VisualGenome scene graphs associated with the GuessWhat?! images with several attributes from resources such as VISA and ImSitu. We then compare several hidden state representations from current state-of-the-art approaches to Grounded Language Learning. By using diagnostic classifiers, we show that current models' learned representations are not expressive enough to encode object attributes (average F1 of 44.27). In addition, they do not learn strategies nor representations that are robust enough to perform well when novel scenes or objects are involved in gameplay (zero-shot best accuracy 50.06%).",Grounded Learning|Goal-oriented evaluation|Object evaluation|Zero-shot evaluation,"Language Grounding to Vision, Robotics and Beyond",Long,https://www.aclweb.org/anthology/2020.acl-main.682.pdf -main.133,Dialogue Coherence Assessment Without Explicit Dialogue Act Labels,Mohsen Mesgar|Sebastian Bücker|Iryna Gurevych,"Recent dialogue coherence models use the coherence features designed for monologue texts, e.g. nominal entities, to represent utterances and then explicitly augment them with dialogue-relevant features, e.g., dialogue act labels. It indicates two drawbacks, (a) semantics of utterances are limited to entity mentions, and (b) the performance of coherence models strongly relies on the quality of the input dialogue act labels. We address these issues by introducing a novel approach to dialogue coherence assessment. We use dialogue act prediction as an auxiliary task in a multi-task learning scenario to obtain informative utterance representations for coherence assessment. Our approach alleviates the need for explicit dialogue act labels during evaluation. The results of our experiments show that our model substantially (more than 20 accuracy points) outperforms its strong competitors on the DailyDialogue corpus, and performs on par with them on the SwitchBoard corpus for ranking dialogues concerning their coherence. We release our source code.",Dialogue Assessment|auxiliary task|multi-task scenario|informative representations,Discourse and Pragmatics,Long,https://www.aclweb.org/anthology/2020.acl-main.133.pdf -main.655,Fine-grained Fact Verification with Kernel Graph Attention Network,Zhenghao Liu|Chenyan Xiong|Maosong Sun|Zhiyuan Liu,"Fact Verification requires fine-grained natural language inference capability that finds subtle clues to identify the syntactical and semantically correct but not well-supported claims. This paper presents Kernel Graph Attention Network (KGAT), which conducts more fine-grained fact verification with kernel-based attentions. Given a claim and a set of potential evidence sentences that form an evidence graph, KGAT introduces node kernels, which better measure the importance of the evidence node, and edge kernels, which conduct fine-grained evidence propagation in the graph, into Graph Attention Networks for more accurate fact verification. KGAT achieves a 70.38% FEVER score and significantly outperforms existing fact verification models on FEVER, a large-scale benchmark for fact verification. Our analyses illustrate that, compared to dot-product attentions, the kernel-based attention concentrates more on relevant evidence sentences and meaningful clues in the evidence graph, which is the main source of KGAT's effectiveness. All source codes of this work are available at https://github.com/thunlp/KernelGAT.",Fine-grained Verification|Fact Verification|fine-grained propagation|Kernel Network,Semantics: Textual Inference and Other Areas of Semantics,Long,https://www.aclweb.org/anthology/2020.acl-main.655.pdf -main.641,Neural Data-to-Text Generation via Jointly Learning the Segmentation and Correspondence,Xiaoyu Shen|Ernie Chang|Hui Su|Cheng Niu|Dietrich Klakow,"The neural attention model has achieved great success in data-to-text generation tasks. Though usually excelling at producing fluent text, it suffers from the problem of information missing, repetition and ``hallucination''. Due to the black-box nature of the neural attention architecture, avoiding these problems in a systematic way is non-trivial. To address this concern, we propose to explicitly segment target text into fragment units and align them with their data correspondences. The segmentation and correspondence are jointly learned as latent variables without any human annotations. We further impose a soft statistical constraint to regularize the segmental granularity. The resulting architecture maintains the same expressive power as neural attention models, while being able to generate fully interpretable outputs with several times less computational cost. On both E2E and WebNLG benchmarks, we show the proposed model consistently outperforms its neural attention counterparts.",Neural Generation|Segmentation|data-to-text tasks|neural model,Generation,Long,https://www.aclweb.org/anthology/2020.acl-main.641.pdf -main.127,Conversational Word Embedding for Retrieval-Based Dialog System,Wentao Ma|Yiming Cui|Ting Liu|Dong Wang|Shijin Wang|Guoping Hu,"Human conversations contain many types of information, e.g., knowledge, common sense, and language habits. In this paper, we propose a conversational word embedding method named PR-Embedding, which utilizes the conversation pairs to learn word embedding. Different from previous works, PR-Embedding uses the vectors from two different semantic spaces to represent the words in post and reply.To catch the information among the pair, we first introduce the word alignment model from statistical machine translation to generate the cross-sentence window, then train the embedding on word-level and sentence-level.We evaluate the method on single-turn and multi-turn response selection tasks for retrieval-based dialog systems.The experiment results show that PR-Embedding can improve the quality of the selected response.",Conversational Embedding|Retrieval-Based System|single-turn tasks|retrieval-based systems,Dialogue and Interactive Systems,Short,https://www.aclweb.org/anthology/2020.acl-main.127.pdf -main.669,Revisiting Unsupervised Relation Extraction,Thy Thy Tran|Phong Le|Sophia Ananiadou,"Unsupervised relation extraction (URE) extracts relations between named entities from raw text without manually-labelled data and existing knowledge bases (KBs). URE methods can be categorised into generative and discriminative approaches, which rely either on hand-crafted features or surface form. However, we demonstrate that by using only named entities to induce relation types, we can outperform existing methods on two popular datasets. We conduct a comparison and evaluation of our findings with other URE techniques, to ascertain the important features in URE. We conclude that entity types provide a strong inductive bias for URE.",Unsupervised Extraction|URE|URE|URE methods,Information Extraction,Short,https://www.aclweb.org/anthology/2020.acl-main.669.pdf -main.494,Generating Hierarchical Explanations on Text Classification via Feature Interaction Detection,Hanjie Chen|Guangtao Zheng|Yangfeng Ji,"Generating explanations for neural networks has become crucial for their applications in real-world with respect to reliability and trustworthiness. In natural language processing, existing methods usually provide important features which are words or phrases selected from an input text as an explanation, but ignore the interactions between them. It poses challenges for humans to interpret an explanation and connect it to model prediction. In this work, we build hierarchical explanations by detecting feature interactions. Such explanations visualize how words and phrases are combined at different levels of the hierarchy, which can help users understand the decision-making of black-box models. The proposed method is evaluated with three neural text classifiers (LSTM, CNN, and BERT) on two benchmark datasets, via both automatic and human evaluations. Experiments show the effectiveness of the proposed method in providing explanations that are both faithful to models and interpretable to humans.",Text Classification|Generating explanations|natural processing|model prediction,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.494.pdf -main.480,Implicit Discourse Relation Classification: We Need to Talk about Evaluation,Najoung Kim|Song Feng|Chulaka Gunasekara|Luis Lastras,"Implicit relation classification on Penn Discourse TreeBank (PDTB) 2.0 is a common benchmark task for evaluating the understanding of discourse relations. However, the lack of consistency in preprocessing and evaluation poses challenges to fair comparison of results in the literature. In this work, we highlight these inconsistencies and propose an improved evaluation protocol. Paired with this protocol, we report strong baseline results from pretrained sentence encoders, which set the new state-of-the-art for PDTB 2.0. Furthermore, this work is the first to explore fine-grained relation classification on PDTB 3.0. We expect our work to serve as a point of comparison for future work, and also as an initiative to discuss models of larger context and possible data augmentations for downstream transferability.",Implicit Classification|Evaluation|understanding relations|PDTB 2.0,Discourse and Pragmatics,Short,https://www.aclweb.org/anthology/2020.acl-main.480.pdf -main.457,Knowledge Graph-Augmented Abstractive Summarization with Semantic-Driven Cloze Reward,Luyang Huang|Lingfei Wu|Lu Wang,"Sequence-to-sequence models for abstractive summarization have been studied extensively, yet the generated summaries commonly suffer from fabricated content, and are often found to be near-extractive. We argue that, to address these issues, the summarizer should acquire semantic interpretation over input, e.g., via structured representation, to allow the generation of more informative summaries. In this paper, we present ASGARD, a novel framework for Abstractive Summarization with Graph-Augmentation and semantic-driven RewarD. We propose the use of dual encoders---a sequential document encoder and a graph-structured encoder---to maintain the global context and local characteristics of entities, complementing each other. We further design a reward based on a multiple choice cloze test to drive the model to better capture entity interactions. Results show that our models produce significantly higher ROUGE scores than a variant without knowledge graph as input on both New York Times and CNN/Daily Mail datasets. We also obtain better or comparable performance compared to systems that are fine-tuned from large pretrained language models. Human judges further rate our model outputs as more informative and containing fewer unfaithful errors.",Knowledge Summarization|abstractive summarization|semantic interpretation|generation summaries,Summarization,Long,https://www.aclweb.org/anthology/2020.acl-main.457.pdf -main.331,MIND: A Large-scale Dataset for News Recommendation,Fangzhao Wu|Ying Qiao|Jiun-Hung Chen|Chuhan Wu|Tao Qi|Jianxun Lian|Danyang Liu|Xing Xie|Jianfeng Gao|Winnie Wu|Ming Zhou,"News recommendation is an important technique for personalized news service. Compared with product and movie recommendations which have been comprehensively studied, the research on news recommendation is much more limited, mainly due to the lack of a high-quality benchmark dataset. In this paper, we present a large-scale dataset named MIND for news recommendation. Constructed from the user click logs of Microsoft News, MIND contains 1 million users and more than 160k English news articles, each of which has rich textual content such as title, abstract and body. We demonstrate MIND a good testbed for news recommendation through a comparative study of several state-of-the-art news recommendation methods which are originally developed on different proprietary datasets. Our results show the performance of news recommendation highly relies on the quality of news content understanding and user interest modeling. Many natural language processing techniques such as effective text representation methods and pre-trained language models can effectively improve the performance of news recommendation. The MIND dataset will be available at https://msnews.github.io.",News Recommendation|personalized service|news understanding|MIND,Resources and Evaluation,Long,https://www.aclweb.org/anthology/2020.acl-main.331.pdf -main.325,Lexically Constrained Neural Machine Translation with Levenshtein Transformer,Raymond Hendy Susanto|Shamil Chollampatt|Liling Tan,"This paper proposes a simple and effective algorithm for incorporating lexical constraints in neural machine translation. Previous work either required re-training existing models with the lexical constraints or incorporating them during beam search decoding with significantly higher computational overheads. Leveraging the flexibility and speed of a recently proposed Levenshtein Transformer model (Gu et al., 2019), our method injects terminology constraints at inference time without any impact on decoding speed. Our method does not require any modification to the training procedure and can be easily applied at runtime with custom dictionaries. Experiments on English-German WMT datasets show that our approach improves an unconstrained baseline and previous approaches.",Lexically Translation|neural translation|Levenshtein Transformer|beam decoding,Machine Translation,Short,https://www.aclweb.org/anthology/2020.acl-main.325.pdf -main.443,Code and Named Entity Recognition in StackOverflow,Jeniya Tabassum|Mounica Maddela|Wei Xu|Alan Ritter,"There is an increasing interest in studying natural language and computer code together, as large corpora of programming texts become readily available on the Internet. For example, StackOverflow currently has over 15 million programming related questions written by 8.5 million users. Meanwhile, there is still a lack of fundamental NLP techniques for identifying code tokens or software-related named entities that appear within natural language sentences. In this paper, we introduce a new named entity recognition (NER) corpus for the computer programming domain, consisting of 15,372 sentences annotated with 20 fine-grained entity types. We also present the SoftNER model that combines contextual information with domain specific knowledge using an attention network. The code token recognizer combined with an entity segmentation model we proposed, consistently improves the performance of the named entity tagger. Our proposed SoftNER tagger outperforms the BiLSTM-CRF model with an absolute increase of +9.73 F-1 score on StackOverflow data. We have published our code and data at: https://github.com/jeniyat/StackOverflowNER",Named Recognition|computer domain|StackOverflow|NLP techniques,Resources and Evaluation,Long,https://www.aclweb.org/anthology/2020.acl-main.443.pdf -main.12,Zero-Shot Transfer Learning with Synthesized Data for Multi-Domain Dialogue State Tracking,Giovanni Campagna|Agata Foryciarz|Mehrad Moradshahi|Monica Lam,Zero-shot transfer learning for multi-domain dialogue state tracking can allow us to handle new domains without incurring the high cost of data acquisition. This paper proposes new zero-short transfer learning technique for dialogue state tracking where the in-domain training data are all synthesized from an abstract dialogue model and the ontology of the domain. We show that data augmentation through synthesized data can improve the accuracy of zero-shot learning for both the TRADE model and the BERT-based SUMBT model on the MultiWOZ 2.1 dataset. We show training with only synthesized in-domain data on the SUMBT model can reach about 2/3 of the accuracy obtained with the full training dataset. We improve the zero-shot learning state of the art on average across domains by 21%.,Multi-Domain Tracking|data acquisition|dialogue tracking|data augmentation,Dialogue and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.acl-main.12.pdf -main.319,A Reinforced Generation of Adversarial Examples for Neural Machine Translation,Wei Zou|Shujian Huang|Jun Xie|Xinyu Dai|Jiajun Chen,"Neural machine translation systems tend to fail on less decent inputs despite its significant efficacy, which may significantly harm the credibility of these systems—fathoming how and when neural-based systems fail in such cases is critical for industrial maintenance. Instead of collecting and analyzing bad cases using limited handcrafted error features, here we investigate this issue by generating adversarial examples via a new paradigm based on reinforcement learning. Our paradigm could expose pitfalls for a given performance metric, e.g., BLEU, and could target any given neural machine translation architecture. We conduct experiments of adversarial attacks on two mainstream neural machine translation architectures, RNN-search, and Transformer. The results show that our method efficiently produces stable attacks with meaning-preserving adversarial examples. We also present a qualitative and quantitative analysis for the preference pattern of the attack, demonstrating its capability of pitfall exposure.",Reinforced Examples|Neural Translation|Neural |industrial maintenance,Machine Translation,Long,https://www.aclweb.org/anthology/2020.acl-main.319.pdf -main.447,S2ORC: The Semantic Scholar Open Research Corpus,Kyle Lo|Lucy Wang|Mark Neumann|Rodney Kinney|Daniel Weld,"We introduce S2ORC, a large corpus of 81.1M English-language academic papers spanning many academic disciplines. The corpus consists of rich metadata, paper abstracts, resolved bibliographic references, as well as structured full text for 8.1M open access papers. Full text is annotated with automatically-detected inline mentions of citations, figures, and tables, each linked to their corresponding paper objects. In S2ORC, we aggregate papers from hundreds of academic publishers and digital archives into a unified source, and create the largest publicly-available collection of machine-readable academic text to date. We hope this resource will facilitate research and development of tools and tasks for text mining over academic text.",text mining|S2ORC|bibliographic references|inline citations,Resources and Evaluation,Long,https://www.aclweb.org/anthology/2020.acl-main.447.pdf -main.321,A Simple and Effective Unified Encoder for Document-Level Machine Translation,Shuming Ma|Dongdong Zhang|Ming Zhou,"Most of the existing models for document-level machine translation adopt dual-encoder structures. The representation of the source sentences and the document-level contexts (In this work, document-level contexts denote the surrounding sentences of the current source sentence.) are modeled with two separate encoders. Although these models can make use of the document-level contexts, they do not fully model the interaction between the contexts and the source sentences, and can not directly adapt to the recent pre-training models (e.g., BERT) which encodes multiple sentences with a single encoder. In this work, we propose a simple and effective unified encoder that can outperform the baseline models of dual-encoder models in terms of BLEU and METEOR scores. Moreover, the pre-training models can further boost the performance of our proposed model.",Document-Level Translation|Unified Encoder|encoders|pre-training models,Machine Translation,Short,https://www.aclweb.org/anthology/2020.acl-main.321.pdf -main.335,Biomedical Entity Representations with Synonym Marginalization,Mujeen Sung|Hwisang Jeon|Jinhyuk Lee|Jaewoo Kang,"Biomedical named entities often play important roles in many biomedical text mining tools. However, due to the incompleteness of provided synonyms and numerous variations in their surface forms, normalization of biomedical entities is very challenging. In this paper, we focus on learning representations of biomedical entities solely based on the synonyms of entities. To learn from the incomplete synonyms, we use a model-based candidate selection and maximize the marginal likelihood of the synonyms present in top candidates. Our model-based candidates are iteratively updated to contain more difficult negative samples as our model evolves. In this way, we avoid the explicit pre-selection of negative samples from more than 400K candidates. On four biomedical entity normalization datasets having three different entity types (disease, chemical, adverse reaction), our model BioSyn consistently outperforms previous state-of-the-art models almost reaching the upper bound on each dataset.",Biomedical Representations|normalization entities|learning entities|Synonym Marginalization,Semantics: Lexical,Long,https://www.aclweb.org/anthology/2020.acl-main.335.pdf -main.453,Exploring Content Selection in Summarization of Novel Chapters,Faisal Ladhak|Bryan Li|Yaser Al-Onaizan|Kathy McKeown,"We present a new summarization task, generating summaries of novel chapters using summary/chapter pairs from online study guides. This is a harder task than the news summarization task, given the chapter length as well as the extreme paraphrasing and generalization found in the summaries. We focus on extractive summarization, which requires the creation of a gold-standard set of extractive summaries. We present a new metric for aligning reference summary sentences with chapter sentences to create gold extracts and also experiment with different alignment methods. Our experiments demonstrate significant improvement over prior alignment approaches for our task as shown through automatic metrics and a crowd-sourced pyramid analysis.",Exploring Selection|Content Selection|Summarization Chapters|summarization task,Summarization,Short,https://www.aclweb.org/anthology/2020.acl-main.453.pdf -main.16,Cross-modal Language Generation using Pivot Stabilization for Web-scale Language Coverage,Ashish V. Thapliyal|Radu Soricut,"Cross-modal language generation tasks such as image captioning are directly hurt in their ability to support non-English languages by the trend of data-hungry models combined with the lack of non-English annotations. We investigate potential solutions for combining existing language-generation annotations in English with translation capabilities in order to create solutions at web-scale in both domain and language coverage. We describe an approach called Pivot-Language Generation Stabilization (PLuGS), which leverages directly at training time both existing English annotations (gold data) as well as their machine-translated versions (silver data); at run-time, it generates first an English caption and then a corresponding target-language caption. We show that PLuGS models outperform other candidate solutions in evaluations performed over 5 different target languages, under a large-domain testset using images from the Open Images dataset. Furthermore, we find an interesting effect where the English captions generated by the PLuGS models are better than the captions generated by the original, monolingual English model.",Cross-modal Generation|Web-scale Coverage|Cross-modal tasks|Pivot Stabilization,Generation,Long,https://www.aclweb.org/anthology/2020.acl-main.16.pdf -main.309,An Analysis of the Utility of Explicit Negative Examples to Improve the Syntactic Abilities of Neural Language Models,Hiroshi Noji|Hiroya Takamura,"We explore the utilities of explicit negative examples in training neural language models. Negative examples here are incorrect words in a sentence, such as barks in *The dogs barks. Neural language models are commonly trained only on positive examples, a set of sentences in the training data, but recent studies suggest that the models trained in this way are not capable of robustly handling complex syntactic constructions, such as long-distance agreement. In this paper, we first demonstrate that appropriately using negative examples about particular constructions (e.g., subject-verb agreement) will boost the model's robustness on them in English, with a negligible loss of perplexity. The key to our success is an additional margin loss between the log-likelihoods of a correct word and an incorrect word. We then provide a detailed analysis of the trained models. One of our findings is the difficulty of object-relative clauses for RNNs. We find that even with our direct learning signals the models still suffer from resolving agreement across an object-relative clause. Augmentation of training sentences involving the constructions somewhat helps, but the accuracy still does not reach the level of subject-relative clauses. Although not directly cognitively appealing, our method can be a tool to analyze the true architectural limitation of neural models on challenging linguistic constructions.",resolving agreement|Augmentation|Augmentation sentences|Syntactic Models,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.309.pdf -main.484,Double-Hard Debias: Tailoring Word Embeddings for Gender Bias Mitigation,Tianlu Wang|Xi Victoria Lin|Nazneen Fatema Rajani|Bryan McCann|Vicente Ordonez|Caiming Xiong,"Word embeddings derived from human-generated corpora inherit strong gender bias which can be further amplified by downstream models. Some commonly adopted debiasing approaches, including the seminal Hard Debias algorithm, apply post-processing procedures that project pre-trained word embeddings into a subspace orthogonal to an inferred gender subspace. We discover that semantic-agnostic corpus regularities such as word frequency captured by the word embeddings negatively impact the performance of these algorithms. We propose a simple but effective technique, Double Hard Debias, which purifies the word embeddings against such corpus regularities prior to inferring and removing the gender subspace. Experiments on three bias mitigation benchmarks show that our approach preserves the distributional semantics of the pre-trained word embeddings while reducing gender bias to a significantly larger degree than prior approaches.",Tailoring Embeddings|Gender Mitigation|Double-Hard Debias|downstream models,Ethics and NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.484.pdf -main.490,Cross-Linguistic Syntactic Evaluation of Word Prediction Models,Aaron Mueller|Garrett Nicolai|Panayiota Petrou-Zeniou|Natalia Talmina|Tal Linzen,"A range of studies have concluded that neural word prediction models can distinguish grammatical from ungrammatical sentences with high accuracy. However, these studies are based primarily on monolingual evidence from English. To investigate how these models' ability to learn syntax varies by language, we introduce CLAMS (Cross-Linguistic Assessment of Models on Syntax), a syntactic evaluation suite for monolingual and multilingual models. CLAMS includes subject-verb agreement challenge sets for English, French, German, Hebrew and Russian, generated from grammars we develop. We use CLAMS to evaluate LSTM language models as well as monolingual and multilingual BERT. Across languages, monolingual LSTMs achieved high accuracy on dependencies without attractors, and generally poor accuracy on agreement across object relative clauses. On other constructions, agreement accuracy was generally higher in languages with richer morphology. Multilingual models generally underperformed monolingual models. Multilingual BERT showed high syntactic accuracy on English, but noticeable deficiencies in other languages.",Cross-Linguistic Syntax|Syntax|Cross-Linguistic Models|neural models,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.490.pdf -main.123,Improving Truthfulness of Headline Generation,Kazuki Matsumaru|Sho Takase|Naoaki Okazaki,"Most studies on abstractive summarization report ROUGE scores between system and reference summaries. However, we have a concern about the truthfulness of generated summaries: whether all facts of a generated summary are mentioned in the source text. This paper explores improving the truthfulness in headline generation on two popular datasets. Analyzing headlines generated by the state-of-the-art encoder-decoder model, we show that the model sometimes generates untruthful headlines. We conjecture that one of the reasons lies in untruthful supervision data used for training the model. In order to quantify the truthfulness of article-headline pairs, we consider the textual entailment of whether an article entails its headline. After confirming quite a few untruthful instances in the datasets, this study hypothesizes that removing untruthful instances from the supervision data may remedy the problem of the untruthful behaviors of the model. Building a binary classifier that predicts an entailment relation between an article and its headline, we filter out untruthful instances from the supervision data. Experimental results demonstrate that the headline generation model trained on filtered supervision data shows no clear difference in ROUGE scores but remarkable improvements in automatic and manual evaluations of the generated headlines.",Truthfulness Generation|abstractive summarization|headline generation|automatic headlines,Summarization,Long,https://www.aclweb.org/anthology/2020.acl-main.123.pdf -main.645,CamemBERT: a Tasty French Language Model,Louis Martin|Benjamin Muller|Pedro Javier Ortiz Suárez|Yoann Dupont|Laurent Romary|Éric de la Clergerie|Djamé Seddah|Benoît Sagot,"Pretrained language models are now ubiquitous in Natural Language Processing. Despite their success, most available models have either been trained on English data or on the concatenation of data in multiple languages. This makes practical use of such models --in all languages except English-- very limited. In this paper, we investigate the feasibility of training monolingual Transformer-based language models for other languages, taking French as an example and evaluating our language models on part-of-speech tagging, dependency parsing, named entity recognition and natural language inference tasks. We show that the use of web crawled data is preferable to the use of Wikipedia data. More surprisingly, we show that a relatively small web crawled dataset (4GB) leads to results that are as good as those obtained using larger datasets (130+GB). Our best performing model CamemBERT reaches or improves the state of the art in all four downstream tasks.",Natural Processing|part-of-speech tagging|dependency parsing|named recognition,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.645.pdf -main.651,ClarQ: A large-scale and diverse dataset for Clarification Question Generation,Vaibhav Kumar|Alan W Black,"Question answering and conversational systems are often baffled and need help clarifying certain ambiguities. However, limitations of existing datasets hinder the development of large-scale models capable of generating and utilising clarification questions. In order to overcome these limitations, we devise a novel bootstrapping framework (based on self-supervision) that assists in the creation of a diverse, large-scale dataset of clarification questions based on post-comment tuples extracted from stackexchange. The framework utilises a neural network based architecture for classifying clarification questions. It is a two-step method where the first aims to increase the precision of the classifier and second aims to increase its recall. We quantitatively demonstrate the utility of the newly created dataset by applying it to the downstream task of question-answering. The final dataset, ClarQ, consists of ~2M examples distributed across 173 domains of stackexchange. We release this dataset in order to foster research into the field of clarification question generation with the larger goal of enhancing dialog and question answering systems.",Clarification Generation|Question answering|classifying questions|downstream question-answering,Question Answering,Short,https://www.aclweb.org/anthology/2020.acl-main.651.pdf -main.137,In Layman’s Terms: Semi-Open Relation Extraction from Scientific Texts,Ruben Kruiper|Julian Vincent|Jessica Chen-Burger|Marc Desmulliez|Ioannis Konstas,"Information Extraction (IE) from scientific texts can be used to guide readers to the central information in scientific documents. But narrow IE systems extract only a fraction of the information captured, and Open IE systems do not perform well on the long and complex sentences encountered in scientific texts. In this work we combine the output of both types of systems to achieve Semi-Open Relation Extraction, a new task that we explore in the Biology domain. First, we present the Focused Open Biological Information Extraction (FOBIE) dataset and use FOBIE to train a state-of-the-art narrow scientific IE system to extract trade-off relations and arguments that are central to biology texts. We then run both the narrow IE system and a state-of-the-art Open IE system on a corpus of 10K open-access scientific biological texts. We show that a significant amount (65%) of erroneous and uninformative Open IE extractions can be filtered using narrow IE extractions. Furthermore, we show that the retained extractions are significantly more often informative to a reader.",Semi-Open Extraction|Information|IE|narrow systems,Information Extraction,Long,https://www.aclweb.org/anthology/2020.acl-main.137.pdf -main.679,The Sensitivity of Language Models and Humans to Winograd Schema Perturbations,Mostafa Abdou|Vinit Ravishankar|Maria Barrett|Yonatan Belinkov|Desmond Elliott|Anders Søgaard,"Large-scale pretrained language models are the major driving force behind recent improvements in perfromance on the Winograd Schema Challenge, a widely employed test of commonsense reasoning ability. We show, however, with a new diagnostic dataset, that these models are sensitive to linguistic perturbations of the Winograd examples that minimally affect human understanding. Our results highlight interesting differences between humans and language models: language models are more sensitive to number or gender alternations and synonym replacements than humans, and humans are more stable and consistent in their predictions, maintain a much higher absolute performance, and perform better on non-associative instances than associative ones.",human understanding|Language Models|Winograd Perturbations|Large-scale models,Semantics: Textual Inference and Other Areas of Semantics,Long,https://www.aclweb.org/anthology/2020.acl-main.679.pdf -main.686,HAT: Hardware-Aware Transformers for Efficient Natural Language Processing,Hanrui Wang|Zhanghao Wu|Zhijian Liu|Han Cai|Ligeng Zhu|Chuang Gan|Song Han,"Transformers are ubiquitous in Natural Language Processing (NLP) tasks, but they are difficult to be deployed on hardware due to the intensive computation. To enable low-latency inference on resource-constrained hardware platforms, we propose to design Hardware-Aware Transformers (HAT) with neural architecture search. We first construct a large design space with arbitrary encoder-decoder attention and heterogeneous layers. Then we train a SuperTransformer that covers all candidates in the design space, and efficiently produces many SubTransformers with weight sharing. Finally, we perform an evolutionary search with a hardware latency constraint to find a specialized SubTransformer dedicated to run fast on the target hardware. Extensive experiments on four machine translation tasks demonstrate that HAT can discover efficient models for different hardware (CPU, GPU, IoT device). When running WMT’14 translation task on Raspberry Pi-4, HAT can achieve 3× speedup, 3.7× smaller size over baseline Transformer; 2.7× speedup, 3.6× smaller size over Evolved Transformer with 12,041× less search cost and no performance loss. HAT is open-sourced at https://github.com/mit-han-lab/hardware-aware-transformers.",Natural Processing|Natural tasks|low-latency inference|machine tasks,Machine Translation,Long,https://www.aclweb.org/anthology/2020.acl-main.686.pdf -main.692,Unsupervised Domain Clusters in Pretrained Language Models,Roee Aharoni|Yoav Goldberg,"The notion of ``in-domain data'' in NLP is often over-simplistic and vague, as textual data varies in many nuanced linguistic aspects such as topic, style or level of formality. In addition, domain labels are many times unavailable, making it challenging to build domain-specific systems. We show that massive pre-trained language models implicitly learn sentence representations that cluster by domains without supervision -- suggesting a simple data-driven definition of domains in textual data. We harness this property and propose domain data selection methods based on such models, which require only a small set of in-domain monolingual data. We evaluate our data selection methods for neural machine translation across five diverse domains, where they outperform an established approach as measured by both BLEU and precision and recall with respect to an oracle selection.",NLP|data-driven domains|neural translation|Unsupervised Clusters,Machine Translation,Long,https://www.aclweb.org/anthology/2020.acl-main.692.pdf -main.731,Unsupervised Multimodal Neural Machine Translation with Pseudo Visual Pivoting,Po-Yao Huang|Junjie Hu|Xiaojun Chang|Alexander Hauptmann,"Unsupervised machine translation (MT) has recently achieved impressive results with monolingual corpora only. However, it is still challenging to associate source-target sentences in the latent space. As people speak different languages biologically share similar visual systems, the potential of achieving better alignment through visual content is promising yet under-explored in unsupervised multimodal MT (MMT). In this paper, we investigate how to utilize visual content for disambiguation and promoting latent space alignment in unsupervised MMT. Our model employs multimodal back-translation and features pseudo visual pivoting in which we learn a shared multilingual visual-semantic embedding space and incorporate visually-pivoted captioning as additional weak supervision. The experimental results on the widely used Multi30K dataset show that the proposed model significantly improves over the state-of-the-art methods and generalizes well when images are not available at the testing time.",Unsupervised Translation|Unsupervised MT|MT|alignment,"Language Grounding to Vision, Robotics and Beyond",Long,https://www.aclweb.org/anthology/2020.acl-main.731.pdf -main.725,Empower Entity Set Expansion via Language Model Probing,Yunyi Zhang|Jiaming Shen|Jingbo Shang|Jiawei Han,"Entity set expansion, aiming at expanding a small seed entity set with new entities belonging to the same semantic class, is a critical task that benefits many downstream NLP and IR applications, such as question answering, query understanding, and taxonomy construction. Existing set expansion methods bootstrap the seed entity set by adaptively selecting context features and extracting new entities. A key challenge for entity set expansion is to avoid selecting ambiguous context features which will shift the class semantics and lead to accumulative errors in later iterations. In this study, we propose a novel iterative set expansion framework that leverages automatically generated class names to address the semantic drift issue. In each iteration, we select one positive and several negative class names by probing a pre-trained language model, and further score each candidate entity based on selected class names. Experiments on two datasets show that our framework generates high-quality class names and outperforms previous state-of-the-art methods significantly.",Empower Expansion|Entity expansion|NLP applications|question answering,Information Retrieval and Text Mining,Long,https://www.aclweb.org/anthology/2020.acl-main.725.pdf -main.719,Rationalizing Medical Relation Prediction from Corpus-level Statistics,Zhen Wang|Jennifer Lee|Simon Lin|Huan Sun,"Nowadays, the interpretability of machine learning models is becoming increasingly important, especially in the medical domain. Aiming to shed some light on how to rationalize medical relation prediction, we present a new interpretable framework inspired by existing theories on how human memory works, e.g., theories of recall and recognition. Given the corpus-level statistics, i.e., a global co-occurrence graph of a clinical text corpus, to predict the relations between two entities, we first recall rich contexts associated with the target entities, and then recognize relational interactions between these contexts to form model rationales, which will contribute to the final prediction. We conduct experiments on a real-world public clinical dataset and show that our framework can not only achieve competitive predictive performance against a comprehensive list of neural baseline models, but also present rationales to justify its prediction. We further collaborate with medical experts deeply to verify the usefulness of our model rationales for clinical decision making.",Medical Prediction|recognition|clinical making|machine models,Information Extraction,Long,https://www.aclweb.org/anthology/2020.acl-main.719.pdf -main.533,"Worse WER, but Better BLEU? Leveraging Word Embedding as Intermediate in Multitask End-to-End Speech Translation",Shun-Po Chuang|Tzu-Wei Sung|Alexander H. Liu|Hung-yi Lee,"Speech translation (ST) aims to learn transformations from speech in the source language to the text in the target language. Previous works show that multitask learning improves the ST performance, in which the recognition decoder generates the text of the source language, and the translation decoder obtains the final translations based on the output of the recognition decoder. Because whether the output of the recognition decoder has the correct semantics is more critical than its accuracy, we propose to improve the multitask ST model by utilizing word embedding as the intermediate.",Speech translation|Word Embedding|ST|multitask learning,Machine Translation,Short,https://www.aclweb.org/anthology/2020.acl-main.533.pdf -main.255,Breaking Through the 80% Glass Ceiling: Raising the State of the Art in Word Sense Disambiguation by Incorporating Knowledge Graph Information,Michele Bevilacqua|Roberto Navigli,"Neural architectures are the current state of the art in Word Sense Disambiguation (WSD). However, they make limited use of the vast amount of relational information encoded in Lexical Knowledge Bases (LKB). We present Enhanced WSD Integrating Synset Embeddings and Relations (EWISER), a neural supervised architecture that is able to tap into this wealth of knowledge by embedding information from the LKB graph within the neural architecture, and to exploit pretrained synset embeddings, enabling the network to predict synsets that are not in the training set. As a result, we set a new state of the art on almost all the evaluation settings considered, also breaking through, for the first time, the 80% ceiling on the concatenation of all the standard all-words English WSD evaluation benchmarks. On multilingual all-words WSD, we report state-of-the-art results by training on nothing but English.",Word Disambiguation|Word WSD|WSD|Enhanced WSD,Semantics: Lexical,Long,https://www.aclweb.org/anthology/2020.acl-main.255.pdf -main.241,Orthogonal Relation Transforms with Graph Context Modeling for Knowledge Graph Embedding,Yun Tang|Jing Huang|Guangtao Wang|Xiaodong He|Bowen Zhou,"Distance-based knowledge graph embeddings have shown substantial improvement on the knowledge graph link prediction task, from TransE to the latest state-of-the-art RotatE. However, complex relations such as N-to-1, 1-to-N and N-to-N still remain challenging to predict. In this work, we propose a novel distance-based approach for knowledge graph link prediction. First, we extend the RotatE from 2D complex domain to high dimensional space with orthogonal transforms to model relations. The orthogonal transform embedding for relations keeps the capability for modeling symmetric/anti-symmetric, inverse and compositional relations while achieves better modeling capacity. Second, the graph context is integrated into distance scoring functions directly. Specifically, graph context is explicitly modeled via two directed context representations. Each node embedding in knowledge graph is augmented with two context representations, which are computed from the neighboring outgoing and incoming nodes/edges respectively. The proposed approach improves prediction accuracy on the difficult N-to-1, 1-to-N and N-to-N cases. Our experimental results show that it achieves state-of-the-art results on two common benchmarks FB15k-237 and WNRR-18, especially on FB15k-237 which has many high in-degree nodes.",Knowledge Embedding|knowledge task|knowledge prediction|Orthogonal Transforms,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.241.pdf -main.527,Relabel the Noise: Joint Extraction of Entities and Relations via Cooperative Multiagents,Daoyuan Chen|Yaliang Li|Kai Lei|Ying Shen,"Distant supervision based methods for entity and relation extraction have received increasing popularity due to the fact that these methods require light human annotation efforts. In this paper, we consider the problem of shifted label distribution, which is caused by the inconsistency between the noisy-labeled training set subject to external knowledge graph and the human-annotated test set, and exacerbated by the pipelined entity-then-relation extraction manner with noise propagation. We propose a joint extraction approach to address this problem by re-labeling noisy instances with a group of cooperative multiagents. To handle noisy instances in a fine-grained manner, each agent in the cooperative group evaluates the instance by calculating a continuous confidence score from its own perspective; To leverage the correlations between these two extraction tasks, a confidence consensus module is designed to gather the wisdom of all agents and re-distribute the noisy training set with confidence-scored labels. Further, the confidences are used to adjust the training losses of extractors. Experimental results on two real-world datasets verify the benefits of re-labeling noisy instance, and show that the proposed model significantly outperforms the state-of-the-art entity and relation extraction methods.",entity extraction|re-labeling instances|extraction tasks|re-labeling instance,Information Extraction,Long,https://www.aclweb.org/anthology/2020.acl-main.527.pdf -main.269,How Does Selective Mechanism Improve Self-Attention Networks?,Xinwei Geng|Longyue Wang|Xing Wang|Bing Qin|Ting Liu|Zhaopeng Tu,"Self-attention networks (SANs) with selective mechanism has produced substantial improvements in various NLP tasks by concentrating on a subset of input words. However, the underlying reasons for their strong performance have not been well explained. In this paper, we bridge the gap by assessing the strengths of selective SANs (SSANs), which are implemented with a flexible and universal Gumbel-Softmax. Experimental results on several representative NLP tasks, including natural language inference, semantic role labelling, and machine translation, show that SSANs consistently outperform the standard SANs. Through well-designed probing experiments, we empirically validate that the improvement of SSANs can be attributed in part to mitigating two commonly-cited weaknesses of SANs: word order encoding and structure modeling. Specifically, the selective mechanism improves SANs by paying more attention to content words that contribute to the meaning of the sentence.",NLP tasks|natural inference|semantic labelling|machine translation,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.269.pdf -main.296,SpanMlt: A Span-based Multi-Task Learning Framework for Pair-wise Aspect and Opinion Terms Extraction,He Zhao|Longtao Huang|Rong Zhang|Quan Lu|Hui Xue,"Aspect terms extraction and opinion terms extraction are two key problems of fine-grained Aspect Based Sentiment Analysis (ABSA). The aspect-opinion pairs can provide a global profile about a product or service for consumers and opinion mining systems. However, traditional methods can not directly output aspect-opinion pairs without given aspect terms or opinion terms. Although some recent co-extraction methods have been proposed to extract both terms jointly, they fail to extract them as pairs. To this end, this paper proposes an end-to-end method to solve the task of Pair-wise Aspect and Opinion Terms Extraction (PAOTE). Furthermore, this paper treats the problem from a perspective of joint term and relation extraction rather than under the sequence tagging formulation performed in most prior works. We propose a multi-task learning framework based on shared spans, where the terms are extracted under the supervision of span boundaries. Meanwhile, the pair-wise relations are jointly identified using the span representations. Extensive experiments show that our model consistently outperforms state-of-the-art methods.",Pair-wise Extraction|Aspect extraction|opinion extraction|fine-grained Analysis,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Long,https://www.aclweb.org/anthology/2020.acl-main.296.pdf -main.282,HyperCore: Hyperbolic and Co-graph Representation for Automatic ICD Coding,Pengfei Cao|Yubo Chen|Kang Liu|Jun Zhao|Shengping Liu|Weifeng Chong,"The International Classification of Diseases (ICD) provides a standardized way for classifying diseases, which endows each disease with a unique code. ICD coding aims to assign proper ICD codes to a medical record. Since manual coding is very laborious and prone to errors, many methods have been proposed for the automatic ICD coding task. However, most of existing methods independently predict each code, ignoring two important characteristics: Code Hierarchy and Code Co-occurrence. In this paper, we propose a Hyperbolic and Co-graph Representation method (HyperCore) to address the above problem. Specifically, we propose a hyperbolic representation method to leverage the code hierarchy. Moreover, we propose a graph convolutional network to utilize the code co-occurrence. Experimental results on two widely used datasets demonstrate that our proposed model outperforms previous state-of-the-art methods.",Automatic Coding|International Diseases|manual coding|automatic task,NLP Applications,Long,https://www.aclweb.org/anthology/2020.acl-main.282.pdf -main.283,Hyperbolic Capsule Networks for Multi-Label Classification,Boli Chen|Xin Huang|Lin Xiao|Liping Jing,"Although deep neural networks are effective at extracting high-level features, classification methods usually encode an input into a vector representation via simple feature aggregation operations (e.g. pooling). Such operations limit the performance. For instance, a multi-label document may contain several concepts. In this case, one vector can not sufficiently capture its salient and discriminative content. Thus, we propose Hyperbolic Capsule Networks (HyperCaps) for Multi-Label Classification (MLC), which have two merits. First, hyperbolic capsules are designed to capture fine-grained document information for each label, which has the ability to characterize complicated structures among labels and documents. Second, Hyperbolic Dynamic Routing (HDR) is introduced to aggregate hyperbolic capsules in a label-aware manner, so that the label-level discriminative information can be preserved along the depth of neural networks. To efficiently handle large-scale MLC datasets, we additionally present a new routing method to adaptively adjust the capsule number during routing. Extensive experiments are conducted on four benchmark datasets. Compared with the state-of-the-art methods, HyperCaps significantly improves the performance of MLC especially on tail labels.",Multi-Label Classification|MLC|routing|Hyperbolic Networks,NLP Applications,Long,https://www.aclweb.org/anthology/2020.acl-main.283.pdf -main.297,Syntax-Aware Opinion Role Labeling with Dependency Graph Convolutional Networks,Bo Zhang|Yue Zhang|Rui Wang|Zhenghua Li|Min Zhang,"Opinion role labeling (ORL) is a fine-grained opinion analysis task and aims to answer “who expressed what kind of sentiment towards what?”. Due to the scarcity of labeled data, ORL remains challenging for data-driven methods. In this work, we try to enhance neural ORL models with syntactic knowledge by comparing and integrating different representations. We also propose dependency graph convolutional networks (DEPGCN) to encode parser information at different processing levels. In order to compensate for parser inaccuracy and reduce error propagation, we introduce multi-task learning (MTL) to train the parser and the ORL model simultaneously. We verify our methods on the benchmark MPQA corpus. The experimental results show that syntactic information is highly valuable for ORL, and our final MTL model effectively boosts the F1 score by 9.29 over the syntax-agnostic baseline. In addition, we find that the contributions from syntactic knowledge do not fully overlap with contextualized word representations (BERT). Our best model achieves 4.34 higher F1 score than the current state-ofthe-art.",Syntax-Aware Labeling|Opinion labeling|ORL|opinion task,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Long,https://www.aclweb.org/anthology/2020.acl-main.297.pdf -main.268,Estimating the influence of auxiliary tasks for multi-task learning of sequence tagging tasks,Fynn Schröder|Chris Biemann,"Multi-task learning (MTL) and transfer learning (TL) are techniques to overcome the issue of data scarcity when training state-of-the-art neural networks. However, finding beneficial auxiliary datasets for MTL or TL is a time- and resource-consuming trial-and-error approach. We propose new methods to automatically assess the similarity of sequence tagging datasets to identify beneficial auxiliary data for MTL or TL setups. Our methods can compute the similarity between any two sequence tagging datasets, \ie they do not need to be annotated with the same tagset or multiple labels in parallel. Additionally, our methods take tokens and their labels into account, which is more robust than only using either of them as an information source, as conducted in prior work. We empirically show that our similarity measures correlate with the change in test score of neural networks that use the auxiliary dataset for MTL to increase the main task performance. We provide an efficient, open-source implementation.",multi-task tasks|MTL|TL|MTL setups,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.268.pdf -main.240,Masked Language Model Scoring,Julian Salazar|Davis Liang|Toan Q. Nguyen|Katrin Kirchhoff,"Pretrained masked language models (MLMs) require finetuning for most NLP tasks. Instead, we evaluate MLMs out of the box via their pseudo-log-likelihood scores (PLLs), which are computed by masking tokens one by one. We show that PLLs outperform scores from autoregressive language models like GPT-2 in a variety of tasks. By rescoring ASR and NMT hypotheses, RoBERTa reduces an end-to-end LibriSpeech model's WER by 30% relative and adds up to +1.7 BLEU on state-of-the-art baselines for low-resource translation pairs, with further gains from domain adaptation. We attribute this success to PLL's unsupervised expression of linguistic acceptability without a left-to-right bias, greatly improving on scores from GPT-2 (+10 points on island effects, NPI licensing in BLiMP). One can finetune MLMs to give scores without masking, enabling computation in a single inference pass. In all, PLLs and their associated pseudo-perplexities (PPPLs) enable plug-and-play use of the growing number of pretrained MLMs; e.g., we use a single cross-lingual model to rescore translations in multiple languages. We release our library for language model scoring at https://github.com/awslabs/mlm-scoring.",Masked Scoring|NLP tasks|domain adaptation|language scoring,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.240.pdf -main.526,ReInceptionE: Relation-Aware Inception Network with Joint Local-Global Structural Information for Knowledge Graph Embedding,Zhiwen Xie|Guangyou Zhou|Jin Liu|Jimmy Xiangji Huang,"The goal of Knowledge graph embedding (KGE) is to learn how to represent the low dimensional vectors for entities and relations based on the observed triples. The conventional shallow models are limited to their expressiveness. ConvE (Dettmers et al., 2018) takes advantage of CNN and improves the expressive power with parameter efficient operators by increasing the interactions between head and relation embeddings. However, there is no structural information in the embedding space of ConvE, and the performance is still limited by the number of interactions. The recent KBGAT (Nathani et al., 2019) provides another way to learn embeddings by adaptively utilizing structural information. In this paper, we take the benefits of ConvE and KBGAT together and propose a Relation-aware Inception network with joint local-global structural information for knowledge graph Embedding (ReInceptionE). Specifically, we first explore the Inception network to learn query embedding, which aims to further increase the interactions between head and relation embeddings. Then, we propose to use a relation-aware attention mechanism to enrich the query embedding with the local neighborhood and global entity information. Experimental results on both WN18RR and FB15k-237 datasets demonstrate that ReInceptionE achieves competitive performance compared with state-of-the-art methods.",Relation-Aware Network|Knowledge Embedding|ReInceptionE|Knowledge embedding,Information Extraction,Long,https://www.aclweb.org/anthology/2020.acl-main.526.pdf -main.532,Tagged Back-translation Revisited: Why Does It Really Work?,Benjamin Marie|Raphael Rubino|Atsushi Fujita,"In this paper, we show that neural machine translation (NMT) systems trained on large back-translated data overfit some of the characteristics of machine-translated texts. Such NMT systems better translate human-produced translations, i.e., translationese, but may largely worsen the translation quality of original texts. Our analysis reveals that adding a simple tag to back-translations prevents this quality degradation and improves on average the overall translation quality by helping the NMT system to distinguish back-translated data from original parallel data during training. We also show that, in contrast to high-resource configurations, NMT systems trained in low-resource settings are much less vulnerable to overfit back-translations. We conclude that the back-translations in the training data should always be tagged especially when the origin of the text to be translated is unknown.",Tagged Revisited|neural systems|NMT systems|back-translations,Machine Translation,Short,https://www.aclweb.org/anthology/2020.acl-main.532.pdf -main.254,Simultaneous Translation Policies: From Fixed to Adaptive,Baigong Zheng|Kaibo Liu|Renjie Zheng|Mingbo Ma|Hairong Liu|Liang Huang,"Adaptive policies are better than fixed policies for simultaneous translation, since they can flexibly balance the tradeoff between translation quality and latency based on the current context information. But previous methods on obtaining adaptive policies either rely on complicated training process, or underperform simple fixed policies. We design an algorithm to achieve adaptive policies via a simple heuristic composition of a set of fixed policies. Experiments on Chinese -> English and German -> English show that our adaptive policies can outperform fixed ones by up to 4 BLEU points for the same latency, and more surprisingly, it even surpasses the BLEU score of full-sentence translation in the greedy mode (and very close to beam mode), but with much lower latency.",simultaneous translation|full-sentence translation|Simultaneous Policies|Adaptive policies,Machine Translation,Short,https://www.aclweb.org/anthology/2020.acl-main.254.pdf -main.718,Multi-Sentence Argument Linking,Seth Ebner|Patrick Xia|Ryan Culkin|Kyle Rawlins|Benjamin Van Durme,"We present a novel document-level model for finding argument spans that fill an event's roles, connecting related ideas in sentence-level semantic role labeling and coreference resolution. Because existing datasets for cross-sentence linking are small, development of our neural model is supported through the creation of a new resource, Roles Across Multiple Sentences (RAMS), which contains 9,124 annotated events across 139 types. We demonstrate strong performance of our model on RAMS and other event-related datasets.",sentence-level labeling|coreference resolution|cross-sentence linking|Multi-Sentence Linking,Information Extraction,Long,https://www.aclweb.org/anthology/2020.acl-main.718.pdf -main.724,CluHTM - Semantic Hierarchical Topic Modeling based on CluWords,Felipe Viegas|Washington Cunha|Christian Gomes|Antônio Pereira|Leonardo Rocha|Marcos Goncalves,"Hierarchical Topic modeling (HTM) exploits latent topics and relationships among them as a powerful tool for data analysis and exploration. Despite advantages over traditional topic modeling, HTM poses its own challenges, such as (1) topic incoherence, (2) unreasonable (hierarchical) structure, and (3) issues related to the definition of the ``ideal'' number of topics and depth of the hierarchy. In this paper, we advance the state-of-the-art on HTM by means of the design and evaluation of CluHTM, a novel non-probabilistic hierarchical matrix factorization aimed at solving the specific issues of HTM. CluHTM's novel contributions include: (i) the exploration of richer text representation that encapsulates both, global (dataset level) and local semantic information -- when combined, these pieces of information help to solve the topic incoherence problem as well as issues related to the unreasonable structure; (ii) the exploitation of a stability analysis metric for defining the number of topics and the ``shape'' the hierarchical structure. In our evaluation, considering twelve datasets and seven state-of-the-art baselines, CluHTM outperformed the baselines in the vast majority of the cases, with gains of around 500% over the strongest state-of-the-art baselines. We also provide qualitative and quantitative statistical analyses of why our solution works so well.",data analysis|data exploration|exploration|HTM,Information Retrieval and Text Mining,Long,https://www.aclweb.org/anthology/2020.acl-main.724.pdf -main.730,TVQA+: Spatio-Temporal Grounding for Video Question Answering,Jie Lei|Licheng Yu|Tamara Berg|Mohit Bansal,"We present the task of Spatio-Temporal Video Question Answering, which requires intelligent systems to simultaneously retrieve relevant moments and detect referenced visual concepts (people and objects) to answer natural language questions about videos. We first augment the TVQA dataset with 310.8K bounding boxes, linking depicted objects to visual concepts in questions and answers. We name this augmented version as TVQA+. We then propose Spatio-Temporal Answerer with Grounded Evidence (STAGE), a unified framework that grounds evidence in both spatial and temporal domains to answer questions about videos. Comprehensive experiments and analyses demonstrate the effectiveness of our framework and how the rich annotations in our TVQA+ dataset can contribute to the question answering task. Moreover, by performing this joint task, our model is able to produce insightful and interpretable spatio-temporal attention visualizations.",Spatio-Temporal Grounding|Video Answering|Spatio-Temporal Answering|Spatio-Temporal Evidence,"Language Grounding to Vision, Robotics and Beyond",Long,https://www.aclweb.org/anthology/2020.acl-main.730.pdf -main.693,Using Context in Neural Machine Translation Training Objectives,Danielle Saunders|Felix Stahlberg|Bill Byrne,"We present Neural Machine Translation (NMT) training using document-level metrics with batch-level documents. Previous sequence-objective approaches to NMT training focus exclusively on sentence-level metrics like sentence BLEU which do not correspond to the desired evaluation metric, typically document BLEU. Meanwhile research into document-level NMT training focuses on data or model architecture rather than training procedure. We find that each of these lines of research has a clear space in it for the other, and propose merging them with a scheme that allows a document-level evaluation metric to be used in the NMT training objective. We first sample pseudo-documents from sentence samples. We then approximate the expected document BLEU gradient with Monte Carlo sampling for use as a cost function in Minimum Risk Training (MRT). This two-level sampling procedure gives NMT performance gains over sequence MRT and maximum-likelihood training. We demonstrate that training is more robust for document-level metrics than with sequence metrics. We further demonstrate improvements on NMT with TER and Grammatical Error Correction (GEC) using GLEU, both metrics used at the document level for evaluations.",Neural training|NMT training|document-level training|NMT objective,Machine Translation,Short,https://www.aclweb.org/anthology/2020.acl-main.693.pdf -main.687,Hard-Coded Gaussian Attention for Neural Machine Translation,Weiqiu You|Simeng Sun|Mohit Iyyer,"Recent work has questioned the importance of the Transformer's multi-headed attention for achieving high translation quality. We push further in this direction by developing a ``hard-coded'' attention variant without any learned parameters. Surprisingly, replacing all learned self-attention heads in the encoder and decoder with fixed, input-agnostic Gaussian distributions minimally impacts BLEU scores across four different language pairs. However, additionally, hard-coding cross attention (which connects the decoder to the encoder) significantly lowers BLEU, suggesting that it is more important than self-attention. Much of this BLEU drop can be recovered by adding just a single learned cross attention head to an otherwise hard-coded Transformer. Taken as a whole, our results offer insight into which components of the Transformer are actually important, which we hope will guide future work into the development of simpler and more efficient attention-based models.",Neural Translation|Hard-Coded Attention|variant|encoder,Machine Translation,Long,https://www.aclweb.org/anthology/2020.acl-main.687.pdf -main.678,Temporal Common Sense Acquisition with Minimal Supervision,Ben Zhou|Qiang Ning|Daniel Khashabi|Dan Roth,"Temporal common sense (e.g., duration and frequency of events) is crucial for understanding natural language. However, its acquisition is challenging, partly because such information is often not expressed explicitly in text, and human annotation on such concepts is costly. This work proposes a novel sequence modeling approach that exploits explicit and implicit mentions of temporal common sense, extracted from a large corpus, to build TacoLM, a temporal common sense language model. Our method is shown to give quality predictions of various dimensions of temporal common sense (on UDST and a newly collected dataset from RealNews). It also produces representations of events for relevant tasks such as duration comparison, parent-child relations, event coreference and temporal QA (on TimeBank, HiEVE and MCTACO) that are better than using the standard BERT. Thus, it will be an important component of temporal NLP.",Temporal Acquisition|human annotation|temporal NLP|Minimal Supervision,Semantics: Textual Inference and Other Areas of Semantics,Long,https://www.aclweb.org/anthology/2020.acl-main.678.pdf -main.650,Semi-supervised Contextual Historical Text Normalization,Peter Makarov|Simon Clematide,"Historical text normalization, the task of mapping historical word forms to their modern counterparts, has recently attracted a lot of interest (Bollmann, 2019; Tang et al., 2018; Lusetti et al., 2018; Bollmann et al., 2018;Robertson and Goldwater, 2018; Bollmannet al., 2017; Korchagina, 2017). Yet, virtually all approaches suffer from the two limitations: 1) They consider a fully supervised setup, often with impractically large manually normalized datasets; 2) Normalization happens on words in isolation. By utilizing a simple generative normalization model and obtaining powerful contextualization from the target-side language model, we train accurate models with unlabeled historical data. In realistic training scenarios, our approach often leads to reduction in manually normalized data at the same accuracy levels.",Semi-supervised Normalization|Historical normalization|mapping forms|generative model,"Phonology, Morphology and Word Segmentation",Long,https://www.aclweb.org/anthology/2020.acl-main.650.pdf -main.136,A Novel Cascade Binary Tagging Framework for Relational Triple Extraction,Zhepei Wei|Jianlin Su|Yue Wang|Yuan Tian|Yi Chang,"Extracting relational triples from unstructured text is crucial for large-scale knowledge graph construction. However, few existing works excel in solving the overlapping triple problem where multiple relational triples in the same sentence share the same entities. In this work, we introduce a fresh perspective to revisit the relational triple extraction task and propose a novel cascade binary tagging framework (CasRel) derived from a principled problem formulation. Instead of treating relations as discrete labels as in previous works, our new framework models relations as functions that map subjects to objects in a sentence, which naturally handles the overlapping problem. Experiments show that the CasRel framework already outperforms state-of-the-art methods even when its encoder module uses a randomly initialized BERT encoder, showing the power of the new tagging framework. It enjoys further performance boost when employing a pre-trained BERT encoder, outperforming the strongest baseline by 17.5 and 30.2 absolute gain in F1-score on two public datasets NYT and WebNLG, respectively. In-depth analysis on different scenarios of overlapping triples shows that the method delivers consistent performance gain across all these scenarios. The source code and data are released online.",Relational Extraction|large-scale construction|overlapping problem|relational task,Information Extraction,Long,https://www.aclweb.org/anthology/2020.acl-main.136.pdf -main.122,Examining the State-of-the-Art in News Timeline Summarization,Demian Gholipour Ghalandari|Georgiana Ifrim,"Previous work on automatic news timeline summarization (TLS) leaves an unclear picture about how this task can generally be approached and how well it is currently solved. This is mostly due to the focus on individual subtasks, such as date selection and date summarization, and to the previous lack of appropriate evaluation metrics for the full TLS task. In this paper, we compare different TLS strategies using appropriate evaluation frameworks, and propose a simple and effective combination of methods that improves over the stateof-the-art on all tested benchmarks. For a more robust evaluation, we also present a new TLS dataset, which is larger and spans longer time periods than previous datasets.",News Summarization|automatic summarization|automatic TLS|date selection,Summarization,Long,https://www.aclweb.org/anthology/2020.acl-main.122.pdf -main.644,Refer360°: A Referring Expression Recognition Dataset in 360° Images,Volkan Cirik|Taylor Berg-Kirkpatrick|Louis-Philippe Morency,"We propose a novel large-scale referring expression recognition dataset, Refer360°, consisting of 17,137 instruction sequences and ground-truth actions for completing these instructions in 360° scenes. Refer360° differs from existing related datasets in three ways. First, we propose a more realistic scenario where instructors and the followers have partial, yet dynamic, views of the scene – followers continuously modify their field-of-view (FoV) while interpreting instructions that specify a final target location. Second, instructions to find the target location consist of multiple steps for followers who will start at random FoVs. As a result, intermediate instructions are strongly grounded in object references, and followers must identify intermediate FoVs to find the final target location correctly. Third, the target locations are neither restricted to predefined objects nor chosen by annotators; instead, they are distributed randomly across scenes. This “point anywhere” approach leads to more linguistically complex instructions, as shown in our analyses. Our examination of the dataset shows that Refer360° manifests linguistically rich phenomena in a language grounding task that poses novel challenges for computational modeling of language, vision, and navigation.",linguistically phenomena|language task|computational language|vision,"Language Grounding to Vision, Robotics and Beyond",Long,https://www.aclweb.org/anthology/2020.acl-main.644.pdf -main.491,Evaluating Explainable AI: Which Algorithmic Explanations Help Users Predict Model Behavior?,Peter Hase|Mohit Bansal,"Algorithmic approaches to interpreting machine learning models have proliferated in recent years. We carry out human subject tests that are the first of their kind to isolate the effect of algorithmic explanations on a key aspect of model interpretability, simulatability, while avoiding important confounding experimental factors. A model is simulatable when a person can predict its behavior on new inputs. Through two kinds of simulation tests involving text and tabular data, we evaluate five explanations methods: (1) LIME, (2) Anchor, (3) Decision Boundary, (4) a Prototype model, and (5) a Composite approach that combines explanations from each method. Clear evidence of method effectiveness is found in very few cases: LIME improves simulatability in tabular classification, and our Prototype method is effective in counterfactual simulation tests. We also collect subjective ratings of explanations, but we do not find that ratings are predictive of how helpful explanations are. Our results provide the first reliable and comprehensive estimates of how explanations influence simulatability across a variety of explanation methods and data domains. We show that (1) we need to be careful about the metrics we use to evaluate explanation methods, and (2) there is significant room for improvement in current methods.",Evaluating AI|Explainable AI|Algorithmic approaches|machine models,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.491.pdf -main.485,"Language (Technology) is Power: A Critical Survey of ""Bias"" in NLP",Su Lin Blodgett|Solon Barocas|Hal Daumé III|Hanna Wallach,"We survey 146 papers analyzing ""bias"" in NLP systems, finding that their motivations are often vague, inconsistent, and lacking in normative reasoning, despite the fact that analyzing ""bias"" is an inherently normative process. We further find that these papers' proposed quantitative techniques for measuring or mitigating ""bias"" are poorly matched to their motivations and do not engage with the relevant literature outside of NLP. Based on these findings, we describe the beginnings of a path forward by proposing three recommendations that should guide work analyzing ""bias"" in NLP systems. These recommendations rest on a greater recognition of the relationships between language and social hierarchies, encouraging researchers and practitioners to articulate their conceptualizations of ""bias""---i.e., what kinds of system behaviors are harmful, in what ways, to whom, and why, as well as the normative reasoning underlying these statements---and to center work around the lived experiences of members of communities affected by NLP systems, while interrogating and reimagining the power relations between technologists and such communities.",NLP|NLP systems|normative reasoning|normative process,Ethics and NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.485.pdf -main.308,What Was Written vs. Who Read It: News Media Profiling Using Text Analysis and Social Media Context,Ramy Baly|Georgi Karadzhov|Jisun An|Haewoon Kwak|Yoan Dinkov|Ahmed Ali|James Glass|Preslav Nakov,"Predicting the political bias and the factuality of reporting of entire news outlets are critical elements of media profiling, which is an understudied but an increasingly important research direction. The present level of proliferation of fake, biased, and propagandistic content online has made it impossible to fact-check every single suspicious claim, either manually or automatically. Thus, it has been proposed to profile entire news outlets and to look for those that are likely to publish fake or biased content. This makes it possible to detect likely “fake news” the moment they are published, by simply checking the reliability of their source. From a practical perspective, political bias and factuality of reporting have a linguistic aspect but also a social context. Here, we study the impact of both, namely (i) what was written (i.e., what was published by the target medium, and how it describes itself in Twitter) vs. (ii) who reads it (i.e., analyzing the target medium’s audience on social media). We further study (iii) what was written about the target medium (in Wikipedia). The evaluation results show that what was written matters most, and we further show that putting all information sources together yields huge improvements over the current state-of-the-art.",News Profiling|media profiling|Text Analysis|political bias,Computational Social Science and Social Media,Long,https://www.aclweb.org/anthology/2020.acl-main.308.pdf -main.17,Fact-based Text Editing,Hayate Iso|Chao Qiao|Hang Li,"We propose a novel text editing task, referred to as fact-based text editing, in which the goal is to revise a given document to better describe the facts in a knowledge base (e.g., several triples). The task is important in practice because reflecting the truth is a common requirement in text editing. First, we propose a method for automatically generating a dataset for research on fact-based text editing, where each instance consists of a draft text, a revised text, and several facts represented in triples. We apply the method into two public table-to-text datasets, obtaining two new datasets consisting of 233k and 37k instances, respectively. Next, we propose a new neural network architecture for fact-based text editing, called FactEditor, which edits a draft text by referring to given facts using a buffer, a stream, and a memory. A straightforward approach to address the problem would be to employ an encoder-decoder model. Our experimental results on the two datasets show that FactEditor outperforms the encoder-decoder approach in terms of fidelity and fluency. The results also show that FactEditor conducts inference faster than the encoder-decoder approach.",Fact-based Editing|text task|text editing|automatically dataset,Generation,Long,https://www.aclweb.org/anthology/2020.acl-main.17.pdf -main.334,BiRRE: Learning Bidirectional Residual Relation Embeddings for Supervised Hypernymy Detection,Chengyu Wang|Xiaofeng He,"The hypernymy detection task has been addressed under various frameworks. Previously, the design of unsupervised hypernymy scores has been extensively studied. In contrast, supervised classifiers, especially distributional models, leverage the global contexts of terms to make predictions, but are more likely to suffer from ``lexical memorization''. In this work, we revisit supervised distributional models for hypernymy detection. Rather than taking embeddings of two terms as classification inputs, we introduce a representation learning framework named Bidirectional Residual Relation Embeddings (BiRRE). In this model, a term pair is represented by a BiRRE vector as features for hypernymy classification, which models the possibility of a term being mapped to another in the embedding space by hypernymy relations. A Latent Projection Model with Negative Regularization (LPMNR) is proposed to simulate how hypernyms and hyponyms are generated by neural language models, and to generate BiRRE vectors based on bidirectional residuals of projections. Experiments verify BiRRE outperforms strong baselines over various evaluation frameworks.",Supervised Detection|hypernymy task|unsupervised scores|hypernymy detection,Semantics: Lexical,Long,https://www.aclweb.org/anthology/2020.acl-main.334.pdf -main.452,Discrete Optimization for Unsupervised Sentence Summarization with Word-Level Extraction,Raphael Schumann|Lili Mou|Yao Lu|Olga Vechtomova|Katja Markert,"Automatic sentence summarization produces a shorter version of a sentence, while preserving its most important information. A good summary is characterized by language fluency and high information overlap with the source sentence. We model these two aspects in an unsupervised objective function, consisting of language modeling and semantic similarity metrics. We search for a high-scoring summary by discrete optimization. Our proposed method achieves a new state-of-the art for unsupervised sentence summarization according to ROUGE scores. Additionally, we demonstrate that the commonly reported ROUGE F1 metric is sensitive to summary length. Since this is unwillingly exploited in recent work, we emphasize that future evaluation should explicitly group summarization systems by output length brackets.",Unsupervised Summarization|Word-Level Extraction|Automatic summarization|Discrete Optimization,Summarization,Long,https://www.aclweb.org/anthology/2020.acl-main.452.pdf -main.446,More Diverse Dialogue Datasets via Diversity-Informed Data Collection,Katherine Stasaski|Grace Hui Yang|Marti A. Hearst,"Automated generation of conversational dialogue using modern neural architectures has made notable advances. However, these models are known to have a drawback of often producing uninteresting, predictable responses; this is known as the diversity problem. We introduce a new strategy to address this problem, called Diversity-Informed Data Collection. Unlike prior approaches, which modify model architectures to solve the problem, this method uses dynamically computed corpus-level statistics to determine which conversational participants to collect data from. Diversity-Informed Data Collection produces significantly more diverse data than baseline data collection methods, and better results on two downstream tasks: emotion classification and dialogue generation. This method is generalizable and can be used with other corpus-level metrics.",Automated dialogue|diversity problem|Diversity-Informed Collection|emotion classification,Resources and Evaluation,Long,https://www.aclweb.org/anthology/2020.acl-main.446.pdf -main.320,A Retrieve-and-Rewrite Initialization Method for Unsupervised Machine Translation,Shuo Ren|Yu Wu|Shujie Liu|Ming Zhou|Shuai Ma,"The commonly used framework for unsupervised machine translation builds initial translation models of both translation directions, and then performs iterative back-translation to jointly boost their translation performance. The initialization stage is very important since bad initialization may wrongly squeeze the search space, and too much noise introduced in this stage may hurt the final performance. In this paper, we propose a novel retrieval and rewriting based method to better initialize unsupervised translation models. We first retrieve semantically comparable sentences from monolingual corpora of two languages and then rewrite the target side to minimize the semantic gap between the source and retrieved targets with a designed rewriting model. The rewritten sentence pairs are used to initialize SMT models which are used to generate pseudo data for two NMT models, followed by the iterative back-translation. Experiments show that our method can build better initial unsupervised translation models and improve the final translation performance by over 4 BLEU scores. Our code is released at https://github.com/Imagist-Shuo/RRforUNMT.git.",Unsupervised Translation|translation|Retrieve-and-Rewrite Method|translation models,Machine Translation,Short,https://www.aclweb.org/anthology/2020.acl-main.320.pdf -main.450,Asking and Answering Questions to Evaluate the Factual Consistency of Summaries,Alex Wang|Kyunghyun Cho|Mike Lewis,"Practical applications of abstractive summarization models are limited by frequent factual inconsistencies with respect to their input. Existing automatic evaluation metrics for summarization are largely insensitive to such errors. We propose QAGS (pronounced ``kags''), an automatic evaluation protocol that is designed to identify factual inconsistencies in a generated summary. QAGS is based on the intuition that if we ask questions about a summary and its source, we will receive similar answers if the summary is factually consistent with the source. To evaluate QAGS, we collect human judgments of factual consistency on model-generated summaries for the CNN/DailyMail (Hermann et al., 2015) and XSUM (Narayan et al., 2018) summarization datasets. QAGS has substantially higher correlations with these judgments than other automatic evaluation metrics. Also, QAGS offers a natural form of interpretability: The answers and questions generated while computing QAGS indicate which tokens of a summary are inconsistent and why. We believe QAGS is a promising tool in automatically generating usable and factually consistent text. Code for QAGS will be available at https://github.com/W4ngatang/qags.",summarization|automatic protocol|automatically text|abstractive models,Summarization,Long,https://www.aclweb.org/anthology/2020.acl-main.450.pdf -main.29,A Joint Model for Document Segmentation and Segment Labeling,Joe Barrow|Rajiv Jain|Vlad Morariu|Varun Manjunatha|Douglas Oard|Philip Resnik,"Text segmentation aims to uncover latent structure by dividing text from a document into coherent sections. Where previous work on text segmentation considers the tasks of document segmentation and segment labeling separately, we show that the tasks contain complementary information and are best addressed jointly. We introduce Segment Pooling LSTM (S-LSTM), which is capable of jointly segmenting a document and labeling segments. In support of joint training, we develop a method for teaching the model to recover from errors by aligning the predicted and ground truth segments. We show that S-LSTM reduces segmentation error by 30% on average, while also improving segment labeling.",Document Labeling|Text segmentation|document segmentation|segment labeling,Information Retrieval and Text Mining,Long,https://www.aclweb.org/anthology/2020.acl-main.29.pdf -main.336,Hypernymy Detection for Low-Resource Languages via Meta Learning,Changlong Yu|Jialong Han|Haisong Zhang|Wilfred Ng,"Hypernymy detection, a.k.a, lexical entailment, is a fundamental sub-task of many natural language understanding tasks. Previous explorations mostly focus on monolingual hypernymy detection on high-resource languages, e.g., English, but few investigate the low-resource scenarios. This paper addresses the problem of low-resource hypernymy detection by combining high-resource languages. We extensively compare three joint training paradigms and for the first time propose applying meta learning to relieve the low-resource issue. Experiments demonstrate the superiority of our method among the three settings, which substantially improves the performance of extremely low-resource languages by preventing over-fitting on small datasets.",Hypernymy Detection|lexical entailment|natural tasks|monolingual detection,Semantics: Lexical,Short,https://www.aclweb.org/anthology/2020.acl-main.336.pdf -main.322,Does Multi-Encoder Help? A Case Study on Context-Aware Neural Machine Translation,Bei Li|Hui Liu|Ziyang Wang|Yufan Jiang|Tong Xiao|Jingbo Zhu|Tongran Liu|Changliang Li,"In encoder-decoder neural models, multiple encoders are in general used to represent the contextual information in addition to the individual sentence. In this paper, we investigate multi-encoder approaches in document-level neural machine translation (NMT). Surprisingly, we find that the context encoder does not only encode the surrounding sentences but also behaves as a noise generator. This makes us rethink the real benefits of multi-encoder in context-aware translation - some of the improvements come from robust training. We compare several methods that introduce noise and/or well-tuned dropout setup into the training of these encoders. Experimental results show that noisy training plays an important role in multi-encoder-based NMT, especially when the training data is small. Also, we establish a new state-of-the-art on IWSLT Fr-En task by careful use of noise generation and dropout methods.",Context-Aware Translation|document-level translation|document-level NMT|document-level,Machine Translation,Short,https://www.aclweb.org/anthology/2020.acl-main.322.pdf -main.444,Dialogue-Based Relation Extraction,Dian Yu|Kai Sun|Claire Cardie|Dong Yu,"We present the first human-annotated dialogue-based relation extraction (RE) dataset DialogRE, aiming to support the prediction of relation(s) between two arguments that appear in a dialogue. We further offer DialogRE as a platform for studying cross-sentence RE as most facts span multiple sentences. We argue that speaker-related information plays a critical role in the proposed task, based on an analysis of similarities and differences between dialogue-based and traditional RE tasks. Considering the timeliness of communication in a dialogue, we design a new metric to evaluate the performance of RE methods in a conversational setting and investigate the performance of several representative RE methods on DialogRE. Experimental results demonstrate that a speaker-aware extension on the best-performing model leads to gains in both the standard and conversational evaluation settings. DialogRE is available at https://dataset.org/dialogre/.",Dialogue-Based Extraction|cross-sentence RE|dialogue-based tasks|conversational settings,Resources and Evaluation,Long,https://www.aclweb.org/anthology/2020.acl-main.444.pdf -main.15,A Study of Non-autoregressive Model for Sequence Generation,Yi Ren|Jinglin Liu|Xu Tan|Zhou Zhao|Sheng Zhao|Tie-Yan Liu,"Non-autoregressive (NAR) models generate all the tokens of a sequence in parallel, resulting in faster generation speed compared to their autoregressive (AR) counterparts but at the cost of lower accuracy. Different techniques including knowledge distillation and source-target alignment have been proposed to bridge the gap between AR and NAR models in various tasks such as neural machine translation (NMT), automatic speech recognition (ASR), and text to speech (TTS). With the help of those techniques, NAR models can catch up with the accuracy of AR models in some tasks but not in some others. In this work, we conduct a study to understand the difficulty of NAR sequence generation and try to answer: (1) Why NAR models can catch up with AR models in some tasks but not all? (2) Why techniques like knowledge distillation and source-target alignment can help NAR models. Since the main difference between AR and NAR models is that NAR models do not use dependency among target tokens while AR models do, intuitively the difficulty of NAR sequence generation heavily depends on the strongness of dependency among target tokens. To quantify such dependency, we propose an analysis model called CoMMA to characterize the difficulty of different NAR sequence generation tasks. We have several interesting findings: 1) Among the NMT, ASR and TTS tasks, ASR has the most target-token dependency while TTS has the least. 2) Knowledge distillation reduces the target-token dependency in target sequence and thus improves the accuracy of NAR models. 3) Source-target alignment constraint encourages dependency of a target token on source tokens and thus eases the training of NAR models.",Sequence Generation|AR|neural translation|automatic recognition,Generation,Long,https://www.aclweb.org/anthology/2020.acl-main.15.pdf -main.478,Discourse as a Function of Event: Profiling Discourse Structure in News Articles around the Main Event,Prafulla Kumar Choubey|Aaron Lee|Ruihong Huang|Lu Wang,"Understanding discourse structures of news articles is vital to effectively contextualize the occurrence of a news event. To enable computational modeling of news structures, we apply an existing theory of functional discourse structure for news articles that revolves around the main event and create a human-annotated corpus of 802 documents spanning over four domains and three media sources. Next, we propose several document-level neural-network models to automatically construct news content structures. Finally, we demonstrate that incorporating system predicted news structures yields new state-of-the-art performance for event coreference resolution. The news documents we annotated are openly available and the annotations are publicly released for future research.",Profiling Structure|computational structures|event resolution|theory structure,Discourse and Pragmatics,Long,https://www.aclweb.org/anthology/2020.acl-main.478.pdf -main.493,Finding Universal Grammatical Relations in Multilingual BERT,Ethan A. Chi|John Hewitt|Christopher D. Manning,"Recent work has found evidence that Multilingual BERT (mBERT), a transformer-based multilingual masked language model, is capable of zero-shot cross-lingual transfer, suggesting that some aspects of its representations are shared cross-lingually. To better understand this overlap, we extend recent work on finding syntactic trees in neural networks' internal representations to the multilingual setting. We show that subspaces of mBERT representations recover syntactic tree distances in languages other than English, and that these subspaces are approximately shared across languages. Motivated by these results, we present an unsupervised analysis method that provides evidence mBERT learns representations of syntactic dependency labels, in the form of clusters which largely agree with the Universal Dependencies taxonomy. This evidence suggests that even without explicit supervision, multilingual masked language models learn certain linguistic universals.",zero-shot transfer|Multilingual BERT|Multilingual mBERT|Multilingual,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.493.pdf -main.487,Social Biases in NLP Models as Barriers for Persons with Disabilities,Ben Hutchinson|Vinodkumar Prabhakaran|Emily Denton|Kellie Webster|Yu Zhong|Stephen Denuyl,"Building equitable and inclusive NLP technologies demands consideration of whether and how social attitudes are represented in ML models. In particular, representations encoded in models often inadvertently perpetuate undesirable social biases from the data on which they are trained. In this paper, we present evidence of such undesirable biases towards mentions of disability in two different English language models: toxicity prediction and sentiment analysis. Next, we demonstrate that the neural embeddings that are the critical first step in most NLP pipelines similarly contain undesirable biases towards mentions of disability. We end by highlighting topical biases in the discourse about disability which may contribute to the observed model biases; for instance, gun violence, homelessness, and drug addiction are over-represented in texts discussing mental illness.",toxicity prediction|NLP Models|equitable technologies|ML models,Ethics and NLP,Short,https://www.aclweb.org/anthology/2020.acl-main.487.pdf -main.134,Fast and Accurate Non-Projective Dependency Tree Linearization,Xiang Yu|Simon Tannert|Ngoc Thang Vu|Jonas Kuhn,"We propose a graph-based method to tackle the dependency tree linearization task. We formulate the task as a Traveling Salesman Problem (TSP), and use a biaffine attention model to calculate the edge costs. We facilitate the decoding by solving the TSP for each subtree and combining the solution into a projective tree. We then design a transition system as post-processing, inspired by non-projective transition-based parsing, to obtain non-projective sentences. Our proposed method outperforms the state-of-the-art linearizer while being 10 times faster in training and decoding.",dependency task|Traveling TSP|TSP|decoding,Generation,Long,https://www.aclweb.org/anthology/2020.acl-main.134.pdf -main.652,DoQA - Accessing Domain-Specific FAQs via Conversational QA,Jon Ander Campos|Arantxa Otegi|Aitor Soroa|Jan Deriu|Mark Cieliebak|Eneko Agirre,"The goal of this work is to build conversational Question Answering (QA) interfaces for the large body of domain-specific information available in FAQ sites. We present DoQA, a dataset with 2,437 dialogues and 10,917 QA pairs. The dialogues are collected from three Stack Exchange sites using the Wizard of Oz method with crowdsourcing. Compared to previous work, DoQA comprises well-defined information needs, leading to more coherent and natural conversations with less factoid questions and is multi-domain. In addition, we introduce a more realistic information retrieval (IR) scenario where the system needs to find the answer in any of the FAQ documents. The results of an existing, strong, system show that, thanks to transfer learning from a Wikipedia QA dataset and fine tuning on a single FAQ domain, it is possible to build high quality conversational QA systems for FAQs without in-domain training data. The good results carry over into the more challenging IR scenario. In both cases, there is still ample room for improvement, as indicated by the higher human upperbound.",DoQA FAQs|conversational interfaces|information scenario|IR scenario,Question Answering,Long,https://www.aclweb.org/anthology/2020.acl-main.652.pdf -main.646,Effective Estimation of Deep Generative Language Models,Tom Pelsmaeker|Wilker Aziz,"Advances in variational inference enable parameterisation of probabilistic models by deep neural networks. This combines the statistical transparency of the probabilistic modelling framework with the representational power of deep learning. Yet, due to a problem known as posterior collapse, it is difficult to estimate such models in the context of language modelling effectively. We concentrate on one such model, the variational auto-encoder, which we argue is an important building block in hierarchical probabilistic models of language. This paper contributes a sober view of the problem, a survey of techniques to address it, novel techniques, and extensions to the model. To establish a ranking of techniques, we perform a systematic comparison using Bayesian optimisation and find that many techniques perform reasonably similar, given enough resources. Still, a favourite can be named based on convenience. We also make several empirical observations and recommendations of best practices that should help researchers interested in this exciting field.",Estimation Models|parameterisation models|posterior collapse|language modelling,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.646.pdf -main.120,A Large-Scale Multi-Document Summarization Dataset from the Wikipedia Current Events Portal,Demian Gholipour Ghalandari|Chris Hokamp|Nghia The Pham|John Glover|Georgiana Ifrim,"Multi-document summarization (MDS) aims to compress the content in large document collections into short summaries and has important applications in story clustering for newsfeeds, presentation of search results, and timeline generation. However, there is a lack of datasets that realistically address such use cases at a scale large enough for training supervised models for this task. This work presents a new dataset for MDS that is large both in the total number of document clusters and in the size of individual clusters. We build this dataset by leveraging the Wikipedia Current Events Portal (WCEP), which provides concise and neutral human-written summaries of news events, with links to external source articles. We also automatically extend these source articles by looking for related articles in the Common Crawl archive. We provide a quantitative analysis of the dataset and empirical results for several state-of-the-art MDS techniques.",Multi-document summarization|story clustering|presentation results|timeline generation,Summarization,Short,https://www.aclweb.org/anthology/2020.acl-main.120.pdf -main.108,Crawling and Preprocessing Mailing Lists At Scale for Dialog Analysis,Janek Bevendorff|Khalid Al Khatib|Martin Potthast|Benno Stein,"This paper introduces the Webis Gmane Email Corpus 2019, the largest publicly available and fully preprocessed email corpus to date. We crawled more than 153 million emails from 14,699 mailing lists and segmented them into semantically consistent components using a new neural segmentation model. With 96% accuracy on 15 classes of email segments, our model achieves state-of-the-art performance while being more efficient to train than previous ones. All data, code, and trained models are made freely available alongside the paper.",Preprocessing Lists|Dialog Analysis|Crawling|semantically components,Resources and Evaluation,Short,https://www.aclweb.org/anthology/2020.acl-main.108.pdf -main.691,"Translationese as a Language in ""Multilingual"" NMT",Parker Riley|Isaac Caswell|Markus Freitag|David Grangier,"Machine translation has an undesirable propensity to produce ``translationese"" artifacts, which can lead to higher BLEU scores while being liked less by human raters. Motivated by this, we model translationese and original (i.e. natural) text as separate languages in a multilingual model, and pose the question: can we perform zero-shot translation between original source text and original target text? There is no data with original source and original target, so we train a sentence-level classifier to distinguish translationese from original target text, and use this classifier to tag the training data for an NMT model. Using this technique we bias the model to produce more natural outputs at test time, yielding gains in human evaluation scores on both accuracy and fluency. Additionally, we demonstrate that it is possible to bias the model to produce translationese and game the BLEU score, increasing it while decreasing human-rated quality. We analyze these outputs using metrics measuring the degree of translationese, and present an analysis of the volatility of heuristic-based train-data tagging.",Translationese|Machine translation|zero-shot translation|Multilingual NMT,Machine Translation,Long,https://www.aclweb.org/anthology/2020.acl-main.691.pdf -main.685,Multi-agent Communication meets Natural Language: Synergies between Functional and Structural Language Learning,Angeliki Lazaridou|Anna Potapenko|Olivier Tieleman,"We present a method for combining multi-agent communication and traditional data-driven approaches to natural language learning, with an end goal of teaching agents to communicate with humans in natural language. Our starting point is a language model that has been trained on generic, not task-specific language data. We then place this model in a multi-agent self-play environment that generates task-specific rewards used to adapt or modulate the model, turning it into a task-conditional language model. We introduce a new way for combining the two types of learning based on the idea of reranking language model samples, and show that this method outperforms others in communicating with humans in a visual referential communication task. Finally, we present a taxonomy of different types of language drift that can occur alongside a set of measures to detect them.",Multi-agent Communication|natural learning|visual task|Functional Learning,"Language Grounding to Vision, Robotics and Beyond",Long,https://www.aclweb.org/anthology/2020.acl-main.685.pdf -main.726,Feature Projection for Improved Text Classification,Qi Qin|Wenpeng Hu|Bing Liu,"In classification, there are usually some good features that are indicative of class labels. For example, in sentiment classification, words like good and nice are indicative of the positive sentiment and words like bad and terrible are indicative of the negative sentiment. However, there are also many common features (e.g., words) that are not indicative of any specific class (e.g., voice and screen, which are common to both sentiment classes and are not discriminative for classification). Although deep learning has made significant progresses in generating discriminative features through its powerful representation learning, we believe there is still room for improvement. In this paper, we propose a novel angle to further improve this representation learning, i.e., feature projection. This method projects existing features into the orthogonal space of the common features. The resulting projection is thus perpendicular to the common features and more discriminative for classification. We apply this new method to improve CNN, RNN, Transformer, and Bert based text classification and obtain markedly better results.",Text Classification|classification|sentiment classification|Bert classification,Information Retrieval and Text Mining,Long,https://www.aclweb.org/anthology/2020.acl-main.726.pdf -main.732,A Multitask Learning Approach for Diacritic Restoration,Sawsan Alqahtani|Ajay Mishra|Mona Diab,"In many languages like Arabic, diacritics are used to specify pronunciations as well as meanings. Such diacritics are often omitted in written text, increasing the number of possible pronunciations and meanings for a word. This results in a more ambiguous text making computational processing on such text more difficult. Diacritic restoration is the task of restoring missing diacritics in the written text. Most state-of-the-art diacritic restoration models are built on character level information which helps generalize the model to unseen data, but presumably lose useful information at the word level. Thus, to compensate for this loss, we investigate the use of multi-task learning to jointly optimize diacritic restoration with related NLP problems namely word segmentation, part-of-speech tagging, and syntactic diacritization. We use Arabic as a case study since it has sufficient data resources for tasks that we consider in our joint modeling. Our joint models significantly outperform the baselines and are comparable to the state-of-the-art models that are more complex relying on morphological analyzers and/or a lot more data (e.g. dialectal data).",Diacritic Restoration|computational processing|restoring diacritics|NLP problems,"Phonology, Morphology and Word Segmentation",Long,https://www.aclweb.org/anthology/2020.acl-main.732.pdf -main.524,Multi-Cell Compositional LSTM for NER Domain Adaptation,Chen Jia|Yue Zhang,"Cross-domain NER is a challenging yet practical problem. Entity mentions can be highly different across domains. However, the correlations between entity types can be relatively more stable across domains. We investigate a multi-cell compositional LSTM structure for multi-task learning, modeling each entity type using a separate cell state. With the help of entity typed units, cross-domain knowledge transfer can be made in an entity type level. Theoretically, the resulting distinct feature distributions for each entity type make it more powerful for cross-domain transfer. Empirically, experiments on four few-shot and zero-shot datasets show our method significantly outperforms a series of multi-task learning methods and achieves the best results.",NER Adaptation|Cross-domain NER|multi-task learning|cross-domain transfer,Information Extraction,Long,https://www.aclweb.org/anthology/2020.acl-main.524.pdf -main.242,Posterior Calibrated Training on Sentence Classification Tasks,Taehee Jung|Dongyeop Kang|Hua Cheng|Lucas Mentch|Thomas Schaaf,"Most classification models work by first predicting a posterior probability distribution over all classes and then selecting that class with the largest estimated probability. In many settings however, the quality of posterior probability itself (e.g., 65% chance having diabetes), gives more reliable information than the final predicted class alone. When these methods are shown to be poorly calibrated, most fixes to date have relied on posterior calibration, which rescales the predicted probabilities but often has little impact on final classifications. Here we propose an end-to-end training procedure called posterior calibrated (PosCal) training that directly optimizes the objective while minimizing the difference between the predicted and empirical posterior probabilities.We show that PosCal not only helps reduce the calibration error but also improve task performance by penalizing drops in performance of both objectives. Our PosCal achieves about 2.5% of task performance gain and 16.1% of calibration error reduction on GLUE (Wang et al., 2018) compared to the baseline. We achieved the comparable task performance with 13.2% calibration error reduction on xSLUE (Kang and Hovy, 2019), but not outperforming the two-stage calibration baseline. PosCal training can be easily extendable to any types of classification tasks as a form of regularization term. Also, PosCal has the advantage that it incrementally tracks needed statistics for the calibration objective during the training process, making efficient use of large training sets.",Sentence Tasks|classifications|xSLUE|classification tasks,Machine Learning for NLP,Short,https://www.aclweb.org/anthology/2020.acl-main.242.pdf -main.256,Glyph2Vec: Learning Chinese Out-of-Vocabulary Word Embedding from Glyphs,Hong-You Chen|SZ-HAN YU|Shou-de Lin,"Chinese NLP applications that rely on large text often contain huge amounts of vocabulary which are sparse in corpus. We show that characters’ written form, Glyphs, in ideographic languages could carry rich semantics. We present a multi-modal model, Glyph2Vec, to tackle Chinese out-of-vocabulary word embedding problem. Glyph2Vec extracts visual features from word glyphs to expand current word embedding space for out-of-vocabulary word embedding, without the need of accessing any corpus, which is useful for improving Chinese NLP systems, especially for low-resource scenarios. Experiments across different applications show the significant effectiveness of our model.",Chinese Embedding|Chinese applications|Chinese problem|out-of-vocabulary embedding,Semantics: Lexical,Short,https://www.aclweb.org/anthology/2020.acl-main.256.pdf -main.530,Contextual Neural Machine Translation Improves Translation of Cataphoric Pronouns,KayYen Wong|Sameen Maruf|Gholamreza Haffari,"The advent of context-aware NMT has resulted in promising improvements in the overall translation quality and specifically in the translation of discourse phenomena such as pronouns. Previous works have mainly focused on the use of past sentences as context with a focus on anaphora translation. In this work, we investigate the effect of future sentences as context by comparing the performance of a contextual NMT model trained with the future context to the one trained with the past context. Our experiments and evaluation, using generic and pronoun-focused automatic metrics, show that the use of future context not only achieves significant improvements over the context-agnostic Transformer, but also demonstrates comparable and in some cases improved performance over its counterpart trained on past context. We also perform an evaluation on a targeted cataphora test suite and report significant gains over the context-agnostic Transformer in terms of BLEU.",Translation Pronouns|translation phenomena|anaphora translation|Contextual Translation,Machine Translation,Short,https://www.aclweb.org/anthology/2020.acl-main.530.pdf -main.518,Video-Grounded Dialogues with Pretrained Generation Language Models,Hung Le|Steven C.H. Hoi,"Pre-trained language models have shown remarkable success in improving various downstream NLP tasks due to their ability to capture dependencies in textual data and generate natural responses. In this paper, we leverage the power of pre-trained language models for improving video-grounded dialogue, which is very challenging and involves complex features of different dynamics: (1) Video features which can extend across both spatial and temporal dimensions; and (2) Dialogue features which involve semantic dependencies over multiple dialogue turns. We propose a framework by extending GPT-2 models to tackle these challenges by formulating video-grounded dialogue tasks as a sequence-to-sequence task, combining both visual and textual representation into a structured sequence, and fine-tuning a large pre-trained GPT-2 network. Our framework allows fine-tuning language models to capture dependencies across multiple modalities over different levels of information: spatio-temporal level in video and token-sentence level in dialogue context. We achieve promising improvement on the Audio-Visual Scene-Aware Dialogues (AVSD) benchmark from DSTC7, which supports a potential direction in this line of research.",downstream tasks|video-grounded tasks|sequence-to-sequence task|Pretrained Models,Dialogue and Interactive Systems,Short,https://www.aclweb.org/anthology/2020.acl-main.518.pdf -main.281,Hiring Now: A Skill-Aware Multi-Attention Model for Job Posting Generation,Liting Liu|Jie Liu|Wenzheng Zhang|Ziming Chi|Wenxuan Shi|Yalou Huang,"Writing a good job posting is a critical step in the recruiting process, but the task is often more difficult than many people think. It is challenging to specify the level of education, experience, relevant skills per the company information and job description. To this end, we propose a novel task of Job Posting Generation (JPG) which is cast as a conditional text generation problem to generate job requirements according to the job descriptions. To deal with this task, we devise a data-driven global Skill-Aware Multi-Attention generation model, named SAMA. Specifically, to model the complex mapping relationships between input and output, we design a hierarchical decoder that we first label the job description with multiple skills, then we generate a complete text guided by the skill labels. At the same time, to exploit the prior knowledge about the skills, we further construct a skill knowledge graph to capture the global prior knowledge of skills and refine the generated results. The proposed approach is evaluated on real-world job posting data. Experimental results clearly demonstrate the effectiveness of the proposed method.",Job Generation|recruiting process|conditional problem|Skill-Aware Model,NLP Applications,Long,https://www.aclweb.org/anthology/2020.acl-main.281.pdf -main.295,Relational Graph Attention Network for Aspect-based Sentiment Analysis,Kai Wang|Weizhou Shen|Yunyi Yang|Xiaojun Quan|Rui Wang,"Aspect-based sentiment analysis aims to determine the sentiment polarity towards a specific aspect in online reviews. Most recent efforts adopt attention-based neural network models to implicitly connect aspects with opinion words. However, due to the complexity of language and the existence of multiple aspects in a single sentence, these models often confuse the connections. In this paper, we address this problem by means of effective encoding of syntax information. Firstly, we define a unified aspect-oriented dependency tree structure rooted at a target aspect by reshaping and pruning an ordinary dependency parse tree. Then, we propose a relational graph attention network (R-GAT) to encode the new tree structure for sentiment prediction. Extensive experiments are conducted on the SemEval 2014 and Twitter datasets, and the experimental results confirm that the connections between aspects and opinion words can be better established with our approach, and the performance of the graph attention network (GAT) is significantly improved as a consequence.",Aspect-based Analysis|encoding information|sentiment prediction|Relational Network,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Long,https://www.aclweb.org/anthology/2020.acl-main.295.pdf -main.294,Parallel Data Augmentation for Formality Style Transfer,Yi Zhang|Tao Ge|Xu SUN,"The main barrier to progress in the task of Formality Style Transfer is the inadequacy of training data. In this paper, we study how to augment parallel data and propose novel and simple data augmentation methods for this task to obtain useful sentence pairs with easily accessible models and systems. Experiments demonstrate that our augmented parallel data largely helps improve formality style transfer when it is used to pre-train the model, leading to the state-of-the-art results in the GYAFC benchmark dataset.",Parallel Augmentation|Formality Transfer|data methods|parallel data,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Short,https://www.aclweb.org/anthology/2020.acl-main.294.pdf -main.280,Distinguish Confusing Law Articles for Legal Judgment Prediction,Nuo Xu|Pinghui Wang|Long Chen|Li Pan|Xiaoyan Wang|Junzhou Zhao,"Legal Judgement Prediction (LJP) is the task of automatically predicting a law case’s judgment results given a text describing the case’s facts, which has great prospects in judicial assistance systems and handy services for the public. In practice, confusing charges are often presented, because law cases applicable to similar law articles are easily misjudged. To address this issue, existing work relies heavily on domain experts, which hinders its application in different law systems. In this paper, we present an end-to-end model, LADAN, to solve the task of LJP. To distinguish confusing charges, we propose a novel graph neural network, GDL, to automatically learn subtle differences between confusing law articles, and also design a novel attention mechanism that fully exploits the learned differences to attentively extract effective discriminative features from fact descriptions. Experiments conducted on real-world datasets demonstrate the superiority of our LADAN.",Legal Prediction|judicial systems|handy services|LJP,NLP Applications,Long,https://www.aclweb.org/anthology/2020.acl-main.280.pdf -main.519,A Unified MRC Framework for Named Entity Recognition,Xiaoya Li|Jingrong Feng|Yuxian Meng|Qinghong Han|Fei Wu|Jiwei Li,"The task of named entity recognition (NER) is normally divided into nested NER and flat NER depending on whether named entities are nested or not.Models are usually separately developed for the two tasks, since sequence labeling models, the most widely used backbone for flat NER, are only able to assign a single label to a particular token, which is unsuitable for nested NER where a token may be assigned several labels. In this paper, we propose a unified framework that is capable of handling both flat and nested NER tasks. Instead of treating the task of NER as a sequence labeling problem, we propose to formulate it as a machine reading comprehension (MRC) task. For example, extracting entities with the per label is formalized as extracting answer spans to the question ``which person is mentioned in the text"".This formulation naturally tackles the entity overlapping issue in nested NER: the extraction of two overlapping entities with different categories requires answering two independent questions. Additionally, since the query encodes informative prior knowledge, this strategy facilitates the process of entity extraction, leading to better performances for not only nested NER, but flat NER. We conduct experiments on both nested and flat NER datasets.Experiment results demonstrate the effectiveness of the proposed formulation. We are able to achieve a vast amount of performance boost over current SOTA models on nested NER datasets, i.e., +1.28, +2.55, +5.44, +6.37,respectively on ACE04, ACE05, GENIA and KBP17, along with SOTA results on flat NER datasets, i.e., +0.24, +1.95, +0.21, +1.49 respectively on English CoNLL 2003, English OntoNotes 5.0, Chinese MSRA and Chinese OntoNotes 4.0.",Named Recognition|NER|flat NER|flat tasks,Information Extraction,Long,https://www.aclweb.org/anthology/2020.acl-main.519.pdf -main.257,Multidirectional Associative Optimization of Function-Specific Word Representations,Daniela Gerz|Ivan Vulić|Marek Rei|Roi Reichart|Anna Korhonen,"We present a neural framework for learning associations between interrelated groups of words such as the ones found in Subject-Verb-Object (SVO) structures. Our model induces a joint function-specific word vector space, where vectors of e.g. plausible SVO compositions lie close together. The model retains information about word group membership even in the joint space, and can thereby effectively be applied to a number of tasks reasoning over the SVO structure. We show the robustness and versatility of the proposed framework by reporting state-of-the-art results on the tasks of estimating selectional preference and event similarity. The results indicate that the combinations of representations learned with our task-independent model outperform task-specific architectures from prior work, while reducing the number of parameters by up to 95%.",estimating preference|Multidirectional Representations|neural framework|task-independent model,Semantics: Lexical,Long,https://www.aclweb.org/anthology/2020.acl-main.257.pdf -main.531,Improving Neural Machine Translation with Soft Template Prediction,Jian Yang|Shuming Ma|Dongdong Zhang|Zhoujun Li|Ming Zhou,"Although neural machine translation (NMT) has achieved significant progress in recent years, most previous NMT models only depend on the source text to generate translation. Inspired by the success of template-based and syntax-based approaches in other fields, we propose to use extracted templates from tree structures as soft target templates to guide the translation procedure. In order to learn the syntactic structure of the target sentences, we adopt constituency-based parse tree to generate candidate templates. We incorporate the template information into the encoder-decoder framework to jointly utilize the templates and source text. Experiments show that our model significantly outperforms the baseline models on four benchmarks and demonstrates the effectiveness of soft target templates.",Neural Translation|translation|translation procedure|Soft Prediction,Machine Translation,Long,https://www.aclweb.org/anthology/2020.acl-main.531.pdf -main.525,Pyramid: A Layered Model for Nested Named Entity Recognition,Jue Wang|Lidan Shou|Ke Chen|Gang Chen,"This paper presents Pyramid, a novel layered model for Nested Named Entity Recognition (nested NER). In our approach, token or text region embeddings are recursively inputted into L flat NER layers, from bottom to top, stacked in a pyramid shape. Each time an embedding passes through a layer of the pyramid, its length is reduced by one. Its hidden state at layer l represents an l-gram in the input text, which is labeled only if its corresponding text region represents a complete entity mention. We also design an inverse pyramid to allow bidirectional interaction between layers. The proposed method achieves state-of-the-art F1 scores in nested NER on ACE-2004, ACE-2005, GENIA, and NNE, which are 80.27, 79.42, 77.78, and 93.70 with conventional embeddings, and 87.74, 86.34, 79.31, and 94.68 with pre-trained contextualized embeddings. In addition, our model can be used for the more general task of Overlapping Named Entity Recognition. A preliminary experiment confirms the effectiveness of our method in overlapping NER.",Nested Recognition|nested NER|Overlapping Recognition|Named Recognition,Information Extraction,Long,https://www.aclweb.org/anthology/2020.acl-main.525.pdf -main.243,Posterior Control of Blackbox Generation,Xiang Lisa Li|Alexander Rush,"Text generation often requires high-precision output that obeys task-specific rules. This fine-grained control is difficult to enforce with off-the-shelf deep learning models. In this work, we consider augmenting neural generation models with discrete control states learned through a structured latent-variable approach. Under this formulation, task-specific knowledge can be encoded through a range of rich, posterior constraints that are effectively trained into the model. This approach allows users to ground internal model decisions based on prior knowledge, without sacrificing the representational power of neural generative models. Experiments consider applications of this approach for text generation. We find that this method improves over standard benchmarks, while also providing fine-grained control.",Posterior Generation|Text generation|deep models|neural models,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.243.pdf -main.733,Frugal Paradigm Completion,Alexander Erdmann|Tom Kenter|Markus Becker|Christian Schallhart,"Lexica distinguishing all morphologically related forms of each lexeme are crucial to many language technologies, yet building them is expensive. We propose a frugal paradigm completion approach that predicts all related forms in a morphological paradigm from as few manually provided forms as possible. It induces typological information during training which it uses to determine the best sources at test time. We evaluate our language-agnostic approach on 7 diverse languages. Compared to popular alternative approaches, ours reduces manual labor by 16-63% and is the most robust to typological variation.",Frugal Completion|language technologies|frugal approach|morphological paradigm,"Phonology, Morphology and Word Segmentation",Long,https://www.aclweb.org/anthology/2020.acl-main.733.pdf -main.727,A negative case analysis of visual grounding methods for VQA,Robik Shrestha|Kushal Kafle|Christopher Kanan,"Existing Visual Question Answering (VQA) methods tend to exploit dataset biases and spurious statistical correlations, instead of producing right answers for the right reasons. To address this issue, recent bias mitigation methods for VQA propose to incorporate visual cues (e.g., human attention maps) to better ground the VQA models, showcasing impressive gains. However, we show that the performance improvements are not a result of improved visual grounding, but a regularization effect which prevents over-fitting to linguistic priors. For instance, we find that it is not actually necessary to provide proper, human-based cues; random, insensible cues also result in similar improvements. Based on this observation, we propose a simpler regularization scheme that does not require any external annotations and yet achieves near state-of-the-art performance on VQA-CPv2.",VQA|visual grounding|VQA-CPv2|negative methods,"Language Grounding to Vision, Robotics and Beyond",Short,https://www.aclweb.org/anthology/2020.acl-main.727.pdf -main.684,Learning Web-based Procedures by Reasoning over Explanations and Demonstrations in Context,Shashank Srivastava|Oleksandr Polozov|Nebojsa Jojic|Christopher Meek,"We explore learning web-based tasks from a human teacher through natural language explanations and a single demonstration. Our approach investigates a new direction for semantic parsing that models explaining a demonstration in a context, rather than mapping explanations to demonstrations. By leveraging the idea of inverse semantics from program synthesis to reason backwards from observed demonstrations, we ensure that all considered interpretations are consistent with executable actions in any context, thus simplifying the problem of search over logical forms. We present a dataset of explanations paired with demonstrations for web-based tasks. Our methods show better task completion rates than a supervised semantic parsing baseline (40% relative improvement on average), and are competitive with simple exploration-and-demonstration based methods, while requiring no exploration of the environment. In learning to align explanations with demonstrations, basic properties of natural language syntax emerge as learned behavior. This is an interesting example of pragmatic language acquisition without any linguistic annotation.",learning tasks|semantic parsing|mapping explanations|web-based tasks,"Language Grounding to Vision, Robotics and Beyond",Long,https://www.aclweb.org/anthology/2020.acl-main.684.pdf -main.690,Reducing Gender Bias in Neural Machine Translation as a Domain Adaptation Problem,Danielle Saunders|Bill Byrne,"Training data for NLP tasks often exhibits gender bias in that fewer sentences refer to women than to men. In Neural Machine Translation (NMT) gender bias has been shown to reduce translation quality, particularly when the target language has grammatical gender. The recent WinoMT challenge set allows us to measure this effect directly (Stanovsky et al, 2019) Ideally we would reduce system bias by simply debiasing all data prior to training, but achieving this effectively is itself a challenge. Rather than attempt to create a `balanced' dataset, we use transfer learning on a small set of trusted, gender-balanced examples. This approach gives strong and consistent improvements in gender debiasing with much less computational cost than training from scratch. A known pitfall of transfer learning on new domains is `catastrophic forgetting', which we address at adaptation and inference time. During adaptation we show that Elastic Weight Consolidation allows a performance trade-off between general translation quality and bias reduction. At inference time we propose a lattice-rescoring scheme which outperforms all systems evaluated in Stanovsky et al, 2019 on WinoMT with no degradation of general test set BLEU. We demonstrate our approach translating from English into three languages with varied linguistic properties and data availability.",Reducing Bias|Neural Translation|Domain Problem|NLP tasks,Machine Translation,Long,https://www.aclweb.org/anthology/2020.acl-main.690.pdf -main.109,Fine-Grained Analysis of Cross-Linguistic Syntactic Divergences,Dmitry Nikolaev|Ofir Arviv|Taelin Karidi|Neta Kenneth|Veronika Mitnik|Lilja Maria Saeboe|Omri Abend,"The patterns in which the syntax of different languages converges and diverges are often used to inform work on cross-lingual transfer. Nevertheless, little empirical work has been done on quantifying the prevalence of different syntactic divergences across language pairs. We propose a framework for extracting divergence patterns for any language pair from a parallel corpus, building on Universal Dependencies. We show that our framework provides a detailed picture of cross-language divergences, generalizes previous approaches, and lends itself to full automation. We further present a novel dataset, a manually word-aligned subset of the Parallel UD corpus in five languages, and use it to perform a detailed corpus study. We demonstrate the usefulness of the resulting analysis by showing that it can help account for performance patterns of a cross-lingual parser.",Fine-Grained Divergences|cross-lingual transfer|full automation|cross-lingual parser,Resources and Evaluation,Long,https://www.aclweb.org/anthology/2020.acl-main.109.pdf -main.647,Null It Out: Guarding Protected Attributes by Iterative Nullspace Projection,Shauli Ravfogel|Yanai Elazar|Hila Gonen|Michael Twiton|Yoav Goldberg,"The ability to control for the kinds of information encoded in neural representation has a variety of use cases, especially in light of the challenge of interpreting these models. We present Iterative Null-space Projection (INLP), a novel method for removing information from neural representations. Our method is based on repeated training of linear classifiers that predict a certain property we aim to remove, followed by projection of the representations on their null-space. By doing so, the classifiers become oblivious to that target property, making it hard to linearly separate the data according to it. While applicable for multiple uses, we evaluate our method on bias and fairness use-cases, and show that our method is able to mitigate bias in word embeddings, as well as to increase fairness in a setting of multi-class classification.",multi-class classification|Iterative Projection|Iterative |neural representation,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.647.pdf -main.121,"Attend, Translate and Summarize: An Efficient Method for Neural Cross-Lingual Summarization",Junnan Zhu|Yu Zhou|Jiajun Zhang|Chengqing Zong,"Cross-lingual summarization aims at summarizing a document in one language (e.g., Chinese) into another language (e.g., English). In this paper, we propose a novel method inspired by the translation pattern in the process of obtaining a cross-lingual summary. We first attend to some words in the source text, then translate them into the target language, and summarize to get the final summary. Specifically, we first employ the encoder-decoder attention distribution to attend to the source words. Second, we present three strategies to acquire the translation probability, which helps obtain the translation candidates for each source word. Finally, each summary word is generated either from the neural distribution or from the translation candidates of source words. Experimental results on Chinese-to-English and English-to-Chinese summarization tasks have shown that our proposed method can significantly outperform the baselines, achieving comparable performance with the state-of-the-art.",Translate Summarize|Neural Summarization|Cross-lingual summarization|Chinese-to-English tasks,Summarization,Long,https://www.aclweb.org/anthology/2020.acl-main.121.pdf -main.135,Semantic Graphs for Generating Deep Questions,Liangming Pan|Yuxi Xie|Yansong Feng|Tat-Seng Chua|Min-Yen Kan,"This paper proposes the problem of Deep Question Generation (DQG), which aims to generate complex questions that require reasoning over multiple pieces of information about the input passage. In order to capture the global structure of the document and facilitate reasoning, we propose a novel framework that first constructs a semantic-level graph for the input document and then encodes the semantic graph by introducing an attention-based GGNN (Att-GGNN). Afterward, we fuse the document-level and graph-level representations to perform joint training of content selection and question decoding. On the HotpotQA deep-question centric dataset, our model greatly improves performance over questions requiring reasoning over multiple facts, leading to state-of-the-art performance. The code is publicly available at https://github.com/WING-NUS/SG-Deep-Question-Generation.",Generating Questions|Deep Generation|Deep DQG|reasoning,Generation,Long,https://www.aclweb.org/anthology/2020.acl-main.135.pdf -main.653,MLQA: Evaluating Cross-lingual Extractive Question Answering,Patrick Lewis|Barlas Oguz|Ruty Rinott|Sebastian Riedel|Holger Schwenk,"Question answering (QA) models have shown rapid progress enabled by the availability of large, high-quality benchmark datasets. Such annotated datasets are difficult and costly to collect, and rarely exist in languages other than English, making building QA systems that work well in other languages challenging. In order to develop such systems, it is crucial to invest in high quality multilingual evaluation benchmarks to measure progress. We present MLQA, a multi-way aligned extractive QA evaluation benchmark intended to spur research in this area. MLQA contains QA instances in 7 languages, English, Arabic, German, Spanish, Hindi, Vietnamese and Simplified Chinese. MLQA has over 12K instances in English and 5K in each other language, with each instance parallel between 4 languages on average. We evaluate state-of-the-art cross-lingual models and machine-translation-based baselines on MLQA. In all cases, transfer results are shown to be significantly behind training-language performance.",Evaluating Answering|Cross-lingual Answering|Question models|MLQA,Question Answering,Long,https://www.aclweb.org/anthology/2020.acl-main.653.pdf -main.486,Social Bias Frames: Reasoning about Social and Power Implications of Language,Maarten Sap|Saadia Gabriel|Lianhui Qin|Dan Jurafsky|Noah A. Smith|Yejin Choi,"Warning: this paper contains content that may be offensive or upsetting. Language has the power to reinforce stereotypes and project social biases onto others. At the core of the challenge is that it is rarely what is stated explicitly, but rather the implied meanings, that frame people's judgments about others. For example, given a statement that ""we shouldn't lower our standards to hire more women,"" most listeners will infer the implicature intended by the speaker - that ""women (candidates) are less qualified."" Most semantic formalisms, to date, do not capture such pragmatic implications in which people express social biases and power differentials in language. We introduce Social Bias Frames, a new conceptual formalism that aims to model the pragmatic frames in which people project social biases and stereotypes onto others. In addition, we introduce the Social Bias Inference Corpus to support large-scale modelling and evaluation with 150k structured annotations of social media posts, covering over 34k implications about a thousand demographic groups. We then establish baseline approaches that learn to recover Social Bias Frames from unstructured text. We find that while state-of-the-art neural models are effective at high-level categorization of whether a given statement projects unwanted social bias (80% F1), they are not effective at spelling out more detailed explanations in terms of Social Bias Frames. Our study motivates future work that combines structured pragmatic inference with commonsense reasoning on social implications.",Warning|large-scale evaluation|high-level categorization|Social Frames,Ethics and NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.486.pdf -main.492,Explaining Black Box Predictions and Unveiling Data Artifacts through Influence Functions,Xiaochuang Han|Byron C. Wallace|Yulia Tsvetkov,"Modern deep learning models for NLP are notoriously opaque. This has motivated the development of methods for interpreting such models, e.g., via gradient-based saliency maps or the visualization of attention weights. Such approaches aim to provide explanations for a particular model prediction by highlighting important words in the corresponding input text. While this might be useful for tasks where decisions are explicitly influenced by individual tokens in the input, we suspect that such highlighting is not suitable for tasks where model decisions should be driven by more complex reasoning. In this work, we investigate the use of influence functions for NLP, providing an alternative approach to interpreting neural text classifiers. Influence functions explain the decisions of a model by identifying influential training examples. Despite the promise of this approach, influence functions have not yet been extensively evaluated in the context of NLP, a gap addressed by this work. We conduct a comparison between influence functions and common word-saliency methods on representative tasks. As suspected, we find that influence functions are particularly useful for natural language inference, a task in which `saliency maps' may not have clear interpretation. Furthermore, we develop a new quantitative measure based on influence functions that can reveal artifacts in training data.",NLP|model prediction|model decisions|natural inference,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.492.pdf -main.479,Harnessing the linguistic signal to predict scalar inferences,Sebastian Schuster|Yuxing Chen|Judith Degen,"Pragmatic inferences often subtly depend on the presence or absence of linguistic features. For example, the presence of a partitive construction (of the) increases the strength of a so-called scalar inference: listeners perceive the inference that Chris did not eat all of the cookies to be stronger after hearing ""Chris ate some of the cookies"" than after hearing the same utterance without a partitive, ""Chris ate some cookies"". In this work, we explore to what extent neural network sentence encoders can learn to predict the strength of scalar inferences. We first show that an LSTM-based sentence encoder trained on an English dataset of human inference strength ratings is able to predict ratings with high accuracy (r = 0.78). We then probe the model's behavior using manually constructed minimal sentence pairs and corpus data. We first that the model inferred previously established associations between linguistic features and inference strength, suggesting that the model learns to use linguistic features to predict pragmatic inferences.",scalar inferences|Pragmatic inferences|partitive construction|neural encoders,Discourse and Pragmatics,Long,https://www.aclweb.org/anthology/2020.acl-main.479.pdf -main.14,TransS-Driven Joint Learning Architecture for Implicit Discourse Relation Recognition,Ruifang He|Jian Wang|Fengyu Guo|Yugui Han,"Implicit discourse relation recognition is a challenging task due to the lack of connectives as strong linguistic clues. Previous methods primarily encode two arguments separately or extract the specific interaction patterns for the task, which have not fully exploited the annotated relation signal. Therefore, we propose a novel TransS-driven joint learning architecture to address the issues. Specifically, based on the multi-level encoder, we 1) translate discourse relations in low-dimensional embedding space (called TransS), which could mine the latent geometric structure information of argument-relation instances; 2) further exploit the semantic features of arguments to assist discourse understanding; 3) jointly learn 1) and 2) to mutually reinforce each other to obtain the better argument representations, so as to improve the performance of the task. Extensive experimental results on the Penn Discourse TreeBank (PDTB) show that our model achieves competitive results against several state-of-the-art systems.",Implicit Recognition|discourse understanding|TransS-Driven Architecture|multi-level encoder,Discourse and Pragmatics,Long,https://www.aclweb.org/anthology/2020.acl-main.14.pdf -main.323,Dynamically Adjusting Transformer Batch Size by Monitoring Gradient Direction Change,Hongfei Xu|Josef van Genabith|Deyi Xiong|Qiuhui Liu,"The choice of hyper-parameters affects the performance of neural models. While much previous research (Sutskever et al., 2013; Duchi et al., 2011; Kingma and Ba, 2015) focuses on accelerating convergence and reducing the effects of the learning rate, comparatively few papers concentrate on the effect of batch size. In this paper, we analyze how increasing batch size affects gradient direction, and propose to evaluate the stability of gradients with their angle change. Based on our observations, the angle change of gradient direction first tends to stabilize (i.e. gradually decrease) while accumulating mini-batches, and then starts to fluctuate. We propose to automatically and dynamically determine batch sizes by accumulating gradients of mini-batches and performing an optimization step at just the time when the direction of gradients starts to fluctuate. To improve the efficiency of our approach for large models, we propose a sampling approach to select gradients of parameters sensitive to the batch size. Our approach dynamically determines proper and efficient batch sizes during training. In our experiments on the WMT 14 English to German and English to French tasks, our approach improves the Transformer with a fixed 25k batch size by +0.73 and +0.82 BLEU respectively.",Dynamically Size|Monitoring Change|accelerating convergence|training,Machine Translation,Short,https://www.aclweb.org/anthology/2020.acl-main.323.pdf -main.445,Facet-Aware Evaluation for Extractive Summarization,Yuning Mao|Liyuan Liu|Qi Zhu|Xiang Ren|Jiawei Han,"Commonly adopted metrics for extractive summarization focus on lexical overlap at the token level. In this paper, we present a facet-aware evaluation setup for better assessment of the information coverage in extracted summaries. Specifically, we treat each sentence in the reference summary as a facet, identify the sentences in the document that express the semantics of each facet as support sentences of the facet, and automatically evaluate extractive summarization methods by comparing the indices of extracted sentences and support sentences of all the facets in the reference summary. To facilitate this new evaluation setup, we construct an extractive version of the CNN/Daily Mail dataset and perform a thorough quantitative investigation, through which we demonstrate that facet-aware evaluation manifests better correlation with human judgment than ROUGE, enables fine-grained evaluation as well as comparative analysis, and reveals valuable insights of state-of-the-art summarization methods. Data can be found at https://github.com/morningmoni/FAR.",Facet-Aware Evaluation|Extractive Summarization|fine-grained evaluation|comparative analysis,Resources and Evaluation,Long,https://www.aclweb.org/anthology/2020.acl-main.445.pdf -main.451,Discourse-Aware Neural Extractive Text Summarization,Jiacheng Xu|Zhe Gan|Yu Cheng|Jingjing Liu,"Recently BERT has been adopted for document encoding in state-of-the-art text summarization models. However, sentence-based extractive models often result in redundant or uninformative phrases in the extracted summaries. Also, long-range dependencies throughout a document are not well captured by BERT, which is pre-trained on sentence pairs instead of documents. To address these issues, we present a discourse-aware neural summarization model - DiscoBert. DiscoBert extracts sub-sentential discourse units (instead of sentences) as candidates for extractive selection on a finer granularity. To capture the long-range dependencies among discourse units, structural discourse graphs are constructed based on RST trees and coreference mentions, encoded with Graph Convolutional Networks. Experiments show that the proposed model outperforms state-of-the-art methods by a significant margin on popular summarization benchmarks compared to other BERT-base models.",Discourse-Aware Summarization|document encoding|extractive selection|text models,Summarization,Long,https://www.aclweb.org/anthology/2020.acl-main.451.pdf -main.337,Investigating Word-Class Distributions in Word Vector Spaces,Ryohei Sasano|Anna Korhonen,"This paper presents an investigation on the distribution of word vectors belonging to a certain word class in a pre-trained word vector space. To this end, we made several assumptions about the distribution, modeled the distribution accordingly, and validated each assumption by comparing the goodness of each model. Specifically, we considered two types of word classes – the semantic class of direct objects of a verb and the semantic class in a thesaurus – and tried to build models that properly estimate how likely it is that a word in the vector space is a member of a given word class. Our results on selectional preference and WordNet datasets show that the centroid-based model will fail to achieve good enough performance, the geometry of the distribution and the existence of subgroups will have limited impact, and also the negative instances need to be considered for adequate modeling of the distribution. We further investigated the relationship between the scores calculated by each model and the degree of membership and found that discriminative learning-based models are best in finding the boundaries of a class, while models based on the offset between positive and negative instances perform best in determining the degree of membership.",modeling distribution|centroid-based model|discriminative models|Word-Class Distributions,Semantics: Lexical,Long,https://www.aclweb.org/anthology/2020.acl-main.337.pdf -main.28,Unsupervised Paraphrasing by Simulated Annealing,Xianggen Liu|Lili Mou|Fandong Meng|Hao Zhou|Jie Zhou|Sen Song,"We propose UPSA, a novel approach that accomplishes Unsupervised Paraphrasing by Simulated Annealing. We model paraphrase generation as an optimization problem and propose a sophisticated objective function, involving semantic similarity, expression diversity, and language fluency of paraphrases. UPSA searches the sentence space towards this objective by performing a sequence of local editing. We evaluate our approach on various datasets, namely, Quora, Wikianswers, MSCOCO, and Twitter. Extensive results show that UPSA achieves the state-of-the-art performance compared with previous unsupervised methods in terms of both automatic and human evaluations. Further, our approach outperforms most existing domain-adapted supervised models, showing the generalizability of UPSA.",Unsupervised Paraphrasing|paraphrase generation|optimization problem|Unsupervised Paraphrasing,Generation,Long,https://www.aclweb.org/anthology/2020.acl-main.28.pdf -main.73,Tree-Structured Neural Topic Model,Masaru Isonuma|Junichiro Mori|Danushka Bollegala|Ichiro Sakata,"This paper presents a tree-structured neural topic model, which has a topic distribution over a tree with an infinite number of branches. Our model parameterizes an unbounded ancestral and fraternal topic distribution by applying doubly-recurrent neural networks. With the help of autoencoding variational Bayes, our model improves data scalability and achieves competitive performance when inducing latent topics and tree structures, as compared to a prior tree-structured topic model (Blei et al., 2010). This work extends the tree-structured topic model such that it can be incorporated with neural models for downstream tasks.",inducing structures|downstream tasks|Tree-Structured Model|doubly-recurrent networks,Information Retrieval and Text Mining,Short,https://www.aclweb.org/anthology/2020.acl-main.73.pdf -main.67,Line Graph Enhanced AMR-to-Text Generation with Mix-Order Graph Attention Networks,Yanbin Zhao|Lu Chen|Zhi Chen|Ruisheng Cao|Su Zhu|Kai Yu,"Efficient structure encoding for graphs with labeled edges is an important yet challenging point in many graph-based models. This work focuses on AMR-to-text generation -- A graph-to-sequence task aiming to recover natural language from Abstract Meaning Representations (AMR). Existing graph-to-sequence approaches generally utilize graph neural networks as their encoders, which have two limitations: 1) The message propagation process in AMR graphs is only guided by the first-order adjacency information. 2) The relationships between labeled edges are not fully considered. In this work, we propose a novel graph encoding framework which can effectively explore the edge relations. We also adopt graph attention networks with higher-order neighborhood information to encode the rich structure in AMR graphs. Experiment results show that our approach obtains new state-of-the-art performance on English AMR benchmark datasets. The ablation analyses also demonstrate that both edge relations and higher-order information are beneficial to graph-to-sequence modeling.",Line Generation|AMR-to-text generation|graph-to-sequence task|graph-to-sequence modeling,Generation,Long,https://www.aclweb.org/anthology/2020.acl-main.67.pdf -main.378,Max-Margin Incremental CCG Parsing,Miloš Stanojević|Mark Steedman,"Incremental syntactic parsing has been an active research area both for cognitive scientists trying to model human sentence processing and for NLP researchers attempting to combine incremental parsing with language modelling for ASR and MT. Most effort has been directed at designing the right transition mechanism, but less has been done to answer the question of what a probabilistic model for those transition parsers should look like. A very incremental transition mechanism of a recently proposed CCG parser when trained in straightforward locally normalised discriminative fashion produces very bad results on English CCGbank. We identify three biases as the causes of this problem: label bias, exposure bias and imbalanced probabilities bias. While known techniques for tackling these biases improve results, they still do not make the parser state of the art. Instead, we tackle all of these three biases at the same time using an improved version of beam search optimisation that minimises all beam search violations instead of minimising only the biggest violation. The new incremental parser gives better results than all previously published incremental CCG parsers, and outperforms even some widely used non-incremental CCG parsers.",Incremental parsing|human processing|ASR|MT,"Syntax: Tagging, Chunking and Parsing",Long,https://www.aclweb.org/anthology/2020.acl-main.378.pdf -main.436,Shaping Visual Representations with Language for Few-Shot Classification,Jesse Mu|Percy Liang|Noah Goodman,"By describing the features and abstractions of our world, language is a crucial tool for human learning and a promising source of supervision for machine learning models. We use language to improve few-shot visual classification in the underexplored scenario where natural language task descriptions are available during training, but unavailable for novel tasks at test time. Existing models for this setting sample new descriptions at test time and use those to classify images. Instead, we propose language-shaped learning (LSL), an end-to-end model that regularizes visual representations to predict language. LSL is conceptually simpler, more data efficient, and outperforms baselines in two challenging few-shot domains.",Few-Shot Classification|human learning|supervision|machine models,"Language Grounding to Vision, Robotics and Beyond",Short,https://www.aclweb.org/anthology/2020.acl-main.436.pdf -main.350,SimulSpeech: End-to-End Simultaneous Speech to Text Translation,Yi Ren|Jinglin Liu|Xu Tan|Chen Zhang|Tao Qin|Zhou Zhao|Tie-Yan Liu,"In this work, we develop SimulSpeech, an end-to-end simultaneous speech to text translation system which translates speech in source language to text in target language concurrently. SimulSpeech consists of a speech encoder, a speech segmenter and a text decoder, where 1) the segmenter builds upon the encoder and leverages a connectionist temporal classification (CTC) loss to split the input streaming speech in real time, 2) the encoder-decoder attention adopts a wait-k strategy for simultaneous translation. SimulSpeech is more challenging than previous cascaded systems (with simultaneous automatic speech recognition (ASR) and simultaneous neural machine translation (NMT)). We introduce two novel knowledge distillation methods to ensure the performance: 1) Attention-level knowledge distillation transfers the knowledge from the multiplication of the attention matrices of simultaneous NMT and ASR models to help the training of the attention mechanism in SimulSpeech; 2) Data-level knowledge distillation transfers the knowledge from the full-sentence NMT model and also reduces the complexity of data distribution to help on the optimization of SimulSpeech. Experiments on MuST-C English-Spanish and English-German spoken language translation datasets show that SimulSpeech achieves reasonable BLEU scores and lower delay compared to full-sentence end-to-end speech to text translation (without simultaneous translation), and better performance than the two-stage cascaded simultaneous translation model in terms of BLEU scores and translation delay.",simultaneous translation|simultaneous recognition|ASR|NMT,Speech and Multimodality,Long,https://www.aclweb.org/anthology/2020.acl-main.350.pdf -main.344,Curriculum Pre-training for End-to-End Speech Translation,Chengyi Wang|Yu Wu|Shujie Liu|Ming Zhou|Zhenglu Yang,"End-to-end speech translation poses a heavy burden on the encoder because it has to transcribe, understand, and learn cross-lingual semantics simultaneously. To obtain a powerful encoder, traditional methods pre-train it on ASR data to capture speech features. However, we argue that pre-training the encoder only through simple speech recognition is not enough, and high-level linguistic knowledge should be considered. Inspired by this, we propose a curriculum pre-training method that includes an elementary course for transcription learning and two advanced courses for understanding the utterance and mapping words in two languages. The difficulty of these courses is gradually increasing. Experiments show that our curriculum pre-training method leads to significant improvements on En-De and En-Fr speech translation benchmarks.",Curriculum Pre-training|End-to-End Translation|speech recognition|transcription learning,Speech and Multimodality,Long,https://www.aclweb.org/anthology/2020.acl-main.344.pdf -main.422,Similarity Analysis of Contextual Word Representation Models,John Wu|Yonatan Belinkov|Hassan Sajjad|Nadir Durrani|Fahim Dalvi|James Glass,"This paper investigates contextual word representation models from the lens of similarity analysis. Given a collection of trained models, we measure the similarity of their internal representations and attention. Critically, these models come from vastly different architectures. We use existing and novel similarity measures that aim to gauge the level of localization of information in the deep models, and facilitate the investigation of which design factors affect model similarity, without requiring any external linguistic annotation. The analysis reveals that models within the same family are more similar to one another, as may be expected. Surprisingly, different architectures have rather similar representations, but different individual neurons. We also observed differences in information localization in lower and higher layers and found that higher layers are more affected by fine-tuning on downstream tasks.",information localization|downstream tasks|Similarity Models|contextual models,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.422.pdf -main.1,Learning to Understand Child-directed and Adult-directed Speech,Lieke Gelderloos|Grzegorz Chrupała|Afra Alishahi,"Speech directed to children differs from adult-directed speech in linguistic aspects such as repetition, word choice, and sentence length, as well as in aspects of the speech signal itself, such as prosodic and phonemic variation. Human language acquisition research indicates that child-directed speech helps language learners. This study explores the effect of child-directed speech when learning to extract semantic information from speech directly. We compare the task performance of models trained on adult-directed speech (ADS) and child-directed speech (CDS). We find indications that CDS helps in the initial stages of learning, but eventually, models trained on ADS reach comparable task performance, and generalize better. The results suggest that this is at least partially due to linguistic rather than acoustic properties of the two registers, as we see the same pattern when looking at models trained on acoustically comparable synthetic speech.",Human acquisition|Human research|CDS|ADS,Cognitive Modeling and Psycholinguistics,Short,https://www.aclweb.org/anthology/2020.acl-main.1.pdf -main.393,Identifying Principals and Accessories in a Complex Case based on the Comprehension of Fact Description,Yakun Hu|Zhunchen Luo|Wenhan Chao,"In this paper, we study the problem of identifying the principals and accessories from the fact description with multiple defendants in a criminal case. We treat the fact descriptions as narrative texts and the defendants as roles over the narrative story. We propose to model the defendants with behavioral semantic information and statistical characteristics, then learning the importances of defendants within a learning-to-rank framework. Experimental results on a real-world dataset demonstrate the behavior analysis can effectively model the defendants' impacts in a complex case.",Comprehension Description|fact description|learning-to-rank framework|behavior analysis,NLP Applications,Short,https://www.aclweb.org/anthology/2020.acl-main.393.pdf -main.387,Towards Transparent and Explainable Attention Models,Akash Kumar Mohankumar|Preksha Nema|Sharan Narasimhan|Mitesh M. Khapra|Balaji Vasan Srinivasan|Balaraman Ravindran,"Recent studies on interpretability of attention distributions have led to notions of faithful and plausible explanations for a model's predictions. Attention distributions can be considered a faithful explanation if a higher attention weight implies a greater impact on the model's prediction. They can be considered a plausible explanation if they provide a human-understandable justification for the model's predictions. In this work, we first explain why current attention mechanisms in LSTM based encoders can neither provide a faithful nor a plausible explanation of the model's predictions. We observe that in LSTM based encoders the hidden representations at different time-steps are very similar to each other (high conicity) and attention weights in these situations do not carry much meaning because even a random permutation of the attention weights does not affect the model's predictions. Based on experiments on a wide variety of tasks and datasets, we observe attention distributions often attribute the model's predictions to unimportant words such as punctuation and fail to offer a plausible explanation for the predictions. To make attention mechanisms more faithful and plausible, we propose a modified LSTM cell with a diversity-driven training objective that ensures that the hidden representations learned at different time steps are diverse. We show that the resulting attention distributions offer more transparency as they (i) provide a more precise importance ranking of the hidden states (ii) are better indicative of words important for the model's predictions (iii) correlate better with gradient-based attribution methods. Human evaluations indicate that the attention distributions learned by our model offer a plausible explanation of the model's predictions. Our code has been made publicly available at https://github.com/akashkm99/Interpretable-Attention",interpretability distributions|attention mechanisms|Human evaluations|Transparent Models,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.387.pdf -main.98,Towards Conversational Recommendation over Multi-Type Dialogs,Zeming Liu|Haifeng Wang|Zheng-Yu Niu|Hua Wu|Wanxiang Che|Ting Liu,"We focus on the study of conversational recommendation in the context of multi-type dialogs, where the bots can proactively and naturally lead a conversation from a non-recommendation dialog (e.g., QA) to a recommendation dialog, taking into account user's interests and feedback. To facilitate the study of this task, we create a human-to-human Chinese dialog dataset DuRecDial (about 10k dialogs, 156k utterances), where there are multiple sequential dialogs for a pair of a recommendation seeker (user) and a recommender (bot). In each dialog, the recommender proactively leads a multi-type dialog to approach recommendation targets and then makes multiple recommendations with rich interaction behavior. This dataset allows us to systematically investigate different parts of the overall problem, e.g., how to naturally lead a dialog, how to interact with users for recommendation. Finally we establish baseline results on DuRecDial for future studies.",Conversational Recommendation|multi-type dialogs|recommender|non-recommendation dialog,Dialogue and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.acl-main.98.pdf -main.608,Unsupervised Dual Paraphrasing for Two-stage Semantic Parsing,Ruisheng Cao|Su Zhu|Chenyu Yang|Chen Liu|Rao Ma|Yanbin Zhao|Lu Chen|Kai Yu,"One daunting problem for semantic parsing is the scarcity of annotation. Aiming to reduce nontrivial human labor, we propose a two-stage semantic parsing framework, where the first stage utilizes an unsupervised paraphrase model to convert an unlabeled natural language utterance into the canonical utterance. The downstream naive semantic parser accepts the intermediate output and returns the target logical form. Furthermore, the entire training process is split into two phases: pre-training and cycle learning. Three tailored self-supervised tasks are introduced throughout training to activate the unsupervised paraphrase model. Experimental results on benchmarks Overnight and GeoGranno demonstrate that our framework is effective and compatible with supervised training.",Two-stage Parsing|semantic parsing|annotation|self-supervised tasks,Semantics: Sentence Level,Long,https://www.aclweb.org/anthology/2020.acl-main.608.pdf -main.152,Parallel Sentence Mining by Constrained Decoding,Pinzhen Chen|Nikolay Bogoychev|Kenneth Heafield|Faheem Kirefu,"We present a novel method to extract parallel sentences from two monolingual corpora, using neural machine translation. Our method relies on translating sentences in one corpus, but constraining the decoding by a prefix tree built on the other corpus. We argue that a neural machine translation system by itself can be a sentence similarity scorer and it efficiently approximates pairwise comparison with a modified beam search. When benchmarked on the BUCC shared task, our method achieves results comparable to other submissions.",Parallel Mining|decoding|Constrained Decoding|neural translation,Machine Translation,Short,https://www.aclweb.org/anthology/2020.acl-main.152.pdf -main.634,Diversifying Dialogue Generation with Non-Conversational Text,Hui Su|Xiaoyu Shen|Sanqiang Zhao|Zhou Xiao|Pengwei Hu|Randy Zhong|Cheng Niu|Jie Zhou,"Neural network-based sequence-to-sequence (seq2seq) models strongly suffer from the low-diversity problem when it comes to open-domain dialogue generation. As bland and generic utterances usually dominate the frequency distribution in our daily chitchat, avoiding them to generate more interesting responses requires complex data filtering, sampling techniques or modifying the training objective. In this paper, we propose a new perspective to diversify dialogue generation by leveraging non-conversational text. Compared with bilateral conversations, non-conversational text are easier to obtain, more diverse and cover a much broader range of topics. We collect a large-scale non-conversational corpus from multi sources including forum comments, idioms and book snippets. We further present a training paradigm to effectively incorporate these text via iterative back translation. The resulting model is tested on two conversational datasets from different domains and is shown to produce significantly more diverse responses without sacrificing the relevance with context.",Diversifying Generation|low-diversity problem|open-domain generation|dialogue generation,Dialogue and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.acl-main.634.pdf -main.620,Uncertainty-Aware Curriculum Learning for Neural Machine Translation,Yikai Zhou|Baosong Yang|Derek F. Wong|Yu Wan|Lidia S. Chao,"Neural machine translation (NMT) has proven to be facilitated by curriculum learning which presents examples in an easy-to-hard order at different training stages. The keys lie in the assessment of data difficulty and model competence. We propose uncertainty-aware curriculum learning, which is motivated by the intuition that: 1) the higher the uncertainty in a translation pair, the more complex and rarer the information it contains; and 2) the end of the decline in model uncertainty indicates the completeness of current training stage. Specifically, we serve cross-entropy of an example as its data difficulty and exploit the variance of distributions over the weights of the network to present the model uncertainty. Extensive experiments on various translation tasks reveal that our approach outperforms the strong baseline and related methods on both translation quality and convergence speed. Quantitative analyses reveal that the proposed strategy offers NMT the ability to automatically govern its learning schedule.",Neural Translation|assessment difficulty|translation tasks|Uncertainty-Aware Learning,Machine Translation,Long,https://www.aclweb.org/anthology/2020.acl-main.620.pdf -main.146,End-to-End Neural Word Alignment Outperforms GIZA++,Thomas Zenkel|Joern Wuebker|John DeNero,"Word alignment was once a core unsupervised learning task in natural language processing because of its essential role in training statistical machine translation (MT) models. Although unnecessary for training neural MT models, word alignment still plays an important role in interactive applications of neural machine translation, such as annotation transfer and lexicon injection. While statistical MT methods have been replaced by neural approaches with superior performance, the twenty-year-old GIZA++ toolkit remains a key component of state-of-the-art word alignment systems. Prior work on neural word alignment has only been able to outperform GIZA++ by using its output during training. We present the first end-to-end neural word alignment method that consistently outperforms GIZA++ on three data sets. Our approach repurposes a Transformer model trained for supervised translation to also serve as an unsupervised word alignment model in a manner that is tightly integrated and does not affect translation quality.",Word alignment|unsupervised task|natural processing|neural translation,Machine Translation,Long,https://www.aclweb.org/anthology/2020.acl-main.146.pdf -main.191,GAN-BERT: Generative Adversarial Learning for Robust Text Classification with a Bunch of Labeled Examples,Danilo Croce|Giuseppe Castellucci|Roberto Basili,"Recent Transformer-based architectures, e.g., BERT, provide impressive results in many Natural Language Processing tasks. However, most of the adopted benchmarks are made of (sometimes hundreds of) thousands of examples. In many real scenarios, obtaining high- quality annotated data is expensive and time consuming; in contrast, unlabeled examples characterizing the target task can be, in general, easily collected. One promising method to enable semi-supervised learning has been proposed in image processing, based on Semi- Supervised Generative Adversarial Networks. In this paper, we propose GAN-BERT that ex- tends the fine-tuning of BERT-like architectures with unlabeled data in a generative adversarial setting. Experimental results show that the requirement for annotated examples can be drastically reduced (up to only 50-100 annotated examples), still obtaining good performances in several sentence classification tasks.",Robust Classification|Natural tasks|image processing|generative setting,Machine Learning for NLP,Short,https://www.aclweb.org/anthology/2020.acl-main.191.pdf -main.185,Negative Training for Neural Dialogue Response Generation,Tianxing He|James Glass,"Although deep learning models have brought tremendous advancements to the field of open-domain dialogue response generation, recent research results have revealed that the trained models have undesirable generation behaviors, such as malicious responses and generic (boring) responses. In this work, we propose a framework named ``Negative Training"" to minimize such behaviors. Given a trained model, the framework will first find generated samples that exhibit the undesirable behavior, and then use them to feed negative training signals for fine-tuning the model. Our experiments show that negative training can significantly reduce the hit rate of malicious responses, or discourage frequent responses and improve response diversity.",Neural Generation|open-domain generation|Negative Training|deep models,Dialogue and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.acl-main.185.pdf -main.768,Are Natural Language Inference Models IMPPRESsive? Learning IMPlicature and PRESupposition,Paloma Jeretic|Alex Warstadt|Suvrat Bhooshan|Adina Williams,"Natural language inference (NLI) is an increasingly important task for natural language understanding, which requires one to infer whether a sentence entails another. However, the ability of NLI models to make pragmatic inferences remains understudied. We create an IMPlicature and PRESupposition diagnostic dataset (IMPPRES), consisting of 32K semi-automatically generated sentence pairs illustrating well-studied pragmatic inference types. We use IMPPRES to evaluate whether BERT, InferSent, and BOW NLI models trained on MultiNLI (Williams et al., 2018) learn to make pragmatic inferences. Although MultiNLI appears to contain very few pairs illustrating these inference types, we find that BERT learns to draw pragmatic inferences. It reliably treats scalar implicatures triggered by ""some"" as entailments. For some presupposition triggers like ""only"", BERT reliably recognizes the presupposition as an entailment, even when the trigger is embedded under an entailment canceling operator like negation. BOW and InferSent show weaker evidence of pragmatic reasoning. We conclude that NLI training encourages models to learn some, but not all, pragmatic inferences.",Natural inference|NLI|natural understanding|pragmatic inferences,Semantics: Textual Inference and Other Areas of Semantics,Long,https://www.aclweb.org/anthology/2020.acl-main.768.pdf -main.740,Don’t Stop Pretraining: Adapt Language Models to Domains and Tasks,Suchin Gururangan|Ana Marasović|Swabha Swayamdipta|Kyle Lo|Iz Beltagy|Doug Downey|Noah A. Smith,"Language models pretrained on text from a wide variety of sources form the foundation of today's NLP. In light of the success of these broad-coverage models, we investigate whether it is still helpful to tailor a pretrained model to the domain of a target task. We present a study across four domains (biomedical and computer science publications, news, and reviews) and eight classification tasks, showing that a second phase of pretraining in-domain (domain-adaptive pretraining) leads to performance gains, under both high- and low-resource settings. Moreover, adapting to the task's unlabeled data (task-adaptive pretraining) improves performance even after domain-adaptive pretraining. Finally, we show that adapting to a task corpus augmented using simple data selection strategies is an effective alternative, especially when resources for domain-adaptive pretraining might be unavailable. Overall, we consistently find that multi-phase adaptive pretraining offers large gains in task performance.",NLP|classification tasks|pretraining|domain-adaptive pretraining,Semantics: Sentence Level,Long,https://www.aclweb.org/anthology/2020.acl-main.740.pdf -main.754,Balancing Training for Multilingual Neural Machine Translation,Xinyi Wang|Yulia Tsvetkov|Graham Neubig,"When training multilingual machine translation (MT) models that can translate to/from multiple languages, we are faced with imbalanced training sets: some languages have much more training data than others. Standard practice is to up-sample less resourced languages to increase representation, and the degree of up-sampling has a large effect on the overall performance. In this paper, we propose a method that instead automatically learns how to weight training data through a data scorer that is optimized to maximize performance on all test languages. Experiments on two sets of languages under both one-to-many and many-to-one MT settings show our method not only consistently outperforms heuristic baselines in terms of average performance, but also offers flexible control over the performance of which languages are optimized.",Multilingual Translation|Balancing Training|multilingual models|heuristic baselines,Machine Translation,Long,https://www.aclweb.org/anthology/2020.acl-main.754.pdf -main.218,Grounding Conversations with Improvised Dialogues,Hyundong Cho|Jonathan May,"Effective dialogue involves grounding, the process of establishing mutual knowledge that is essential for communication between people. Modern dialogue systems are not explicitly trained to build common ground, and therefore overlook this important aspect of communication. Improvisational theater (improv) intrinsically contains a high proportion of dialogue focused on building common ground, and makes use of the yes-and principle, a strong grounding speech act, to establish coherence and an actionable objective reality. We collect a corpus of more than 26,000 yes-and turns, transcribing them from improv dialogues and extracting them from larger, but more sparsely populated movie script dialogue corpora, via a bootstrapped classifier. We fine-tune chit-chat dialogue systems with our corpus to encourage more grounded, relevant conversation and confirm these findings with human evaluations.",Grounding Conversations|dialogue systems|bootstrapped classifier|chit-chat systems,Dialogue and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.acl-main.218.pdf -main.542,Curriculum Learning for Natural Language Understanding,Benfeng Xu|Licheng Zhang|Zhendong Mao|Quan Wang|Hongtao Xie|Yongdong Zhang,"With the great success of pre-trained language models, the pretrain-finetune paradigm now becomes the undoubtedly dominant solution for natural language understanding (NLU) tasks. At the fine-tune stage, target task data is usually introduced in a completely random order and treated equally. However, examples in NLU tasks can vary greatly in difficulty, and similar to human learning procedure, language models can benefit from an easy-to-difficult curriculum. Based on this idea, we propose our Curriculum Learning approach. By reviewing the trainset in a crossed way, we are able to distinguish easy examples from difficult ones, and arrange a curriculum for language models. Without any manual model architecture design or use of external data, our Curriculum Learning approach obtains significant and universal performance improvements on a wide range of NLU tasks.",Curriculum Learning|Natural Understanding|natural tasks|NLU tasks,Semantics: Textual Inference and Other Areas of Semantics,Long,https://www.aclweb.org/anthology/2020.acl-main.542.pdf -main.224,Bridging the Structural Gap Between Encoding and Decoding for Data-To-Text Generation,Chao Zhao|Marilyn Walker|Snigdha Chaturvedi,"Generating sequential natural language descriptions from graph-structured data (e.g., knowledge graph) is challenging, partly because of the structural differences between the input graph and the output text. Hence, popular sequence-to-sequence models, which require serialized input, are not a natural fit for this task. Graph neural networks, on the other hand, can better encode the input graph but broaden the structural gap between the encoder and decoder, making faithful generation difficult. To narrow this gap, we propose DualEnc, a dual encoding model that can not only incorporate the graph structure, but can also cater to the linear structure of the output text. Empirical comparisons with strong single-encoder baselines demonstrate that dual encoding can significantly improve the quality of the generated text.",Data-To-Text Generation|faithful generation|Encoding|Decoding,Generation,Long,https://www.aclweb.org/anthology/2020.acl-main.224.pdf -main.230,Cross-media Structured Common Space for Multimedia Event Extraction,Manling Li|Alireza Zareian|Qi Zeng|Spencer Whitehead|Di Lu|Heng Ji|Shih-Fu Chang,"We introduce a new task, MultiMedia Event Extraction, which aims to extract events and their arguments from multimedia documents. We develop the first benchmark and collect a dataset of 245 multimedia news articles with extensively annotated events and arguments. We propose a novel method, Weakly Aligned Structured Embedding (WASE), that encodes structured representations of semantic information from textual and visual data into a common embedding space. The structures are aligned across modalities by employing a weakly supervised training strategy, which enables exploiting available resources without explicit cross-media annotation. Compared to uni-modal state-of-the-art methods, our approach achieves 4.0% and 9.8% absolute F-score gains on text event argument role labeling and visual event extraction. Compared to state-of-the-art multimedia unstructured representations, we achieve 8.3% and 5.0% absolute F-score gains on multimedia event extraction and argument role labeling, respectively. By utilizing images, we extract 21.4% more event mentions than traditional text-only methods.",Multimedia Extraction|text labeling|visual extraction|argument labeling,"Language Grounding to Vision, Robotics and Beyond",Long,https://www.aclweb.org/anthology/2020.acl-main.230.pdf -main.556,Multi-Granularity Interaction Network for Extractive and Abstractive Multi-Document Summarization,Hanqi Jin|Tianming Wang|Xiaojun Wan,"In this paper, we propose a multi-granularity interaction network for extractive and abstractive multi-document summarization, which jointly learn semantic representations for words, sentences, and documents. The word representations are used to generate an abstractive summary while the sentence representations are used to produce an extractive summary. We employ attention mechanisms to interact between different granularity of semantic representations, which helps to capture multi-granularity key information and improves the performance of both abstractive and extractive summarization. Experiment results show that our proposed model substantially outperforms all strong baseline methods and achieves the best results on the Multi-News dataset.",Extractive Summarization|Extractive |abstractive summarization|Multi-Granularity Network,Summarization,Long,https://www.aclweb.org/anthology/2020.acl-main.556.pdf -main.581,Single-/Multi-Source Cross-Lingual NER via Teacher-Student Learning on Unlabeled Data in Target Language,Qianhui Wu|Zijia Lin|Börje Karlsson|Jian-Guang Lou|Biqing Huang,"To better tackle the named entity recognition (NER) problem on languages with little/no labeled data, cross-lingual NER must effectively leverage knowledge learned from source languages with rich labeled data. Previous works on cross-lingual NER are mostly based on label projection with pairwise texts or direct model transfer. However, such methods either are not applicable if the labeled data in the source languages is unavailable, or do not leverage information contained in unlabeled data in the target language. In this paper, we propose a teacher-student learning method to address such limitations, where NER models in the source languages are used as teachers to train a student model on unlabeled data in the target language. The proposed method works for both single-source and multi-source cross-lingual NER. For the latter, we further propose a similarity measuring method to better weight the supervision from different teacher models. Extensive experiments for 3 target languages on benchmark datasets well demonstrate that our method outperforms existing state-of-the-art methods for both single-source and multi-source cross-lingual NER.",Single-/Multi-Source NER|named problem|cross-lingual NER|single-source NER,Information Extraction,Long,https://www.aclweb.org/anthology/2020.acl-main.581.pdf -main.595,Coupling Distant Annotation and Adversarial Training for Cross-Domain Chinese Word Segmentation,Ning Ding|Dingkun Long|Guangwei Xu|Muhua Zhu|Pengjun Xie|Xiaobin Wang|Haitao Zheng,"Fully supervised neural approaches have achieved significant progress in the task of Chinese word segmentation (CWS). Nevertheless, the performance of supervised models always drops gravely if the domain shifts due to the distribution gap across domains and the out of vocabulary (OOV) problem. In order to simultaneously alleviate the issues, this paper intuitively couples distant annotation and adversarial training for cross-domain CWS. 1) We rethink the essence of ``Chinese words'' and design an automatic distant annotation mechanism, which does not need any supervision or pre-defined dictionaries on the target domain. The method could effectively explore domain-specific words and distantly annotate the raw texts for the target domain. 2) We further develop a sentence-level adversarial training procedure to perform noise reduction and maximum utilization of the source domain information. Experiments on multiple real-world datasets across various domains show the superiority and robustness of our model, significantly outperforming previous state-of-the-arts cross-domain CWS methods.",Coupling Annotation|Cross-Domain Segmentation|Chinese segmentation|Chinese CWS,"Phonology, Morphology and Word Segmentation",Long,https://www.aclweb.org/anthology/2020.acl-main.595.pdf -main.594,Bootstrapping Techniques for Polysynthetic Morphological Analysis,William Lane|Steven Bird,"Polysynthetic languages have exceptionally large and sparse vocabularies, thanks to the number of morpheme slots and combinations in a word. This complexity, together with a general scarcity of written data, poses a challenge to the development of natural language technologies. To address this challenge, we offer linguistically-informed approaches for bootstrapping a neural morphological analyzer, and demonstrate its application to Kunwinjku, a polysynthetic Australian language. We generate data from a finite state transducer to train an encoder-decoder model. We improve the model by ""hallucinating"" missing linguistic structure into the training data, and by resampling from a Zipf distribution to simulate a more natural distribution of morphemes. The best model accounts for all instances of reduplication in the test set and achieves an accuracy of 94.7% overall, a 10 percentage point improvement over the FST baseline. This process demonstrates the feasibility of bootstrapping a neural morph analyzer from minimal resources.",Polysynthetic Analysis|Bootstrapping Techniques|natural technologies|linguistically-informed approaches,"Phonology, Morphology and Word Segmentation",Long,https://www.aclweb.org/anthology/2020.acl-main.594.pdf -main.580,Representation Learning for Information Extraction from Form-like Documents,Bodhisattwa Prasad Majumder|Navneet Potti|Sandeep Tata|James Bradley Wendt|Qi Zhao|Marc Najork,"We propose a novel approach using representation learning for tackling the problem of extracting structured information from form-like document images. We propose an extraction system that uses knowledge of the types of the target fields to generate extraction candidates and a neural network architecture that learns a dense representation of each candidate based on neighboring words in the document. These learned representations are not only useful in solving the extraction task for unseen document templates from two different domains but are also interpretable, as we show using loss cases.",Information Extraction|extraction task|Representation Learning|extraction system,Information Extraction,Long,https://www.aclweb.org/anthology/2020.acl-main.580.pdf -main.231,Learning to Segment Actions from Observation and Narration,Daniel Fried|Jean-Baptiste Alayrac|Phil Blunsom|Chris Dyer|Stephen Clark|Aida Nematzadeh,"We apply a generative segmental model of task structure, guided by narration, to action segmentation in video. We focus on unsupervised and weakly-supervised settings where no action labels are known during training. Despite its simplicity, our model performs competitively with previous work on a dataset of naturalistic instructional videos. Our model allows us to vary the sources of supervision used in training, and we find that both task structure and narrative language provide large benefits in segmentation quality.",action video|unsupervised settings|generative structure|Observation,"Language Grounding to Vision, Robotics and Beyond",Long,https://www.aclweb.org/anthology/2020.acl-main.231.pdf -main.557,Tetra-Tagging: Word-Synchronous Parsing with Linear-Time Inference,Nikita Kitaev|Dan Klein,"We present a constituency parsing algorithm that, like a supertagger, works by assigning labels to each word in a sentence. In order to maximally leverage current neural architectures, the model scores each word's tags in parallel, with minimal task-specific structure. After scoring, a left-to-right reconciliation phase extracts a tree in (empirically) linear time. Our parser achieves 95.4 F1 on the WSJ test set while also achieving substantial speedups compared to current state-of-the-art parsers with comparable accuracies.",Tetra-Tagging|Word-Synchronous Parsing|Linear-Time Inference|constituency algorithm,"Syntax: Tagging, Chunking and Parsing",Short,https://www.aclweb.org/anthology/2020.acl-main.557.pdf -main.543,Do Neural Models Learn Systematicity of Monotonicity Inference in Natural Language?,Hitomi Yanaka|Koji Mineshima|Daisuke Bekki|Kentaro Inui,"Despite the success of language models using neural networks, it remains unclear to what extent neural models have the generalization ability to perform inferences. In this paper, we introduce a method for evaluating whether neural models can learn systematicity of monotonicity inference in natural language, namely, the regularity for performing arbitrary inferences with generalization on composition. We consider four aspects of monotonicity inferences and test whether the models can systematically interpret lexical and logical phenomena on different training/test splits. A series of experiments show that three neural models systematically draw inferences on unseen combinations of lexical and logical phenomena when the syntactic structures of the sentences are similar between the training and test sets. However, the performance of the models significantly decreases when the structures are slightly changed in the test set while retaining all vocabularies and constituents already appearing in the training set. This indicates that the generalization ability of neural models is limited to cases where the syntactic structures are nearly the same as those in the training set.",Systematicity Inference|inferences|generalization composition|monotonicity inferences,Semantics: Textual Inference and Other Areas of Semantics,Long,https://www.aclweb.org/anthology/2020.acl-main.543.pdf -main.225,Enabling Language Models to Fill in the Blanks,Chris Donahue|Mina Lee|Percy Liang,"We present a simple approach for text infilling, the task of predicting missing spans of text at any position in a document. While infilling could enable rich functionality especially for writing assistance tools, more attention has been devoted to language modeling---a special case of infilling where text is predicted at the end of a document. In this paper, we aim to extend the capabilities of language models (LMs) to the more general task of infilling. To this end, we train (or fine tune) off-the-shelf LMs on sequences containing the concatenation of artificially-masked text and the text which was masked. We show that this approach, which we call infilling by language modeling, can enable LMs to infill entire sentences effectively on three different domains: short stories, scientific abstracts, and lyrics. Furthermore, we show that humans have difficulty identifying sentences infilled by our approach as machine-generated in the domain of short stories.",text infilling|predicting text|writing tools|language modeling,Generation,Short,https://www.aclweb.org/anthology/2020.acl-main.225.pdf -main.219,Image-Chat: Engaging Grounded Conversations,Kurt Shuster|Samuel Humeau|Antoine Bordes|Jason Weston,"To achieve the long-term goal of machines being able to engage humans in conversation, our models should captivate the interest of their speaking partners. Communication grounded in images, whereby a dialogue is conducted based on a given photo, is a setup naturally appealing to humans (Hu et al., 2014). In this work we study large-scale architectures and datasets for this goal. We test a set of neural architectures using state-of-the-art image and text representations, considering various ways to fuse the components. To test such models, we collect a dataset of grounded human-human conversations, where speakers are asked to play roles given a provided emotional mood or style, as the use of such traits is also a key factor in engagingness (Guo et al., 2019). Our dataset, Image-Chat, consists of 202k dialogues over 202k images using 215 possible style traits. Automatic metrics and human evaluations of engagingness show the efficacy of our approach; in particular, we obtain state-of-the-art performance on the existing IGC task, and our best performing model is almost on par with humans on the Image-Chat test set (preferred 47.7% of the time).",large-scale architectures|IGC task|neural architectures|image representations,Dialogue and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.acl-main.219.pdf -main.755,Evaluating Robustness to Input Perturbations for Neural Machine Translation,Xing Niu|Prashant Mathur|Georgiana Dinu|Yaser Al-Onaizan,Neural Machine Translation (NMT) models are sensitive to small perturbations in the input. Robustness to such perturbations is typically measured using translation quality metrics such as BLEU on the noisy input. This paper proposes additional metrics which measure the relative degradation and changes in translation when small perturbations are added to the input. We focus on a class of models employing subword regularization to address robustness and perform extensive evaluations of these models using the robustness measures proposed. Results show that our proposed metrics reveal a clear trend of improved robustness to perturbations when subword regularization methods are used.,Neural Translation|Neural models|subword methods|relative degradation,Machine Translation,Short,https://www.aclweb.org/anthology/2020.acl-main.755.pdf -main.741,Estimating Mutual Information Between Dense Word Embeddings,Vitalii Zhelezniak|Aleksandar Savkov|Nils Hammerla,"Word embedding-based similarity measures are currently among the top-performing methods on unsupervised semantic textual similarity (STS) tasks. Recent work has increasingly adopted a statistical view on these embeddings, with some of the top approaches being essentially various correlations (which include the famous cosine similarity). Another excellent candidate for a similarity measure is mutual information (MI), which can capture arbitrary dependencies between the variables and has a simple and intuitive expression. Unfortunately, its use in the context of dense word embeddings has so far been avoided due to difficulties with estimating MI for continuous data. In this work we go through a vast literature on estimating MI in such cases and single out the most promising methods, yielding a simple and elegant similarity measure for word embeddings. We show that mutual information is a viable alternative to correlations, gives an excellent signal that correlates well with human judgements of similarity and rivals existing state-of-the-art unsupervised methods.",Estimating Information|unsupervised tasks|Dense Embeddings|statistical view,Semantics: Sentence Level,Short,https://www.aclweb.org/anthology/2020.acl-main.741.pdf -main.769,End-to-End Bias Mitigation by Modelling Biases in Corpora,Rabeeh Karimi Mahabadi|Yonatan Belinkov|James Henderson,"Several recent studies have shown that strong natural language understanding (NLU) models are prone to relying on unwanted dataset biases without learning the underlying task, resulting in models that fail to generalize to out-of-domain datasets and are likely to perform poorly in real-world scenarios. We propose two learning strategies to train neural models, which are more robust to such biases and transfer better to out-of-domain datasets. The biases are specified in terms of one or more bias-only models, which learn to leverage the dataset biases. During training, the bias-only models' predictions are used to adjust the loss of the base model to reduce its reliance on biases by down-weighting the biased examples and focusing the training on the hard examples. We experiment on large-scale natural language inference and fact verification benchmarks, evaluating on out-of-domain datasets that are specifically designed to assess the robustness of models against known biases in the training data. Results show that our debiasing methods greatly improve robustness in all settings and better transfer to other textual entailment datasets. Our code and data are publicly available in https://github.com/rabeehk/robust-nli.",End-to-End Mitigation|real-world scenarios|training|large-scale benchmarks,Semantics: Textual Inference and Other Areas of Semantics,Long,https://www.aclweb.org/anthology/2020.acl-main.769.pdf -main.184,Grounded Conversation Generation as Guided Traverses in Commonsense Knowledge Graphs,Houyu Zhang|Zhenghao Liu|Chenyan Xiong|Zhiyuan Liu,"Human conversations naturally evolve around related concepts and hop to distant concepts. This paper presents a new conversation generation model, ConceptFlow, which leverages commonsense knowledge graphs to explicitly model conversation flows. By grounding conversations to the concept space, ConceptFlow represents the potential conversation flow as traverses in the concept space along commonsense relations. The traverse is guided by graph attentions in the concept graph, moving towards more meaningful directions in the concept space, in order to generate more semantic and informative responses. Experiments on Reddit conversations demonstrate ConceptFlow's effectiveness over previous knowledge-aware conversation models and GPT-2 based models while using 70% fewer parameters, confirming the advantage of explicit modeling conversation structures. All source codes of this work are available at https://github.com/thunlp/ConceptFlow.",Grounded Generation|conversation model|ConceptFlow|knowledge-aware models,Dialogue and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.acl-main.184.pdf -main.190,ExpBERT: Representation Engineering with Natural Language Explanations,Shikhar Murty|Pang Wei Koh|Percy Liang,"Suppose we want to specify the inductive bias that married couples typically go on honeymoons for the task of extracting pairs of spouses from text. In this paper, we allow model developers to specify these types of inductive biases as natural language explanations. We use BERT fine-tuned on MultiNLI to ""interpret"" these explanations with respect to the input sentence, producing explanation-guided representations of the input. Across three relation extraction tasks, our method, ExpBERT, matches a BERT baseline but with 3--20x less labeled data and improves on the baseline by 3--10 F1 points with the same amount of labeled data.",relation tasks|ExpBERT|Natural Explanations|explanation-guided representations,Machine Learning for NLP,Short,https://www.aclweb.org/anthology/2020.acl-main.190.pdf -main.621,Closing the Gap: Joint De-Identification and Concept Extraction in the Clinical Domain,Lukas Lange|Heike Adel|Jannik Strötgen,"Exploiting natural language processing in the clinical domain requires de-identification, i.e., anonymization of personal information in texts. However, current research considers de-identification and downstream tasks, such as concept extraction, only in isolation and does not study the effects of de-identification on other tasks. In this paper, we close this gap by reporting concept extraction performance on automatically anonymized data and investigating joint models for de-identification and concept extraction. In particular, we propose a stacked model with restricted access to privacy sensitive information and a multitask model. We set the new state of the art on benchmark datasets in English (96.1% F1 for de-identification and 88.9% F1 for concept extraction) and Spanish (91.4% F1 for concept extraction).",Concept Extraction|natural processing|anonymization information|de-identification,NLP Applications,Short,https://www.aclweb.org/anthology/2020.acl-main.621.pdf -main.147,Enhancing Machine Translation with Dependency-Aware Self-Attention,Emanuele Bugliarello|Naoaki Okazaki,"Most neural machine translation models only rely on pairs of parallel sentences, assuming syntactic information is automatically learned by an attention mechanism. In this work, we investigate different approaches to incorporate syntactic knowledge in the Transformer model and also propose a novel, parameter-free, dependency-aware self-attention mechanism that improves its translation quality, especially for long sentences and in low-resource scenarios. We show the efficacy of each approach on WMT English-German and English-Turkish, and WAT English-Japanese translation tasks.",Machine Translation|neural models|attention mechanism|Transformer model,Machine Translation,Short,https://www.aclweb.org/anthology/2020.acl-main.147.pdf -main.153,Self-Attention with Cross-Lingual Position Representation,Liang Ding|Longyue Wang|Dacheng Tao,"Position encoding (PE), an essential part of self-attention networks (SANs), is used to preserve the word order information for natural language processing tasks, generating fixed position indices for input sequences. However, in cross-lingual scenarios, \eg machine translation, the PEs of source and target sentences are modeled independently. Due to word order divergences in different languages, modeling the cross-lingual positional relationships might help SANs tackle this problem. In this paper, we augment SANs with cross-lingual position representations to model the bilingually aware latent structure for the input sentence. Specifically, we utilize bracketing transduction grammar (BTG)-based reordering information to encourage SANs to learn bilingual diagonal alignments. Experimental results on WMT'14 English⇒German, WAT'17 Japanese⇒English, and WMT'17 Chinese⇔English translation tasks demonstrate that our approach significantly and consistently improves translation quality over strong baselines. Extensive analyses confirm that the performance gains come from the cross-lingual information.",natural tasks|WMT'17 tasks|Cross-Lingual Representation|Position encoding,Machine Translation,Short,https://www.aclweb.org/anthology/2020.acl-main.153.pdf -main.635,KdConv: A Chinese Multi-domain Dialogue Dataset Towards Multi-turn Knowledge-driven Conversation,Hao Zhou|Chujie Zheng|Kaili Huang|Minlie Huang|Xiaoyan Zhu,"The research of knowledge-driven conversational systems is largely limited due to the lack of dialog data which consists of multi-turn conversations on multiple topics and with knowledge annotations. In this paper, we propose a Chinese multi-domain knowledge-driven conversation dataset, KdConv, which grounds the topics in multi-turn conversations to knowledge graphs. Our corpus contains 4.5K conversations from three domains (film, music, and travel), and 86K utterances with an average turn number of 19.0. These conversations contain in-depth discussions on related topics and natural transition between multiple topics. To facilitate the following research on this corpus, we provide several benchmark models. Comparative results show that the models can be enhanced by introducing background knowledge, yet there is still a large space for leveraging knowledge to model multi-turn conversations for further research. Results also show that there are obvious performance differences between different domains, indicating that it is worth further explore transfer learning and domain adaptation. The corpus and benchmark models are publicly available.",Multi-turn Conversation|Multi-turn |knowledge-driven systems|transfer learning,Dialogue and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.acl-main.635.pdf -main.609,DRTS Parsing with Structure-Aware Encoding and Decoding,Qiankun Fu|Yue Zhang|Jiangming Liu|Meishan Zhang,"Discourse representation tree structure (DRTS) parsing is a novel semantic parsing task which has been concerned most recently. State-of-the-art performance can be achieved by a neural sequence-to-sequence model, treating the tree construction as an incremental sequence generation problem. Structural information such as input syntax and the intermediate skeleton of the partial output has been ignored in the model, which could be potentially useful for the DRTS parsing. In this work, we propose a structural-aware model at both the encoder and decoder phase to integrate the structural information, where graph attention network (GAT) is exploited for effectively modeling. Experimental results on a benchmark dataset show that our proposed model is effective and can obtain the best performance in the literature.",Discourse parsing|semantic task|tree construction|incremental problem,Discourse and Pragmatics,Long,https://www.aclweb.org/anthology/2020.acl-main.609.pdf -main.99,Unknown Intent Detection Using Gaussian Mixture Model with an Application to Zero-shot Intent Classification,Guangfeng Yan|Lu Fan|Qimai Li|Han Liu|Xiaotong Zhang|Xiao-Ming Wu|Albert Y.S. Lam,"User intent classification plays a vital role in dialogue systems. Since user intent may frequently change over time in many realistic scenarios, unknown (new) intent detection has become an essential problem, where the study has just begun. This paper proposes a semantic-enhanced Gaussian mixture model (SEG) for unknown intent detection. In particular, we model utterance embeddings with a Gaussian mixture distribution and inject dynamic class semantic information into Gaussian means, which enables learning more class-concentrated embeddings that help to facilitate downstream outlier detection. Coupled with a density-based outlier detection algorithm, SEG achieves competitive results on three real task-oriented dialogue datasets in two languages for unknown intent detection. On top of that, we propose to integrate SEG as an unknown intent identifier into existing generalized zero-shot intent classification models to improve their performance. A case study on a state-of-the-art method, ReCapsNet, shows that SEG can push the classification performance to a significantly higher level.",Unknown Detection|Zero-shot Classification|User classification|dialogue systems,Dialogue and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.acl-main.99.pdf -main.386,Towards Faithfully Interpretable NLP Systems: How Should We Define and Evaluate Faithfulness?,Alon Jacovi|Yoav Goldberg,"With the growing popularity of deep-learning based NLP models, comes a need for interpretable systems. But what is interpretability, and what constitutes a high-quality interpretation? In this opinion piece we reflect on the current state of interpretability evaluation research. We call for more clearly differentiating between different desired criteria an interpretation should satisfy, and focus on the faithfulness criteria. We survey the literature with respect to faithfulness evaluation, and arrange the current approaches around three assumptions, providing an explicit form to how faithfulness is ""defined"" by the community. We provide concrete guidelines on how evaluation of interpretation methods should and should not be conducted. Finally, we claim that the current binary definition for faithfulness sets a potentially unrealistic bar for being considered faithful. We call for discarding the binary notion of faithfulness in favor of a more graded one, which we believe will be of greater practical utility.",Faithfully Systems|interpretable systems|interpretability research|faithfulness evaluation,Interpretability and Analysis of Models for NLP,Short,https://www.aclweb.org/anthology/2020.acl-main.386.pdf -main.392,Graph Neural News Recommendation with Unsupervised Preference Disentanglement,Linmei Hu|Siyong Xu|Chen Li|Cheng Yang|Chuan Shi|Nan Duan|Xing Xie|Ming Zhou,"With the explosion of news information, personalized news recommendation has become very important for users to quickly find their interested contents. Most existing methods usually learn the representations of users and news from news contents for recommendation. However, they seldom consider high-order connectivity underlying the user-news interactions. Moreover, existing methods failed to disentangle a user's latent preference factors which cause her clicks on different news. In this paper, we model the user-news interactions as a bipartite graph and propose a novel Graph Neural News Recommendation model with Unsupervised Preference Disentanglement, named GNUD. Our model can encode high-order relationships into user and news representations by information propagation along the graph. Furthermore, the learned representations are disentangled with latent preference factors by a neighborhood routing algorithm, which can enhance expressiveness and interpretability. A preference regularizer is also designed to force each disentangled subspace to independently reflect an isolated preference, improving the quality of the disentangled representations. Experimental results on real-world news datasets demonstrate that our proposed model can effectively improve the performance of news recommendation and outperform state-of-the-art news recommendation methods.",Graph Recommendation|Unsupervised Disentanglement|personalized recommendation|recommendation,NLP Applications,Long,https://www.aclweb.org/anthology/2020.acl-main.392.pdf -main.345,How Accents Confound: Probing for Accent Information in End-to-End Speech Recognition Systems,Archiki Prasad|Preethi Jyothi,"In this work, we present a detailed analysis of how accent information is reflected in the internal representation of speech in an end-to-end automatic speech recognition (ASR) system. We use a state-of-the-art end-to-end ASR system, comprising convolutional and recurrent layers, that is trained on a large amount of US-accented English speech and evaluate the model on speech samples from seven different English accents. We examine the effects of accent on the internal representation using three main probing techniques: a) Gradient-based explanation methods, b) Information-theoretic measures, and c) Outputs of accent and phone classifiers. We find different accents exhibiting similar trends irrespective of the probing technique used. We also find that most accent information is encoded within the first recurrent layer, which is suggestive of how one could adapt such an end-to-end model to learn representations that are invariant to accents.",Probing|Accent Information|End-to-End Systems|end-to-end system,Speech and Multimodality,Long,https://www.aclweb.org/anthology/2020.acl-main.345.pdf -main.423,SenseBERT: Driving Some Sense into BERT,Yoav Levine|Barak Lenz|Or Dagan|Ori Ram|Dan Padnos|Or Sharir|Shai Shalev-Shwartz|Amnon Shashua|Yoav Shoham,"The ability to learn from large unlabeled corpora has allowed neural language models to advance the frontier in natural language understanding. However, existing self-supervision techniques operate at the word form level, which serves as a surrogate for the underlying semantic content. This paper proposes a method to employ weak-supervision directly at the word sense level. Our model, named SenseBERT, is pre-trained to predict not only the masked words but also their WordNet supersenses. Accordingly, we attain a lexical-semantic level language model, without the use of human annotation. SenseBERT achieves significantly improved lexical understanding, as we demonstrate by experimenting on SemEval Word Sense Disambiguation, and by attaining a state of the art result on the `Word in Context' task.",natural understanding|lexical understanding|SemEval Disambiguation|task,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.423.pdf -main.437,Discrete Latent Variable Representations for Low-Resource Text Classification,Shuning Jin|Sam Wiseman|Karl Stratos|Karen Livescu,"While much work on deep latent variable models of text uses continuous latent variables, discrete latent variables are interesting because they are more interpretable and typically more space efficient. We consider several approaches to learning discrete latent variable models for text in the case where exact marginalization over these variables is intractable. We compare the performance of the learned representations as features for low-resource document and sentence classification. Our best models outperform the previous best reported results with continuous representations in these low-resource settings, while learning significantly more compressed representations. Interestingly, we find that an amortized variant of Hard EM performs particularly well in the lowest-resource regimes.",Low-Resource Classification|Discrete Representations|discrete models|continuous representations,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.437.pdf -main.351,Towards end-2-end learning for predicting behavior codes from spoken utterances in psychotherapy conversations,Karan Singla|Zhuohao Chen|David Atkins|Shrikanth Narayanan,"Spoken language understanding tasks usually rely on pipelines involving complex processing blocks such as voice activity detection, speaker diarization and Automatic speech recognition (ASR). We propose a novel framework for predicting utterance level labels directly from speech features, thus removing the dependency on first generating transcripts, and transcription free behavioral coding. Our classifier uses a pretrained Speech-2-Vector encoder as bottleneck to generate word-level representations from speech features. This pretrained encoder learns to encode speech features for a word using an objective similar to Word2Vec. Our proposed approach just uses speech features and word segmentation information for predicting spoken utterance-level target labels. We show that our model achieves competitive results to other state-of-the-art approaches which use transcribed text for the task of predicting psychotherapy-relevant behavior codes.",predicting codes|Spoken tasks|voice detection|speaker diarization,Speech and Multimodality,Short,https://www.aclweb.org/anthology/2020.acl-main.351.pdf -main.379,Neural Reranking for Dependency Parsing: An Evaluation,Bich-Ngoc Do|Ines Rehbein,"Recent work has shown that neural rerankers can improve results for dependency parsing over the top k trees produced by a base parser. However, all neural rerankers so far have been evaluated on English and Chinese only, both languages with a configurational word order and poor morphology. In the paper, we re-assess the potential of successful neural reranking models from the literature on English and on two morphologically rich(er) languages, German and Czech. In addition, we introduce a new variation of a discriminative reranker based on graph convolutional networks (GCNs). We show that the GCN not only outperforms previous models on English but is the only model that is able to improve results over the baselines on German and Czech. We explain the differences in reranking performance based on an analysis of a) the gold tree ratio and b) the variety in the k-best lists.",Neural Reranking|Dependency Parsing|reranking|neural rerankers,"Syntax: Tagging, Chunking and Parsing",Long,https://www.aclweb.org/anthology/2020.acl-main.379.pdf -main.66,Improved Natural Language Generation via Loss Truncation,Daniel Kang|Tatsunori Hashimoto,"Neural language models are usually trained to match the distributional properties of large-scale corpora by minimizing the log loss. While straightforward to optimize, this approach forces the model to reproduce all variations in the dataset, including noisy and invalid references (e.g., misannotations and hallucinated facts). Even a small fraction of noisy data can degrade the performance of log loss. As an alternative, prior work has shown that minimizing the distinguishability of generated samples is a principled and robust loss that can handle invalid references. However, distinguishability has not been used in practice due to challenges in optimization and estimation. We propose loss truncation: a simple and scalable procedure which adaptively removes high log loss examples as a way to optimize for distinguishability. Empirically, we demonstrate that loss truncation outperforms existing baselines on distinguishability on a summarization task. Furthermore, we show that samples generated by the loss truncation model have factual accuracy ratings that exceed those of baselines and match human references.",Natural Generation|optimization|estimation|distinguishability,Generation,Long,https://www.aclweb.org/anthology/2020.acl-main.66.pdf -main.72,Interactive Construction of User-Centric Dictionary for Text Analytics,Ryosuke Kohita|Issei Yoshida|Hiroshi Kanayama|Tetsuya Nasukawa,"We propose a methodology to construct a term dictionary for text analytics through an interactive process between a human and a machine, which helps the creation of flexible dictionaries with precise granularity required in typical text analysis. This paper introduces the first formulation of interactive dictionary construction to address this issue. To optimize the interaction, we propose a new algorithm that effectively captures an analyst's intention starting from only a small number of sample terms. Along with the algorithm, we also design an automatic evaluation framework that provides a systematic assessment of any interactive method for the dictionary creation task. Experiments using real scenario based corpora and dictionaries show that our algorithm outperforms baseline methods, and works even with a small number of interactions.",Interactive Dictionary|Text Analytics|text analysis|interactive construction,Information Retrieval and Text Mining,Long,https://www.aclweb.org/anthology/2020.acl-main.72.pdf -main.64,USR: An Unsupervised and Reference Free Evaluation Metric for Dialog Generation,Shikib Mehri|Maxine Eskenazi,"The lack of meaningful automatic evaluation metrics for dialog has impeded open-domain dialog research. Standard language generation metrics have been shown to be ineffective for evaluating dialog models. To this end, this paper presents USR, an UnSupervised and Reference-free evaluation metric for dialog. USR is a reference-free metric that trains unsupervised models to measure several desirable qualities of dialog. USR is shown to strongly correlate with human judgment on both Topical-Chat (turn-level: 0.42, system-level: 1.0) and PersonaChat (turn-level: 0.48 and system-level: 1.0). USR additionally produces interpretable measures for several desirable properties of dialog.",Dialog Generation|dialog|open-domain research|USR,Dialogue and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.acl-main.64.pdf -main.70,An Online Semantic-enhanced Dirichlet Model for Short Text Stream Clustering,Jay Kumar|Junming Shao|Salah Uddin|Wazir Ali,"Clustering short text streams is a challenging task due to its unique properties: infinite length, sparse data representation and cluster evolution. Existing approaches often exploit short text streams in a batch way. However, determine the optimal batch size is usually a difficult task since we have no priori knowledge when the topics evolve. In addition, traditional independent word representation in graphical model tends to cause ``term ambiguity"" problem in short text clustering. Therefore, in this paper, we propose an Online Semantic-enhanced Dirichlet Model for short sext stream clustering, called OSDM, which integrates the word-occurance semantic information (i.e., context) into a new graphical model and clusters each arriving short text automatically in an online way. Extensive results have demonstrated that OSDM has better performance compared to many state-of-the-art algorithms on both synthetic and real-world data sets.",Short Clustering|Clustering streams|Online Model|sparse representation,Information Retrieval and Text Mining,Long,https://www.aclweb.org/anthology/2020.acl-main.70.pdf -main.409,Learning to Faithfully Rationalize by Construction,Sarthak Jain|Sarah Wiegreffe|Yuval Pinter|Byron C. Wallace,"In many settings it is important for one to be able to understand why a model made a particular prediction. In NLP this often entails extracting snippets of an input text 'responsible for' corresponding model output; when such a snippet comprises tokens that indeed informed the model's prediction, it is a faithful explanation. In some settings, faithfulness may be critical to ensure transparency. Lei et al. (2016) proposed a model to produce faithful rationales for neural text classification by defining independent snippet extraction and prediction modules. However, the discrete selection over input tokens performed by this method complicates training, leading to high variance and requiring careful hyperparameter tuning. We propose a simpler variant of this approach that provides faithful explanations by construction. In our scheme, named FRESH, arbitrary feature importance scores (e.g., gradients from a trained model) are used to induce binary labels over token inputs, which an extractor can be trained to predict. An independent classifier module is then trained exclusively on snippets provided by the extractor; these snippets thus constitute faithful explanations, even if the classifier is arbitrarily complex. In both automatic and manual evaluations we find that variants of this simple framework yield predictive performance superior to 'end-to-end' approaches, while being more general and easier to train. Code is available at https://github.com/successar/FRESH.",NLP|neural classification|training|automatic evaluations,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.409.pdf -main.421,On the Cross-lingual Transferability of Monolingual Representations,Mikel Artetxe|Sebastian Ruder|Dani Yogatama,"State-of-the-art unsupervised multilingual models (e.g., multilingual BERT) have been shown to generalize in a zero-shot cross-lingual setting. This generalization ability has been attributed to the use of a shared subword vocabulary and joint training across multiple languages giving rise to deep multilingual abstractions. We evaluate this hypothesis by designing an alternative approach that transfers a monolingual model to new languages at the lexical level. More concretely, we first train a transformer-based masked language model on one language, and transfer it to a new language by learning a new embedding matrix with the same masked language modeling objective, freezing parameters of all other layers. This approach does not rely on a shared vocabulary or joint training. However, we show that it is competitive with multilingual BERT on standard cross-lingual classification benchmarks and on a new Cross-lingual Question Answering Dataset (XQuAD). Our results contradict common beliefs of the basis of the generalization ability of multilingual models and suggest that deep monolingual models learn some abstractions that generalize across languages. We also release XQuAD as a more comprehensive cross-lingual benchmark, which comprises 240 paragraphs and 1190 question-answer pairs from SQuAD v1.1 translated into ten languages by professional translators.",zero-shot setting|Cross-lingual Representations|unsupervised models|joint training,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.421.pdf -main.2,Predicting Depression in Screening Interviews from Latent Categorization of Interview Prompts,Alex Rinaldi|Jean Fox Tree|Snigdha Chaturvedi,"Accurately diagnosing depression is difficult-- requiring time-intensive interviews, assessments, and analysis. Hence, automated methods that can assess linguistic patterns in these interviews could help psychiatric professionals make faster, more informed decisions about diagnosis. We propose JLPC, a model that analyzes interview transcripts to identify depression while jointly categorizing interview prompts into latent categories. This latent categorization allows the model to define high-level conversational contexts that influence patterns of language in depressed individuals. We show that the proposed model not only outperforms competitive baselines, but that its latent prompt categories provide psycholinguistic insights about depression.",Depression Interviews|depression|assessments|analysis,Cognitive Modeling and Psycholinguistics,Long,https://www.aclweb.org/anthology/2020.acl-main.2.pdf -main.58,Learning to Tag OOV Tokens by Integrating Contextual Representation and Background Knowledge,Keqing He|Yuanmeng Yan|Weiran XU,"Neural-based context-aware models for slot tagging have achieved state-of-the-art performance. However, the presence of OOV(out-of-vocab) words significantly degrades the performance of neural-based models, especially in a few-shot scenario. In this paper, we propose a novel knowledge-enhanced slot tagging model to integrate contextual representation of input text and the large-scale lexical background knowledge. Besides, we use multi-level graph attention to explicitly model lexical relations. The experiments show that our proposed knowledge integration mechanism achieves consistent improvements across settings with different sizes of training data on two public benchmark datasets.",slot tagging|Contextual Representation|Neural-based models|knowledge-enhanced model,Dialogue and Interactive Systems,Short,https://www.aclweb.org/anthology/2020.acl-main.58.pdf -main.347,Learning Spoken Language Representations with Neural Lattice Language Modeling,Chao-Wei Huang|Yun-Nung Chen,"Pre-trained language models have achieved huge improvement on many NLP tasks. However, these methods are usually designed for written text, so they do not consider the properties of spoken language. Therefore, this paper aims at generalizing the idea of language model pre-training to lattices generated by recognition systems. We propose a framework that trains neural lattice language models to provide contextualized representations for spoken language understanding tasks. The proposed two-stage pre-training approach reduces the demands of speech data and has better efficiency. Experiments on intent detection and dialogue act recognition datasets demonstrate that our proposed method consistently outperforms strong baselines when evaluated on spoken inputs. The code is available at https://github.com/MiuLab/Lattice-ELMo.",NLP tasks|spoken tasks|intent detection|Spoken Representations,Speech and Multimodality,Short,https://www.aclweb.org/anthology/2020.acl-main.347.pdf -main.353,"It Takes Two to Lie: One to Lie, and One to Listen",Denis Peskov|Benny Cheng|Ahmed Elgohary|Joe Barrow|Cristian Danescu-Niculescu-Mizil|Jordan Boyd-Graber,"Trust is implicit in many online text conversations---striking up new friendships, or asking for tech support. But trust can be betrayed through deception. We study the language and dynamics of deception in the negotiation-based game Diplomacy, where seven players compete for world domination by forging and breaking alliances with each other. Our study with players from the Diplomacy community gathers 17,289 messages annotated by the sender for their intended truthfulness and by the receiver for their perceived truthfulness. Unlike existing datasets, this captures deception in long-lasting relationships, where the interlocutors strategically combine truth with lies to advance objectives. A model that uses power dynamics and conversational contexts can predict when a lie occurs nearly as well as human players.",language deception|deception|Trust|tech support,Computational Social Science and Social Media,Long,https://www.aclweb.org/anthology/2020.acl-main.353.pdf -main.435,Dense-Caption Matching and Frame-Selection Gating for Temporal Localization in VideoQA,Hyounghun Kim|Zineng Tang|Mohit Bansal,"Videos convey rich information. Dynamic spatio-temporal relationships between people/objects, and diverse multimodal events are present in a video clip. Hence, it is important to develop automated models that can accurately extract such information from videos. Answering questions on videos is one of the tasks which can evaluate such AI abilities. In this paper, we propose a video question answering model which effectively integrates multi-modal input sources and finds the temporally relevant information to answer questions. Specifically, we first employ dense image captions to help identify objects and their detailed salient regions and actions, and hence give the model useful extra information (in explicit textual format to allow easier matching) for answering questions. Moreover, our model is also comprised of dual-level attention (word/object and frame level), multi-head self/cross-integration for different sources (video and dense captions), and gates which pass more relevant information to the classifier. Finally, we also cast the frame selection problem as a multi-label classification task and introduce two loss functions, In-andOut Frame Score Margin (IOFSM) and Balanced Binary Cross-Entropy (BBCE), to better supervise the model with human importance annotations. We evaluate our model on the challenging TVQA dataset, where each of our model components provides significant gains, and our overall model outperforms the state-of-the-art by a large margin (74.09% versus 70.52%). We also present several word, object, and frame level visualization studies.",Dense-Caption Matching|Temporal VideoQA|answering questions|frame problem,"Language Grounding to Vision, Robotics and Beyond",Long,https://www.aclweb.org/anthology/2020.acl-main.435.pdf -main.384,Probing for Referential Information in Language Models,Ionut-Teodor Sorodoc|Kristina Gulordava|Gemma Boleda,"Language models keep track of complex information about the preceding context -- including, e.g., syntactic relations in a sentence. We investigate whether they also capture information beneficial for resolving pronominal anaphora in English. We analyze two state of the art models with LSTM and Transformer architectures, via probe tasks and analysis on a coreference annotated corpus. The Transformer outperforms the LSTM in all analyses. Our results suggest that language models are more successful at learning grammatical constraints than they are at learning truly referential information, in the sense of capturing the fact that we use language to refer to entities in the world. However, we find traces of the latter aspect, too.",Probing|probe tasks|Language Models|LSTM architectures,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.384.pdf -main.390,Empowering Active Learning to Jointly Optimize System and User Demands,Ji-Ung Lee|Christian M. Meyer|Iryna Gurevych,"Existing approaches to active learning maximize the system performance by sampling unlabeled instances for annotation that yield the most efficient training. However, when active learning is integrated with an end-user application, this can lead to frustration for participating users, as they spend time labeling instances that they would not otherwise be interested in reading. In this paper, we propose a new active learning approach that jointly optimizes the seemingly counteracting objectives of the active learning system (training efficiently) and the user (receiving useful instances). We study our approach in an educational application, which particularly benefits from this technique as the system needs to rapidly learn to predict the appropriateness of an exercise to a particular user, while the users should receive only exercises that match their skills. We evaluate multiple learning strategies and user types with data from real users and find that our joint approach better satisfies both objectives when alternative methods lead to many unsuitable exercises for end users.",educational application|Active Learning|end-user application|active approach,NLP Applications,Long,https://www.aclweb.org/anthology/2020.acl-main.390.pdf -main.179,Recurrent Neural Network Language Models Always Learn English-Like Relative Clause Attachment,Forrest Davis|Marten van Schijndel,"A standard approach to evaluating language models analyzes how models assign probabilities to valid versus invalid syntactic constructions (i.e. is a grammatical sentence more probable than an ungrammatical sentence). Our work uses ambiguous relative clause attachment to extend such evaluations to cases of multiple simultaneous valid interpretations, where stark grammaticality differences are absent. We compare model performance in English and Spanish to show that non-linguistic biases in RNN LMs advantageously overlap with syntactic structure in English but not Spanish. Thus, English models may appear to acquire human-like syntactic preferences, while models trained on Spanish fail to acquire comparable human-like preferences. We conclude by relating these results to broader concerns about the relationship between comprehension (i.e. typical language model use cases) and production (which generates the training data for language models), suggesting that necessary linguistic biases are not present in the training signal at all.",production|Recurrent Always|language models|RNN LMs,Cognitive Modeling and Psycholinguistics,Long,https://www.aclweb.org/anthology/2020.acl-main.179.pdf -main.145,Character-Level Translation with Self-attention,Yingqiang Gao|Nikola I. Nikolov|Yuhuang Hu|Richard H.R. Hahnloser,"We explore the suitability of self-attention models for character-level neural machine translation. We test the standard transformer model, as well as a novel variant in which the encoder block combines information from nearby characters using convolutions. We perform extensive experiments on WMT and UN datasets, testing both bilingual and multilingual translation to English using up to three input languages (French, Spanish, and Chinese). Our transformer variant consistently outperforms the standard transformer at the character-level and converges faster while learning more robust character-level alignments.",Character-Level Translation|bilingual translation|self-attention models|transformer model,Machine Translation,Short,https://www.aclweb.org/anthology/2020.acl-main.145.pdf -main.623,Estimating predictive uncertainty for rumour verification models,Elena Kochkina|Maria Liakata,"The inability to correctly resolve rumours circulating online can have harmful real-world consequences. We present a method for incorporating model and data uncertainty estimates into natural language processing models for automatic rumour verification. We show that these estimates can be used to filter out model predictions likely to be erroneous so that these difficult instances can be prioritised by a human fact-checker. We propose two methods for uncertainty-based instance rejection, supervised and unsupervised. We also show how uncertainty estimates can be used to interpret model performance as a rumour unfolds.",automatic verification|uncertainty-based rejection|rumour models|natural models,NLP Applications,Long,https://www.aclweb.org/anthology/2020.acl-main.623.pdf -main.637,Modeling Long Context for Task-Oriented Dialogue State Generation,Jun Quan|Deyi Xiong,"Based on the recently proposed transferable dialogue state generator (TRADE) that predicts dialogue states from utterance-concatenated dialogue context, we propose a multi-task learning model with a simple yet effective utterance tagging technique and a bidirectional language model as an auxiliary task for task-oriented dialogue state generation. By enabling the model to learn a better representation of the long dialogue context, our approaches attempt to solve the problem that the performance of the baseline significantly drops when the input dialogue context sequence is long. In our experiments, our proposed model achieves a 7.03% relative improvement over the baseline, establishing a new state-of-the-art joint goal accuracy of 52.04% on the MultiWOZ 2.0 dataset.",Task-Oriented Generation|auxiliary task|transferable generator|TRADE,Dialogue and Interactive Systems,Short,https://www.aclweb.org/anthology/2020.acl-main.637.pdf -main.151,On the Limitations of Cross-lingual Encoders as Exposed by Reference-Free Machine Translation Evaluation,Wei Zhao|Goran Glavaš|Maxime Peyrard|Yang Gao|Robert West|Steffen Eger,"Evaluation of cross-lingual encoders is usually performed either via zero-shot cross-lingual transfer in supervised downstream tasks or via unsupervised cross-lingual textual similarity. In this paper, we concern ourselves with reference-free machine translation (MT) evaluation where we directly compare source texts to (sometimes low-quality) system translations, which represents a natural adversarial setup for multilingual encoders. Reference-free evaluation holds the promise of web-scale comparison of MT systems. We systematically investigate a range of metrics based on state-of-the-art cross-lingual semantic representations obtained with pretrained M-BERT and LASER. We find that they perform poorly as semantic encoders for reference-free MT evaluation and identify their two key limitations, namely, (a) a semantic mismatch between representations of mutual translations and, more prominently, (b) the inability to punish ""translationese"", i.e., low-quality literal translations. We propose two partial remedies: (1) post-hoc re-alignment of the vector spaces and (2) coupling of semantic-similarity based metrics with target-side language modeling. In segment-level MT evaluation, our best metric surpasses reference-based BLEU by 5.7 correlation points.",Evaluation encoders|zero-shot transfer|supervised tasks|web-scale systems,Machine Translation,Long,https://www.aclweb.org/anthology/2020.acl-main.151.pdf -main.186,Recursive Template-based Frame Generation for Task Oriented Dialog,Rashmi Gangadharaiah|Balakrishnan Narayanaswamy,"The Natural Language Understanding (NLU) component in task oriented dialog systems processes a user's request and converts it into structured information that can be consumed by downstream components such as the Dialog State Tracker (DST). This information is typically represented as a semantic frame that captures the intent and slot-labels provided by the user. We first show that such a shallow representation is insufficient for complex dialog scenarios, because it does not capture the recursive nature inherent in many domains. We propose a recursive, hierarchical frame-based representation and show how to learn it from data. We formulate the frame generation task as a template-based tree decoding task, where the decoder recursively generates a template and then fills slot values into the template. We extend local tree-based loss functions with terms that provide global supervision and show how to optimize them end-to-end. We achieve a small improvement on the widely used ATIS dataset and a much larger improvement on a more complex dataset we describe here.",Task Dialog|Natural component|task systems|complex scenarios,Dialogue and Interactive Systems,Short,https://www.aclweb.org/anthology/2020.acl-main.186.pdf -main.192,Generalizing Natural Language Analysis through Span-relation Representations,Zhengbao Jiang|Wei Xu|Jun Araki|Graham Neubig,"Natural language processing covers a wide variety of tasks predicting syntax, semantics, and information content, and usually each type of output is generated with specially designed architectures. In this paper, we provide the simple insight that a great variety of tasks can be represented in a single unified format consisting of labeling spans and relations between spans, thus a single task-independent model can be used across different tasks. We perform extensive experiments to test this insight on 10 disparate tasks spanning dependency parsing (syntax), semantic role labeling (semantics), relation extraction (information content), aspect based sentiment analysis (sentiment), and many others, achieving performance comparable to state-of-the-art specialized models. We further demonstrate benefits of multi-task learning, and also show that the proposed method makes it easy to analyze differences and similarities in how the model handles different tasks. Finally, we convert these datasets into a unified format to build a benchmark, which provides a holistic testbed for evaluating future models for generalized natural language analysis.",Natural Analysis|Natural processing|dependency parsing|semantic labeling,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.192.pdf -main.757,Regularized Context Gates on Transformer for Machine Translation,Xintong Li|Lemao Liu|Rui Wang|Guoping Huang|Max Meng,"Context gates are effective to control the contributions from the source and target contexts in the recurrent neural network (RNN) based neural machine translation (NMT). However, it is challenging to extend them into the advanced Transformer architecture, which is more complicated than RNN. This paper first provides a method to identify source and target contexts and then introduce a gate mechanism to control the source and target contributions in Transformer. In addition, to further reduce the bias problem in the gate mechanism, this paper proposes a regularization method to guide the learning of the gates with supervision automatically generated using pointwise mutual information. Extensive experiments on 4 translation datasets demonstrate that the proposed model obtains an averaged gain of 1.0 BLEU score over a strong Transformer baseline.",Machine Translation|learning gates|Regularized Gates|Transformer,Machine Translation,Short,https://www.aclweb.org/anthology/2020.acl-main.757.pdf -main.743,Predicting the Focus of Negation: Model and Error Analysis,Md Mosharaf Hossain|Kathleen Hamilton|Alexis Palmer|Eduardo Blanco,"The focus of a negation is the set of tokens intended to be negated, and a key component for revealing affirmative alternatives to negated utterances. In this paper, we experiment with neural networks to predict the focus of negation. Our main novelty is leveraging a scope detector to introduce the scope of negation as an additional input to the network. Experimental results show that doing so obtains the best results to date. Additionally, we perform a detailed error analysis providing insights into the main error categories, and analyze errors depending on whether the model takes into account scope and context information.",Negation|error analysis|Model Analysis|neural networks,Semantics: Sentence Level,Long,https://www.aclweb.org/anthology/2020.acl-main.743.pdf -main.569,A Top-down Neural Architecture towards Text-level Parsing of Discourse Rhetorical Structure,Longyin Zhang|Yuqing Xing|Fang Kong|Peifeng Li|Guodong Zhou,"Due to its great importance in deep natural language understanding and various down-stream applications, text-level parsing of discourse rhetorical structure (DRS) has been drawing more and more attention in recent years. However, all the previous studies on text-level discourse parsing adopt bottom-up approaches, which much limit the DRS determination on local information and fail to well benefit from global information of the overall discourse. In this paper, we justify from both computational and perceptive points-of-view that the top-down architecture is more suitable for text-level DRS parsing. On the basis, we propose a top-down neural architecture toward text-level DRS parsing. In particular, we cast discourse parsing as a recursive split point ranking task, where a split point is classified to different levels according to its rank and the elementary discourse units (EDUs) associated with it are arranged accordingly. In this way, we can determine the complete DRS as a hierarchical tree structure via an encoder-decoder with an internal stack. Experimentation on both the English RST-DT corpus and the Chinese CDTB corpus shows the great effectiveness of our proposed top-down approach towards text-level DRS parsing.",Text-level Structure|natural understanding|down-stream applications|text-level parsing,Discourse and Pragmatics,Long,https://www.aclweb.org/anthology/2020.acl-main.569.pdf -main.555,Leveraging Graph to Improve Abstractive Multi-Document Summarization,Wei Li|Xinyan Xiao|Jiachen Liu|Hua Wu|Haifeng Wang|Junping Du,"Graphs that capture relations between textual units have great benefits for detecting salient information from multiple documents and generating overall coherent summaries. In this paper, we develop a neural abstractive multi-document summarization (MDS) model which can leverage well-known graph representations of documents such as similarity graph and discourse graph, to more effectively process multiple input documents and produce abstractive summaries. Our model utilizes graphs to encode documents in order to capture cross-document relations, which is crucial to summarizing long documents. Our model can also take advantage of graphs to guide the summary generation process, which is beneficial for generating coherent and concise summaries. Furthermore, pre-trained language models can be easily combined with our model, which further improve the summarization performance significantly. Empirical results on the WikiSum and MultiNews dataset show that the proposed architecture brings substantial improvements over several strong baselines.",Abstractive Summarization|detecting information|generating summaries|graph documents,Summarization,Long,https://www.aclweb.org/anthology/2020.acl-main.555.pdf -main.233,MART: Memory-Augmented Recurrent Transformer for Coherent Video Paragraph Captioning,Jie Lei|Liwei Wang|Yelong Shen|Dong Yu|Tamara Berg|Mohit Bansal,"Generating multi-sentence descriptions for videos is one of the most challenging captioning tasks due to its high requirements for not only visual relevance but also discourse-based coherence across the sentences in the paragraph. Towards this goal, we propose a new approach called Memory-Augmented Recurrent Transformer (MART), which uses a memory module to augment the transformer architecture. The memory module generates a highly summarized memory state from the video segments and the sentence history so as to help better prediction of the next sentence (w.r.t. coreference and repetition aspects), thus encouraging coherent paragraph generation. Extensive experiments, human evaluations, and qualitative analyses on two popular datasets ActivityNet Captions and YouCookII show that MART generates more coherent and less repetitive paragraph captions than baseline methods, while maintaining relevance to the input video events.",Coherent Captioning|Generating descriptions|captioning tasks|coherent generation,"Language Grounding to Vision, Robotics and Beyond",Long,https://www.aclweb.org/anthology/2020.acl-main.233.pdf -main.227,Improving Adversarial Text Generation by Modeling the Distant Future,Ruiyi Zhang|Changyou Chen|Zhe Gan|Wenlin Wang|Dinghan Shen|Guoyin Wang|Zheng Wen|Lawrence Carin,"Auto-regressive text generation models usually focus on local fluency, and may cause inconsistent semantic meaning in long text generation. Further, automatically generating words with similar semantics is challenging, and hand-crafted linguistic rules are difficult to apply. We consider a text planning scheme and present a model-based imitation-learning approach to alleviate the aforementioned issues. Specifically, we propose a novel guider network to focus on the generative process over a longer horizon, which can assist next-word prediction and provide intermediate rewards for generator optimization. Extensive experiments demonstrate that the proposed method leads to improved performance.",Adversarial Generation|long generation|next-word prediction|generator optimization,Generation,Long,https://www.aclweb.org/anthology/2020.acl-main.227.pdf -main.541,Benchmarking Multimodal Regex Synthesis with Complex Structures,Xi Ye|Qiaochu Chen|Isil Dillig|Greg Durrett,"Existing datasets for regular expression (regex) generation from natural language are limited in complexity; compared to regex tasks that users post on StackOverflow, the regexes in these datasets are simple, and the language used to describe them is not diverse. We introduce StructuredRegex, a new regex synthesis dataset differing from prior ones in three aspects. First, to obtain structurally complex and realistic regexes, we generate the regexes using a probabilistic grammar with pre-defined macros observed from real-world StackOverflow posts. Second, to obtain linguistically diverse natural language descriptions, we show crowdworkers abstract depictions of the underlying regex and ask them to describe the pattern they see, rather than having them paraphrase synthetic language. Third, we augment each regex example with a collection of strings that are and are not matched by the ground truth regex, similar to how real users give examples. Our quantitative and qualitative analysis demonstrates the advantages of StructuredRegex over prior datasets. Further experimental results using various multimodal synthesis techniques highlight the challenge presented by our dataset, including non-local constraints and multi-modal inputs.",Multimodal Synthesis|regular generation|regex tasks|StackOverflow,Semantics: Textual Inference and Other Areas of Semantics,Long,https://www.aclweb.org/anthology/2020.acl-main.541.pdf -main.596,Modeling Morphological Typology for Unsupervised Learning of Language Morphology,Hongzhi Xu|Jordan Kodner|Mitchell Marcus|Charles Yang,"This paper describes a language-independent model for fully unsupervised morphological analysis that exploits a universal framework leveraging morphological typology. By modeling morphological processes including suffixation, prefixation, infixation, and full and partial reduplication with constrained stem change rules, our system effectively constrains the search space and offers a wide coverage in terms of morphological typology. The system is tested on nine typologically and genetically diverse languages, and shows superior performance over leading systems. We also investigate the effect of an oracle that provides only a handful of bits per language to signal morphological type.",Unsupervised Morphology|fully analysis|language-independent model|universal framework,"Phonology, Morphology and Word Segmentation",Long,https://www.aclweb.org/anthology/2020.acl-main.596.pdf -main.582,Synchronous Double-channel Recurrent Network for Aspect-Opinion Pair Extraction,Shaowei Chen|Jie Liu|Yu Wang|Wenzheng Zhang|Ziming Chi,"Opinion entity extraction is a fundamental task in fine-grained opinion mining. Related studies generally extract aspects and/or opinion expressions without recognizing the relations between them. However, the relations are crucial for downstream tasks, including sentiment classification, opinion summarization, etc. In this paper, we explore Aspect-Opinion Pair Extraction (AOPE) task, which aims at extracting aspects and opinion expressions in pairs. To deal with this task, we propose Synchronous Double-channel Recurrent Network (SDRN) mainly consisting of an opinion entity extraction unit, a relation detection unit, and a synchronization unit. The opinion entity extraction unit and the relation detection unit are developed as two channels to extract opinion entities and relations simultaneously. Furthermore, within the synchronization unit, we design Entity Synchronization Mechanism (ESM) and Relation Synchronization Mechanism (RSM) to enhance the mutual benefit on the above two channels. To verify the performance of SDRN, we manually build three datasets based on SemEval 2014 and 2015 benchmarks. Extensive experiments demonstrate that SDRN achieves state-of-the-art performances.",Aspect-Opinion Extraction|Opinion extraction|fine-grained mining|sentiment classification,Information Extraction,Long,https://www.aclweb.org/anthology/2020.acl-main.582.pdf -main.583,Cross-modal Coherence Modeling for Caption Generation,Malihe Alikhani|Piyush Sharma|Shengjie Li|Radu Soricut|Matthew Stone,"We use coherence relations inspired by computational models of discourse to study the information needs and goals of image captioning. Using an annotation protocol specifically devised for capturing image--caption coherence relations, we annotate 10,000 instances from publicly-available image--caption pairs. We introduce a new task for learning inferences in imagery and text, coherence relation prediction, and show that these coherence annotations can be exploited to learn relation classifiers as an intermediary step, and also train coherence-aware, controllable image captioning models. The results show a dramatic improvement in the consistency and quality of the generated captions with respect to information needs specified via coherence relations.",Caption Generation|image captioning|coherence prediction|Cross-modal Modeling,"Language Grounding to Vision, Robotics and Beyond",Long,https://www.aclweb.org/anthology/2020.acl-main.583.pdf -main.597,Predicting Declension Class from Form and Meaning,Adina Williams|Tiago Pimentel|Hagen Blix|Arya D. McCarthy|Eleanor Chodroff|Ryan Cotterell,"The noun lexica of many natural languages are divided into several declension classes with characteristic morphological properties. Class membership is far from deterministic, but the phonological form of a noun and/or its meaning can often provide imperfect clues. Here, we investigate the strength of those clues. More specifically, we operationalize this by measuring how much information, in bits, we can glean about declension class from knowing the form and/or meaning of nouns. We know that form and meaning are often also indicative of grammatical gender—which, as we quantitatively verify, can itself share information with declension class—so we also control for gender. We find for two Indo-European languages (Czech and German) that form and meaning respectively share significant amounts of information with class (and contribute additional information above and beyond gender). The three-way interaction between class, form, and meaning (given gender) is also significant. Our study is important for two reasons: First, we introduce a new method that provides additional quantitative support for a classic linguistic finding that form and meaning are relevant for the classification of nouns into declensions. Secondly, we show not only that individual declensions classes vary in the strength of their clues within a language, but also that these variations themselves vary across languages.",classification nouns|Declension Class|noun lexica|declension classes,"Phonology, Morphology and Word Segmentation",Long,https://www.aclweb.org/anthology/2020.acl-main.597.pdf -main.226,INSET: Sentence Infilling with INter-SEntential Transformer,Yichen Huang|Yizhe Zhang|Oussama Elachqar|Yu Cheng,"Missing sentence generation (or sentence in-filling) fosters a wide range of applications in natural language generation, such as document auto-completion and meeting note expansion. This task asks the model to generate intermediate missing sentences that can syntactically and semantically bridge the surrounding context. Solving the sentence infilling task requires techniques in natural language processing ranging from understanding to discourse-level planning to generation. In this paper, we propose a framework to decouple the challenge and address these three aspects respectively, leveraging the power of existing large-scale pre-trained models such as BERT and GPT-2. We empirically demonstrate the effectiveness of our model in learning a sentence representation for generation and further generating a missing sentence that fits the context.",Sentence Infilling|Missing generation|sentence in-filling|natural generation,Generation,Long,https://www.aclweb.org/anthology/2020.acl-main.226.pdf -main.540,Word-level Textual Adversarial Attacking as Combinatorial Optimization,Yuan Zang|Fanchao Qi|Chenghao Yang|Zhiyuan Liu|Meng Zhang|Qun Liu|Maosong Sun,"Adversarial attacks are carried out to reveal the vulnerability of deep neural networks. Textual adversarial attacking is challenging because text is discrete and a small perturbation can bring significant change to the original input. Word-level attacking, which can be regarded as a combinatorial optimization problem, is a well-studied class of textual attack methods. However, existing word-level attack models are far from perfect, largely because unsuitable search space reduction methods and inefficient optimization algorithms are employed. In this paper, we propose a novel attack model, which incorporates the sememe-based word substitution method and particle swarm optimization-based search algorithm to solve the two problems separately. We conduct exhaustive experiments to evaluate our attack model by attacking BiLSTM and BERT on three benchmark datasets. Experimental results demonstrate that our model consistently achieves much higher attack success rates and crafts more high-quality adversarial examples as compared to baseline methods. Also, further experiments show our model has higher transferability and can bring more robustness enhancement to victim models by adversarial training. All the code and data of this paper can be obtained on https://github.com/thunlp/SememePSO-Attack.",Textual attacking|Word-level attacking|combinatorial problem|Word-level Attacking,Semantics: Sentence Level,Long,https://www.aclweb.org/anthology/2020.acl-main.540.pdf -main.554,Jointly Learning to Align and Summarize for Neural Cross-Lingual Summarization,Yue Cao|Hui Liu|Xiaojun Wan,"Cross-lingual summarization is the task of generating a summary in one language given a text in a different language. Previous works on cross-lingual summarization mainly focus on using pipeline methods or training an end-to-end model using the translated parallel data. However, it is a big challenge for the model to directly learn cross-lingual summarization as it requires learning to understand different languages and learning how to summarize at the same time. In this paper, we propose to ease the cross-lingual summarization training by jointly learning to align and summarize. We design relevant loss functions to train this framework and propose several methods to enhance the isomorphism and cross-lingual transfer between languages. Experimental results show that our model can outperform competitive models in most cases. In addition, we show that our model even has the ability to generate cross-lingual summaries without access to any cross-lingual corpus.",Neural Summarization|Cross-lingual summarization|cross-lingual training|pipeline methods,Summarization,Long,https://www.aclweb.org/anthology/2020.acl-main.554.pdf -main.232,Learning to execute instructions in a Minecraft dialogue,Prashant Jayannavar|Anjali Narayan-Chen|Julia Hockenmaier,"The Minecraft Collaborative Building Task is a two-player game in which an Architect (A) instructs a Builder (B) to construct a target structure in a simulated Blocks World Environment. We define the subtask of predicting correct action sequences (block placements and removals) in a given game context, and show that capturing B's past actions as well as B's perspective leads to a significant improvement in performance on this challenging language understanding problem.",Minecraft Task|predicting sequences|language problem|Builder,"Language Grounding to Vision, Robotics and Beyond",Long,https://www.aclweb.org/anthology/2020.acl-main.232.pdf -main.568,Speaker Sensitive Response Evaluation Model,JinYeong Bak|Alice Oh,"Automatic evaluation of open-domain dialogue response generation is very challenging because there are many appropriate responses for a given context. Existing evaluation models merely compare the generated response with the ground truth response and rate many of the appropriate responses as inappropriate if they deviate from the ground truth. One approach to resolve this problem is to consider the similarity of the generated response with the conversational context. In this paper, we propose an automatic evaluation model based on that idea and learn the model parameters from an unlabeled conversation corpus. Our approach considers the speakers in defining the different levels of similar context. We use a Twitter conversation corpus that contains many speakers and conversations to test our evaluation model. Experiments show that our model outperforms the other existing evaluation metrics in terms of high correlation with human annotation scores. We also show that our model trained on Twitter can be applied to movie dialogues without any additional training. We provide our code and the learned parameters so that they can be used for automatic evaluation of dialogue response generation models.",Speaker Model|Automatic generation|open-domain generation|automatic models,Dialogue and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.acl-main.568.pdf -main.742,Exploring Unexplored Generalization Challenges for Cross-Database Semantic Parsing,Alane Suhr|Ming-Wei Chang|Peter Shaw|Kenton Lee,"We study the task of cross-database semantic parsing (XSP), where a system that maps natural language utterances to executable SQL queries is evaluated on databases unseen during training. Recently, several datasets, including Spider, were proposed to support development of XSP systems. We propose a challenging evaluation setup for cross-database semantic parsing, focusing on variation across database schemas and in-domain language use. We re-purpose eight semantic parsing datasets that have been well-studied in the setting where in-domain training data is available, and instead use them as additional evaluation data for XSP systems instead. We build a system that performs well on Spider, and find that it struggles to generalize to our re-purposed set. Our setup uncovers several generalization challenges for cross-database semantic parsing, demonstrating the need to use and develop diverse training and evaluation datasets.",Exploring Challenges|Cross-Database Parsing|cross-database XSP|cross-database,Semantics: Sentence Level,Long,https://www.aclweb.org/anthology/2020.acl-main.742.pdf -main.756,Parallel Corpus Filtering via Pre-trained Language Models,Boliang Zhang|Ajay Nagesh|Kevin Knight,"Web-crawled data provides a good source of parallel corpora for training machine translation models. It is automatically obtained, but extremely noisy, and recent work shows that neural machine translation systems are more sensitive to noise than traditional statistical machine translation methods. In this paper, we propose a novel approach to filter out noisy sentence pairs from web-crawled corpora via pre-trained language models. We measure sentence parallelism by leveraging the multilingual capability of BERT and use the Generative Pre-training (GPT) language model as a domain filter to balance data domains. We evaluate the proposed method on the WMT 2018 Parallel Corpus Filtering shared task, and on our own web-crawled Japanese-Chinese parallel corpus. Our method significantly outperforms baselines and achieves a new state-of-the-art. In an unsupervised setting, our method achieves comparable performance to the top-1 supervised method. We also evaluate on a web-crawled Japanese-Chinese parallel corpus that we make publicly available.",machine models|WMT task|Parallel Filtering|Pre-trained Models,Machine Translation,Long,https://www.aclweb.org/anthology/2020.acl-main.756.pdf -main.193,Learning to Contextually Aggregate Multi-Source Supervision for Sequence Labeling,Ouyu Lan|Xiao Huang|Bill Yuchen Lin|He Jiang|Liyuan Liu|Xiang Ren,"Sequence labeling is a fundamental task for a range of natural language processing problems. When used in practice, its performance is largely influenced by the annotation quality and quantity, and meanwhile, obtaining ground truth labels is often costly. In many cases, ground truth labels do not exist, but noisy annotations or annotations from different domains are accessible. In this paper, we propose a novel framework Consensus Network (ConNet) that can be trained on annotations from multiple sources (e.g., crowd annotation, cross-domain data). It learns individual representation for every source and dynamically aggregates source-specific knowledge by a context-aware attention module. Finally, it leads to a model reflecting the agreement (consensus) among multiple sources. We evaluate the proposed framework in two practical settings of multi-source learning: learning with crowd annotations and unsupervised cross-domain model adaptation. Extensive experimental results show that our model achieves significant improvements over existing methods in both settings. We also demonstrate that the method can apply to various tasks and cope with different encoders.",Sequence Labeling|natural problems|crowd annotation|multi-source learning,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.193.pdf -main.187,Speak to your Parser: Interactive Text-to-SQL with Natural Language Feedback,Ahmed Elgohary|Saghar Hosseini|Ahmed Hassan Awadallah,"We study the task of semantic parse correction with natural language feedback. Given a natural language utterance, most semantic parsing systems pose the problem as one-shot translation where the utterance is mapped to a corresponding logical form. In this paper, we investigate a more interactive scenario where humans can further interact with the system by providing free-form natural language feedback to correct the system when it generates an inaccurate interpretation of an initial utterance. We focus on natural language to SQL systems and construct, SPLASH, a dataset of utterances, incorrect SQL interpretations and the corresponding natural language feedback. We compare various reference models for the correction task and show that incorporating such a rich form of feedback can significantly improve the overall semantic parsing accuracy while retaining the flexibility of natural language interaction. While we estimated human correction accuracy is 81.5%, our best model achieves only 25.1%, which leaves a large gap for improvement in future research. SPLASH is publicly available at https://aka.ms/Splash_dataset.",semantic correction|one-shot translation|correction task|Parser,Dialogue and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.acl-main.187.pdf -main.636,Meta-Reinforced Multi-Domain State Generator for Dialogue Systems,Yi Huang|Junlan Feng|Min Hu|Xiaoting Wu|Xiaoyu Du|Shuo Ma,"A Dialogue State Tracker (DST) is a core component of a modular task-oriented dialogue system. Tremendous progress has been made in recent years. However, the major challenges remain. The state-of-the-art accuracy for DST is below 50% for a multi-domain dialogue task. A learnable DST for any new domain requires a large amount of labeled in-domain data and training from scratch. In this paper, we propose a Meta-Reinforced Multi-Domain State Generator (MERET). Our first contribution is to improve the DST accuracy. We enhance a neural model based DST generator with a reward manager, which is built on policy gradient reinforcement learning (RL) to fine-tune the generator. With this change, we are able to improve the joint accuracy of DST from 48.79% to 50.91% on the MultiWOZ corpus. Second, we explore to train a DST meta-learning model with a few domains as source domains and a new domain as target domain. We apply the model-agnostic meta-learning algorithm (MAML) to DST and the obtained meta-learning model is used for new domain adaptation. Our experimental results show this solution is able to outperform the traditional training approach with extremely less training data in target domain.",Dialogue Systems|multi-domain task|new adaptation|domain adaptation,Dialogue and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.acl-main.636.pdf -main.150,Language-aware Interlingua for Multilingual Neural Machine Translation,Changfeng Zhu|Heng Yu|Shanbo Cheng|Weihua Luo,"Multilingual neural machine translation (NMT) has led to impressive accuracy improvements in low-resource scenarios by sharing common linguistic information across languages. However, the traditional multilingual model fails to capture the diversity and specificity of different languages, resulting in inferior performance compared with individual models that are sufficiently trained. In this paper, we incorporate a language-aware interlingua into the Encoder-Decoder architecture. The interlingual network enables the model to learn a language-independent representation from the semantic spaces of different languages, while still allowing for language-specific specialization of a particular language-pair. Experiments show that our proposed method achieves remarkable improvements over state-of-the-art multilingual NMT baselines and produces comparable performance with strong individual models.",Multilingual Translation|low-resource scenarios|Language-aware Interlingua|NMT,Machine Translation,Short,https://www.aclweb.org/anthology/2020.acl-main.150.pdf -main.144,Boosting Neural Machine Translation with Similar Translations,Jitao XU|Josep Crego|Jean Senellart,"This paper explores data augmentation methods for training Neural Machine Translation to make use of similar translations, in a comparable way a human translator employs fuzzy matches. In particular, we show how we can simply present the neural model with information of both source and target sides of the fuzzy matches, we also extend the similarity to include semantically related translations retrieved using sentence distributed representations. We show that translations based on fuzzy matching provide the model with ``copy'' information while translations based on embedding similarities tend to extend the translation ``context''. Results indicate that the effect from both similar sentences are adding up to further boost accuracy, combine naturally with model fine-tuning and are providing dynamic adaptation for unseen translation pairs. Tests on multiple data sets and domains show consistent accuracy improvements. To foster research around these techniques, we also release an Open-Source toolkit with efficient and flexible fuzzy-match implementation.",Boosting Translation|Neural Translation|data methods|human translator,Machine Translation,Long,https://www.aclweb.org/anthology/2020.acl-main.144.pdf -main.622,CorefQA: Coreference Resolution as Query-based Span Prediction,Wei Wu|Fei Wang|Arianna Yuan|Fei Wu|Jiwei Li,"In this paper, we present CorefQA, an accurate and extensible approach for the coreference resolution task. We formulate the problem as a span prediction task, like in question answering: A query is generated for each candidate mention using its surrounding context, and a span prediction module is employed to extract the text spans of the coreferences within the document using the generated query. This formulation comes with the following key advantages: (1) The span prediction strategy provides the flexibility of retrieving mentions left out at the mention proposal stage; (2) In the question answering framework, encoding the mention and its context explicitly in a query makes it possible to have a deep and thorough examination of cues embedded in the context of coreferent mentions; and (3) A plethora of existing question answering datasets can be used for data augmentation to improve the model's generalization capability. Experiments demonstrate significant performance boost over previous models, with 83.1 (+3.5) F1 score on the CoNLL-2012 benchmark and 87.5 (+2.5) F1 score on the GAP benchmark.",Coreference Resolution|coreference task|span task|in answering,NLP Applications,Long,https://www.aclweb.org/anthology/2020.acl-main.622.pdf -main.178,Recollection versus Imagination: Exploring Human Memory and Cognition via Neural Language Models,Maarten Sap|Eric Horvitz|Yejin Choi|Noah A. Smith|James Pennebaker,"We investigate the use of NLP as a measure of the cognitive processes involved in storytelling, contrasting imagination and recollection of events. To facilitate this, we collect and release Hippocorpus, a dataset of 7,000 stories about imagined and recalled events. We introduce a measure of narrative flow and use this to examine the narratives for imagined and recalled events. Additionally, we measure the differential recruitment of knowledge attributed to semantic memory versus episodic memory (Tulving, 1972) for imagined and recalled storytelling by comparing the frequency of descriptions of general commonsense events with more specific realis events. Our analyses show that imagined stories have a substantially more linear narrative flow, compared to recalled stories in which adjacent sentences are more disconnected. In addition, while recalled stories rely more on autobiographical events based on episodic memory, imagined stories express more commonsense knowledge based on semantic memory. Finally, our measures reveal the effect of narrativization of memories in stories (e.g., stories about frequently recalled memories flow more linearly; Bartlett, 1932). Our findings highlight the potential of using NLP tools to study the traces of human cognition in language.",Imagination|storytelling|narrativization memories|Recollection,Cognitive Modeling and Psycholinguistics,Short,https://www.aclweb.org/anthology/2020.acl-main.178.pdf -main.391,Encoder-Decoder Models Can Benefit from Pre-trained Masked Language Models in Grammatical Error Correction,Masahiro Kaneko|Masato Mita|Shun Kiyono|Jun Suzuki|Kentaro Inui,"This paper investigates how to effectively incorporate a pre-trained masked language model (MLM), such as BERT, into an encoder-decoder (EncDec) model for grammatical error correction (GEC). The answer to this question is not as straightforward as one might expect because the previous common methods for incorporating a MLM into an EncDec model have potential drawbacks when applied to GEC. For example, the distribution of the inputs to a GEC model can be considerably different (erroneous, clumsy, etc.) from that of the corpora used for pre-training MLMs; however, this issue is not addressed in the previous methods. Our experiments show that our proposed method, where we first fine-tune a MLM with a given GEC corpus and then use the output of the fine-tuned MLM as additional features in the GEC model, maximizes the benefit of the MLM. The best-performing model achieves state-of-the-art performances on the BEA-2019 and CoNLL-2014 benchmarks. Our code is publicly available at: https://github.com/kanekomasahiro/bert-gec.",Grammatical Correction|GEC|Encoder-Decoder Models|Pre-trained Models,NLP Applications,Short,https://www.aclweb.org/anthology/2020.acl-main.391.pdf -main.385,Quantifying Attention Flow in Transformers,Samira Abnar|Willem Zuidema,"In the Transformer model, “self-attention” combines information from attended embeddings into the representation of the focal embedding in the next layer. Thus, across layers of the Transformer, information originating from different tokens gets increasingly mixed. This makes attention weights unreliable as explanations probes. In this paper, we consider the problem of quantifying this flow of information through self-attention. We propose two methods for approximating the attention to input tokens given attention weights, attention rollout and attention flow, as post hoc methods when we use attention weights as the relative relevance of the input tokens. We show that these methods give complementary views on the flow of information, and compared to raw attention, both yield higher correlations with importance scores of input tokens obtained using an ablation method and input gradients.",Quantifying Transformers|quantifying information|Attention Transformers|Transformer model,Interpretability and Analysis of Models for NLP,Short,https://www.aclweb.org/anthology/2020.acl-main.385.pdf -main.352,Neural Temporal Opinion Modelling for Opinion Prediction on Twitter,Lixing Zhu|Yulan He|Deyu Zhou,"Opinion prediction on Twitter is challenging due to the transient nature of tweet content and neighbourhood context. In this paper, we model users' tweet posting behaviour as a temporal point process to jointly predict the posting time and the stance label of the next tweet given a user's historical tweet sequence and tweets posted by their neighbours. We design a topic-driven attention mechanism to capture the dynamic topic shifts in the neighbourhood context. Experimental results show that the proposed model predicts both the posting time and the stance labels of future tweets more accurately compared to a number of competitive baselines.",Opinion Prediction|Neural Modelling|temporal process|topic-driven mechanism,Computational Social Science and Social Media,Short,https://www.aclweb.org/anthology/2020.acl-main.352.pdf -main.434,Spying on Your Neighbors: Fine-grained Probing of Contextual Embeddings for Information about Surrounding Words,Josef Klafka|Allyson Ettinger,"Although models using contextual word embeddings have achieved state-of-the-art results on a host of NLP tasks, little is known about exactly what information these embeddings encode about the context words that they are understood to reflect. To address this question, we introduce a suite of probing tasks that enable fine-grained testing of contextual embeddings for encoding of information about surrounding words. We apply these tasks to examine the popular BERT, ELMo and GPT contextual encoders, and find that each of our tested information types is indeed encoded as contextual information across tokens, often with near-perfect recoverability---but the encoders vary in which features they distribute to which tokens, how nuanced their distributions are, and how robust the encoding of each feature is to distance. We discuss implications of these results for how different types of models break down and prioritize word-level context information when constructing token embeddings.",Fine-grained Embeddings|NLP tasks|probing tasks|encoding information,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.434.pdf -main.3,Coach: A Coarse-to-Fine Approach for Cross-domain Slot Filling,Zihan Liu|Genta Indra Winata|Peng Xu|Pascale Fung,"As an essential task in task-oriented dialog systems, slot filling requires extensive training data in a certain domain. However, such data are not always available. Hence, cross-domain slot filling has naturally arisen to cope with this data scarcity problem. In this paper, we propose a Coarse-to-fine approach (Coach) for cross-domain slot filling. Our model first learns the general pattern of slot entities by detecting whether the tokens are slot entities or not. It then predicts the specific types for the slot entities. In addition, we propose a template regularization approach to improve the adaptation robustness by regularizing the representation of utterances based on utterance templates. Experimental results show that our model significantly outperforms state-of-the-art approaches in slot filling. Furthermore, our model can also be applied to the cross-domain named entity recognition task, and it achieves better adaptation performance than other existing baselines. The code is available at https://github.com/zliucr/coach.",Cross-domain Filling|task-oriented systems|slot filling|data problem,Dialogue and Interactive Systems,Short,https://www.aclweb.org/anthology/2020.acl-main.3.pdf -main.420,Information-Theoretic Probing for Linguistic Structure,Tiago Pimentel|Josef Valvoda|Rowan Hall Maudslay|Ran Zmigrod|Adina Williams|Ryan Cotterell,"The success of neural networks on a diverse set of NLP tasks has led researchers to question how much these networks actually ``know'' about natural language. Probes are a natural way of assessing this. When probing, a researcher chooses a linguistic task and trains a supervised model to predict annotations in that linguistic task from the network's learned representations. If the probe does well, the researcher may conclude that the representations encode knowledge related to the task. A commonly held belief is that using simpler models as probes is better; the logic is that simpler models will identify linguistic structure, but not learn the task itself. We propose an information-theoretic operationalization of probing as estimating mutual information that contradicts this received wisdom: one should always select the highest performing probe one can, even if it is more complex, since it will result in a tighter estimate, and thus reveal more of the linguistic information inherent in the representation. The experimental portion of our paper focuses on empirically estimating the mutual information between a linguistic property and BERT, comparing these estimates to several baselines. We evaluate on a set of ten typologically diverse languages often underrepresented in NLP research---plus English---totalling eleven languages. Our implementation is available in https://github.com/rycolab/info-theoretic-probing.",Information-Theoretic Probing|NLP tasks|linguistic task|probing,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.420.pdf -main.346,Improving Disfluency Detection by Self-Training a Self-Attentive Model,Paria Jamshid Lou|Mark Johnson,"Self-attentive neural syntactic parsers using contextualized word embeddings (e.g. ELMo or BERT) currently produce state-of-the-art results in joint parsing and disfluency detection in speech transcripts. Since the contextualized word embeddings are pre-trained on a large amount of unlabeled data, using additional unlabeled data to train a neural model might seem redundant. However, we show that self-training --- a semi-supervised technique for incorporating unlabeled data --- sets a new state-of-the-art for the self-attentive parser on disfluency detection, demonstrating that self-training provides benefits orthogonal to the pre-trained contextualized word representations. We also show that ensembling self-trained parsers provides further gains for disfluency detection.",Disfluency Detection|joint parsing|Self-Attentive Model|Self-attentive parsers,Speech and Multimodality,Long,https://www.aclweb.org/anthology/2020.acl-main.346.pdf -main.59,Multi-Agent Task-Oriented Dialog Policy Learning with Role-Aware Reward Decomposition,Ryuichi Takanobu|Runze Liang|Minlie Huang,"Many studies have applied reinforcement learning to train a dialog policy and show great promise these years. One common approach is to employ a user simulator to obtain a large number of simulated user experiences for reinforcement learning algorithms. However, modeling a realistic user simulator is challenging. A rule-based simulator requires heavy domain expertise for complex tasks, and a data-driven simulator requires considerable data and it is even unclear how to evaluate a simulator. To avoid explicitly building a user simulator beforehand, we propose Multi-Agent Dialog Policy Learning, which regards both the system and the user as the dialog agents. Two agents interact with each other and are jointly learned simultaneously. The method uses the actor-critic framework to facilitate pretraining and improve scalability. We also propose Hybrid Value Network for the role-aware reward decomposition to integrate role-specific domain knowledge of each agent in the task-oriented dialog. Results show that our method can successfully build a system policy and a user policy simultaneously, and two agents can achieve a high task success rate through conversational interaction.",pretraining|Multi-Agent Learning|Role-Aware Decomposition|reinforcement learning,Dialogue and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.acl-main.59.pdf -main.71,Generative Semantic Hashing Enhanced via Boltzmann Machines,Lin Zheng|Qinliang Su|Dinghan Shen|Changyou Chen,"Generative semantic hashing is a promising technique for large-scale information retrieval thanks to its fast retrieval speed and small memory footprint. For the tractability of training, existing generative-hashing methods mostly assume a factorized form for the posterior distribution, enforcing independence among the bits of hash codes. From the perspectives of both model representation and code space size, independence is always not the best assumption. In this paper, to introduce correlations among the bits of hash codes, we propose to employ the distribution of Boltzmann machine as the variational posterior. To address the intractability issue of training, we first develop an approximate method to reparameterize the distribution of a Boltzmann machine by augmenting it as a hierarchical concatenation of a Gaussian-like distribution and a Bernoulli distribution. Based on that, an asymptotically-exact lower bound is further derived for the evidence lower bound (ELBO). With these novel techniques, the entire model can be optimized efficiently. Extensive experimental results demonstrate that by effectively modeling correlations among different bits within a hash code, our model can achieve significant performance gains.",Generative Hashing|large-scale retrieval|training|Boltzmann Machines,Information Retrieval and Text Mining,Long,https://www.aclweb.org/anthology/2020.acl-main.71.pdf -main.408,ERASER: A Benchmark to Evaluate Rationalized NLP Models,Jay DeYoung|Sarthak Jain|Nazneen Fatema Rajani|Eric Lehman|Caiming Xiong|Richard Socher|Byron C. Wallace,"State-of-the-art models in NLP are now predominantly based on deep neural networks that are opaque in terms of how they come to make predictions. This limitation has increased interest in designing more interpretable deep models for NLP that reveal the `reasoning' behind model outputs. But work in this direction has been conducted on different datasets and tasks with correspondingly unique aims and metrics; this makes it difficult to track progress. We propose the Evaluating Rationales And Simple English Reasoning (ERASER\, a benchmark to advance research on interpretable models in NLP. This benchmark comprises multiple datasets and tasks for which human annotations of ``rationales'' (supporting evidence) have been collected. We propose several metrics that aim to capture how well the rationales provided by models align with human rationales, and also how faithful these rationales are (i.e., the degree to which provided rationales influenced the corresponding predictions). Our hope is that releasing this benchmark facilitates progress on designing more interpretable NLP systems. The benchmark, code, and documentation are available at https://www.eraserbenchmark.com/",NLP|Evaluating Reasoning|ERASER|Rationalized Models,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.408.pdf -main.65,Explicit Semantic Decomposition for Definition Generation,Jiahuan Li|Yu Bao|Shujian Huang|Xinyu Dai|Jiajun Chen,"Definition generation, which aims to automatically generate dictionary definitions for words, has recently been proposed to assist the construction of dictionaries and help people understand unfamiliar texts. However, previous works hardly consider explicitly modeling the ``components'' of definitions, leading to under-specific generation results. In this paper, we propose ESD, namely Explicit Semantic Decomposition for definition Generation, which explicitly decomposes the meaning of words into semantic components, and models them with discrete latent variables for definition generation. Experimental results show that \method achieves top results on WordNet and Oxford benchmarks, outperforming strong previous baselines.",Definition Generation|construction dictionaries|under-specific generation|Explicit Decomposition,Generation,Long,https://www.aclweb.org/anthology/2020.acl-main.65.pdf -main.342,Transition-based Directed Graph Construction for Emotion-Cause Pair Extraction,Chuang Fan|Chaofa Yuan|Jiachen Du|Lin Gui|Min Yang|Ruifeng Xu,"Emotion-cause pair extraction aims to extract all potential pairs of emotions and corresponding causes from unannotated emotion text. Most existing methods are pipelined framework, which identifies emotions and extracts causes separately, leading to a drawback of error propagation. Towards this issue, we propose a transition-based model to transform the task into a procedure of parsing-like directed graph construction. The proposed model incrementally generates the directed graph with labeled edges based on a sequence of actions, from which we can recognize emotions with the corresponding causes simultaneously, thereby optimizing separate subtasks jointly and maximizing mutual benefits of tasks interdependently. Experimental results show that our approach achieves the best performance, outperforming the state-of-the-art methods by 6.71% (p<0.01) in F1 measure.",Emotion-Cause Extraction|error propagation|parsing-like construction|Transition-based Construction,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Long,https://www.aclweb.org/anthology/2020.acl-main.342.pdf -main.424,ASSET: A Dataset for Tuning and Evaluation of Sentence Simplification Models with Multiple Rewriting Transformations,Fernando Alva-Manchego|Louis Martin|Antoine Bordes|Carolina Scarton|Benoît Sagot|Lucia Specia,"In order to simplify a sentence, human editors perform multiple rewriting transformations: they split it into several shorter sentences, paraphrase words (i.e. replacing complex words or phrases by simpler synonyms), reorder components, and/or delete information deemed unnecessary. Despite these varied range of possible text alterations, current models for automatic sentence simplification are evaluated using datasets that are focused on a single transformation, such as lexical paraphrasing or splitting. This makes it impossible to understand the ability of simplification models in more realistic settings. To alleviate this limitation, this paper introduces ASSET, a new dataset for assessing sentence simplification in English. ASSET is a crowdsourced multi-reference corpus where each simplification was produced by executing several rewriting transformations. Through quantitative and qualitative experiments, we show that simplifications in ASSET are better at capturing characteristics of simplicity when compared to other standard evaluation datasets for the task. Furthermore, we motivate the need for developing better methods for automatic evaluation using ASSET, since we show that current popular metrics may not be suitable when multiple simplification transformations are performed.",Tuning Models|rewriting transformations|automatic simplification|splitting,Resources and Evaluation,Long,https://www.aclweb.org/anthology/2020.acl-main.424.pdf -main.7,Guiding Variational Response Generator to Exploit Persona,Bowen Wu|Mengyuan Li|Zongsheng Wang|Yifu Chen|Derek F. Wong|Qihang Feng|Junhong Huang|Baoxun Wang,"Leveraging persona information of users in Neural Response Generators (NRG) to perform personalized conversations has been considered as an attractive and important topic in the research of conversational agents over the past few years. Despite of the promising progress achieved by recent studies in this field, persona information tends to be incorporated into neural networks in the form of user embeddings, with the expectation that the persona can be involved via End-to-End learning. This paper proposes to adopt the personality-related characteristics of human conversations into variational response generators, by designing a specific conditional variational autoencoder based deep model with two new regularization terms employed to the loss function, so as to guide the optimization towards the direction of generating both persona-aware and relevant responses. Besides, to reasonably evaluate the performances of various persona modeling approaches, this paper further presents three direct persona-oriented metrics from different perspectives. The experimental results have shown that our proposed methodology can notably improve the performance of persona-aware response generation, and the metrics are reasonable to evaluate the results.",conversational agents|optimization|persona-aware generation|Variational Generator,Dialogue and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.acl-main.7.pdf -main.430,Influence Paths for Characterizing Subject-Verb Number Agreement in LSTM Language Models,Kaiji Lu|Piotr Mardziel|Klas Leino|Matt Fredrikson|Anupam Datta,"LSTM-based recurrent neural networks are the state-of-the-art for many natural language processing (NLP) tasks. Despite their performance, it is unclear whether, or how, LSTMs learn structural features of natural languages such as subject-verb number agreement in English. Lacking this understanding, the generality of LSTM performance on this task and their suitability for related tasks remains uncertain. Further, errors cannot be properly attributed to a lack of structural capability, training data omissions, or other exceptional faults. We introduce *influence paths*, a causal account of structural properties as carried by paths across gates and neurons of a recurrent neural network. The approach refines the notion of influence (the subject’s grammatical number has influence on the grammatical number of the subsequent verb) into a set of gate or neuron-level paths. The set localizes and segments the concept (e.g., subject-verb agreement), its constituent elements (e.g., the subject), and related or interfering elements (e.g., attractors). We exemplify the methodology on a widely-studied multi-layer LSTM language model, demonstrating its accounting for subject-verb number agreement. The results offer both a finer and a more complete view of an LSTM’s handling of this structural aspect of the English language than prior results based on diagnostic classifiers and ablation.",natural tasks|LSTM Models|LSTM-based networks|LSTMs,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.430.pdf -main.49,Integrating Semantic and Structural Information with Graph Convolutional Network for Controversy Detection,Lei Zhong|Juan Cao|Qiang Sheng|Junbo Guo|Ziang Wang,"Identifying controversial posts on social media is a fundamental task for mining public sentiment, assessing the influence of events, and alleviating the polarized views. However, existing methods fail to 1) effectively incorporate the semantic information from content-related posts; 2) preserve the structural information for reply relationship modeling; 3) properly handle posts from topics dissimilar to those in the training set. To overcome the first two limitations, we propose Topic-Post-Comment Graph Convolutional Network (TPC-GCN), which integrates the information from the graph structure and content of topics, posts, and comments for post-level controversy detection. As to the third limitation, we extend our model to Disentangled TPC-GCN (DTPC-GCN), to disentangle topic-related and topic-unrelated features and then fuse dynamically. Extensive experiments on two real-world datasets demonstrate that our models outperform existing methods. Analysis of the results and cases proves that our models can integrate both semantic and structural information with significant generalizability.",Controversy Detection|Identifying posts|mining sentiment|assessing events,Computational Social Science and Social Media,Long,https://www.aclweb.org/anthology/2020.acl-main.49.pdf -main.356,Bayesian Hierarchical Words Representation Learning,Oren Barkan|Idan Rejwan|Avi Caciularu|Noam Koenigstein,"This paper presents the Bayesian Hierarchical Words Representation (BHWR) learning algorithm. BHWR facilitates Variational Bayes word representation learning combined with semantic taxonomy modeling via hierarchical priors. By propagating relevant information between related words, BHWR utilizes the taxonomy to improve the quality of such representations. Evaluation of several linguistic datasets demonstrates the advantages of BHWR over suitable alternatives that facilitate Bayesian modeling with or without semantic priors. Finally, we further show that BHWR produces better representations for rare words.",Bayesian modeling|Bayesian Learning|BHWR|Variational learning,Machine Learning for NLP,Short,https://www.aclweb.org/anthology/2020.acl-main.356.pdf -main.61,Response-Anticipated Memory for On-Demand Knowledge Integration in Response Generation,Zhiliang Tian|Wei Bi|Dongkyu Lee|Lanqing Xue|Yiping Song|Xiaojiang Liu|Nevin L. Zhang,"Neural conversation models are known to generate appropriate but non-informative responses in general. A scenario where informativeness can be significantly enhanced is Conversing by Reading (CbR), where conversations take place with respect to a given external document. In previous work, the external document is utilized by (1) creating a context-aware document memory that integrates information from the document and the conversational context, and then (2) generating responses referring to the memory. In this paper, we propose to create the document memory with some anticipated responses in mind. This is achieved using a teacher-student framework. The teacher is given the external document, the context, and the ground-truth response, and learns how to build a response-aware document memory from three sources of information. The student learns to construct a response-anticipated document memory from the first two sources, and teacher’s insight on memory creation. Empirical results show that our model outperforms the previous state-of-the-art for the CbR task.",On-Demand Integration|Response Generation|Conversing Reading|Conversing,Dialogue and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.acl-main.61.pdf -main.418,Toward Gender-Inclusive Coreference Resolution,Yang Trista Cao|Hal Daumé III,"Correctly resolving textual mentions of people fundamentally entails making inferences about those people. Such inferences raise the risk of systemic biases in coreference resolution systems, including biases that can harm binary and non-binary trans and cis stakeholders. To better understand such biases, we foreground nuanced conceptualizations of gender from sociology and sociolinguistics, and develop two new datasets for interrogating bias in crowd annotations and in existing coreference resolution systems. Through these studies, conducted on English text, we confirm that without acknowledging and building systems that recognize the complexity of gender, we build systems that lead to many potential harms.",Gender-Inclusive Resolution|interrogating annotations|coreference systems|systemic biases,Ethics and NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.418.pdf -main.75,"""The Boating Store Had Its Best Sail Ever"": Pronunciation-attentive Contextualized Pun Recognition",Yichao Zhou|Jyun-Yu Jiang|Jieyu Zhao|Kai-Wei Chang|Wei Wang,"Humor plays an important role in human languages and it is essential to model humor when building intelligence systems. Among different forms of humor, puns perform wordplay for humorous effects by employing words with double entendre and high phonetic similarity. However, identifying and modeling puns are challenging as puns usually involved implicit semantic or phonological tricks. In this paper, we propose Pronunciation-attentive Contextualized Pun Recognition (PCPR) to perceive human humor, detect if a sentence contains puns and locate them in the sentence. PCPR derives contextualized representation for each word in a sentence by capturing the association between the surrounding context and its corresponding phonetic symbols. Extensive experiments are conducted on two benchmark datasets. Results demonstrate that the proposed approach significantly outperforms the state-of-the-art methods in pun detection and location tasks. In-depth analyses verify the effectiveness and robustness of PCPR.",Pronunciation-attentive Recognition|human languages|intelligence systems|pun tasks,NLP Applications,Long,https://www.aclweb.org/anthology/2020.acl-main.75.pdf -main.381,Analyzing analytical methods: The case of phonology in neural models of spoken language,Grzegorz Chrupała|Bertrand Higy|Afra Alishahi,"Given the fast development of analysis techniques for NLP and speech processing systems, few systematic studies have been conducted to compare the strengths and weaknesses of each method. As a step in this direction we study the case of representations of phonology in neural network models of spoken language. We use two commonly applied analytical techniques, diagnostic classifiers and representational similarity analysis, to quantify to what extent neural activation patterns encode phonemes and phoneme sequences. We manipulate two factors that can affect the outcome of analysis. First, we investigate the role of learning by comparing neural activations extracted from trained versus randomly-initialized models. Second, we examine the temporal scope of the activations by probing both local activations corresponding to a few milliseconds of the speech signal, and global activations pooled over the whole utterance. We conclude that reporting analysis results with randomly initialized models is crucial, and that global-scope methods tend to yield more consistent and interpretable results and we recommend their use as a complement to local-scope diagnostic methods.",NLP systems|learning|reporting analysis|neural language,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.381.pdf -main.395,Programming in Natural Language with fuSE: Synthesizing Methods from Spoken Utterances Using Deep Natural Language Understanding,Sebastian Weigelt|Vanessa Steurer|Tobias Hey|Walter F. Tichy,"The key to effortless end-user programming is natural language. We examine how to teach intelligent systems new functions, expressed in natural language. As a first step, we collected 3168 samples of teaching efforts in plain English. Then we built fuSE, a novel system that translates English function descriptions into code. Our approach is three-tiered and each task is evaluated separately. We first classify whether an intent to teach new functionality is present in the utterance (accuracy: 97.7% using BERT). Then we analyze the linguistic structure and construct a semantic model (accuracy: 97.6% using a BiLSTM). Finally, we synthesize the signature of the method, map the intermediate steps (instructions in the method body) to API calls and inject control structures (F₁: 67.0% with information retrieval and knowledge-based methods). In an end-to-end evaluation on an unseen dataset fuSE synthesized 84.6% of the method signatures and 79.2% of the API calls correctly.",intelligent systems|information retrieval|Deep Understanding|end-user programming,NLP Applications,Long,https://www.aclweb.org/anthology/2020.acl-main.395.pdf -main.626,Controlled Crowdsourcing for High-Quality QA-SRL Annotation,Paul Roit|Ayal Klein|Daniela Stepanov|Jonathan Mamou|Julian Michael|Gabriel Stanovsky|Luke Zettlemoyer|Ido Dagan,"Question-answer driven Semantic Role Labeling (QA-SRL) was proposed as an attractive open and natural flavour of SRL, potentially attainable from laymen. Recently, a large-scale crowdsourced QA-SRL corpus and a trained parser were released. Trying to replicate the QA-SRL annotation for new texts, we found that the resulting annotations were lacking in quality, particularly in coverage, making them insufficient for further research and evaluation. In this paper, we present an improved crowdsourcing protocol for complex semantic annotation, involving worker selection and training, and a data consolidation phase. Applying this protocol to QA-SRL yielded high-quality annotation with drastically higher coverage, producing a new gold evaluation dataset. We believe that our annotation protocol and gold standard will facilitate future replicable research of natural semantic annotations.",High-Quality Annotation|Question-answer Labeling|complex annotation|training,Semantics: Sentence Level,Short,https://www.aclweb.org/anthology/2020.acl-main.626.pdf -main.140,Probing Linguistic Features of Sentence-Level Representations in Relation Extraction,Christoph Alt|Aleksandra Gabryszak|Leonhard Hennig,"Despite the recent progress, little is known about the features captured by state-of-the-art neural relation extraction (RE) models. Common methods encode the source sentence, conditioned on the entity mentions, before classifying the relation. However, the complexity of the task makes it difficult to understand how encoder architecture and supporting linguistic knowledge affect the features learned by the encoder. We introduce 14 probing tasks targeting linguistic properties relevant to RE, and we use them to study representations learned by more than 40 different encoder architecture and linguistic feature combinations trained on two datasets, TACRED and SemEval 2010 Task 8. We find that the bias induced by the architecture and the inclusion of linguistic features are clearly expressed in the probing task performance. For example, adding contextualized word representations greatly increases performance on probing tasks with a focus on named entity and part-of-speech information, and yields better results in RE. In contrast, entity masking improves RE, but considerably lowers performance on entity type related probing tasks.",Relation Extraction|probing tasks|RE|probing task,Information Extraction,Long,https://www.aclweb.org/anthology/2020.acl-main.140.pdf -main.154,``You Sound Just Like Your Father'' Commercial Machine Translation Systems Include Stylistic Biases,Dirk Hovy|Federico Bianchi|Tommaso Fornaciari,"The main goal of machine translation has been to convey the correct content. Stylistic considerations have been at best secondary. We show that as a consequence, the output of three commercial machine translation systems (Bing, DeepL, Google) make demographically diverse samples from five languages ``sound'' older and more male than the original. Our findings suggest that translation models reflect demographic bias in the training data. This opens up interesting new research avenues in machine translation to take stylistic considerations into account.",machine translation|commercial systems|machine systems|translation models,Machine Translation,Short,https://www.aclweb.org/anthology/2020.acl-main.154.pdf -main.632,Exploiting Personal Characteristics of Debaters for Predicting Persuasiveness,Khalid Al Khatib|Michael Völske|Shahbaz Syed|Nikolay Kolyada|Benno Stein,"Predicting the persuasiveness of arguments has applications as diverse as writing assistance, essay scoring, and advertising. While clearly relevant to the task, the personal characteristics of an argument's source and audience have not yet been fully exploited toward automated persuasiveness prediction. In this paper, we model debaters' prior beliefs, interests, and personality traits based on their previous activity, without dependence on explicit user profiles or questionnaires. Using a dataset of over 60,000 argumentative discussions, comprising more than three million individual posts collected from the subreddit r/ChangeMyView, we demonstrate that our modeling of debater's characteristics enhances the prediction of argument persuasiveness as well as of debaters' resistance to persuasion.",Predicting Persuasiveness|Predicting arguments|writing assistance|essay scoring,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Short,https://www.aclweb.org/anthology/2020.acl-main.632.pdf -main.168,Learning to Update Natural Language Comments Based on Code Changes,Sheena Panthaplackel|Pengyu Nie|Milos Gligoric|Junyi Jessy Li|Raymond Mooney,"We formulate the novel task of automatically updating an existing natural language comment based on changes in the body of code it accompanies. We propose an approach that learns to correlate changes across two distinct language representations, to generate a sequence of edits that are applied to the existing comment to reflect the source code modifications. We train and evaluate our model using a dataset that we collected from commit histories of open-source software projects, with each example consisting of a concurrent update to a method and its corresponding comment. We compare our approach against multiple baselines using both automatic metrics and human evaluation. Results reflect the challenge of this task and that our model outperforms baselines with respect to making edits.",automatically comment|making edits|language representations|edits,Generation,Long,https://www.aclweb.org/anthology/2020.acl-main.168.pdf -main.183,Can You Put it All Together: Evaluating Conversational Agents' Ability to Blend Skills,Eric Michael Smith|Mary Williamson|Kurt Shuster|Jason Weston|Y-Lan Boureau,"Being engaging, knowledgeable, and empathetic are all desirable general qualities in a conversational agent. Previous work has introduced tasks and datasets that aim to help agents to learn those qualities in isolation and gauge how well they can express them. But rather than being specialized in one single quality, a good open-domain conversational agent should be able to seamlessly blend them all into one cohesive conversational flow. In this work, we investigate several ways to combine models trained towards isolated capabilities, ranging from simple model aggregation schemes that require minimal additional training, to various forms of multi-task training that encompass several skills at all training stages. We further propose a new dataset, BlendedSkillTalk, to analyze how these capabilities would mesh together in a natural conversation, and compare the performance of different architectures and training schemes. Our experiments show that multi-tasking over several tasks that focus on particular capabilities results in better blended conversation performance compared to models trained on a single skill, and that both unified or two-stage approaches perform well if they are constructed to avoid unwanted bias in skill selection or are fine-tuned on our new task.",conversational agent|open-domain agent|model schemes|multi-task training,Dialogue and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.acl-main.183.pdf -main.197,SMART: Robust and Efficient Fine-Tuning for Pre-trained Natural Language Models through Principled Regularized Optimization,Haoming Jiang|Pengcheng He|Weizhu Chen|Xiaodong Liu|Jianfeng Gao|Tuo Zhao,"Transfer learning has fundamentally changed the landscape of natural language processing (NLP). Many state-of-the-art models are first pre-trained on a large text corpus and then fine-tuned on downstream tasks. However, due to limited data resources from downstream tasks and the extremely high complexity of pre-trained models, aggressive fine-tuning often causes the fine-tuned model to overfit the training data of downstream tasks and fail to generalize to unseen data. To address such an issue in a principled manner, we propose a new learning framework for robust and efficient fine-tuning for pre-trained models to attain better generalization performance. The proposed framework contains two important ingredients: 1. Smoothness-inducing regularization, which effectively manages the complexity of the model; 2. Bregman proximal point optimization, which is an instance of trust-region methods and can prevent aggressive updating. Our experiments show that the proposed framework achieves new state-of-the-art performance on a number of NLP tasks including GLUE, SNLI, SciTail and ANLI. Moreover, it also outperforms the state-of-the-art T5 model, which is the largest pre-trained model containing 11 billion parameters, on GLUE.",NLP|generalization|NLP tasks|SMART,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.197.pdf -main.752,TriggerNER: Learning with Entity Triggers as Explanations for Named Entity Recognition,Bill Yuchen Lin|Dong-Ho Lee|Ming Shen|Ryan Moreno|Xiao Huang|Prashant Shiralkar|Xiang Ren,"Training neural models for named entity recognition (NER) in a new domain often requires additional human annotations (e.g., tens of thousands of labeled instances) that are usually expensive and time-consuming to collect. Thus, a crucial research question is how to obtain supervision in a cost-effective way. In this paper, we introduce ""entity triggers,"" an effective proxy of human explanations for facilitating label-efficient learning of NER models. An entity trigger is defined as a group of words in a sentence that helps to explain why humans would recognize an entity in the sentence. We crowd-sourced 14k entity triggers for two well-studied NER datasets. Our proposed model, Trigger Matching Network, jointly learns trigger representations and soft matching module with self-attention such that can generalize to unseen sentences easily for tagging. Our framework is significantly more cost-effective than the traditional neural NER frameworks. Experiments show that using only 20% of the trigger-annotated sentences results in a comparable performance as using 70% of conventional annotated sentences.",Named Recognition|NER|supervision|label-efficient models,Information Extraction,Short,https://www.aclweb.org/anthology/2020.acl-main.752.pdf -main.746,Universal Decompositional Semantic Parsing,Elias Stengel-Eskin|Aaron Steven White|Sheng Zhang|Benjamin Van Durme,"We introduce a transductive model for parsing into Universal Decompositional Semantics (UDS) representations, which jointly learns to map natural language utterances into UDS graph structures and annotate the graph with decompositional semantic attribute scores. We also introduce a strong pipeline model for parsing into the UDS graph structure, and show that our transductive parser performs comparably while additionally performing attribute prediction. By analyzing the attribute prediction errors, we find the model captures natural relationships between attribute groups.",parsing|Universal Parsing|transductive model|Universal representations,Semantics: Sentence Level,Long,https://www.aclweb.org/anthology/2020.acl-main.746.pdf -main.236,Contextual Embeddings: When Are They Worth It?,Simran Arora|Avner May|Jian Zhang|Christopher Ré,"We study the settings for which deep contextual embeddings (e.g., BERT) give large improvements in performance relative to classic pretrained embeddings (e.g., GloVe), and an even simpler baseline---random word embeddings---focusing on the impact of the training set size and the linguistic properties of the task. Surprisingly, we find that both of these simpler baselines can match contextual embeddings on industry-scale data, and often perform within 5 to 10% accuracy (absolute) on benchmark tasks. Furthermore, we identify properties of data for which contextual embeddings give particularly large gains: language containing complex structure, ambiguous word usage, and words unseen in training.",Contextual Embeddings|deep embeddings|pretrained embeddings|GloVe,Machine Learning for NLP,Short,https://www.aclweb.org/anthology/2020.acl-main.236.pdf -main.550,Automatic Generation of Citation Texts in Scholarly Papers: A Pilot Study,Xinyu Xing|Xiaosheng Fan|Xiaojun Wan,"In this paper, we study the challenging problem of automatic generation of citation texts in scholarly papers. Given the context of a citing paper A and a cited paper B, the task aims to generate a short text to describe B in the given context of A. One big challenge for addressing this task is the lack of training data. Usually, explicit citation texts are easy to extract, but it is not easy to extract implicit citation texts from scholarly papers. We thus first train an implicit citation extraction model based on BERT and leverage the model to construct a large training dataset for the citation text generation task. Then we propose and train a multi-source pointer-generator network with cross attention mechanism for citation text generation. Empirical evaluation results on a manually labeled test dataset verify the efficacy of our model. This pilot study confirms the feasibility of automatically generating citation texts in scholarly papers and the technique has the great potential to help researchers prepare their scientific papers.",Automatic Texts|citation task|citation generation|automatically texts,Summarization,Long,https://www.aclweb.org/anthology/2020.acl-main.550.pdf -main.544,Evidence-Aware Inferential Text Generation with Vector Quantised Variational AutoEncoder,Daya Guo|Duyu Tang|Nan Duan|Jian Yin|Daxin Jiang|Ming Zhou,"Generating inferential texts about an event in different perspectives requires reasoning over different contexts that the event occurs. Existing works usually ignore the context that is not explicitly provided, resulting in a context-independent semantic representation that struggles to support the generation. To address this, we propose an approach that automatically finds evidence for an event from a large text corpus, and leverages the evidence to guide the generation of inferential texts. Our approach works in an encoderdecoder manner and is equipped with Vector Quantised-Variational Autoencoder, where the encoder outputs representations from a distribution over discrete variables. Such discrete representations enable automatically selecting relevant evidence, which not only facilitates evidence-aware generation, but also provides a natural way to uncover rationales behind the generation. Our approach provides state-of-the-art performance on both Event2mind and Atomic datasets. More importantly, we find that with discrete representations, our model selectively uses evidence to generate different inferential texts.",Evidence-Aware Generation|Generating texts|generation|generation texts,Semantics: Textual Inference and Other Areas of Semantics,Long,https://www.aclweb.org/anthology/2020.acl-main.544.pdf -main.222,The Dialogue Dodecathlon: Open-Domain Knowledge and Image Grounded Conversational Agents,Kurt Shuster|Da JU|Stephen Roller|Emily Dinan|Y-Lan Boureau|Jason Weston,"We introduce dodecaDialogue: a set of 12 tasks that measures if a conversational agent can communicate engagingly with personality and empathy, ask questions, answer questions by utilizing knowledge resources, discuss topics and situations, and perceive and converse about images. By multi-tasking on such a broad large-scale set of data, we hope to both move towards and measure progress in producing a single unified agent that can perceive, reason and converse with humans in an open-domain setting. We show that such multi-tasking improves over a BERT pre-trained baseline, largely due to multi-tasking with very large dialogue datasets in a similar domain, and that the multi-tasking in general provides gains to both text and image-based tasks using several metrics in both the fine-tune and task transfer settings. We obtain state-of-the-art results on many of the tasks, providing a strong baseline for this challenge.",text tasks|Dialogue Dodecathlon|Image Agents|dodecaDialogue,Dialogue and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.acl-main.222.pdf -main.578,Neighborhood Matching Network for Entity Alignment,Yuting Wu|Xiao Liu|Yansong Feng|Zheng Wang|Dongyan Zhao,"Structural heterogeneity between knowledge graphs is an outstanding challenge for entity alignment. This paper presents Neighborhood Matching Network (NMN), a novel entity alignment framework for tackling the structural heterogeneity challenge. NMN estimates the similarities between entities to capture both the topological structure and the neighborhood difference. It provides two innovative components for better learning representations for entity alignment. It first uses a novel graph sampling method to distill a discriminative neighborhood for each entity. It then adopts a cross-graph neighborhood matching module to jointly encode the neighborhood difference for a given entity pair. Such strategies allow NMN to effectively construct matching-oriented entity representations while ignoring noisy neighbors that have a negative impact on the alignment task. Extensive experiments performed on three entity alignment datasets show that NMN can well estimate the neighborhood similarity in more tough cases and significantly outperforms 12 previous state-of-the-art methods.",Entity Alignment|structural challenge|matching-oriented representations|alignment task,Information Extraction,Long,https://www.aclweb.org/anthology/2020.acl-main.578.pdf -main.593,The Right Tool for the Job: Matching Model and Instance Complexities,Roy Schwartz|Gabriel Stanovsky|Swabha Swayamdipta|Jesse Dodge|Noah A. Smith,"As NLP models become larger, executing a trained model requires significant computational resources incurring monetary and environmental costs. To better respect a given inference budget, we propose a modification to contextual representation fine-tuning which, during inference, allows for an early (and fast) “exit” from neural network calculations for simple instances, and late (and accurate) exit for hard instances. To achieve this, we add classifiers to different layers of BERT and use their calibrated confidence scores to make early exit decisions. We test our proposed modification on five different datasets in two tasks: three text classification datasets and two natural language inference benchmarks. Our method presents a favorable speed/accuracy tradeoff in almost all cases, producing models which are up to five times faster than the state of the art, while preserving their accuracy. Our method also requires almost no additional training resources (in either time or parameters) compared to the baseline BERT model. Finally, our method alleviates the need for costly retraining of multiple models at different levels of efficiency; we allow users to control the inference speed/accuracy tradeoff using a single trained model, by setting a single variable at inference time. We publicly release our code.",inference|early decisions|costly retraining|Job Model,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.593.pdf -main.587,A Mixture of h - 1 Heads is Better than h Heads,Hao Peng|Roy Schwartz|Dianqi Li|Noah A. Smith,"Multi-head attentive neural architectures have achieved state-of-the-art results on a variety of natural language processing tasks. Evidence has shown that they are overparameterized; attention heads can be pruned without significant performance loss. In this work, we instead “reallocate” them—the model learns to activate different heads on different inputs. Drawing connections between multi-head attention and mixture of experts, we propose the mixture of attentive experts model (MAE). MAE is trained using a block coordinate descent algorithm that alternates between updating (1) the responsibilities of the experts and (2) their parameters. Experiments on machine translation and language modeling show that MAE outperforms strong baselines on both tasks. Particularly, on the WMT14 English to German translation dataset, MAE improves over “transformer-base” by 0.8 BLEU, with a comparable number of parameters. Our analysis shows that our model learns to specialize different experts to different inputs.",natural tasks|machine translation|language modeling|Multi-head architectures,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.587.pdf -main.586,"Words Aren't Enough, Their Order Matters: On the Robustness of Grounding Visual Referring Expressions",Arjun Akula|Spandana Gella|Yaser Al-Onaizan|Song-Chun Zhu|Siva Reddy,"Visual referring expression recognition is a challenging task that requires natural language understanding in the context of an image. We critically examine RefCOCOg, a standard benchmark for this task, using a human study and show that 83.7% of test instances do not require reasoning on linguistic structure, i.e., words are enough to identify the target object, the word order doesn't matter. To measure the true progress of existing models, we split the test set into two sets, one which requires reasoning on linguistic structure and the other which doesn’t. Additionally, we create an out-of-distribution dataset Ref-Adv by asking crowdworkers to perturb in-domain examples such that the target object changes. Using these datasets, we empirically show that existing methods fail to exploit linguistic structure and are 12% to 23% lower in performance than the established progress for this task. We also propose two methods, one based on contrastive learning and the other based on multi-task learning, to increase the robustness of ViLBERT, the current state-of-the-art model for this task. Our datasets are publicly available at https://github.com/aws/aws-refcocog-adv.",Robustness Expressions|Grounding Expressions|Visual recognition|natural understanding,"Language Grounding to Vision, Robotics and Beyond",Short,https://www.aclweb.org/anthology/2020.acl-main.586.pdf -main.592,Learning Architectures from an Extended Search Space for Language Modeling,Yinqiao Li|Chi Hu|Yuhao Zhang|Nuo Xu|Yufan Jiang|Tong Xiao|Jingbo Zhu|Tongran Liu|Changliang Li,"Neural architecture search (NAS) has advanced significantly in recent years but most NAS systems restrict search to learning architectures of a recurrent or convolutional cell. In this paper, we extend the search space of NAS. In particular, we present a general approach to learn both intra-cell and inter-cell architectures (call it ESS). For a better search result, we design a joint learning method to perform intra-cell and inter-cell NAS simultaneously. We implement our model in a differentiable architecture search system. For recurrent neural language modeling, it outperforms a strong baseline significantly on the PTB and WikiText data, with a new state-of-the-art on PTB. Moreover, the learned architectures show good transferability to other systems. E.g., they improve state-of-the-art systems on the CoNLL and WNUT named entity recognition (NER) tasks and CoNLL chunking task, indicating a promising line of research on large-scale pre-learned architectures.",Language Modeling|intra-cell NAS|recurrent modeling|CoNLL task,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.592.pdf -main.579,Relation Extraction with Explanation,Hamed Shahbazi|Xiaoli Fern|Reza Ghaeini|Prasad Tadepalli,Recent neural models for relation extraction with distant supervision alleviate the impact of irrelevant sentences in a bag by learning importance weights for the sentences. Efforts thus far have focused on improving extraction accuracy but little is known about their explanability. In this work we annotate a test set with ground-truth sentence-level explanations to evaluate the quality of explanations afforded by the relation extraction models. We demonstrate that replacing the entity mentions in the sentences with their fine-grained entity types not only enhances extraction accuracy but also improves explanation. We also propose to automatically generate ``distractor'' sentences to augment the bags and train the model to ignore the distractors. Evaluations on the widely used FB-NYT dataset show that our methods achieve new state-of-the-art accuracy while improving model explanability.,relation extraction|Explanation|neural models|relation models,Information Extraction,Short,https://www.aclweb.org/anthology/2020.acl-main.579.pdf -main.545,How to Ask Good Questions? Try to Leverage Paraphrases,Xin Jia|Wenjie Zhou|Xu Sun|Yunfang Wu,"Given a sentence and its relevant answer, how to ask good questions is a challenging task, which has many real applications. Inspired by human's paraphrasing capability to ask questions of the same meaning but with diverse expressions, we propose to incorporate paraphrase knowledge into question generation(QG) to generate human-like questions. Specifically, we present a two-hand hybrid model leveraging a self-built paraphrase resource, which is automatically conducted by a simple back-translation method. On the one hand, we conduct multi-task learning with sentence-level paraphrase generation (PG) as an auxiliary task to supplement paraphrase knowledge to the task-share encoder. On the other hand, we adopt a new loss function for diversity training to introduce more question patterns to QG. Extensive experimental results show that our proposed model obtains obvious performance gain over several strong baselines, and further human evaluation validates that our model can ask questions of high quality by leveraging paraphrase knowledge.",question generation(QG|sentence-level generation|diversity training|Paraphrases,Semantics: Textual Inference and Other Areas of Semantics,Long,https://www.aclweb.org/anthology/2020.acl-main.545.pdf -main.223,Automatic Poetry Generation from Prosaic Text,Tim Van de Cruys,"In the last few years, a number of successful approaches have emerged that are able to adequately model various aspects of natural language. In particular, language models based on neural networks have improved the state of the art with regard to predictive language modeling, while topic models are successful at capturing clear-cut, semantic dimensions. In this paper, we will explore how these approaches can be adapted and combined to model the linguistic and literary aspects needed for poetry generation. The system is exclusively trained on standard, non-poetic text, and its output is constrained in order to confer a poetic character to the generated verse. The framework is applied to the generation of poems in both English and French, and is equally evaluated for both languages. Even though it only uses standard, non-poetic text as input, the system yields state of the art results for poetry generation.",Automatic Generation|predictive modeling|poetry generation|generation poems,Generation,Long,https://www.aclweb.org/anthology/2020.acl-main.223.pdf -main.237,Interactive Classification by Asking Informative Questions,Lili Yu|Howard Chen|Sida I. Wang|Tao Lei|Yoav Artzi,"We study the potential for interaction in natural language classification. We add a limited form of interaction for intent classification, where users provide an initial query using natural language, and the system asks for additional information using binary or multi-choice questions. At each turn, our system decides between asking the most informative question or making the final classification pre-diction. The simplicity of the model allows for bootstrapping of the system without interaction data, instead relying on simple crowd-sourcing tasks. We evaluate our approach on two domains, showing the benefit of interaction and the advantage of learning to balance between asking additional questions and making the final prediction.",Interactive Classification|natural classification|intent classification|classification pre-diction,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.237.pdf -main.551,Composing Elementary Discourse Units in Abstractive Summarization,Zhenwen Li|Wenhao Wu|Sujian Li,"In this paper, we argue that elementary discourse unit (EDU) is a more appropriate textual unit of content selection than the sentence unit in abstractive summarization. To well handle the problem of composing EDUs into an informative and fluent summary, we propose a novel summarization method that first designs an EDU selection model to extract and group informative EDUs and then an EDU fusion model to fuse the EDUs in each group into one sentence. We also design the reinforcement learning mechanism to use EDU fusion results to reward the EDU selection action, boosting the final summarization performance. Experiments on CNN/Daily Mail have demonstrated the effectiveness of our model.",Abstractive Summarization|content selection|summarization|summarization method,Summarization,Short,https://www.aclweb.org/anthology/2020.acl-main.551.pdf -main.747,Unsupervised Cross-lingual Representation Learning at Scale,Alexis Conneau|Kartikay Khandelwal|Naman Goyal|Vishrav Chaudhary|Guillaume Wenzek|Francisco Guzmán|Edouard Grave|Myle Ott|Luke Zettlemoyer|Veselin Stoyanov,"This paper shows that pretraining multilingual language models at scale leads to significant performance gains for a wide range of cross-lingual transfer tasks. We train a Transformer-based masked language model on one hundred languages, using more than two terabytes of filtered CommonCrawl data. Our model, dubbed XLM-R, significantly outperforms multilingual BERT (mBERT) on a variety of cross-lingual benchmarks, including +14.6% average accuracy on XNLI, +13% average F1 score on MLQA, and +2.4% F1 score on NER. XLM-R performs particularly well on low-resource languages, improving 15.7% in XNLI accuracy for Swahili and 11.4% for Urdu over previous XLM models. We also present a detailed empirical analysis of the key factors that are required to achieve these gains, including the trade-offs between (1) positive transfer and capacity dilution and (2) the performance of high and low resource languages at scale. Finally, we show, for the first time, the possibility of multilingual modeling without sacrificing per-language performance; XLM-R is very competitive with strong monolingual models on the GLUE and XNLI benchmarks. We will make our code and models publicly available.",cross-lingual tasks|XNLI|MLQA|NER,Semantics: Sentence Level,Long,https://www.aclweb.org/anthology/2020.acl-main.747.pdf -main.753,Addressing Posterior Collapse with Mutual Information for Improved Variational Neural Machine Translation,Arya D. McCarthy|Xian Li|Jiatao Gu|Ning Dong,"This paper proposes a simple and effective approach to address the problem of posterior collapse in conditional variational autoencoders (CVAEs). It thus improves performance of machine translation models that use noisy or monolingual data, as well as in conventional settings. Extending Transformer and conditional VAEs, our proposed latent variable model measurably prevents posterior collapse by (1) using a modified evidence lower bound (ELBO) objective which promotes mutual information between the latent variable and the target, and (2) guiding the latent variable with an auxiliary bag-of-words prediction task. As a result, the proposed model yields improved translation quality compared to existing variational NMT models on WMT Ro↔En and De↔En. With latent variables being effectively utilized, our model demonstrates improved robustness over non-latent Transformer in handling uncertainty: exploiting noisy source-side monolingual data (up to +3.2 BLEU), and training with weakly aligned web-mined parallel data (up to +4.7 BLEU).",Variational Translation|posterior collapse|auxiliary task|uncertainty,Machine Translation,Long,https://www.aclweb.org/anthology/2020.acl-main.753.pdf -main.196,On Importance Sampling-Based Evaluation of Latent Language Models,Robert L Logan IV|Matt Gardner|Sameer Singh,"Language models that use additional latent structures (e.g., syntax trees, coreference chains, knowledge graph links) provide several advantages over traditional language models. However, likelihood-based evaluation of these models is often intractable as it requires marginalizing over the latent space. Existing works avoid this issue by using importance sampling. Although this approach has asymptotic guarantees, analysis is rarely conducted on the effect of decisions such as sample size and choice of proposal distribution on the reported estimates. In this paper, we carry out this analysis for three models: RNNG, EntityNLM, and KGLM. In addition, we elucidate subtle differences in how importance sampling is applied in these works that can have substantial effects on the final estimates, as well as provide theoretical results which reinforce the validity of this technique.",Importance Models|likelihood-based evaluation|Language models|importance sampling,Machine Learning for NLP,Short,https://www.aclweb.org/anthology/2020.acl-main.196.pdf -main.182,"""None of the Above"": Measure Uncertainty in Dialog Response Retrieval",Yulan Feng|Shikib Mehri|Maxine Eskenazi|Tiancheng Zhao,"This paper discusses the importance of uncovering uncertainty in end-to-end dialog tasks and presents our experimental results on uncertainty classification on the processed Ubuntu Dialog Corpus. We show that instead of retraining models for this specific purpose, we can capture the original retrieval model's underlying confidence concerning the best prediction using trivial additional computation.",Dialog Retrieval|end-to-end tasks|uncertainty classification|retraining models,Dialogue and Interactive Systems,Short,https://www.aclweb.org/anthology/2020.acl-main.182.pdf -main.169,Politeness Transfer: A Tag and Generate Approach,Aman Madaan|Amrith Setlur|Tanmay Parekh|Barnabas Poczos|Graham Neubig|Yiming Yang|Ruslan Salakhutdinov|Alan W Black|Shrimai Prabhumoye,"This paper introduces a new task of politeness transfer which involves converting non-polite sentences to polite sentences while preserving the meaning. We also provide a dataset of more than 1.39 instances automatically labeled for politeness to encourage benchmark evaluations on this new task. We design a tag and generate pipeline that identifies stylistic attributes and subsequently generates a sentence in the target style while preserving most of the source content. For politeness as well as five other transfer tasks, our model outperforms the state-of-the-art methods on automatic metrics for content preservation, with a comparable or better performance on style transfer accuracy. Additionally, our model surpasses existing methods on human evaluations for grammaticality, meaning preservation and transfer accuracy across all the six style transfer tasks. The data and code is located at https://github.com/tag-and-generate.",Politeness Transfer|politeness|transfer tasks|content preservation,Generation,Long,https://www.aclweb.org/anthology/2020.acl-main.169.pdf -main.155,MMPE: A Multi-Modal Interface for Post-Editing Machine Translation,Nico Herbig|Tim Düwel|Santanu Pal|Kalliopi Meladaki|Mahsa Monshizadeh|Antonio Krüger|Josef van Genabith,"Current advances in machine translation (MT) increase the need for translators to switch from traditional translation to post-editing (PE) of machine-translated text, a process that saves time and reduces errors. This affects the design of translation interfaces, as the task changes from mainly generating text to correcting errors within otherwise helpful translation proposals. Since this paradigm shift offers potential for modalities other than mouse and keyboard, we present MMPE, the first prototype to combine traditional input modes with pen, touch, and speech modalities for PE of MT. The results of an evaluation with professional translators suggest that pen and touch interaction are suitable for deletion and reordering tasks, while they are of limited use for longer insertions. On the other hand, speech and multi-modal combinations of select & speech are considered suitable for replacements and insertions but offer less potential for deletion and reordering. Overall, participants were enthusiastic about the new modalities and saw them as good extensions to mouse & keyboard, but not as a complete substitute.",Post-Editing Translation|machine translation|MT|translators,NLP Applications,Long,https://www.aclweb.org/anthology/2020.acl-main.155.pdf -main.633,Out of the Echo Chamber: Detecting Countering Debate Speeches,Matan Orbach|Yonatan Bilu|Assaf Toledo|Dan Lahav|Michal Jacovi|Ranit Aharonov|Noam Slonim,"An educated and informed consumption of media content has become a challenge in modern times. With the shift from traditional news outlets to social media and similar venues, a major concern is that readers are becoming encapsulated in ""echo chambers"" and may fall prey to fake news and disinformation, lacking easy access to dissenting views. We suggest a novel task aiming to alleviate some of these concerns -- that of detecting articles that most effectively counter the arguments -- and not just the stance -- made in a given text. We study this problem in the context of debate speeches. Given such a speech, we aim to identify, from among a set of speeches on the same topic and with an opposing stance, the ones that directly counter it. We provide a large dataset of 3,685 such speeches (in English), annotated for this relation, which hopefully would be of general interest to the NLP community. We explore several algorithms addressing this task, and while some are successful, all fall short of expert human performance, suggesting room for further research. All data collected during this work is freely available for research.",Detecting Speeches|Echo Chamber|echo chambers|opposing stance,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Long,https://www.aclweb.org/anthology/2020.acl-main.633.pdf -main.627,Cross-Lingual Semantic Role Labeling with High-Quality Translated Training Corpus,Hao Fei|Meishan Zhang|Donghong Ji,"Many efforts of research are devoted to semantic role labeling (SRL) which is crucial for natural language understanding. Supervised approaches have achieved impressing performances when large-scale corpora are available for resource-rich languages such as English. While for the low-resource languages with no annotated SRL dataset, it is still challenging to obtain competitive performances. Cross-lingual SRL is one promising way to address the problem, which has achieved great advances with the help of model transferring and annotation projection. In this paper, we propose a novel alternative based on corpus translation, constructing high-quality training datasets for the target languages from the source gold-standard SRL annotations. Experimental results on Universal Proposition Bank show that the translation-based method is highly effective, and the automatic pseudo datasets can improve the target-language SRL performances significantly.",Cross-Lingual Labeling|semantic labeling|natural understanding|model transferring,Semantics: Sentence Level,Long,https://www.aclweb.org/anthology/2020.acl-main.627.pdf -main.141,Reasoning with Latent Structure Refinement for Document-Level Relation Extraction,Guoshun Nan|Zhijiang Guo|Ivan Sekulic|Wei Lu,"Document-level relation extraction requires integrating information within and across multiple sentences of a document and capturing complex interactions between inter-sentence entities. However, effective aggregation of relevant information in the document remains a challenging research question. Existing approaches construct static document-level graphs based on syntactic trees, co-references or heuristics from the unstructured text to model the dependencies. Unlike previous methods that may not be able to capture rich non-local interactions for inference, we propose a novel model that empowers the relational reasoning across sentences by automatically inducing the latent document-level graph. We further develop a refinement strategy, which enables the model to incrementally aggregate relevant information for multi-hop reasoning. Specifically, our model achieves an F1 score of 59.05 on a large-scale document-level dataset (DocRED), significantly improving over the previous results, and also yields new state-of-the-art results on the CDR and GDA dataset. Furthermore, extensive analyses show that the model is able to discover more accurate inter-sentence relations.",Reasoning|Document-Level Extraction|aggregation information|inference,Information Extraction,Long,https://www.aclweb.org/anthology/2020.acl-main.141.pdf -main.394,Joint Modelling of Emotion and Abusive Language Detection,Santhosh Rajamanickam|Pushkar Mishra|Helen Yannakoudakis|Ekaterina Shutova,"The rise of online communication platforms has been accompanied by some undesirable effects, such as the proliferation of aggressive and abusive behaviour online. Aiming to tackle this problem, the natural language processing (NLP) community has experimented with a range of techniques for abuse detection. While achieving substantial success, these methods have so far only focused on modelling the linguistic properties of the comments and the online communities of users, disregarding the emotional state of the users and how this might affect their language. The latter is, however, inextricably linked to abusive behaviour. In this paper, we present the first joint model of emotion and abusive language detection, experimenting in a multi-task learning framework that allows one task to inform the other. Our results demonstrate that incorporating affective features leads to significant improvements in abuse detection performance across datasets.",Joint Detection|abuse detection|abusive detection|multi-task framework,NLP Applications,Long,https://www.aclweb.org/anthology/2020.acl-main.394.pdf -main.380,Demographics Should Not Be the Reason of Toxicity: Mitigating Discrimination in Text Classifications with Instance Weighting,Guanhua Zhang|Bing Bai|Junqi Zhang|Kun Bai|Conghui Zhu|Tiejun Zhao,"With the recent proliferation of the use of text classifications, researchers have found that there are certain unintended biases in text classification datasets. For example, texts containing some demographic identity-terms (e.g., ""gay"", ""black"") are more likely to be abusive in existing abusive language detection datasets. As a result, models trained with these datasets may consider sentences like ""She makes me happy to be gay"" as abusive simply because of the word ""gay."" In this paper, we formalize the unintended biases in text classification datasets as a kind of selection bias from the non-discrimination distribution to the discrimination distribution. Based on this formalization, we further propose a model-agnostic debiasing training framework by recovering the non-discrimination distribution using instance weighting, which does not require any extra resources or annotations apart from a pre-defined set of demographic identity-terms. Experiments demonstrate that our method can effectively alleviate the impacts of the unintended biases without significantly hurting models' generalization ability.",Mitigating Discrimination|Text Classifications|Discrimination|Instance Weighting,Ethics and NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.380.pdf -main.74,Unsupervised FAQ Retrieval with Question Generation and BERT,Yosi Mass|Boaz Carmeli|Haggai Roitman|David Konopnicki,"We focus on the task of Frequently Asked Questions (FAQ) retrieval. A given user query can be matched against the questions and/or the answers in the FAQ. We present a fully unsupervised method that exploits the FAQ pairs to train two BERT models. The two models match user queries to FAQ answers and questions, respectively. We alleviate the missing labeled data of the latter by automatically generating high-quality question paraphrases. We show that our model is on par and even outperforms supervised models on existing datasets.",Unsupervised Retrieval|Question Generation|Frequently retrieval|fully method,Information Retrieval and Text Mining,Short,https://www.aclweb.org/anthology/2020.acl-main.74.pdf -main.60,Paraphrase Augmented Task-Oriented Dialog Generation,Silin Gao|Yichi Zhang|Zhijian Ou|Zhou Yu,"Neural generative models have achieved promising performance on dialog generation tasks if given a huge data set. However, the lack of high-quality dialog data and the expensive data annotation process greatly limit their application in real world settings. We propose a paraphrase augmented response generation (PARG) framework that jointly trains a paraphrase model and a response generation model to improve the dialog generation performance. We also design a method to automatically construct paraphrase training data set based on dialog state and dialog act labels. PARG is applicable to various dialog generation models, such as TSCP (Lei et al., 2018) and DAMD (Zhang et al., 2019). Experimental results show that the proposed framework improves these state-of-the-art dialog models further on CamRest676 and MultiWOZ. PARG also outperforms other data augmentation methods significantly in dialog generation tasks, especially under low resource settings.",Paraphrase Generation|dialog tasks|data process|dialog generation,Dialogue and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.acl-main.60.pdf -main.419,Human Attention Maps for Text Classification: Do Humans and Neural Networks Focus on the Same Words?,Cansu Sen|Thomas Hartvigsen|Biao Yin|Xiangnan Kong|Elke Rundensteiner,"Motivated by human attention, computational attention mechanisms have been designed to help neural networks adjust their focus on specific parts of the input data. While attention mechanisms are claimed to achieve interpretability, little is known about the actual relationships between machine and human attention. In this work, we conduct the first quantitative assessment of human versus computational attention mechanisms for the text classification task. To achieve this, we design and conduct a large-scale crowd-sourcing study to collect human attention maps that encode the parts of a text that humans focus on when conducting text classification. Based on this new resource of human attention dataset for text classification, YELP-HAT, collected on the publicly available YELP dataset, we perform a quantitative comparative analysis of machine attention maps created by deep learning models and human attention maps. Our analysis offers insights into the relationships between human versus machine attention maps along three dimensions: overlap in word selections, distribution over lexical categories, and context-dependency of sentiment polarity. Our findings open promising future research opportunities ranging from supervised attention to the design of human-centric attention-based explanations.",Text Classification|quantitative mechanisms|text task|large-scale study,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.419.pdf -main.431,Interpreting Pretrained Contextualized Representations via Reductions to Static Embeddings,Rishi Bommasani|Kelly Davis|Claire Cardie,"Contextualized representations (e.g. ELMo, BERT) have become the default pretrained representations for downstream NLP applications. In some settings, this transition has rendered their static embedding predecessors (e.g. Word2Vec, GloVe) obsolete. As a side-effect, we observe that older interpretability methods for static embeddings --- while more diverse and mature than those available for their dynamic counterparts --- are underutilized in studying newer contextualized representations. Consequently, we introduce simple and fully general methods for converting from contextualized representations to static lookup-table embeddings which we apply to 5 popular pretrained models and 9 sets of pretrained weights. Our analysis of the resulting static embeddings notably reveals that pooling over many contexts significantly improves representational quality under intrinsic evaluation. Complementary to analyzing representational quality, we consider social biases encoded in pretrained representations with respect to gender, race/ethnicity, and religion and find that bias is encoded disparately across pretrained models and internal layers even for models with the same training data. Concerningly, we find dramatic inconsistencies between social bias estimators for word embeddings.",Interpreting Representations|downstream applications|static embeddings|Pretrained Representations,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.431.pdf -main.357,Pre-training Is (Almost) All You Need: An Application to Commonsense Reasoning,Alexandre Tamborrino|Nicola Pellicanò|Baptiste Pannier|Pascal Voitot|Louise Naudin,"Fine-tuning of pre-trained transformer models has become the standard approach for solving common NLP tasks. Most of the existing approaches rely on a randomly initialized classifier on top of such networks. We argue that this fine-tuning procedure is sub-optimal as the pre-trained model has no prior on the specific classifier labels, while it might have already learned an intrinsic textual representation of the task. In this paper, we introduce a new scoring method that casts a plausibility ranking task in a full-text format and leverages the masked language modeling head tuned during the pre-training phase. We study commonsense reasoning tasks where the model must rank a set of hypotheses given a premise, focusing on the COPA, Swag, HellaSwag and CommonsenseQA datasets. By exploiting our scoring method without fine-tuning, we are able to produce strong baselines (e.g. 80% test accuracy on COPA) that are comparable to supervised approaches. Moreover, when fine-tuning directly on the proposed scoring function, we show that our method provides a much more stable training phase across random restarts (e.g x10 standard deviation reduction on COPA test accuracy) and requires less annotated data than the standard classifier approach to reach equivalent performances.",Commonsense Reasoning|common tasks|plausibility task|pre-training phase,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.357.pdf -main.48,GCAN: Graph-aware Co-Attention Networks for Explainable Fake News Detection on Social Media,Yi-Ju Lu|Cheng-Te Li,"This paper solves the fake news detection problem under a more realistic scenario on social media. Given the source short-text tweet and the corresponding sequence of retweet users without text comments, we aim at predicting whether the source tweet is fake or not, and generating explanation by highlighting the evidences on suspicious retweeters and the words they concern. We develop a novel neural network-based model, Graph-aware Co-Attention Networks (GCAN), to achieve the goal. Extensive experiments conducted on real tweet datasets exhibit that GCAN can significantly outperform state-of-the-art methods by 16% in accuracy on average. In addition, the case studies also show that GCAN can produce reasonable explanations.",Explainable Detection|fake problem|GCAN|Graph-aware Networks,Computational Social Science and Social Media,Long,https://www.aclweb.org/anthology/2020.acl-main.48.pdf -main.343,CH-SIMS: A Chinese Multimodal Sentiment Analysis Dataset with Fine-grained Annotation of Modality,Wenmeng Yu|Hua Xu|Fanyang Meng|Yilin Zhu|Yixiao Ma|Jiele Wu|Jiyun Zou|Kaicheng Yang,"Previous studies in multimodal sentiment analysis have used limited datasets, which only contain unified multimodal annotations. However, the unified annotations do not always reflect the independent sentiment of single modalities and limit the model to capture the difference between modalities. In this paper, we introduce a Chinese single- and multi-modal sentiment analysis dataset, CH-SIMS, which contains 2,281 refined video segments in the wild with both multimodal and independent unimodal annotations. It allows researchers to study the interaction between modalities or use independent unimodal annotations for unimodal sentiment analysis.Furthermore, we propose a multi-task learning framework based on late fusion as the baseline. Extensive experiments on the CH-SIMS show that our methods achieve state-of-the-art performance and learn more distinctive unimodal representations. The full dataset and codes are available for use at https://github.com/thuiar/MMSA.",multimodal analysis|unimodal analysis|CH-SIMS|multimodal annotations,Speech and Multimodality,Long,https://www.aclweb.org/anthology/2020.acl-main.343.pdf -main.6,Generating Informative Conversational Response using Recurrent Knowledge-Interaction and Knowledge-Copy,Xiexiong Lin|Weiyu Jian|Jianshan He|Taifeng Wang|Wei Chu,"Knowledge-driven conversation approaches have achieved remarkable research attention recently. However, generating an informative response with multiple relevant knowledge without losing fluency and coherence is still one of the main challenges. To address this issue, this paper proposes a method that uses recurrent knowledge interaction among response decoding steps to incorporate appropriate knowledge. Furthermore, we introduce a knowledge copy mechanism using a knowledge-aware pointer network to copy words from external knowledge according to knowledge attention distribution. Our joint neural conversation model which integrates recurrent Knowledge-Interaction and knowledge Copy (KIC) performs well on generating informative responses. Experiments demonstrate that our model with fewer parameters yields significant improvements over competitive baselines on two datasets Wizard-of-Wikipedia(average Bleu +87%; abs.: 0.034) and DuConv(average Bleu +20%; abs.: 0.047)) with different knowledge formats (textual & structured) and different languages (English & Chinese).",Generating Response|Knowledge-driven approaches|response steps|knowledge mechanism,Dialogue and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.acl-main.6.pdf -main.425,"Fatality Killed the Cat or: BabelPic, a Multimodal Dataset for Non-Concrete Concepts",Agostina Calabrese|Michele Bevilacqua|Roberto Navigli,"Thanks to the wealth of high-quality annotated images available in popular repositories such as ImageNet, multimodal language-vision research is in full bloom. However, events, feelings and many other kinds of concepts which can be visually grounded are not well represented in current datasets. Nevertheless, we would expect a wide-coverage language understanding system to be able to classify images depicting recess and remorse, not just cats, dogs and bridges. We fill this gap by presenting BabelPic, a hand-labeled dataset built by cleaning the image-synset association found within the BabelNet Lexical Knowledge Base (LKB). BabelPic explicitly targets non-concrete concepts, thus providing refreshing new data for the community. We also show that pre-trained language-vision systems can be used to further expand the resource by exploiting natural language knowledge available in the LKB. BabelPic is available for download at http://babelpic.org.",multimodal research|BabelPic|language system|pre-trained systems,Resources and Evaluation,Short,https://www.aclweb.org/anthology/2020.acl-main.425.pdf -main.355,"Two Birds, One Stone: A Simple, Unified Model for Text Generation from Structured and Unstructured Data",Hamidreza Shahidi|Ming Li|Jimmy Lin,"A number of researchers have recently questioned the necessity of increasingly complex neural network (NN) architectures. In particular, several recent papers have shown that simpler, properly tuned models are at least competitive across several NLP tasks. In this work, we show that this is also the case for text generation from structured and unstructured data. We consider neural table-to-text generation and neural question generation (NQG) tasks for text generation from structured and unstructured data, respectively. Table-to-text generation aims to generate a description based on a given table, and NQG is the task of generating a question from a given passage where the generated question can be answered by a certain sub-span of the passage using NN models. Experimental results demonstrate that a basic attention-based seq2seq model trained with the exponential moving average technique achieves the state of the art in both tasks. Code is available at https://github.com/h-shahidi/2birds-gen.",Text Generation|NLP tasks|neural generation|Table-to-text generation,Generation,Short,https://www.aclweb.org/anthology/2020.acl-main.355.pdf -main.433,On the Spontaneous Emergence of Discrete and Compositional Signals,Nur Geffen Lan|Emmanuel Chemla|Shane Steinert-Threlkeld,"We propose a general framework to study language emergence through signaling games with neural agents. Using a continuous latent space, we are able to (i) train using backpropagation, (ii) show that discrete messages nonetheless naturally emerge. We explore whether categorical perception effects follow and show that the messages are not compositional.",language emergence|signaling games|neural agents|backpropagation,Interpretability and Analysis of Models for NLP,Short,https://www.aclweb.org/anthology/2020.acl-main.433.pdf -main.427,CraftAssist Instruction Parsing: Semantic Parsing for a Voxel-World Assistant,Kavya Srinet|Yacine Jernite|Jonathan Gray|Arthur Szlam,"We propose a semantic parsing dataset focused on instruction-driven communication with an agent in the game Minecraft. The dataset consists of 7K human utterances and their corresponding parses. Given proper world state, the parses can be interpreted and executed in game. We report the performance of baseline models, and analyze their successes and failures.",Voxel-World Assistant|instruction-driven communication|CraftAssist Parsing|Semantic Parsing,Dialogue and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.acl-main.427.pdf -main.4,Designing Precise and Robust Dialogue Response Evaluators,Tianyu Zhao|Divesh Lala|Tatsuya Kawahara,"Automatic dialogue response evaluator has been proposed as an alternative to automated metrics and human evaluation. However, existing automatic evaluators achieve only moderate correlation with human judgement and they are not robust. In this work, we propose to build a reference-free evaluator and exploit the power of semi-supervised training and pretrained (masked) language models. Experimental results demonstrate that the proposed evaluator achieves a strong correlation (> 0.6) with human judgement and generalizes robustly to diverse responses and corpora. We open-source the code and data in https://github.com/ZHAOTING/dialog-processing.",human evaluation|Precise Evaluators|Automatic evaluator|reference-free evaluator,Dialogue and Interactive Systems,Short,https://www.aclweb.org/anthology/2020.acl-main.4.pdf -main.341,SentiBERT: A Transferable Transformer-Based Architecture for Compositional Sentiment Semantics,Da Yin|Tao Meng|Kai-Wei Chang,"We propose SentiBERT, a variant of BERT that effectively captures compositional sentiment semantics. The model incorporates contextualized representation with binary constituency parse tree to capture semantic composition. Comprehensive experiments demonstrate that SentiBERT achieves competitive performance on phrase-level sentiment classification. We further demonstrate that the sentiment composition learned from the phrase-level annotations on SST can be transferred to other sentiment analysis tasks as well as related tasks, such as emotion classification tasks. Moreover, we conduct ablation studies and design visualization methods to understand SentiBERT. We show that SentiBERT is better than baseline approaches in capturing negation and the contrastive relation and model the compositional sentiment semantics.",Compositional Semantics|phrase-level classification|sentiment tasks|emotion tasks,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Long,https://www.aclweb.org/anthology/2020.acl-main.341.pdf -main.76,Fast and Accurate Deep Bidirectional Language Representations for Unsupervised Learning,Joongbo Shin|Yoonhyung Lee|Seunghyun Yoon|Kyomin Jung,"Even though BERT has achieved successful performance improvements in various supervised learning tasks, BERT is still limited by repetitive inferences on unsupervised tasks for the computation of contextual language representations. To resolve this limitation, we propose a novel deep bidirectional language model called a Transformer-based Text Autoencoder (T-TA). The T-TA computes contextual language representations without repetition and displays the benefits of a deep bidirectional architecture, such as that of BERT. In computation time experiments in a CPU environment, the proposed T-TA performs over six times faster than the BERT-like model on a reranking task and twelve times faster on a semantic similarity task. Furthermore, the T-TA shows competitive or even better accuracies than those of BERT on the above tasks. Code is available at https://github.com/joongbo/tta.",Unsupervised Learning|supervised tasks|unsupervised tasks|computation representations,NLP Applications,Long,https://www.aclweb.org/anthology/2020.acl-main.76.pdf -main.369,CluBERT: A Cluster-Based Approach for Learning Sense Distributions in Multiple Languages,Tommaso Pasini|Federico Scozzafava|Bianca Scarlini,"Knowing the Most Frequent Sense (MFS) of a word has been proved to help Word Sense Disambiguation (WSD) models significantly. However, the scarcity of sense-annotated data makes it difficult to induce a reliable and high-coverage distribution of the meanings in a language vocabulary. To address this issue, in this paper we present CluBERT, an automatic and multilingual approach for inducing the distributions of word senses from a corpus of raw sentences. Our experiments show that CluBERT learns distributions over English senses that are of higher quality than those extracted by alternative approaches. When used to induce the MFS of a lemma, CluBERT attains state-of-the-art results on the English Word Sense Disambiguation tasks and helps to improve the disambiguation performance of two off-the-shelf WSD models. Moreover, our distributions also prove to be effective in other languages, beating all their alternatives for computing the MFS on the multilingual WSD tasks. We release our sense distributions in five different languages at https://github.com/SapienzaNLP/clubert.",English tasks|disambiguation|multilingual tasks|CluBERT,Semantics: Lexical,Long,https://www.aclweb.org/anthology/2020.acl-main.369.pdf -main.62,Semi-Supervised Dialogue Policy Learning via Stochastic Reward Estimation,Xinting Huang|Jianzhong Qi|Yu Sun|Rui Zhang,"Dialogue policy optimization often obtains feedback until task completion in task-oriented dialogue systems. This is insufficient for training intermediate dialogue turns since supervision signals (or rewards) are only provided at the end of dialogues. To address this issue, reward learning has been introduced to learn from state-action pairs of an optimal policy to provide turn-by-turn rewards. This approach requires complete state-action annotations of human-to-human dialogues (i.e., expert demonstrations), which is labor intensive. To overcome this limitation, we propose a novel reward learning approach for semi-supervised policy learning. The proposed approach learns a dynamics model as the reward function which models dialogue progress (i.e., state-action sequences) based on expert demonstrations, either with or without annotations. The dynamics model computes rewards by predicting whether the dialogue progress is consistent with expert demonstrations. We further propose to learn action embeddings for a better generalization of the reward function. The proposed approach outperforms competitive policy learning baselines on MultiWOZ, a benchmark multi-domain dataset.",Semi-Supervised Learning|generalization function|Stochastic Estimation|Dialogue optimization,Dialogue and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.acl-main.62.pdf -main.396,Toxicity Detection: Does Context Really Matter?,John Pavlopoulos|Jeffrey Sorensen|Lucas Dixon|Nithum Thain|Ion Androutsopoulos,"Moderation is crucial to promoting healthy online discussions. Although several ‘toxicity’ detection datasets and models have been published, most of them ignore the context of the posts, implicitly assuming that comments may be judged independently. We investigate this assumption by focusing on two questions: (a) does context affect the human judgement, and (b) does conditioning on context improve performance of toxicity detection systems? We experiment with Wikipedia conversations, limiting the notion of context to the previous post in the thread and the discussion title. We find that context can both amplify or mitigate the perceived toxicity of posts. Moreover, a small but significant subset of manually labeled posts (5% in one of our experiments) end up having the opposite toxicity labels if the annotators are not provided with context. Surprisingly, we also find no evidence that context actually improves the performance of toxicity classifiers, having tried a range of classifiers and mechanisms to make them context aware. This points to the need for larger datasets of comments annotated in context. We make our code and data publicly available.",Toxicity Detection|healthy discussions|toxicity systems|toxicity classifiers,NLP Applications,Long,https://www.aclweb.org/anthology/2020.acl-main.396.pdf -main.89,Injecting Numerical Reasoning Skills into Language Models,Mor Geva|Ankit Gupta|Jonathan Berant,"Large pre-trained language models (LMs) are known to encode substantial amounts of linguistic information. However, high-level reasoning skills, such as numerical reasoning, are difficult to learn from a language-modeling objective only. Consequently, existing models for numerical reasoning have used specialized architectures with limited flexibility. In this work, we show that numerical reasoning is amenable to automatic data generation, and thus one can inject this skill into pre-trained LMs, by generating large amounts of data, and training in a multi-task setup. We show that pre-training our model, GenBERT, on this data, dramatically improves performance on DROP (49.3 --> 72.3 F1), reaching performance that matches state-of-the-art models of comparable size, while using a simple and general-purpose encoder-decoder architecture. Moreover, GenBERT generalizes well to math word problem datasets, while maintaining high performance on standard RC tasks. Our approach provides a general recipe for injecting skills into large pre-trained LMs, whenever the skill is amenable to automatic data augmentation.",numerical reasoning|automatic generation|RC tasks|automatic augmentation,Question Answering,Long,https://www.aclweb.org/anthology/2020.acl-main.89.pdf -main.382,Make Up Your Mind! Adversarial Generation of Inconsistent Natural Language Explanations,Oana-Maria Camburu|Brendan Shillingford|Pasquale Minervini|Thomas Lukasiewicz|Phil Blunsom,"To increase trust in artificial intelligence systems, a promising research direction consists of designing neural models capable of generating natural language explanations for their predictions. In this work, we show that such models are nonetheless prone to generating mutually inconsistent explanations, such as ''Because there is a dog in the image.'' and ''Because there is no dog in the [same] image.'', exposing flaws in either the decision-making process of the model or in the generation of the explanations. We introduce a simple yet effective adversarial framework for sanity checking models against the generation of inconsistent natural language explanations. Moreover, as part of the framework, we address the problem of adversarial attacks with full target sequences, a scenario that was not previously addressed in sequence-to-sequence attacks. Finally, we apply our framework on a state-of-the-art neural natural language inference model that provides natural language explanations for its predictions. Our framework shows that this model is capable of generating a significant number of inconsistent explanations.",Adversarial Explanations|artificial systems|generation explanations|sanity models,Interpretability and Analysis of Models for NLP,Short,https://www.aclweb.org/anthology/2020.acl-main.382.pdf -main.631,Conditional Augmentation for Aspect Term Extraction via Masked Sequence-to-Sequence Generation,Kun Li|Chengbo Chen|Xiaojun Quan|Qing Ling|Yan Song,"Aspect term extraction aims to extract aspect terms from review texts as opinion targets for sentiment analysis. One of the big challenges with this task is the lack of sufficient annotated data. While data augmentation is potentially an effective technique to address the above issue, it is uncontrollable as it may change aspect words and aspect labels unexpectedly. In this paper, we formulate the data augmentation as a conditional generation task: generating a new sentence while preserving the original opinion targets and labels. We propose a masked sequence-to-sequence method for conditional augmentation of aspect term extraction. Unlike existing augmentation approaches, ours is controllable and allows to generate more diversified sentences. Experimental results confirm that our method alleviates the data scarcity problem significantly. It also effectively boosts the performances of several current models for aspect term extraction.",Conditional Augmentation|Aspect Extraction|sentiment analysis|data augmentation,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Long,https://www.aclweb.org/anthology/2020.acl-main.631.pdf -main.157,Will-They-Won't-They: A Very Large Dataset for Stance Detection on Twitter,Costanza Conforti|Jakob Berndt|Mohammad Taher Pilehvar|Chryssi Giannitsarou|Flavio Toxvaerd|Nigel Collier,"We present a new challenging stance detection dataset, called Will-They-Won’t-They (WT–WT), which contains 51,284 tweets in English, making it by far the largest available dataset of the type. All the annotations are carried out by experts; therefore, the dataset constitutes a high-quality and reliable benchmark for future research in stance detection. Our experiments with a wide range of recent state-of-the-art stance detection systems show that the dataset poses a strong challenge to existing models in this domain.",Stance Detection|stance systems|Will-They-Won’t-They WT|Will-They-Won’t-They,Resources and Evaluation,Short,https://www.aclweb.org/anthology/2020.acl-main.157.pdf -main.143,Bilingual Dictionary Based Neural Machine Translation without Using Parallel Sentences,Xiangyu Duan|Baijun Ji|Hao Jia|Min Tan|Min Zhang|Boxing Chen|Weihua Luo|Yue Zhang,"In this paper, we propose a new task of machine translation (MT), which is based on no parallel sentences but can refer to a ground-truth bilingual dictionary. Motivated by the ability of a monolingual speaker learning to translate via looking up the bilingual dictionary, we propose the task to see how much potential an MT system can attain using the bilingual dictionary and large scale monolingual corpora, while is independent on parallel sentences. We propose anchored training (AT) to tackle the task. AT uses the bilingual dictionary to establish anchoring points for closing the gap between source language and target language. Experiments on various language pairs show that our approaches are significantly better than various baselines, including dictionary-based word-by-word translation, dictionary-supervised cross-lingual word embedding transformation, and unsupervised MT. On distant language pairs that are hard for unsupervised MT to perform well, AT performs remarkably better, achieving performances comparable to supervised SMT trained on more than 4M parallel sentences.",Bilingual Translation|machine MT|MT|dictionary-based translation,Machine Translation,Long,https://www.aclweb.org/anthology/2020.acl-main.143.pdf -main.625,Language to Network: Conditional Parameter Adaptation with Natural Language Descriptions,Tian Jin|Zhun Liu|Shengjia Yan|Alexandre Eichenberger|Louis-Philippe Morency,"Transfer learning using ImageNet pre-trained models has been the de facto approach in a wide range of computer vision tasks. However, fine-tuning still requires task-specific training data. In this paper, we propose N³ (Neural Networks from Natural Language) - a new paradigm of synthesizing task-specific neural networks from language descriptions and a generic pre-trained model. N³ leverages language descriptions to generate parameter adaptations as well as a new task-specific classification layer for a pre-trained neural network, effectively ``fine-tuning'' the network for a new task using only language descriptions as input. To the best of our knowledge, N³ is the first method to synthesize entire neural networks from natural language. Experimental results show that N³ can out-perform previous natural-language based zero-shot learning methods across 4 different zero-shot image classification benchmarks. We also demonstrate a simple method to help identify keywords in language descriptions leveraged by N³ when synthesizing model parameters.",Transfer learning|computer tasks|fine-tuning|Conditional Adaptation,NLP Applications,Long,https://www.aclweb.org/anthology/2020.acl-main.625.pdf -main.619,Gender in Danger? Evaluating Speech Translation Technology on the MuST-SHE Corpus,Luisa Bentivogli|Beatrice Savoldi|Matteo Negri|Mattia A. Di Gangi|Roldano Cattoni|Marco Turchi,"Translating from languages without productive grammatical gender like English into gender-marked languages is a well-known difficulty for machines. This difficulty is also due to the fact that the training data on which models are built typically reflect the asymmetries of natural languages, gender bias included. Exclusively fed with textual data, machine translation is intrinsically constrained by the fact that the input sentence does not always contain clues about the gender identity of the referred human entities. But what happens with speech translation, where the input is an audio signal? Can audio provide additional information to reduce gender bias? We present the first thorough investigation of gender bias in speech translation, contributing with: i) the release of a benchmark useful for future studies, and ii) the comparison of different technologies (cascade and end-to-end) on two language directions (English-Italian/French).",Speech Technology|Translating|machines|machine translation,Machine Translation,Long,https://www.aclweb.org/anthology/2020.acl-main.619.pdf -main.194,MixText: Linguistically-Informed Interpolation of Hidden Space for Semi-Supervised Text Classification,Jiaao Chen|Zichao Yang|Diyi Yang,"This paper presents MixText, a semi-supervised learning method for text classification, which uses our newly designed data augmentation method called TMix. TMix creates a large amount of augmented training samples by interpolating text in hidden space. Moreover, we leverage recent advances in data augmentation to guess low-entropy labels for unlabeled data, hence making them as easy to use as labeled data. By mixing labeled, unlabeled and augmented data, MixText significantly outperformed current pre-trained and fined-tuned models and other state-of-the-art semi-supervised learning methods on several text classification benchmarks. The improvement is especially prominent when supervision is extremely limited. We have publicly released our code at https://github.com/GT-SALT/MixText.",Semi-Supervised Classification|text classification|data augmentation|supervision,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.194.pdf -main.180,Speakers enhance contextually confusable words,Eric Meinhardt|Eric Bakovic|Leon Bergen,"Recent work has found evidence that natural languages are shaped by pressures for efficient communication — e.g. the more contextually predictable a word is, the fewer speech sounds or syllables it has (Piantadosi et al. 2011). Research on the degree to which speech and language are shaped by pressures for effective communication — robustness in the face of noise and uncertainty — has been more equivocal. We develop a measure of contextual confusability during word recognition based on psychoacoustic data. Applying this measure to naturalistic speech corpora, we find evidence suggesting that speakers alter their productions to make contextually more confusable words easier to understand.",word recognition|contextually words|speech sounds|noise,Cognitive Modeling and Psycholinguistics,Long,https://www.aclweb.org/anthology/2020.acl-main.180.pdf -main.745,TaBERT: Pretraining for Joint Understanding of Textual and Tabular Data,Pengcheng Yin|Graham Neubig|Wen-tau Yih|Sebastian Riedel,"Recent years have witnessed the burgeoning of pretrained language models (LMs) for text-based natural language (NL) understanding tasks. Such models are typically trained on free-form NL text, hence may not be suitable for tasks like semantic parsing over structured data, which require reasoning over both free-form NL questions and structured tabular data (e.g., database tables). In this paper we present TaBERT, a pretrained LM that jointly learns representations for NL sentences and (semi-)structured tables. TaBERT is trained on a large corpus of 26 million tables and their English contexts. In experiments, neural semantic parsers using TaBERT as feature representation layers achieve new best results on the challenging weakly-supervised semantic parsing benchmark WikiTableQuestions, while performing competitively on the text-to-SQL dataset Spider.",Joint Data|text-based tasks|semantic parsing|TaBERT,Semantics: Sentence Level,Long,https://www.aclweb.org/anthology/2020.acl-main.745.pdf -main.751,TXtract: Taxonomy-Aware Knowledge Extraction for Thousands of Product Categories,Giannis Karamanolakis|Jun Ma|Xin Luna Dong,"Extracting structured knowledge from product profiles is crucial for various applications in e-Commerce. State-of-the-art approaches for knowledge extraction were each designed for a single category of product, and thus do not apply to real-life e-Commerce scenarios, which often contain thousands of diverse categories. This paper proposes TXtract, a taxonomy-aware knowledge extraction model that applies to thousands of product categories organized in a hierarchical taxonomy. Through category conditional self-attention and multi-task learning, our approach is both scalable, as it trains a single model for thousands of categories, and effective, as it extracts category-specific attribute values. Experiments on products from a taxonomy with 4,000 categories show that TXtract outperforms state-of-the-art approaches by up to 10% in F1 and 15% in coverage across all categories.",Taxonomy-Aware Extraction|e-Commerce|knowledge extraction|TXtract,Information Extraction,Long,https://www.aclweb.org/anthology/2020.acl-main.751.pdf -main.221,Neural Generation of Dialogue Response Timings,Matthew Roddy|Naomi Harte,"The timings of spoken response offsets in human dialogue have been shown to vary based on contextual elements of the dialogue. We propose neural models that simulate the distributions of these response offsets, taking into account the response turn as well as the preceding turn. The models are designed to be integrated into the pipeline of an incremental spoken dialogue system (SDS). We evaluate our models using offline experiments as well as human listening tests. We show that human listeners consider certain response timings to be more natural based on the dialogue context. The introduction of these models into SDS pipelines could increase the perceived naturalness of interactions.",Neural Timings|neural models|incremental system|SDS,Dialogue and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.acl-main.221.pdf -main.547,Neural Graph Matching Networks for Chinese Short Text Matching,Lu Chen|Yanbin Zhao|Boer Lv|Lesheng Jin|Zhi Chen|Su Zhu|Kai Yu,"Chinese short text matching usually employs word sequences rather than character sequences to get better performance. However, Chinese word segmentation can be erroneous, ambiguous or inconsistent, which consequently hurts the final matching performance. To address this problem, we propose neural graph matching networks, a novel sentence matching framework capable of dealing with multi-granular input information. Instead of a character sequence or a single word sequence, paired word lattices formed from multiple word segmentation hypotheses are used as input and the model learns a graph representation according to an attentive graph matching mechanism. Experiments on two Chinese datasets show that our models outperform the state-of-the-art short text matching models.",Chinese Matching|Chinese segmentation|matching|Neural Networks,Semantics: Textual Inference and Other Areas of Semantics,Short,https://www.aclweb.org/anthology/2020.acl-main.547.pdf -main.553,Heterogeneous Graph Neural Networks for Extractive Document Summarization,Danqing Wang|Pengfei Liu|Yining Zheng|Xipeng Qiu|Xuanjing Huang,"As a crucial step in extractive document summarization, learning cross-sentence relations has been explored by a plethora of approaches. An intuitive way is to put them in the graph-based neural network, which has a more complex structure for capturing inter-sentence relationships. In this paper, we present a heterogeneous graph-based neural network for extractive summarization (HETERSUMGRAPH), which contains semantic nodes of different granularity levels apart from sentences. These additional nodes act as the intermediary between sentences and enrich the cross-sentence relations. Besides, our graph structure is flexible in natural extension from a single-document setting to multi-document via introducing document nodes. To our knowledge, we are the first one to introduce different types of nodes into graph-based neural networks for extractive document summarization and perform a comprehensive qualitative analysis to investigate their benefits. The code will be released on Github.",Extractive Summarization|learning relations|heterogeneous summarization|Heterogeneous Networks,Summarization,Long,https://www.aclweb.org/anthology/2020.acl-main.553.pdf -main.235,A Batch Normalized Inference Network Keeps the KL Vanishing Away,Qile Zhu|Wei Bi|Xiaojiang Liu|Xiyao Ma|Xiaolin Li|Dapeng Wu,"Variational Autoencoder (VAE) is widely used as a generative model to approximate a model's posterior on latent variables by combining the amortized variational inference and deep neural networks. However, when paired with strong autoregressive decoders, VAE often converges to a degenerated local optimum known as ``posterior collapse''. Previous approaches consider the Kullback–Leibler divergence (KL) individual for each datapoint. We propose to let the KL follow a distribution across the whole dataset, and analyze that it is sufficient to prevent posterior collapse by keeping the expectation of the KL's distribution positive. Then we propose Batch Normalized-VAE (BN-VAE), a simple but effective approach to set a lower bound of the expectation by regularizing the distribution of the approximate posterior's parameters. Without introducing any new model component or modifying the objective, our approach can avoid the posterior collapse effectively and efficiently. We further show that the proposed BN-VAE can be extended to conditional VAE (CVAE). Empirically, our approach surpasses strong autoregressive baselines on language modeling, text classification and dialogue generation, and rivals more complex approaches while keeping almost the same training time as VAE.",amortized inference|language modeling|text classification|dialogue generation,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.235.pdf -main.209,Can We Predict New Facts with Open Knowledge Graph Embeddings? A Benchmark for Open Link Prediction,Samuel Broscheit|Kiril Gashteovski|Yanjie Wang|Rainer Gemulla,"Open Information Extraction systems extract (“subject text”, “relation text”, “object text”) triples from raw text. Some triples are textual versions of facts, i.e., non-canonicalized mentions of entities and relations. In this paper, we investigate whether it is possible to infer new facts directly from the open knowledge graph without any canonicalization or any supervision from curated knowledge. For this purpose, we propose the open link prediction task,i.e., predicting test facts by completing (“subject text”, “relation text”, ?) questions. An evaluation in such a setup raises the question if a correct prediction is actually a new fact that was induced by reasoning over the open knowledge graph or if it can be trivially explained. For example, facts can appear in different paraphrased textual variants, which can lead to test leakage. To this end, we propose an evaluation protocol and a methodology for creating the open link prediction benchmark OlpBench. We performed experiments with a prototypical knowledge graph embedding model for openlink prediction. While the task is very challenging, our results suggests that it is possible to predict genuinely new facts, which can not be trivially explained.",Open Prediction|open task|predicting facts|openlink prediction,Semantics: Textual Inference and Other Areas of Semantics,Long,https://www.aclweb.org/anthology/2020.acl-main.209.pdf -main.584,Knowledge Supports Visual Language Grounding: A Case Study on Colour Terms,Simeon Schüz|Sina Zarrieß,"In human cognition, world knowledge supports the perception of object colours: knowing that trees are typically green helps to perceive their colour in certain contexts. We go beyond previous studies on colour terms using isolated colour swatches and study visual grounding of colour terms in realistic objects. Our models integrate processing of visual information and object-specific knowledge via hard-coded (late) or learned (early) fusion. We find that both models consistently outperform a bottom-up baseline that predicts colour terms solely from visual inputs, but show interesting differences when predicting atypical colours of so-called colour diagnostic objects. Our models also achieve promising results when tested on new object categories not seen during training.",Knowledge Grounding|human cognition|visual terms|bottom-up baseline,"Language Grounding to Vision, Robotics and Beyond",Short,https://www.aclweb.org/anthology/2020.acl-main.584.pdf -main.590,Evaluating and Enhancing the Robustness of Neural Network-based Dependency Parsing Models with Adversarial Examples,Xiaoqing Zheng|Jiehang Zeng|Yi Zhou|Cho-Jui Hsieh|Minhao Cheng|Xuanjing Huang,"Despite achieving prominent performance on many important tasks, it has been reported that neural networks are vulnerable to adversarial examples. Previously studies along this line mainly focused on semantic tasks such as sentiment analysis, question answering and reading comprehension. In this study, we show that adversarial examples also exist in dependency parsing: we propose two approaches to study where and how parsers make mistakes by searching over perturbations to existing texts at sentence and phrase levels, and design algorithms to construct such examples in both of the black-box and white-box settings. Our experiments with one of state-of-the-art parsers on the English Penn Treebank (PTB) show that up to 77% of input examples admit adversarial perturbations, and we also show that the robustness of parsing models can be improved by crafting high-quality adversaries and including them in the training stage, while suffering little to no performance drop on the clean input data.",semantic tasks|sentiment analysis|question answering|reading comprehension,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.590.pdf -main.591,Exploiting Syntactic Structure for Better Language Modeling: A Syntactic Distance Approach,Wenyu Du|Zhouhan Lin|Yikang Shen|Timothy J. O'Donnell|Yoshua Bengio|Yue Zhang,"It is commonly believed that knowledge of syntactic structure should improve language modeling. However, effectively and computationally efficiently incorporating syntactic structure into neural language models has been a challenging topic. In this paper, we make use of a multi-task objective, i.e., the models simultaneously predict words as well as ground truth parse trees in a form called “syntactic distances”, where information between these two separate objectives shares the same intermediate representation. Experimental results on the Penn Treebank and Chinese Treebank datasets show that when ground truth parse trees are provided as additional training signals, the model is able to achieve lower perplexity and induce trees with better quality.",Language Modeling|Syntactic Approach|neural models|intermediate representation,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.591.pdf -main.585,Span-based Localizing Network for Natural Language Video Localization,Hao Zhang|Aixin Sun|Wei Jing|Joey Tianyi Zhou,"Given an untrimmed video and a text query, natural language video localization (NLVL) is to locate a matching span from the video that semantically corresponds to the query. Existing solutions formulate NLVL either as a ranking task and apply multimodal matching architecture, or as a regression task to directly regress the target video span. In this work, we address NLVL task with a span-based QA approach by treating the input video as text passage. We propose a video span localizing network (VSLNet), on top of the standard span-based QA framework, to address NLVL. The proposed VSLNet tackles the differences between NLVL and span-based QA through a simple and yet effective query-guided highlighting (QGH) strategy. The QGH guides VSLNet to search for matching video span within a highlighted region. Through extensive experiments on three benchmark datasets, we show that the proposed VSLNet outperforms the state-of-the-art methods; and adopting span-based QA framework is a promising direction to solve NLVL.",Natural Localization|NLVL|ranking task|regression task,"Language Grounding to Vision, Robotics and Beyond",Long,https://www.aclweb.org/anthology/2020.acl-main.585.pdf -main.208,Semantic Scaffolds for Pseudocode-to-Code Generation,Ruiqi Zhong|Mitchell Stern|Dan Klein,"We propose a method for program generation based on semantic scaffolds, lightweight structures representing the high-level semantic and syntactic composition of a program. By first searching over plausible scaffolds then using these as constraints for a beam search over programs, we achieve better coverage of the search space when compared with existing techniques. We apply our hierarchical search method to the SPoC dataset for pseudocode-to-code generation, in which we are given line-level natural language pseudocode annotations and aim to produce a program satisfying execution-based test cases. By using semantic scaffolds during inference, we achieve a 10% absolute improvement in top-100 accuracy over the previous state-of-the-art. Additionally, we require only 11 candidates to reach the top-3000 performance of the previous best approach when tested against unseen problems, demonstrating a substantial improvement in efficiency.",Pseudocode-to-Code Generation|program generation|inference|Semantic Scaffolds,NLP Applications,Long,https://www.aclweb.org/anthology/2020.acl-main.208.pdf -main.552,Extractive Summarization as Text Matching,Ming Zhong|Pengfei Liu|Yiran Chen|Danqing Wang|Xipeng Qiu|Xuanjing Huang,"This paper creates a paradigm shift with regard to the way we build neural extractive summarization systems. Instead of following the commonly used framework of extracting sentences individually and modeling the relationship between sentences, we formulate the extractive summarization task as a semantic text matching problem, in which a source document and candidate summaries will be (extracted from the original text) matched in a semantic space. Notably, this paradigm shift to semantic matching framework is well-grounded in our comprehensive analysis of the inherent gap between sentence-level and summary-level extractors based on the property of the dataset. Besides, even instantiating the framework with a simple form of a matching model, we have driven the state-of-the-art extractive result on CNN/DailyMail to a new level (44.41 in ROUGE-1). Experiments on the other five datasets also show the effectiveness of the matching framework. We believe the power of this matching-based summarization framework has not been fully exploited. To encourage more instantiations in the future, we have released our codes, processed dataset, as well as generated summaries in {https://github.com/maszhongming/MatchSum}.",Extractive Summarization|Text Matching|extractive task|semantic problem,Summarization,Long,https://www.aclweb.org/anthology/2020.acl-main.552.pdf -main.234,What is Learned in Visually Grounded Neural Syntax Acquisition,Noriyuki Kojima|Hadar Averbuch-Elor|Alexander Rush|Yoav Artzi,"Visual features are a promising signal for learning bootstrap textual models. However, blackbox learning models make it difficult to isolate the specific contribution of visual components. In this analysis, we consider the case study of the Visually Grounded Neural Syntax Learner (Shi et al., 2019), a recent approach for learning syntax from a visual training signal. By constructing simplified versions of the model, we isolate the core factors that yield the model's strong performance. Contrary to what the model might be capable of learning, we find significantly less expressive versions produce similar predictions and perform just as well, or even better. We also find that a simple lexical signal of noun concreteness plays the main role in the model's predictions as opposed to more complex syntactic reasoning.",Visually Acquisition|bootstrap models|blackbox models|visual components,"Language Grounding to Vision, Robotics and Beyond",Short,https://www.aclweb.org/anthology/2020.acl-main.234.pdf -main.220,Learning an Unreferenced Metric for Online Dialogue Evaluation,Koustuv Sinha|Prasanna Parthasarathi|Jasmine Wang|Ryan Lowe|William L. Hamilton|Joelle Pineau,"Evaluating the quality of a dialogue interaction between two agents is a difficult task, especially in open-domain chit-chat style dialogue. There have been recent efforts to develop automatic dialogue evaluation metrics, but most of them do not generalize to unseen datasets and/or need a human-generated reference response during inference, making it infeasible for online evaluation. Here, we propose an unreferenced automated evaluation metric that uses large pre-trained language models to extract latent representations of utterances, and leverages the temporal transitions that exist between them. We show that our model achieves higher correlation with human annotations in an online setting, while not requiring true responses for comparison during inference.",Online Evaluation|inference|online setting|Unreferenced Metric,Dialogue and Interactive Systems,Short,https://www.aclweb.org/anthology/2020.acl-main.220.pdf -main.546,NeuInfer: Knowledge Inference on N-ary Facts,Saiping Guan|Xiaolong Jin|Jiafeng Guo|Yuanzhuo Wang|Xueqi Cheng,"Knowledge inference on knowledge graph has attracted extensive attention, which aims to find out connotative valid facts in knowledge graph and is very helpful for improving the performance of many downstream applications. However, researchers have mainly poured attention to knowledge inference on binary facts. The studies on n-ary facts are relatively scarcer, although they are also ubiquitous in the real world. Therefore, this paper addresses knowledge inference on n-ary facts. We represent each n-ary fact as a primary triple coupled with a set of its auxiliary descriptive attribute-value pair(s). We further propose a neural network model, NeuInfer, for knowledge inference on n-ary facts. Besides handling the common task to infer an unknown element in a whole fact, NeuInfer can cope with a new type of task, flexible knowledge inference. It aims to infer an unknown element in a partial fact consisting of the primary triple coupled with any number of its auxiliary description(s). Experimental results demonstrate the remarkable superiority of NeuInfer.",Knowledge Inference|NeuInfer|neural model|N-ary Facts,Semantics: Textual Inference and Other Areas of Semantics,Long,https://www.aclweb.org/anthology/2020.acl-main.546.pdf -main.778,Treebank Embedding Vectors for Out-of-domain Dependency Parsing,Joachim Wagner|James Barry|Jennifer Foster,"A recent advance in monolingual dependency parsing is the idea of a treebank embedding vector, which allows all treebanks for a particular language to be used as training data while at the same time allowing the model to prefer training data from one treebank over others and to select the preferred treebank at test time. We build on this idea by 1) introducing a method to predict a treebank vector for sentences that do not come from a treebank used in training, and 2) exploring what happens when we move away from predefined treebank embedding vectors during test time and instead devise tailored interpolations. We show that 1) there are interpolated vectors that are superior to the predefined ones, and 2) treebank vectors can be predicted with sufficient accuracy, for nine out of ten test languages, to match the performance of an oracle approach that knows the most suitable predefined treebank embedding for the test set.",Out-of-domain Parsing|monolingual parsing|Treebank Vectors|treebank vector,"Syntax: Tagging, Chunking and Parsing",Short,https://www.aclweb.org/anthology/2020.acl-main.778.pdf -main.750,Multi-Domain Named Entity Recognition with Genre-Aware and Agnostic Inference,Jing Wang|Mayank Kulkarni|Daniel Preotiuc-Pietro,"Named entity recognition is a key component of many text processing pipelines and it is thus essential for this component to be robust to different types of input. However, domain transfer of NER models with data from multiple genres has not been widely studied. To this end, we conduct NER experiments in three predictive setups on data from: a) multiple domains; b) multiple domains where the genre label is unknown at inference time; c) domains not encountered in training. We introduce a new architecture tailored to this task by using shared and private domain parameters and multi-task learning. This consistently outperforms all other baseline and competitive methods on all three experimental setups, with differences ranging between +1.95 to +3.11 average F1 across multiple genres when compared to standard approaches. These results illustrate the challenges that need to be taken into account when building real-world NLP applications that are robust to various types of text and the methods that can help, at least partially, alleviate these issues.",Multi-Domain Recognition|Named recognition|domain models|NER,Information Extraction,Long,https://www.aclweb.org/anthology/2020.acl-main.750.pdf -main.744,Structured Tuning for Semantic Role Labeling,Tao Li|Parth Anand Jawale|Martha Palmer|Vivek Srikumar,"Recent neural network-driven semantic role labeling (SRL) systems have shown impressive improvements in F1 scores. These improvements are due to expressive input representations, which, at least at the surface, are orthogonal to knowledge-rich constrained decoding mechanisms that helped linear SRL models. Introducing the benefits of structure to inform neural models presents a methodological challenge. In this paper, we present a structured tuning framework to improve models using softened constraints only at training time. Our framework leverages the expressiveness of neural networks and provides supervision with structured loss components. We start with a strong baseline (RoBERTa) to validate the impact of our approach, and show that our framework outperforms the baseline by learning to comply with declarative constraints. Additionally, our experiments with smaller training sizes show that we can achieve consistent improvements under low-resource scenarios.",Semantic Labeling|Structured Tuning|expressive representations|knowledge-rich mechanisms,Semantics: Sentence Level,Long,https://www.aclweb.org/anthology/2020.acl-main.744.pdf -main.181,What determines the order of adjectives in English? Comparing efficiency-based theories using dependency treebanks,Richard Futrell|William Dyer|Greg Scontras,"We take up the scientific question of what determines the preferred order of adjectives in English, in phrases such as big blue box where multiple adjectives modify a following noun. We implement and test four quantitative theories, all of which are theoretically motivated in terms of efficiency in human language production and comprehension. The four theories we test are subjectivity (Scontras et al., 2017), information locality (Futrell, 2019), integration cost (Dyer, 2017), and information gain, which we introduce. We evaluate theories based on their ability to predict orders of unseen adjectives in hand-parsed and automatically-parsed dependency treebanks. We find that subjectivity, information locality, and information gain are all strong predictors, with some evidence for a two-factor account, where subjectivity and information gain reflect a factor involving semantics, and information locality reflects collocational preferences.",efficiency-based theories|order adjectives|information locality|integration cost,Cognitive Modeling and Psycholinguistics,Long,https://www.aclweb.org/anthology/2020.acl-main.181.pdf -main.195,MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices,Zhiqing Sun|Hongkun Yu|Xiaodan Song|Renjie Liu|Yiming Yang|Denny Zhou,"Natural Language Processing (NLP) has recently achieved great success by using huge pre-trained models with hundreds of millions of parameters. However, these models suffer from heavy model sizes and high latency such that they cannot be deployed to resource-limited mobile devices. In this paper, we propose MobileBERT for compressing and accelerating the popular BERT model. Like the original BERT, MobileBERT is task-agnostic, that is, it can be generically applied to various downstream NLP tasks via simple fine-tuning. Basically, MobileBERT is a thin version of BERT_LARGE, while equipped with bottleneck structures and a carefully designed balance between self-attentions and feed-forward networks. To train MobileBERT, we first train a specially designed teacher model, an inverted-bottleneck incorporated BERT_LARGE model. Then, we conduct knowledge transfer from this teacher to MobileBERT. Empirical studies show that MobileBERT is 4.3x smaller and 5.5x faster than BERT_BASE while achieving competitive results on well-known benchmarks. On the natural language inference tasks of GLUE, MobileBERT achieves a GLUE score of 77.7 (0.6 lower than BERT_BASE), and 62 ms latency on a Pixel 4 phone. On the SQuAD v1.1/v2.0 question answering task, MobileBERT achieves a dev F1 score of 90.0/79.2 (1.5/2.1 higher than BERT_BASE).",Natural NLP|NLP tasks|knowledge transfer|natural tasks,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.195.pdf -main.618,Classification-Based Self-Learning for Weakly Supervised Bilingual Lexicon Induction,Mladen Karan|Ivan Vulić|Anna Korhonen|Goran Glavaš,"Effective projection-based cross-lingual word embedding (CLWE) induction critically relies on the iterative self-learning procedure. It gradually expands the initial small seed dictionary to learn improved cross-lingual mappings. In this work, we present ClassyMap, a classification-based approach to self-learning, yielding a more robust and a more effective induction of projection-based CLWEs. Unlike prior self-learning methods, our approach allows for integration of diverse features into the iterative process. We show the benefits of ClassyMap for bilingual lexicon induction: we report consistent improvements in a weakly supervised setup (500 seed translation pairs) on a benchmark with 28 language pairs.",Weakly Induction|self-learning|induction CLWEs|bilingual induction,Machine Translation,Short,https://www.aclweb.org/anthology/2020.acl-main.618.pdf -main.142,TACRED Revisited: A Thorough Evaluation of the TACRED Relation Extraction Task,Christoph Alt|Aleksandra Gabryszak|Leonhard Hennig,"TACRED is one of the largest, most widely used crowdsourced datasets in Relation Extraction (RE). But, even with recent advances in unsupervised pre-training and knowledge enhanced neural RE, models still show a high error rate. In this paper, we investigate the questions: Have we reached a performance ceiling or is there still room for improvement? And how do crowd annotations, dataset, and models contribute to this error rate? To answer these questions, we first validate the most challenging 5K examples in the development and test sets using trained annotators. We find that label errors account for 8% absolute F1 test error, and that more than 50% of the examples need to be relabeled. On the relabeled test set the average F1 score of a large baseline model set improves from 62.1 to 70.1. After validation, we analyze misclassifications on the challenging instances, categorize them into linguistically motivated error groups, and verify the resulting error hypotheses on three state-of-the-art RE models. We show that two groups of ambiguous relations are responsible for most of the remaining errors and that models may adopt shallow heuristics on the dataset when entities are not masked.",TACRED Task|Relation Extraction|Relation RE|unsupervised RE,Information Extraction,Long,https://www.aclweb.org/anthology/2020.acl-main.142.pdf -main.624,From Zero to Hero: Human-In-The-Loop Entity Linking in Low Resource Domains,Jan-Christoph Klie|Richard Eckart de Castilho|Iryna Gurevych,"Entity linking (EL) is concerned with disambiguating entity mentions in a text against knowledge bases (KB). It is crucial in a considerable number of fields like humanities, technical writing and biomedical sciences to enrich texts with semantics and discover more knowledge. The use of EL in such domains requires handling noisy texts, low resource settings and domain-specific KBs. Existing approaches are mostly inappropriate for this, as they depend on training data. However, in the above scenario, there exists hardly annotated data, and it needs to be created from scratch. We therefore present a novel domain-agnostic Human-In-The-Loop annotation approach: we use recommenders that suggest potential concepts and adaptive candidate ranking, thereby speeding up the overall annotation process and making it less tedious for users. We evaluate our ranking approach in a simulation on difficult texts and show that it greatly outperforms a strong baseline in ranking accuracy. In a user study, the annotation speed improves by 35% compared to annotating without interactive support; users report that they strongly prefer our system. An open-source and ready-to-use implementation based on the text annotation platform INCEpTION (https://inception-project.github.io) is made available.",Human-In-The-Loop Linking|Entity linking|disambiguating mentions|annotation process,NLP Applications,Long,https://www.aclweb.org/anthology/2020.acl-main.624.pdf -main.630,tBERT: Topic Models and BERT Joining Forces for Semantic Similarity Detection,Nicole Peinelt|Dong Nguyen|Maria Liakata,Semantic similarity detection is a fundamental task in natural language understanding. Adding topic information has been useful for previous feature-engineered semantic similarity models as well as neural models for other tasks. There is currently no standard way of combining topics with pretrained contextual representations such as BERT. We propose a novel topic-informed BERT-based architecture for pairwise semantic similarity detection and show that our model improves performance over strong neural baselines across a variety of English language datasets. We find that the addition of topics to BERT helps particularly with resolving domain-specific cases.,Semantic Detection|natural understanding|pairwise detection|tBERT,Semantics: Sentence Level,Short,https://www.aclweb.org/anthology/2020.acl-main.630.pdf -main.156,A Monolingual Approach to Contextualized Word Embeddings for Mid-Resource Languages,Pedro Javier Ortiz Suárez|Laurent Romary|Benoît Sagot,"We use the multilingual OSCAR corpus, extracted from Common Crawl via language classification, filtering and cleaning, to train monolingual contextualized word embeddings (ELMo) for several mid-resource languages. We then compare the performance of OSCAR-based and Wikipedia-based ELMo embeddings for these languages on the part-of-speech tagging and parsing tasks. We show that, despite the noise in the Common-Crawl-based OSCAR data, embeddings trained on OSCAR perform much better than monolingual embeddings trained on Wikipedia. They actually equal or improve the current state of the art in tagging and parsing for all five languages. In particular, they also improve over multilingual Wikipedia-based contextual embeddings (multilingual BERT), which almost always constitutes the previous state of the art, thereby showing that the benefit of a larger, more diverse corpus surpasses the cross-lingual benefit of multilingual embedding architectures.",Contextualized Embeddings|cleaning|part-of-speech tasks|parsing tasks,Resources and Evaluation,Long,https://www.aclweb.org/anthology/2020.acl-main.156.pdf -main.383,Perturbed Masking: Parameter-free Probing for Analyzing and Interpreting BERT,Zhiyong Wu|Yun Chen|Ben Kao|Qun Liu,"By introducing a small set of additional parameters, a probe learns to solve specific linguistic tasks (e.g., dependency parsing) in a supervised manner using feature representations (e.g., contextualized embeddings). The effectiveness of such probing tasks is taken as evidence that the pre-trained model encodes linguistic knowledge. However, this approach of evaluating a language model is undermined by the uncertainty of the amount of knowledge that is learned by the probe itself. Complementary to those works, we propose a parameter-free probing technique for analyzing pre-trained language models (e.g., BERT). Our method does not require direct supervision from the probing tasks, nor do we introduce additional parameters to the probing process. Our experiments on BERT show that syntactic trees recovered from BERT using our method are significantly better than linguistically-uninformed baselines. We further feed the empirically induced dependency structures into a downstream sentiment classification task and find its improvement compatible with or even superior to a human-designed dependency schema.",Analyzing BERT|linguistic tasks|dependency parsing|probing tasks,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.383.pdf -main.88,Explicit Memory Tracker with Coarse-to-Fine Reasoning for Conversational Machine Reading,Yifan Gao|Chien-Sheng Wu|Shafiq Joty|Caiming Xiong|Richard Socher|Irwin King|Michael Lyu|Steven C.H. Hoi,"The goal of conversational machine reading is to answer user questions given a knowledge base text which may require asking clarification questions. Existing approaches are limited in their decision making due to struggles in extracting question-related rules and reasoning about them. In this paper, we present a new framework of conversational machine reading that comprises a novel Explicit Memory Tracker (EMT) to track whether conditions listed in the rule text have already been satisfied to make a decision. Moreover, our framework generates clarification questions by adopting a coarse-to-fine reasoning strategy, utilizing sentence-level entailment scores to weight token-level distributions. On the ShARC benchmark (blind, held-out) testset, EMT achieves new state-of-the-art results of 74.6% micro-averaged decision accuracy and 49.5 BLEU4. We also show that EMT is more interpretable by visualizing the entailment-oriented reasoning process as the conversation flows. Code and models are released at https://github.com/Yifan-Gao/explicit_memory_tracker.",Conversational Reading|decision making|Explicit Tracker|Coarse-to-Fine Reasoning,Question Answering,Long,https://www.aclweb.org/anthology/2020.acl-main.88.pdf -main.397,AMR Parsing with Latent Structural Information,Qiji Zhou|Yue Zhang|Donghong Ji|Hao Tang,"Abstract Meaning Representations (AMRs) capture sentence-level semantics structural representations to broad-coverage natural sentences. We investigate parsing AMR with explicit dependency structures and interpretable latent structures. We generate the latent soft structure without additional annotations, and fuse both dependency and latent structure via an extended graph neural networks. The fused structural information helps our experiments results to achieve the best reported results on both AMR 2.0 (77.5% Smatch F1 on LDC2017T10) and AMR 1.0 ((71.8% Smatch F1 on LDC2014T12).",parsing AMR|AMR Parsing|Abstract Representations|AMRs,Semantics: Sentence Level,Long,https://www.aclweb.org/anthology/2020.acl-main.397.pdf -main.63,Towards Unsupervised Language Understanding and Generation by Joint Dual Learning,Shang-Yu Su|Chao-Wei Huang|Yun-Nung Chen,"In modular dialogue systems, natural language understanding (NLU) and natural language generation (NLG) are two critical components, where NLU extracts the semantics from the given texts and NLG is to construct corresponding natural language sentences based on the input semantic representations. However, the dual property between understanding and generation has been rarely explored. The prior work is the first attempt that utilized the duality between NLU and NLG to improve the performance via a dual supervised learning framework. However, the prior work still learned both components in a supervised manner; instead, this paper introduces a general learning framework to effectively exploit such duality, providing flexibility of incorporating both supervised and unsupervised learning algorithms to train language understanding and generation models in a joint fashion. The benchmark experiments demonstrate that the proposed approach is capable of boosting the performance of both NLU and NLG. The source code is available at: https://github.com/MiuLab/DuaLUG.",Unsupervised Understanding|Unsupervised Generation|natural understanding|natural generation,Dialogue and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.acl-main.63.pdf -main.368,BERTRAM: Improved Word Embeddings Have Big Impact on Contextualized Model Performance,Timo Schick|Hinrich Schütze,"Pretraining deep language models has led to large performance gains in NLP. Despite this success, Schick and Schütze (2020) recently showed that these models struggle to understand rare words. For static word embeddings, this problem has been addressed by separately learning representations for rare words. In this work, we transfer this idea to pretrained language models: We introduce BERTRAM, a powerful architecture based on BERT that is capable of inferring high-quality embeddings for rare words that are suitable as input representations for deep language models. This is achieved by enabling the surface form and contexts of a word to interact with each other in a deep architecture. Integrating BERTRAM into BERT leads to large performance increases due to improved representations of rare and medium frequency words on both a rare word probing task and three downstream tasks.",NLP|rare task|BERTRAM|Word Embeddings,Semantics: Lexical,Long,https://www.aclweb.org/anthology/2020.acl-main.368.pdf -main.77,Fine-grained Interest Matching for Neural News Recommendation,Heyuan Wang|Fangzhao Wu|Zheng Liu|Xing Xie,"Personalized news recommendation is a critical technology to improve users’ online news reading experience. The core of news recommendation is accurate matching between user's interests and candidate news. The same user usually has diverse interests that are reflected in different news she has browsed. Meanwhile, important semantic features of news are implied in text segments of different granularities. Existing studies generally represent each user as a single vector and then match the candidate news vector, which may lose fine-grained information for recommendation. In this paper, we propose FIM, a Fine-grained Interest Matching method for neural news recommendation. Instead of aggregating user's all historical browsed news into a unified vector, we hierarchically construct multi-level representations for each news via stacked dilated convolutions. Then we perform fine-grained matching between segment pairs of each browsed news and the candidate news at each semantic level. High-order salient signals are then identified by resembling the hierarchy of image recognition for final click prediction. Extensive experiments on a real-world dataset from MSN news validate the effectiveness of our model on news recommendation.",Fine-grained Matching|Neural Recommendation|Personalized recommendation|news recommendation,NLP Applications,Long,https://www.aclweb.org/anthology/2020.acl-main.77.pdf -main.5,Dialogue State Tracking with Explicit Slot Connection Modeling,Yawen Ouyang|Moxin Chen|Xinyu Dai|Yinggong Zhao|Shujian Huang|Jiajun Chen,"Recent proposed approaches have made promising progress in dialogue state tracking (DST). However, in multi-domain scenarios, ellipsis and reference are frequently adopted by users to express values that have been mentioned by slots from other domains. To handle these phenomena, we propose a Dialogue State Tracking with Slot Connections (DST-SC) model to explicitly consider slot correlations across different domains. Given a target slot, the slot connecting mechanism in DST-SC can infer its source slot and copy the source slot value directly, thus significantly reducing the difficulty of learning and reasoning. Experimental results verify the benefits of explicit slot connection modeling, and our model achieves state-of-the-art performance on MultiWOZ 2.0 and MultiWOZ 2.1 datasets.",Dialogue Tracking|dialogue DST|learning reasoning|Explicit Modeling,Dialogue and Interactive Systems,Short,https://www.aclweb.org/anthology/2020.acl-main.5.pdf -main.426,Modeling Label Semantics for Predicting Emotional Reactions,Radhika Gaonkar|Heeyoung Kwon|Mohaddeseh Bastan|Niranjan Balasubramanian|Nathanael Chambers,"Predicting how events induce emotions in the characters of a story is typically seen as a standard multi-label classification task, which usually treats labels as anonymous classes to predict. They ignore information that may be conveyed by the emotion labels themselves. We propose that the semantics of emotion labels can guide a model’s attention when representing the input story. Further, we observe that the emotions evoked by an event are often related: an event that evokes joy is unlikely to also evoke sadness. In this work, we explicitly model label classes via label embeddings, and add mechanisms that track label-label correlations both during training and inference. We also introduce a new semi-supervision strategy that regularizes for the correlations on unlabeled data. Our empirical evaluations show that modeling label semantics yields consistent benefits, and we advance the state-of-the-art on an emotion inference task.",Label Semantics|Predicting Reactions|multi-label task|training,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Short,https://www.aclweb.org/anthology/2020.acl-main.426.pdf -main.340,Relation-Aware Collaborative Learning for Unified Aspect-Based Sentiment Analysis,Zhuang Chen|Tieyun Qian,"Aspect-based sentiment analysis (ABSA) involves three subtasks, i.e., aspect term extraction, opinion term extraction, and aspect-level sentiment classification. Most existing studies focused on one of these subtasks only. Several recent researches made successful attempts to solve the complete ABSA problem with a unified framework. However, the interactive relations among three subtasks are still under-exploited. We argue that such relations encode collaborative signals between different subtasks. For example, when the opinion term is ``delicious'', the aspect term must be ``food'' rather than ``place''. In order to fully exploit these relations, we propose a Relation-Aware Collaborative Learning (REMNLP) framework which allows the subtasks to work coordinately via the multi-task learning and relation propagation mechanisms in a stacked multi-layer network. Extensive experiments on three real-world datasets demonstrate that REMNLP significantly outperforms the state-of-the-art methods for the complete ABSA task.",Unified Analysis|Aspect-based analysis|aspect extraction|opinion extraction,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Long,https://www.aclweb.org/anthology/2020.acl-main.340.pdf -main.354,Learning Implicit Text Generation via Feature Matching,Inkit Padhi|Pierre Dognin|Ke Bai|Cícero Nogueira dos Santos|Vijil Chenthamarakshan|Youssef Mroueh|Payel Das,"Generative feature matching network (GFMN) is an approach for training state-of-the-art implicit generative models for images by performing moment matching on features from pre-trained neural networks. In this paper, we present new GFMN formulations that are effective for sequential data. Our experimental results show the effectiveness of the proposed method, SeqGFMN, for three distinct generation tasks in English: unconditional text generation, class-conditional text generation, and unsupervised text style transfer. SeqGFMN is stable to train and outperforms various adversarial approaches for text generation and text style transfer.",Implicit Generation|generation tasks|unconditional generation|class-conditional generation,Generation,Short,https://www.aclweb.org/anthology/2020.acl-main.354.pdf -main.432,Learning to Deceive with Attention-Based Explanations,Danish Pruthi|Mansi Gupta|Bhuwan Dhingra|Graham Neubig|Zachary C. Lipton,"Attention mechanisms are ubiquitous components in neural architectures applied to natural language processing. In addition to yielding gains in predictive accuracy, attention weights are often claimed to confer interpretability, purportedly useful both for providing insights to practitioners and for explaining why a model makes its decisions to stakeholders. We call the latter use of attention mechanisms into question by demonstrating a simple method for training models to produce deceptive attention masks. Our method diminishes the total weight assigned to designated impermissible tokens, even when the models can be shown to nevertheless rely on these features to drive predictions. Across multiple models and tasks, our approach manipulates attention weights while paying surprisingly little cost in accuracy. Through a human study, we show that our manipulated attention-based explanations deceive people into thinking that predictions from a model biased against gender minorities do not rely on the gender. Consequently, our results cast doubt on attention’s reliability as a tool for auditing algorithms in the context of fairness and accountability.",natural processing|Attention mechanisms|neural architectures|human study,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.acl-main.432.pdf +UID,title,authors,abstract,keywords,track,paper_type,pdf_url,presentation_id +main.1004,AnswerFact: Fact Checking in Product Question Answering,Wenxuan Zhang|Yang Deng|Jing Ma|Wai Lam,"Product-related question answering platforms nowadays are widely employed in many E-commerce sites, providing a convenient way for potential customers to address their concerns during online shopping. However, the misinformation in the answers on those platforms poses unprecedented challenges for users to obtain reliable and truthful product information, which may even cause a commercial loss in E-commerce business. To tackle this issue, we investigate to predict the veracity of answers in this paper and introduce AnswerFact, a large scale fact checking dataset from product question answering forums. Each answer is accompanied by its veracity label and associated evidence sentences, providing a valuable testbed for evidence-based fact checking tasks in QA settings. We further propose a novel neural model with tailored evidence ranking components to handle the concerned answer veracity prediction problem. Extensive experiments are conducted with our proposed model and various existing fact checking methods, showing that our method outperforms all baselines on this task.",,Question Answering,Long,https://www.aclweb.org/anthology/2020.emnlp-main.188,38938822 +main.1006,Knowledge-Grounded Dialogue Generation with Pre-trained Language Models,Xueliang Zhao|wei wu|Can Xu|Chongyang Tao|Dongyan Zhao|Rui Yan,"We study knowledge-grounded dialogue generation with pre-trained language models. To leverage the redundant external knowledge under capacity constraint, we propose equipping response generation defined by a pre-trained language model with a knowledge selection module, and an unsupervised approach to jointly optimizing knowledge selection and response generation with unlabeled dialogues. Empirical results on two benchmarks indicate that our model can significantly outperform state-of-the-art methods in both automatic evaluation and human judgment.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.272,38938823 +main.1009,BiST: Bi-directional Spatio-Temporal Reasoning for Video-Grounded Dialogues,Hung Le|Doyen Sahoo|Nancy Chen|Steven C.H. Hoi,"Video-grounded dialogues are very challenging due to (i) the complexity of videos which contain both spatial and temporal variations, and (ii) the complexity of user utterances which query different segments and/or different objects in videos over multiple dialogue turns. However, existing approaches to video-grounded dialogues often focus on superficial temporal-level visual cues, but neglect more fine-grained spatial signals from videos. To address this drawback, we proposed Bi-directional Spatio-Temporal Learning (BiST), a vision-language neural framework for high-resolution queries in videos based on textual cues. Specifically, our approach not only exploits both spatial and temporal-level information, but also learns dynamic information diffusion between the two feature spaces through spatial-to-temporal and temporal-to-spatial reasoning. The bidirectional strategy aims to tackle the evolving semantics of user queries in the dialogue setting. The retrieved visual cues are used as contextual information to construct relevant responses to the users. Our empirical results and comprehensive qualitative analysis show that BiST achieves competitive performance and generates reasonable responses on a large-scale AVSD benchmark. We also adapt our BiST models to the Video QA setting, and substantially outperform prior approaches on the TGIF-QA benchmark.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.145,38938824 +main.1010,A Knowledge-Aware Sequence-to-Tree Network for Math Word Problem Solving,Qinzhuo Wu|Qi Zhang|Jinlan Fu|Xuanjing Huang,"With the advancements in natural language processing tasks, math word problem solving has received increasing attention. Previous methods have achieved promising results but ignore background common-sense knowledge not directly provided by the problem. In addition, during generation, they focus on local features while neglecting global information. To incorporate external knowledge and global expression information, we propose a novel knowledge-aware sequence-to-tree (KA-S2T) network in which the entities in the problem sequences and their categories are modeled as an entity graph. Based on this entity graph, a graph attention network is used to capture knowledge-aware problem representations. Further, we use a tree-structured decoder with a state aggregation mechanism to capture the long-distance dependency and global expression information. Experimental results on the Math23K dataset revealed that the KA-S2T model can achieve better performance than previously reported best results.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.579,38938825 +main.1011,Knowledge Association with Hyperbolic Knowledge Graph Embeddings,Zequn Sun|Muhao Chen|Wei Hu|Chengming Wang|Jian Dai|Wei Zhang,"Capturing associations for knowledge graphs (KGs) through entity alignment, entity type inference and other related tasks benefits NLP applications with comprehensive knowledge representations. Recent related methods built on Euclidean embeddings are challenged by the hierarchical structures and different scales of KGs. They also depend on high embedding dimensions to realize enough expressiveness. Differently, we explore with low-dimensional hyperbolic embeddings for knowledge association. We propose a hyperbolic relational graph neural network for KG embedding and capture knowledge associations with a hyperbolic transformation. Extensive experiments on entity alignment and type inference demonstrate the effectiveness and efficiency of our method.",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.460,38938826 +main.1012,UniConv: A Unified Conversational Neural Architecture for Multi-domain Task-oriented Dialogues,Hung Le|Doyen Sahoo|Chenghao Liu|Nancy Chen|Steven C.H. Hoi,"Building an end-to-end conversational agent for multi-domain task-oriented dialogues has been an open challenge for two main reasons. First, tracking dialogue states of multiple domains is non-trivial as the dialogue agent must obtain complete states from all relevant domains, some of which might have shared slots among domains as well as unique slots specifically for one domain only. Second, the dialogue agent must also process various types of information across domains, including dialogue context, dialogue states, and database, to generate natural responses to users. Unlike the existing approaches that are often designed to train each module separately, we propose ""UniConv"" - a novel unified neural architecture for end-to-end conversational systems in multi-domain task-oriented dialogues, which is designed to jointly train (i) a Bi-level State Tracker which tracks dialogue states by learning signals at both slot and domain level independently, and (ii) a Joint Dialogue Act and Response Generator which incorporates information from various input components and models dialogue acts and target responses simultaneously. We conduct comprehensive experiments in dialogue state tracking, context-to-text, and end-to-end settings on the MultiWOZ2.1 benchmark, achieving superior performance over competitive baselines.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.146,38938827 +main.1018,Exploring the Linear Subspace Hypothesis in Gender Bias Mitigation,Francisco Vargas|Ryan Cotterell,"Bolukbasi et al. (2016) presents one of the first gender bias mitigation techniques for word embeddings. Their method takes pre-trained word embeddings as input and attempts to isolate a linear subspace that captures most of the gender bias in the embeddings. As judged by an analogical evaluation task, their method virtually eliminates gender bias in the embeddings. However, an implicit and untested assumption of their method is that the bias subspace is actually linear. In this work, we generalize their method to a kernelized, non-linear version. We take inspiration from kernel principal component analysis and derive a non-linear bias isolation technique. We discuss and overcome some of the practical drawbacks of our method for non-linear gender bias mitigation in word embeddings and analyze empirically whether the bias subspace is actually linear. Our analysis shows that gender bias is in fact well captured by a linear subspace, justifying the assumption of Bolukbasi et al. (2016).",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.232,38938828 +main.1022,Context-Aware Answer Extraction in Question Answering,Yeon Seonwoo|Ji-Hoon Kim|Jung-Woo Ha|Alice Oh,"Extractive QA models have shown very promising performance in predicting the correct answer to a question for a given passage. However, they sometimes result in predicting the correct answer text but in a context irrelevant to the given question. This discrepancy becomes especially important as the number of occurrences of the answer text in a passage increases. To resolve this issue, we propose BLANC (BLock AttentioN for Context prediction) based on two main ideas: context prediction as an auxiliary task in multi-task learning manner, and a block attention method that learns the context prediction task. With experiments on reading comprehension, we show that BLANC outperforms the state-of-the-art QA models, and the performance gap increases as the number of answer text occurrences increases. We also conduct an experiment of training the models using SQuAD and predicting the supporting facts on HotpotQA and show that BLANC outperforms all baseline models in this zero-shot setting.",,Question Answering,Long,https://www.aclweb.org/anthology/2020.emnlp-main.189,38938829 +main.1023,Few-Shot Learning for Opinion Summarization,Arthur Bražinskas|Mirella Lapata|Ivan Titov,"Opinion summarization is the automatic creation of text reflecting subjective information expressed in multiple documents, such as user reviews of a product. The task is practically important and has attracted a lot of attention. However, due to the high cost of summary production, datasets large enough for training supervised models are lacking. Instead, the task has been traditionally approached with extractive methods that learn to select text fragments in an unsupervised or weakly-supervised way. Recently, it has been shown that abstractive summaries, potentially more fluent and better at reflecting conflicting information, can also be produced in an unsupervised fashion. However, these models, not being exposed to actual summaries, fail to capture their essential properties. In this work, we show that even a handful of summaries is sufficient to bootstrap generation of the summary text with all expected properties, such as writing style, informativeness, fluency, and sentiment preservation. We start by training a conditional Transformer language model to generate a new product review given other available reviews of the product. The model is also conditioned on review properties that are directly related to summaries; the properties are derived from reviews with no manual effort. In the second stage, we fine-tune a plug-in module that learns to predict property values on a handful of summaries. This lets us switch the generator to the summarization mode. We show on Amazon and Yelp datasets that our approach substantially outperforms previous extractive and abstractive methods in automatic and human evaluation.",,Summarization,Long,https://www.aclweb.org/anthology/2020.emnlp-main.337,38938830 +main.1024,The Importance of Fillers for Text Representations of Speech Transcripts,Tanvi Dinkar|Pierre Colombo|Matthieu Labeau|Chloé Clavel,"While being an essential component of spoken language, fillers (e.g. ""um"" or ""uh"") often remain overlooked in Spoken Language Understanding (SLU) tasks. We explore the possibility of representing them with deep contextualised embeddings, showing improvements on modelling spoken language and two downstream tasks --- predicting a speaker's stance and expressed confidence.",,Speech and Multimodality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.641,38938831 +main.1030,STL-CQA: Structure-based Transformers with Localization and Encoding for Chart Question Answering,Hrituraj Singh|Sumit Shekhar,"Chart Question Answering (CQA) is the task of answering natural language questions about visualisations in the chart image. Recent solutions, inspired by VQA approaches, rely on image-based attention for question/answering while ignoring the inherent chart structure. We propose STL-CQA which improves the question/answering through sequential elements localization, question encoding and then, a structural transformer-based learning approach. We conduct extensive experiments while proposing pre-training tasks, methodology and also an improved dataset with more complex and balanced questions of different types. The proposed methodology shows a significant accuracy improvement compared to the state-of-the-art approaches on various chart Q/A datasets, while outperforming even human baseline on the DVQA Dataset. We also demonstrate interpretability while examining different components in the inference pipeline.",,"Language Grounding to Vision, Robotics and Beyond",Long,https://www.aclweb.org/anthology/2020.emnlp-main.264,38938832 +main.1032,MultiCQA: Zero-Shot Transfer of Self-Supervised Text Matching Models on a Massive Scale,Andreas Rücklé|Jonas Pfeiffer|Iryna Gurevych,"We study the zero-shot transfer capabilities of text matching models on a massive scale, by self-supervised training on 140 source domains from community question answering forums in English. We investigate the model performances on nine benchmarks of answer selection and question similarity tasks, and show that all 140 models transfer surprisingly well, where the large majority of models substantially outperforms common IR baselines. We also demonstrate that considering a broad selection of source domains is crucial for obtaining the best zero-shot transfer performances, which contrasts the standard procedure that merely relies on the largest and most similar domains. In addition, we extensively study how to best combine multiple source domains. We propose to incorporate self-supervised with supervised multi-task learning on all available source domains. Our best zero-shot transfer model considerably outperforms in-domain BERT and the previous state of the art on six benchmarks. Fine-tuning of our model with in-domain data results in additional large gains and achieves the new state of the art on all nine benchmarks.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.194,38938833 +main.1046,Local Additivity Based Data Augmentation for Semi-supervised NER,Jiaao Chen|Zhenghui Wang|Ran Tian|Zichao Yang|Diyi Yang,"Named Entity Recognition (NER) is one of the first stages in deep language understanding yet current NER models heavily rely on human-annotated data. In this work, to alleviate the dependence on labeled data, we propose a Local Additivity based Data Augmentation (LADA) method for semi-supervised NER, in which we create virtual samples by interpolating sequences close to each other. Our approach has two variations: Intra-LADA and Inter-LADA, where Intra-LADA performs interpolations among tokens within one sentence, and Inter-LADA samples different sentences to interpolate. Through linear additions between sampled training data, LADA creates an infinite amount of labeled data and improves both entity and context learning. We further extend LADA to the semi-supervised setting by designing a novel consistency loss for unlabeled data. Experiments conducted on two NER benchmarks demonstrate the effectiveness of our methods over several strong baselines. We have publicly released our code at https://github.com/GT-SALT/LADA",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.95,38938834 +main.1049,ToTTo: A Controlled Table-To-Text Generation Dataset,Ankur Parikh|Xuezhi Wang|Sebastian Gehrmann|Manaal Faruqui|Bhuwan Dhingra|Diyi Yang|Dipanjan Das,"We present ToTTo, an open-domain English table-to-text dataset with over 120,000 training examples that proposes a controlled generation task: given a Wikipedia table and a set of highlighted table cells, produce a one-sentence description. To obtain generated targets that are natural but also faithful to the source table, we introduce a dataset construction process where annotators directly revise existing candidate sentences from Wikipedia. We present systematic analyses of our dataset and annotation process as well as results achieved by several state-of-the-art baselines. While usually fluent, existing methods often hallucinate phrases that are not supported by the table, suggesting that this dataset can serve as a useful research benchmark for high-precision conditional text generation.",,Language Generation,Long,https://www.aclweb.org/anthology/2020.emnlp-main.89,38938835 +main.1052,Asking without Telling: Exploring Latent Ontologies in Contextual Representations,Julian Michael|Jan A. Botha|Ian Tenney,"The success of pretrained contextual encoders, such as ELMo and BERT, has brought a great deal of interest in what these models learn: do they, without explicit supervision, learn to encode meaningful notions of linguistic structure? If so, how is this structure encoded? To investigate this, we introduce latent subclass learning (LSL): a modification to classifier-based probing that induces a latent categorization (or ontology) of the probe's inputs. Without access to fine-grained gold labels, LSL extracts emergent structure from input representations in an interpretable and quantifiable form. In experiments, we find strong evidence of familiar categories, such as a notion of personhood in ELMo, as well as novel ontological distinctions, such as a preference for fine-grained semantic roles on core arguments. Our results provide unique new evidence of emergent structure in pretrained encoders, including departures from existing annotations which are inaccessible to earlier methods.",,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.552,38938836 +main.106,The Role of Context in Neural Pitch Accent Detection in English,Elizabeth Nielsen|Mark Steedman|Sharon Goldwater,"Prosody is a rich information source in natural language, serving as a marker for phenomena such as contrast. In order to make this information available to downstream tasks, we need a way to detect prosodic events in speech. We propose a new model for pitch accent detection, inspired by the work of Stehwien et al. (2018), who presented a CNN-based model for this task. Our model makes greater use of context by using full utterances as input and adding an LSTM layer. We find that these innovations lead to an improvement from 87.5% to 88.7% accuracy on pitch accent detection on American English speech in the Boston University Radio News Corpus, a state-of-the-art result. We also find that a simple baseline that just predicts a pitch accent on every content word yields 82.2% accuracy, and we suggest that this is the appropriate baseline for this task. Finally, we conduct ablation tests that show pitch is the most important acoustic feature for this task and this corpus.",,Speech and Multimodality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.642,38938650 +main.1061,Alignment-free Cross-lingual Semantic Role Labeling,Rui Cai|Mirella Lapata,"Cross-lingual semantic role labeling (SRL) aims at leveraging resources in a source language to minimize the effort required to construct annotations or models for a new target language. Recent approaches rely on word alignments, machine translation engines, or preprocessing tools such as parsers or taggers. We propose a cross-lingual SRL model which only requires annotations in a source language and access to raw text in the form of a parallel corpus. The backbone of our model is an LSTM-based semantic role labeler jointly trained with a semantic role compressor and multilingual word embeddings. The compressor collects useful information from the output of the semantic role labeler, filtering noisy and conflicting evidence. It lives in a multilingual embedding space and provides direct supervision for predicting semantic roles in the target language. Results on the Universal Proposition Bank and manually annotated datasets show that our method is highly effective, even against systems utilizing supervised features.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",Long,https://www.aclweb.org/anthology/2020.emnlp-main.319,38938837 +main.1070,SynSetExpan: An Iterative Framework for Joint Entity Set Expansion and Synonym Discovery,Jiaming Shen|Wenda Qiu|Jingbo Shang|Michelle Vanni|Xiang Ren|Jiawei Han,"Entity set expansion and synonym discovery are two critical NLP tasks. Previous studies accomplish them separately, without exploring their interdependencies. In this work, we hypothesize that these two tasks are tightly coupled because two synonymous entities tend to have a similar likelihood of belonging to various semantic classes. This motivates us to design SynSetExpan, a novel framework that enables two tasks to mutually enhance each other. SynSetExpan uses a synonym discovery model to include popular entities' infrequent synonyms into the set, which boosts the set expansion recall. Meanwhile, the set expansion model, being able to determine whether an entity belongs to a semantic class, can generate pseudo training data to fine-tune the synonym discovery model towards better accuracy. To facilitate the research on studying the interplays of these two tasks, we create the first large-scale Synonym-Enhanced Set Expansion (SE2) dataset via crowdsourcing. Extensive experiments on the SE2 dataset and previous benchmarks demonstrate the effectiveness of SynSetExpan for both entity set expansion and synonym discovery tasks.",,Information Retrieval and Text Mining,Long,https://www.aclweb.org/anthology/2020.emnlp-main.666,38938838 +main.1071,Learning to Represent Image and Text with Denotation Graphs,Bowen Zhang|Hexiang Hu|Vihan Jain|Eugene Ie|Fei Sha,"Learning to fuse vision and language information and representing them is an important research problem with many applications. Recent progresses have leveraged the ideas of pre-training (from language modeling) and attention layers in Transformers to learn representation from datasets containing images aligned with linguistic expressions that describe the images. In this paper, we propose learning representations from a set of implied, visually grounded expressions between image and text, automatically mined from those datasets. In particular, we use denotation graphs to represent how specific concepts (such as sentences describing images) can be linked to abstract and generic concepts (such as short phrases) that are also visually grounded. This type of generic-to-specific relations can be discovered using linguistic analysis tools. We propose methods to incorporate such relations into learning representation. We show that state-of-the-art multimodal learning models can be further improved by leveraging automatically harvested structural relations. The representations lead to stronger empirical results on downstream tasks of cross-modal image retrieval, referring expression, and compositional attribute-object recognition. Both our codes and the extracted denotation graphs on the Flickr30K and the COCO datasets are publically available on https://sha-lab.github.io/DG.",,"Language Grounding to Vision, Robotics and Beyond",Long,https://www.aclweb.org/anthology/2020.emnlp-main.60,38938839 +main.108,How Much Knowledge Can You Pack into the Parameters of a Language Model?,Adam Roberts|Colin Raffel|Noam Shazeer,"It has recently been observed that neural language models trained on unstructured text can implicitly store and retrieve knowledge using natural language queries. In this short paper, we measure the practical utility of this approach by fine-tuning pre-trained models to answer questions without access to any external context or knowledge. We show that this approach scales with model size and performs competitively with open-domain systems that explicitly retrieve answers from an external knowledge source when answering questions. To facilitate reproducibility and future work, we release our code and trained models.",,Question Answering,Long,https://www.aclweb.org/anthology/2020.emnlp-main.437,38938651 +main.1085,Video2Commonsense: Generating Commonsense Descriptions to Enrich Video Captioning,Zhiyuan Fang|Tejas Gokhale|Pratyay Banerjee|Chitta Baral|Yezhou Yang,"Captioning is a crucial and challenging task for video understanding. In videos that involve active agents such as humans, the agent's actions can bring about myriad changes in the scene. Observable changes such as movements, manipulations, and transformations of the objects in the scene, are reflected in conventional video captioning. Unlike images, actions in videos are also inherently linked to social aspects such as intentions (why the action is taking place), effects (what changes due to the action), and attributes that describe the agent. Thus for video understanding, such as when captioning videos or when answering questions about videos, one must have an understanding of these commonsense aspects. We present the first work on generating \textit{commonsense} captions directly from videos, to describe latent aspects such as intentions, effects, and attributes. We present a new dataset ``Video-to-Commonsense (V2C)"" that contains $\sim9k$ videos of human agents performing various actions, annotated with 3 types of commonsense descriptions. Additionally we explore the use of open-ended video-based commonsense question answering (V2C-QA) as a way to enrich our captions. Both the generation task and the QA task can be used to enrich video captions.",,"Language Grounding to Vision, Robotics and Beyond",Long,https://www.aclweb.org/anthology/2020.emnlp-main.61,38938840 +main.1086,Table Fact Verification with Structure-Aware Transformer,Hongzhi Zhang|Yingyao Wang|Sirui Wang|Xuezhi Cao|Fuzheng Zhang|Zhongyuan Wang,"Verifying fact on semi-structured evidence like tables requires the ability to encode structural information and perform symbolic reasoning. Pre-trained language models trained on natural language could not be directly applied to encode tables, because simply linearizing tables into sequences will lose the cell alignment information. To better utilize pre-trained transformers for table representation, we propose a Structure-Aware Transformer (SAT), which injects the table structural information into the mask of the self-attention layer. A method to combine symbolic and linguistic reasoning is also explored for this task. Our method outperforms baseline with 4.93% on TabFact, a large scale table verification dataset.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",Long,https://www.aclweb.org/anthology/2020.emnlp-main.126,38938841 +main.1091,"Diverse, Controllable, and Keyphrase-Aware: A Corpus and Method for News Multi-Headline Generation",Dayiheng Liu|Yeyun Gong|Yu Yan|Jie Fu|Bo Shao|Daxin Jiang|Jiancheng Lv|Nan Duan,"News headline generation aims to produce a short sentence to attract readers to read the news. One news article often contains multiple keyphrases that are of interest to different users, which can naturally have multiple reasonable headlines. However, most existing methods focus on the single headline generation. In this paper, we propose generating multiple headlines with keyphrases of user interests, whose main idea is to generate multiple keyphrases of interest to users for the news first, and then generate multiple keyphrase-relevant headlines. We propose a multi-source Transformer decoder, which takes three sources as inputs: (a) keyphrase, (b) keyphrase-filtered article, and (c) original article to generate keyphrase-relevant, high-quality, and diverse headlines. Furthermore, we propose a simple and effective method to mine the keyphrases of interest in the news article and build a first large-scale keyphrase-aware news headline corpus, which contains over 180K aligned triples of . Extensive experimental comparisons on the real-world dataset show that the proposed method achieves state-of-the-art results in terms of quality and diversity.",,Summarization,Long,https://www.aclweb.org/anthology/2020.emnlp-main.505,38938842 +main.110,Towards Medical Machine Reading Comprehension with Structural Knowledge and Plain Text,Dongfang Li|Baotian Hu|Qingcai Chen|Weihua Peng|Anqi Wang,"Machine reading comprehension (MRC) has achieved significant progress on the open domain in recent years, mainly due to large-scale pre-trained language models. However, it performs much worse in specific domains such as the medical field due to the lack of extensive training data and professional structural knowledge neglect. As an effort, we first collect a large scale medical multi-choice question dataset (more than 21k instances) for the National Licensed Pharmacist Examination in China. It is a challenging medical examination with a passing rate of less than 14.2% in 2018. Then we propose a novel reading comprehension model KMQA, which can fully exploit the structural medical knowledge (i.e., medical knowledge graph) and the reference medical plain text (i.e., text snippets retrieved from reference books). The experimental results indicate that the KMQA outperforms existing competitive models with a large margin and passes the exam with 61.8% accuracy rate on the test set.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.111,38938652 +main.1100,Iterative Domain-Repaired Back-Translation,Hao-Ran Wei|Zhirui Zhang|Boxing Chen|Weihua Luo,"In this paper, we focus on the domain-specific translation with low resources, where in-domain parallel corpora are scarce or nonexistent. One common and effective strategy for this case is exploiting in-domain monolingual data with the back-translation method. However, the synthetic parallel data is very noisy because they are generated by imperfect out-of-domain systems, resulting in the poor performance of domain adaptation. To address this issue, we propose a novel iterative domain-repaired back-translation framework, which introduces the Domain-Repair (DR) model to refine translations in synthetic bilingual data. To this end, we construct corresponding data for the DR model training by round-trip translating the monolingual sentences, and then design the unified training framework to optimize paired DR and NMT models jointly. Experiments on adapting NMT models between specific domains and from the general domain to specific domains demonstrate the effectiveness of our proposed approach, achieving 15.79 and 4.47 BLEU improvements on average over unadapted models and back-translation.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.474,38938843 +main.1103,GLUCOSE: GeneraLized and COntextualized Story Explanations,Nasrin Mostafazadeh|Aditya Kalyanpur|Lori Moon|David Buchanan|Lauren Berkowitz|Or Biran|Jennifer Chu-Carroll,"When humans read or listen, they make implicit commonsense inferences that frame their understanding of what happened and why. As a step toward AI systems that can build similar mental models, we introduce GLUCOSE, a large-scale dataset of implicit commonsense causal knowledge, encoded as causal mini-theories about the world, each grounded in a narrative context. To construct GLUCOSE, we drew on cognitive psychology to identify ten dimensions of causal explanation, focusing on events, states, motivations, and emotions. Each GLUCOSE entry includes a story-specific causal statement paired with an inference rule generalized from the statement. This paper details two concrete contributions. First, we present our platform for effectively crowdsourcing GLUCOSE data at scale, which uses semi-structured templates to elicit causal explanations. Using this platform, we collected a total of ~670K specific statements and general rules that capture implicit commonsense knowledge about everyday situations. Second, we show that existing knowledge resources and pretrained language models do not include or readily predict GLUCOSE's rich inferential content. However, when state-of-the-art neural models are trained on this knowledge, they can start to make commonsense inferences on unseen stories that match humans' mental models.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",Long,https://www.aclweb.org/anthology/2020.emnlp-main.370,38938844 +main.1107,Grounded Adaptation for Zero-shot Executable Semantic Parsing,Victor Zhong|Mike Lewis|Sida I. Wang|Luke Zettlemoyer,"We propose Grounded Adaptation for Zeroshot Executable Semantic Parsing (GAZP) to adapt an existing semantic parser to new environments (e.g. new database schemas). GAZP combines a forward semantic parser with a backward utterance generator to synthesize data (e.g. utterances and SQL queries) in the new environment, then selects cycle-consistent examples to adapt the parser. Unlike data-augmentation, which typically synthesizes unverified examples in the training environment, GAZP synthesizes examples in the new environment whose input-output consistency are verified through execution. On the Spider, Sparc, and CoSQL zero-shot semantic parsing tasks, GAZP improves logical form and execution accuracy of the baseline parser. Our analyses show that GAZP outperforms data-augmentation in the training environment, performance increases with the amount of GAZP-synthesized data, and cycle-consistency is central to successful adaptation.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",Long,https://www.aclweb.org/anthology/2020.emnlp-main.558,38938845 +main.1113,Room-Across-Room: Multilingual Vision-and-Language Navigation with Dense Spatiotemporal Grounding,Alexander Ku|Peter Anderson|Roma Patel|Eugene Ie|Jason Baldridge,"We introduce Room-Across-Room (RxR), a new Vision-and-Language Navigation (VLN) dataset. RxR is multilingual (English, Hindi, and Telugu) and larger (more paths and instructions) than other VLN datasets. It emphasizes the role of language in VLN by addressing known biases in paths and eliciting more references to visible entities. Furthermore, each word in an instruction is time-aligned to the virtual poses of instruction creators and validators. We establish baseline scores for monolingual and multilingual settings and multitask learning when including Room-to-Room annotations (Anderson et al., 2018). We also provide results for a model that learns from synchronized pose traces by focusing only on portions of the panorama attended to in human demonstrations. The size, scope and detail of RxR dramatically expands the frontier for research on embodied language agents in photorealistic simulated environments.",,"Language Grounding to Vision, Robotics and Beyond",Long,https://www.aclweb.org/anthology/2020.emnlp-main.356,38938846 +main.1116,Joint Constrained Learning for Event-Event Relation Extraction,Haoyu Wang|Muhao Chen|Hongming Zhang|Dan Roth,"Understanding natural language involves recognizing how multiple event mentions structurally and temporally interact with each other. In this process, one can induce event complexes that organize multi-granular events with temporal order and membership relations interweaving among them. Due to the lack of jointly labeled data for these relational phenomena and the restriction on the structures they articulate, we propose a joint constrained learning framework for modeling event-event relations. Specifically, the framework enforces logical constraints within and across multiple temporal and subevent relations of events by converting these constraints into differentiable learning objectives. We show that our joint constrained learning approach effectively compensates for the lack of jointly labeled data, and outperforms SOTA methods on benchmarks for both temporal relation extraction and event hierarchy construction, replacing a commonly used but more expensive global inference process. We also present a promising case study to show the effectiveness of our approach to inducing event complexes on an external corpus.",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.51,38938847 +main.1123,Multi-hop Inference for Question-driven Summarization,Yang Deng|Wenxuan Zhang|Wai Lam,"Question-driven summarization has been recently studied as an effective approach to summarizing the source document to produce concise but informative answers for non-factoid questions. In this work, we propose a novel question-driven abstractive summarization method, Multi-hop Selective Generator (MSG), to incorporate multi-hop reasoning into question-driven summarization and, meanwhile, provide justifications for the generated summaries. Specifically, we jointly model the relevance to the question and the interrelation among different sentences via a human-like multi-hop inference module, which captures important sentences for justifying the summarized answer. A gated selective pointer generator network with a multi-view coverage mechanism is designed to integrate diverse information from different perspectives. Experimental results show that the proposed method consistently outperforms state-of-the-art methods on two non-factoid QA datasets, namely WikiHow and PubMedQA.",,Summarization,Long,https://www.aclweb.org/anthology/2020.emnlp-main.547,38938848 +main.1129,Chapter Captor: Text Segmentation in Novels,Charuta Pethe|Allen Kim|Steve Skiena,"Books are typically segmented into chapters and sections, representing coherent sub-narratives and topics. We investigate the task of predicting chapter boundaries, as a proxy for the general task of segmenting long texts. We build a Project Gutenberg chapter segmentation data set of 9,126 English novels, using a hybrid approach combining neural inference and rule matching to recognize chapter title headers in books, achieving an F1-score of 0.77 on this task. Using this annotated data as ground truth after removing structural cues, we present cut-based and neural methods for chapter segmentation, achieving a F1-score of 0.453 on the challenging task of exact break prediction over book-length documents. Finally, we reveal interesting historical trends in the chapter structure of novels.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.672,38938849 +main.1130,Grounded Compositional Outputs for Adaptive Language Modeling,Nikolaos Pappas|Phoebe Mulcaire|Noah A. Smith,"Language models have emerged as a central component across NLP, and a great deal of progress depends on the ability to cheaply adapt them (e.g., through finetuning) to new domains and tasks. A language model's \emph{vocabulary}---typically selected before training and permanently fixed later---affects its size and is part of what makes it resistant to such adaptation. Prior work has used compositional input embeddings based on surface forms to ameliorate this issue. In this work, we go one step beyond and propose a fully compositional output embedding layer for language models, which is further grounded in information from a structured lexicon (WordNet), namely semantically related words and free-text definitions. To our knowledge, the result is the first word-level language model with a size that does not depend on the training vocabulary. We evaluate the model on conventional language modeling as well as challenging cross-domain settings with an open vocabulary, finding that it matches or outperforms previous state-of-the-art output embedding methods and adaptation approaches. Our analysis attributes the improvements to sample efficiency: our model is more accurate for low-frequency words.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.96,38938850 +main.1135,What Time Is It? Temporal Analysis of Novels,Allen Kim|Charuta Pethe|Steve Skiena,"Recognizing the flow of time in a story is a crucial aspect of understanding it. Prior work related to time has primarily focused on identifying temporal expressions or relative sequencing of events, but here we propose computationally annotating each line of a book with wall clock times, even in the absence of explicit time-descriptive phrases. To do so, we construct a data set of hourly time phrases from 52,183 fictional books. We then construct a time-of-day classification model that achieves an average error of 2.27 hours. Furthermore, we show that by analyzing a book in whole using dynamic programming of breakpoints, we can roughly partition a book into segments that each correspond to a particular time-of-day. This approach improves upon baselines by over two hour. Finally, we apply our model to a corpus of literature categorized by different periods in history, to show interesting trends of hourly activity throughout the past. Among several observations we find that the fraction of events taking place past 10 P.M jumps past 1880 - coincident with the advent of the electric light bulb and city lights.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.730,38938851 +main.1140,GraphDialog: Integrating Graph Knowledge into End-to-End Task-Oriented Dialogue Systems,Shiquan Yang|Rui Zhang|Sarah Erfani,"End-to-end task-oriented dialogue systems aim to generate system responses directly from plain text inputs. There are two challenges for such systems: one is how to effectively incorporate external knowledge bases (KBs) into the learning framework; the other is how to accurately capture the semantics of dialogue history. In this paper, we address these two challenges by exploiting the graph structural information in the knowledge base and in the dependency parsing tree of the dialogue. To effectively leverage the structural information in dialogue history, we propose a new recurrent cell architecture which allows representation learning on graphs. To exploit the relations between entities in KBs, the model combines multi-hop reasoning ability based on the graph structure. Experimental results show that the proposed model achieves consistent improvement over state-of-the-art models on two different task-oriented dialogue datasets.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.147,38938852 +main.1141,BERT-enhanced Relational Sentence Ordering Network,Baiyun Cui|Yingming Li|Zhongfei Zhang,"In this paper, we introduce a novel BERT-enhanced Relational Sentence Ordering Network (referred to as BRSON) by leveraging BERT for capturing better dependency relationship among sentences to enhance the coherence modeling for the entire paragraph. In particular, we develop a new Relational Pointer Decoder (referred as RPD) by incorporating the relative ordering information into the pointer network with a Deep Relational Module (referred as DRM), which utilizes BERT to exploit the deep semantic connection and relative ordering between sentences.This enables us to strengthen both local and global dependencies among sentences. Extensive evaluations are conducted on six public datasets. The experimental results demonstrate the effectiveness and promise of our BRSON, showing a significant improvement over the state-of-the-art by a wide margin.",,Discourse and Pragmatics,Long,https://www.aclweb.org/anthology/2020.emnlp-main.511,38938853 +main.1146,Self-Induced Curriculum Learning in Self-Supervised Neural Machine Translation,Dana Ruiter|Josef van Genabith|Cristina España-Bonet,"Self-supervised neural machine translation (SSNMT) jointly learns to identify and select suitable training data from comparable (rather than parallel) corpora and to translate, in a way that the two tasks support each other in a virtuous circle. In this study, we provide an in-depth analysis of the sampling choices the SSNMT model makes during training. We show how, without it having been told to do so, the model self-selects samples of increasing (i) complexity and (ii) task-relevance in combination with (iii) performing a denoising curriculum. We observe that the dynamics of the mutual-supervision signals of both system internal representation types are vital for the extraction and translation performance. We show that in terms of the Gunning-Fog Readability index, SSNMT starts extracting and learning from Wikipedia data suitable for high school students and quickly moves towards content suitable for first year undergraduate students.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.202,38938854 +main.1159,Learning from Context or Names? An Empirical Study on Neural Relation Extraction,Hao Peng|Tianyu Gao|Xu Han|Yankai Lin|Peng Li|Zhiyuan Liu|Maosong Sun|Jie Zhou,"Neural models have achieved remarkable success on relation extraction (RE) benchmarks. However, there is no clear understanding what information in text affects existing RE models to make decisions and how to further improve the performance of these models. To this end, we empirically study the effect of two main information sources in text: textual context and entity mentions (names). We find that (i) while context is the main source to support the predictions, RE models also heavily rely on the information from entity mentions, most of which is type information, and (ii) existing datasets may leak shallow heuristics via entity mentions and thus contribute to the high performance on RE benchmarks. Based on the analyses, we propose an entity-masked contrastive pre-training framework for RE to gain a deeper understanding on both textual context and type information while avoiding rote memorization of entities or use of superficial cues in mentions. We carry out extensive experiments to support our views, and show that our framework can improve the effectiveness and robustness of neural models in different RE scenarios. All the code and datasets are released at https://github.com/thunlp/RE-Context-or-Names.",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.298,38938855 +main.1179,Conversational Semantic Parsing,Armen Aghajanyan|Jean Maillard|Akshat Shrivastava|Keith Diedrick|Michael Haeger|Haoran Li|Yashar Mehdad|Veselin Stoyanov|Anuj Kumar|Mike Lewis|Sonal Gupta,"The structured representation for semantic parsing in task-oriented assistant systems is geared towards simple understanding of one-turn queries. Due to the limitations of the representation, the session-based properties such as co-reference resolution and context carryover are processed downstream in a pipelined system. In this paper, we propose a semantic representation for such task-oriented conversational systems that can represent concepts such as co-reference and context carryover, enabling comprehensive understanding of queries in a session. We release a new session-based, compositional task-oriented parsing dataset of 20k sessions consisting of 60k utterances. Unlike Dialog State Tracking Challenges, the queries in the dataset have compositional forms. We propose a new family of Seq2Seq models for the session-based parsing above, which also set state-of-the-art in ATIS, SNIPS, TOP and DSTC2. Notably, we improve the best known results on DSTC2 by up to 5 points for slot-carryover.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.408,38938856 +main.1180,Public Sentiment Drift Analysis Based on Hierarchical Variational Auto-encoder,Wenyue Zhang|Xiaoli Li|Yang Li|Suge Wang|Deyu Li|Jian Liao|Jianxing Zheng,"Detecting public sentiment drift is a challenging task due to sentiment change over time. Existing methods first build a classification model using historical data and subsequently detect drift if the model performs much worse on new data. In this paper, we focus on distribution learning by proposing a novel Hierarchical Variational Auto-Encoder (HVAE) model to learn better distribution representation, and design a new drift measure to directly evaluate distribution changes between historical data and new data.Our experimental results demonstrate that our proposed model achieves better results than three existing state-of-the-art methods.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.307,38938857 +main.1187,Neural Deepfake Detection with Factual Structure of Text,Wanjun Zhong|Duyu Tang|Zenan Xu|Ruize Wang|Nan Duan|Ming Zhou|Jiahai Wang|Jian Yin,"Deepfake detection, the task of automatically discriminating machine-generated text, is increasingly critical with recent advances in natural language generative models. Existing approaches to deepfake detection typically represent documents with coarse-grained representations. However, they struggle to capture factual structures of documents, which is a discriminative factor between machine-generated and human-written text according to our statistical analysis. To address this, we propose a graph-based model that utilizes the factual structure of a document for deepfake detection of text. Our approach represents the factual structure of a given document as an entity graph, which is further utilized to learn sentence representations with a graph neural network. Sentence representations are then composed to a document representation for making predictions, where consistent relations between neighboring sentences are sequentially modeled. Results of experiments on two public deepfake datasets show that our approach significantly improves strong base models built with RoBERTa. Model analysis further indicates that our model can distinguish the difference in the factual structure between machine-generated text and human-written text.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",Long,https://www.aclweb.org/anthology/2020.emnlp-main.193,38938858 +main.1191,Towards Interpretable Reasoning over Paragraph Effects in Situation,Mucheng Ren|Xiubo Geng|Tao QIN|Heyan Huang|Daxin Jiang,"We focus on the task of reasoning over paragraph effects in situation, which requires a model to understand the cause and effect described in a background paragraph, and apply the knowledge to a novel situation. Existing works ignore the complicated reasoning process and solve it with a one-step ""black box"" model. Inspired by human cognitive processes, in this paper we propose a sequential approach for this task which explicitly models each step of the reasoning process with neural network modules. In particular, five reasoning modules are designed and learned in an end-to-end manner, which leads to a more interpretable model. Experimental results on the ROPES dataset demonstrate the effectiveness and explainability of our proposed approach.",,Question Answering,Long,https://www.aclweb.org/anthology/2020.emnlp-main.548,38938859 +main.1196,Learning to Contrast the Counterfactual Samples for Robust Visual Question Answering,Zujie Liang|Weitao Jiang|Haifeng Hu|Jiaying Zhu,"In the task of Visual Question Answering (VQA), most state-of-the-art models tend to learn spurious correlations in the training set and achieve poor performance in out-of-distribution test data. Some methods of generating counterfactual samples have been proposed to alleviate this problem. However, the counterfactual samples generated by most previous methods are simply added to the training data for augmentation and are not fully utilized. Therefore, we introduce a novel self-supervised contrastive learning mechanism to learn the relationship between original samples, factual samples and counterfactual samples. With the better cross-modal joint embeddings learned from the auxiliary training objective, the reasoning capability and robustness of the VQA model are boosted significantly. We evaluate the effectiveness of our method by surpassing current state-of-the-art models on the VQA-CP dataset, a diagnostic benchmark for assessing the VQA model's robustness.",,"Language Grounding to Vision, Robotics and Beyond",Long,https://www.aclweb.org/anthology/2020.emnlp-main.265,38938860 +main.1201,TOD-BERT: Pre-trained Natural Language Understanding for Task-Oriented Dialogue,Chien-Sheng Wu|Steven C.H. Hoi|Richard Socher|Caiming Xiong,"The underlying difference of linguistic patterns between general text and task-oriented dialogue makes existing pre-trained language models less useful in practice. In this work, we unify nine human-human and multi-turn task-oriented dialogue datasets for language modeling. To better model dialogue behavior during pre-training, we incorporate user and system tokens into the masked language modeling. We propose a contrastive objective function to simulate the response selection task. Our pre-trained task-oriented dialogue BERT (TOD-BERT) outperforms strong baselines like BERT on four downstream task-oriented dialogue applications, including intention recognition, dialogue state tracking, dialogue act prediction, and response selection. We also show that TOD-BERT has a stronger few-shot ability that can mitigate the data scarcity problem for task-oriented dialogue.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.66,38938861 +main.1205,Top-Rank-Focused Adaptive Vote Collection for the Evaluation of Domain-Specific Semantic Models,Pierangelo Lombardo|Alessio Boiardi|Luca Colombo|Angelo Schiavone|Nicolò Tamagnone,"The growth of domain-specific applications of semantic models, boosted by the recent achievements of unsupervised embedding learning algorithms, demands domain-specific evaluation datasets. In many cases, content-based recommenders being a prime example, these models are required to rank words or texts according to their semantic relatedness to a given concept, with particular focus on top ranks. In this work, we give a threefold contribution to address these requirements: (i) we define a protocol for the construction, based on adaptive pairwise comparisons, of a relatedness-based evaluation dataset tailored on the available resources and optimized to be particularly accurate in top-rank evaluation; (ii) we define appropriate metrics, extensions of well-known ranking correlation coefficients, to evaluate a semantic model via the aforementioned dataset by taking into account the greater significance of top ranks. Finally, (iii) we define a stochastic transitivity model to simulate semantic-driven pairwise comparisons, which confirms the effectiveness of the proposed dataset construction protocol.",,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.249,38938862 +main.1208,Lifelong Language Knowledge Distillation,Yung-Sung Chuang|Shang-Yu Su|Yun-Nung Chen,"It is challenging to perform lifelong language learning (LLL) on a stream of different tasks without any performance degradation comparing to the multi-task counterparts. To address this issue, we present Lifelong Language Knowledge Distillation (L2KD), a simple but efficient method that can be easily applied to existing LLL architectures in order to mitigate the degradation. Specifically, when the LLL model is trained on a new task, we assign a teacher model to first learn the new task, and pass the knowledge to the LLL model via knowledge distillation. Therefore, the LLL model can better adapt to the new task while keeping the previously learned knowledge. Experiments show that the proposed L2KD consistently improves previous state-of-the-art models, and the degradation comparing to multi-task models in LLL tasks is well mitigated for both sequence generation and text classification tasks.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.233,38938863 +main.1210,KERMIT: Complementing Transformer Architectures with Encoders of Explicit Syntactic Interpretations,Fabio Massimo Zanzotto|Andrea Santilli|Leonardo Ranaldi|Dario Onorati|Pierfrancesco Tommasino|Francesca Fallucchi,"Syntactic parsers have dominated natural language understanding for decades. Yet, their syntactic interpretations are losing centrality in downstream tasks due to the success of large-scale textual representation learners. In this paper, we propose KERMIT (Kernel-inspired Encoder with Recursive Mechanism for Interpretable Trees) to embed symbolic syntactic parse trees into artificial neural networks and to visualize how syntax is used in inference. We experimented with KERMIT paired with two state-of-the-art transformer-based universal sentence encoders (BERT and XLNet) and we showed that KERMIT can indeed boost their performance by effectively embedding human-coded universal syntactic representations in neural networks",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.18,38938864 +main.1217,A Multi-Task Incremental Learning Framework with Category Name Embedding for Aspect-Category Sentiment Analysis,Zehui Dai|Cheng Peng|Huajie Chen|Yadong Ding,"(T)ACSA tasks, including aspect-category sentiment analysis (ACSA) and targeted aspect-category sentiment analysis (TACSA), aims at identifying sentiment polarity on predefined categories. Incremental learning on new categories is necessary for (T)ACSA real applications. Though current multi-task learning models achieve good performance in (T)ACSA tasks, they suffer from catastrophic forgetting problems in (T)ACSA incremental learning tasks. In this paper, to make multi-task learning feasible for incremental learning, we proposed Category Name Embedding network (CNE-net). We set both encoder and decoder shared among all categories to weaken the catastrophic forgetting problem. Besides the origin input sentence, we applied another input feature, i.e., category name, for task discrimination. Our model achieved state-of-the-art on two (T)ACSA benchmark datasets. Furthermore, we proposed a dataset for (T)ACSA incremental learning and achieved the best performance compared with other strong baselines.",,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Long,https://www.aclweb.org/anthology/2020.emnlp-main.565,38938865 +main.1219,Incremental Processing in the Age of Non-Incremental Encoders: An Empirical Assessment of Bidirectional Models for Incremental NLU,Brielen Madureira|David Schlangen,"While humans process language incrementally, the best language encoders currently used in NLP do not. Both bidirectional LSTMs and Transformers assume that the sequence that is to be encoded is available in full, to be processed either forwards and backwards (BiLSTMs) or as a whole (Transformers). We investigate how they behave under incremental interfaces, when partial output must be provided based on partial input seen up to a certain time step, which may happen in interactive systems. We test five models on various NLU datasets and compare their performance using three incremental evaluation metrics. The results support the possibility of using bidirectional encoders in incremental mode while retaining most of their non-incremental quality. The ""omni-directional'' BERT model, which achieves better non-incremental performance, is impacted more by the incremental access. This can be alleviated by adapting the training regime (truncated training), or the testing procedure, by delaying the output until some right context is available or by incorporating hypothetical right contexts generated by a language model like GPT-2.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.26,38938866 +main.1220,Masking as an Efficient Alternative to Finetuning for Pretrained Language Models,Mengjie Zhao|Tao Lin|Fei Mi|Martin Jaggi|Hinrich Schütze,"We present an efficient method of utilizing pretrained language models, where we learn selective binary masks for pretrained weights in lieu of modifying them through finetuning. Extensive evaluations of masking BERT, RoBERTa, and DistilBERT on eleven diverse NLP tasks show that our masking scheme yields performance comparable to finetuning, yet has a much smaller memory footprint when several tasks need to be inferred. Intrinsic evaluations show that representations computed by our binary masked language models encode information necessary for solving downstream tasks. Analyzing the loss landscape, we show that masking and finetuning produce models that reside in minima that can be connected by a line segment with nearly constant test accuracy. This confirms that masking can be utilized as an efficient alternative to finetuning.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.174,38938867 +main.1225,Uncertainty-Aware Label Refinement for Sequence Labeling,Tao Gui|Jiacheng Ye|Qi Zhang|Zhengyan Li|Zichu Fei|Yeyun Gong|Xuanjing Huang,"Conditional random fields (CRF) for label decoding has become ubiquitous in sequence labeling tasks. However, the local label dependencies and inefficient Viterbi decoding have always been a problem to be solved. In this work, we introduce a novel two-stage label decoding framework to model long-term label dependencies, while being much more computationally efficient. A base model first predicts draft labels, and then a novel two-stream self-attention model makes refinements on these draft predictions based on long-range label dependencies, which can achieve parallel decoding for a faster prediction. In addition, in order to mitigate the side effects of incorrect draft labels, Bayesian neural networks are used to indicate the labels with a high probability of being wrong, which can greatly assist in preventing error propagation. The experimental results on three sequence labeling benchmarks demonstrated that the proposed method not only outperformed the CRF-based methods but also greatly accelerated the inference process.",,"Syntax: Tagging, Chunking, and Parsing",Long,https://www.aclweb.org/anthology/2020.emnlp-main.181,38938868 +main.1227,"PatchBERT: Just-in-Time, Out-of-Vocabulary Patching",Sangwhan Moon|Naoaki Okazaki,"Large scale pre-trained language models have shown groundbreaking performance improvements for transfer learning in the domain of natural language processing. In our paper, we study a pre-trained multilingual BERT model and analyze the OOV rate on downstream tasks, how it introduces information loss, and as a side-effect, obstructs the potential of the underlying model. We then propose multiple approaches for mitigation and demonstrate that it improves performance with the same parameter count when combined with fine-tuning.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.631,38938869 +main.1231,An Unsupervised Joint System for Text Generation from Knowledge Graphs and Semantic Parsing,Martin Schmitt|Sahand Sharifzadeh|Volker Tresp|Hinrich Schütze,"Knowledge graphs (KGs) can vary greatly from one domain to another. Therefore supervised approaches to both graph-to-text generation and text-to-graph knowledge extraction (semantic parsing) will always suffer from a shortage of domain-specific parallel graph-text data; at the same time, adapting a model trained on a different domain is often impossible due to little or no overlap in entities and relations. This situation calls for an approach that (1) does not need large amounts of annotated data and thus (2) does not need to rely on domain adaptation techniques to work well on different domains. To this end, we present the first approach to unsupervised text generation from KGs and show simultaneously how it can be used for unsupervised semantic parsing. We evaluate our approach on WebNLG v2.1 and a new benchmark leveraging scene graphs from Visual Genome. Our system outperforms strong baselines for both text<->graph conversion tasks without any manual adaptation from one dataset to the other. In additional experiments, we investigate the impact of using different unsupervised objectives.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.577,38938870 +main.1248,Towards Reasonably-Sized Character-Level Transformer NMT by Finetuning Subword Systems,Jindřich Libovický|Alexander Fraser,"Applying the Transformer architecture on the character level usually requires very deep architectures that are difficult and slow to train. These problems can be partially overcome by incorporating a segmentation into tokens in the model. We show that by initially training a subword model and then finetuning it on characters, we can obtain a neural machine translation model that works at the character level without requiring token segmentation. We use only the vanilla 6-layer Transformer Base architecture. Our character-level models better capture morphological phenomena and show more robustness to noise at the expense of somewhat worse overall translation quality. Our study is a significant step towards high-performance and easy to train character-based models that are not extremely large.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.203,38938871 +main.125,Authorship Attribution for Neural Text Generation,Adaku Uchendu|Thai Le|Kai Shu|Dongwon Lee,"In recent years, the task of generating realistic short and long texts have made tremendous advancements. In particular, several recently proposed neural network-based language models have demonstrated their astonishing capabilities to generate texts that are challenging to distinguish from human-written texts with the naked eye. Despite many benefits and utilities of such neural methods, in some applications, being able to tell the “author” of a text in question becomes critically important. In this work, in the context of this Turing Test, we investigate the so-called authorship attribution problem in three versions: (1) given two texts T1 and T2, are both generated by the same method or not? (2) is the given text T written by a human or machine? (3) given a text T and k candidate neural methods, can we single out the method (among k alternatives) that generated T? Against one humanwritten and eight machine-generated texts (i.e., CTRL, GPT, GPT2, GROVER, XLM, XLNET, PPLM, FAIR), we empirically experiment with the performance of various models in three problems. By and large, we find that most generators still generate texts significantly different from human-written ones, thereby making three problems easier to solve. However, the qualities of texts generated by GPT2, GROVER, and FAIR are better, often confusing machine classifiers in solving three problems. All codes and datasets of our experiments are available at: https://bit.ly/ 302zWdz",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.673,38938653 +main.1250,An Element-aware Multi-representation Model for Law Article Prediction,Huilin Zhong|Junsheng Zhou|Weiguang QU|Yunfei Long|Yanhui Gu,"Existing works have proved that using law articles as external knowledge can improve the performance of the Legal Judgment Prediction. However, they do not fully use law article information and most of the current work is only for single label samples. In this paper, we propose a Law Article Element-aware Multi-representation Model (LEMM), which can make full use of law article information and can be used for multi-label samples. The model uses the labeled elements of law articles to extract fact description features from multiple angles. It generates multiple representations of a fact for classification. Every label has a law-aware fact representation to encode more information. To capture the dependencies between law articles, the model also introduces a self-attention mechanism between multiple representations. Compared with baseline models like TopJudge, this model improves the accuracy of 5.84%, the macro F1 of 6.42%, and the micro F1 of 4.28%.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.540,38938872 +main.1258,Keep It Surprisingly Simple: A Simple First Order Graph Based Parsing Model for Joint Morphosyntactic Parsing in Sanskrit,Amrith Krishna|Ashim Gupta|Deepak Garasangi|Pavankumar Satuluri|Pawan Goyal,"Morphologically rich languages seem to benefit from joint processing of morphology and syntax, as compared to pipeline architectures. We propose a graph-based model for joint morphological parsing and dependency parsing in Sanskrit. Here, we extend the Energy based model framework (Krishna et al., 2020), proposed for several structured prediction tasks in Sanskrit, in 2 simple yet significant ways. First, the framework's default input graph generation method is modified to generate a multigraph, which enables the use of an exact search inference. Second, we prune the input search space using a linguistically motivated approach, rooted in the traditional grammatical analysis of Sanskrit. Our experiments show that the morphological parsing from our joint model outperforms standalone morphological parsers. We report state of the art results in morphological parsing, and in dependency parsing, both in standalone (with gold morphological tags) and joint morphosyntactic parsing setting.",,"Syntax: Tagging, Chunking, and Parsing",Long,https://www.aclweb.org/anthology/2020.emnlp-main.388,38938873 +main.1262,Tell Me How to Ask Again: Question Data Augmentation with Controllable Rewriting in Continuous Space,Dayiheng Liu|Yeyun Gong|Jie Fu|Yu Yan|Jiusheng Chen|Jiancheng Lv|Nan Duan|Ming Zhou,"In this paper, we propose a novel data augmentation method, referred to as Controllable Rewriting based Question Data Augmentation (CRQDA), for machine reading comprehension (MRC), question generation, and question-answering natural language inference tasks. We treat the question data augmentation task as a constrained question rewriting problem to generate context-relevant, high-quality, and diverse question data samples. CRQDA utilizes a Transformer Autoencoder to map the original discrete question into a continuous embedding space. It then uses a pre-trained MRC model to revise the question representation iteratively with gradient-based optimization. Finally, the revised question representations are mapped back into the discrete space, which serve as additional question data. Comprehensive experiments on SQuAD 2.0, SQuAD 1.1 question generation, and QNLI tasks demonstrate the effectiveness of CRQDA.",,Question Answering,Long,https://www.aclweb.org/anthology/2020.emnlp-main.467,38938874 +main.1263,Transfer Learning and Distant Supervision for Multilingual Transformer Models: A Study on African Languages,Michael A. Hedderich|David Adelani|Dawei Zhu|Jesujoba Alabi|Udia Markus|Dietrich Klakow,"Multilingual transformer models like mBERT and XLM-RoBERTa have obtained great improvements for many NLP tasks on a variety of languages. However, recent works also showed that results from high-resource languages could not be easily transferred to realistic, low-resource scenarios. In this work, we study trends in performance for different amounts of available resources for the three African languages Hausa, isiXhosa and \yoruba on both NER and topic classification. We show that in combination with transfer learning or distant supervision, these models can achieve with as little as 10 or 100 labeled sentences the same performance as baselines with much more supervised training data. However, we also find settings where this does not hold. Our discussions and additional experiments on assumptions such as time and hardware restrictions highlight challenges and opportunities in low-resource learning.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.204,38938875 +main.1267,Leveraging Declarative Knowledge in Text and First-Order Logic for Fine-Grained Propaganda Detection,Ruize Wang|Duyu Tang|Nan Duan|Wanjun Zhong|Zhongyu Wei|Xuanjing Huang|Daxin Jiang|Ming Zhou,"We study the detection of propagandistic text fragments in news articles. Instead of merely learning from input-output datapoints in training data, we introduce an approach to inject declarative knowledge of fine-grained propaganda techniques. Specifically, we leverage the declarative knowledge expressed in both first-order logic and natural language. The former refers to the logical consistency between coarse- and fine-grained predictions, which is used to regularize the training process with propositional Boolean expressions. The latter refers to the literal definition of each propaganda technique, which is utilized to get class representations for regularizing the model parameters. We conduct experiments on Propaganda Techniques Corpus, a large manually annotated dataset for fine-grained propaganda detection. Experiments show that our method achieves superior performance, demonstrating that leveraging declarative knowledge can help the model to make more accurate predictions.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",Long,https://www.aclweb.org/anthology/2020.emnlp-main.320,38938876 +main.1271,Generating Radiology Reports via Memory-driven Transformer,Zhihong Chen|Yan Song|Tsung-Hui Chang|Xiang Wan,"Medical imaging is frequently used in clinical practice and trials for diagnosis and treatment. Writing imaging reports is time-consuming and can be error-prone for inexperienced radiologists. Therefore, automatically generating radiology reports is highly desired to lighten the workload of radiologists and accordingly promote clinical automation, which is an essential task to apply artificial intelligence to the medical domain. In this paper, we propose to generate radiology reports with memory-driven Transformer, where a relational memory is designed to record key information of the generation process and a memory-driven conditional layer normalization is applied to incorporating the memory into the decoder of Transformer. Experimental results on two prevailing radiology report datasets, IU X-Ray and MIMIC-CXR, show that our proposed approach outperforms previous models with respect to both language generation metrics and clinical evaluations. Particularly, this is the first work reporting the generation results on MIMIC-CXR to the best of our knowledge. Further analyses also demonstrate that our approach is able to generate long reports with necessary medical terms as well as meaningful image-text attention mappings.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.112,38938877 +main.1275,Benchmarking Meaning Representations in Neural Semantic Parsing,Jiaqi Guo|Qian Liu|Jian-Guang LOU|Zhenwen Li|Xueqing Liu|Tao Xie|Ting Liu,"Meaning representation is an important component of semantic parsing. Although researchers have designed a lot of meaning representations, recent work focuses on only a few of them. Thus, the impact of meaning representation on semantic parsing is less understood. Furthermore, existing work's performance is often not comprehensively evaluated due to the lack of readily-available execution engines. Upon identifying these gaps, we propose~\benchmarkname{}, a new unified benchmark on meaning representations, by integrating existing semantic parsing datasets, completing the missing logical forms, and implementing the missing execution engines. The resulting unified benchmark contains the complete enumeration of logical forms and execution engines over three datasets $\times$ four meaning representations. A thorough experimental study on Unimer reveals that neural semantic parsing approaches exhibit notably different performance when they are trained to generate different meaning representations. Also, program alias and grammar rules heavily impact the performance of different meaning representations. Our benchmark, execution engines and implementation can be found on: https://github.com/JasperGuo/Unimer.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",Long,https://www.aclweb.org/anthology/2020.emnlp-main.118,38938878 +main.128,Structured Attention for Unsupervised Dialogue Structure Induction,Liang Qiu|Yizhou Zhao|Weiyan Shi|Yuan Liang|Feng Shi|Tao Yuan|Zhou Yu|Song-Chun Zhu,"Inducing a meaningful structural representation from one or a set of dialogues is a crucial but challenging task in computational linguistics. Advancement made in this area is critical for dialogue system design and discourse analysis. It can also be extended to solve grammatical inference. In this work, we propose to incorporate structured attention layers into a Variational Recurrent Neural Network (VRNN) model with discrete latent states to learn dialogue structure in an unsupervised fashion. Compared to a vanilla VRNN, structured attention enables a model to focus on different parts of the source sentence embeddings while enforcing a structural inductive bias. Experiments show that on two-party dialogue datasets, VRNN with structured attention learns semantic structures that are similar to templates used to generate this dialogue corpus. While on multi-party dialogue datasets, our model learns an interactive structure demonstrating its capability of distinguishing speakers or addresses, automatically disentangling dialogues without explicit human annotation.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.148,38938654 +main.1280,Pretrained Language Model Embryology: The Birth of ALBERT,Cheng-Han Chiang|Sung-Feng Huang|Hung-yi Lee,"While behaviors of pretrained language models (LMs) have been thoroughly examined, what happened during pretraining is rarely studied. We thus investigate the developmental process from a set of randomly initialized parameters to a totipotent language model, which we refer to as the \textit{embryology} of a pretrained language model. Our results show that ALBERT learns to reconstruct and predict tokens of different parts of speech (POS) in different learning speeds during pretraining. We also find that linguistic knowledge and world knowledge do not generally improve as pretraining proceeds, nor do downstream tasks' performance. These findings suggest that knowledge of a pretrained model varies during pretraining, and having more pretrain steps does not necessarily provide a model with more comprehensive knowledge. We provide source codes and pretrained models to reproduce our results at \url{https://github.com/d223302/albert-embryology}.",,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.553,38938879 +main.1282,Investigating Cross-Linguistic Adjective Ordering Tendencies with a Latent-Variable Model,Jun Yen Leung|Guy Emerson|Ryan Cotterell,"Across languages, multiple consecutive adjectives modifying a noun (e.g.~``the big red dog'') follow certain unmarked ordering rules. While explanatory accounts have been put forward, much of the work done in this area has relied primarily on the intuitive judgment of native speakers, rather than on corpus data. We present the first purely corpus-driven model of multi-lingual adjective ordering in the form of a latent-variable model that can accurately order adjectives across 24 different languages, even when the training and testing languages are different. We utilize this novel statistical model to provide strong converging evidence for the existence of universal, cross-linguistic, hierarchical adjective ordering tendencies.",,"Linguistic Theories, Cognitive Modeling and Psycholinguistics",Long,https://www.aclweb.org/anthology/2020.emnlp-main.329,38938880 +main.1287,"Hashtags, Emotions, and Comments: A Large-Scale Dataset to Understand Fine-Grained Social Emotions to Online Topics",Keyang Ding|Jing Li|Yuji Zhang,"This paper studies social emotions to online discussion topics. While most prior work focus on emotions from writers, we investigate readers’ responses and explore the public feelings to an online topic. A large-scale dataset is collected from Chinese microblog Sina Weibo with over 13 thousand trending topics, emotion votes in 24 fine-grained types from massive participants, and user comments to allow context understanding. In experiments, we examine baseline performance to predict a topic’s possible social emotions in a multilabel classification setting. The results show that a seq2seq model with user comment modeling performs the best, even surpassing human prediction. More analyses shed light on the effects of emotion types, topic description lengths, contexts from user comments, and the limited capacity of the existing models.",,Computational Social Science and Social Media,Long,https://www.aclweb.org/anthology/2020.emnlp-main.106,38938881 +main.1289,Inducing Target-specific Latent Structures for Aspect Sentiment Classification,Chenhua Chen|Zhiyang Teng|Yue Zhang,"Aspect-level sentiment analysis aims to recognize the sentiment polarity of an aspect or a target in a comment. Recently, graph convolutional networks based on linguistic dependency trees have been studied for this task. However, the dependency parsing accuracy of commercial product comments or tweets might be unsatisfactory. To tackle this problem, we associate linguistic dependency trees with automatically induced aspectspecific graphs. We propose gating mechanisms to dynamically combine information from word dependency graphs and latent graphs which are learned by self-attention networks. Our model can complement supervised syntactic features with latent semantic dependencies. Experimental results on five benchmarks show the effectiveness of our proposed latent models, giving significantly better results than models without using latent graphs.",,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Long,https://www.aclweb.org/anthology/2020.emnlp-main.451,38938882 +main.1298,CLIRMatrix: A Massively Large Collection of Bilingual and Multilingual Datasets for Cross-Lingual Information Retrieval,Shuo Sun|Kevin Duh,"We present CLIRMatrix, a massively large collection of bilingual and multilingual datasets for Cross-Lingual Information Retrieval extracted automatically from Wikipedia. CLIRMatrix comprises (1) BI-139, a bilingual dataset of queries in one language matched with relevant documents in another language for 139x138=19,182 language pairs, and (2) MULTI-8, a multilingual dataset of queries and documents jointly aligned in 8 different languages. In total, we mined 49 million unique queries and 34 billion (query, document, label) triplets, making it the largest and most comprehensive CLIR dataset to date. This collection is intended to support research in end-to-end neural information retrieval and is publicly available at [url]. We provide baseline neural model results on BI-139, and evaluate MULTI-8 in both single-language retrieval and mix-language retrieval settings.",,Information Retrieval and Text Mining,Long,https://www.aclweb.org/anthology/2020.emnlp-main.340,38938883 +main.1299,Train No Evil: Selective Masking for Task-guided Pre-training,Yuxian Gu|Zhengyan Zhang|Xiaozhi Wang|Zhiyuan Liu|Maosong Sun,"Recently, pre-trained language models mostly follow the pre-train-then-fine-tuning paradigm and have achieved great performance on various downstream tasks. However, since the pre-training stage is typically task-agnostic and the fine-tuning stage usually suffers from insufficient supervised data, the models cannot always well capture the domain-specific and task-specific patterns. In this paper, we propose a three-stage framework by adding a task-guided pre-training stage with selective masking between general pre-training and fine-tuning. In this stage, the model is trained by masked language modeling on in-domain unsupervised data to learn domain-specific patterns and we propose a novel selective masking strategy to learn task-specific patterns. Specifically, we design a method to measure the importance of each token in sequences and selectively mask the important tokens. Experimental results on two sentiment analysis tasks show that our method can achieve comparable or even better performance with less than 50\% of computation cost, which indicates our method is both effective and efficient. The source code of this paper can be obtained from \url{https://github.com/thunlp/SelectiveMasking}.",,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Long,https://www.aclweb.org/anthology/2020.emnlp-main.566,38938884 +main.130,Shallow-to-Deep Training for Neural Machine Translation,Bei Li|Ziyang Wang|Hui Liu|Yufan Jiang|Quan Du|Tong Xiao|Huizhen Wang|Jingbo Zhu,"Deep encoders have been proven to be effective in improving neural machine translation (NMT) systems, but training an extremely deep encoder is time consuming. Moreover, why deep models help NMT is an open question. In this paper, we investigate the behavior of a well-tuned deep Transformer system. We find that stacking layers is helpful in improving the representation ability of NMT models and adjacent layers perform similarly. This inspires us to develop a shallow-to-deep training method that learns deep models by stacking shallow models. In this way, we successfully train a Transformer system with a 54-layer encoder. Experimental results on WMT’16 English-German and WMT’14 English-French translation tasks show that it is 1:4  faster than training from scratch, and achieves a BLEU score of 30:33 and 43:29 on two tasks. The code is publicly available at https://github.com/libeineu/ SDT-Training.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.72,38938655 +main.1305,Deconstructing Word Embedding Algorithms,Kian Kenyon-Dean|Edward Newell|Jackie Chi Kit Cheung,"Word embeddings are reliable feature representations of words used to obtain high quality results for various NLP applications. Uncontextualized word embeddings are used in many NLP tasks today, especially in resource-limited settings where high memory capacity and GPUs are not available. Given the historical success of word embeddings in NLP, we propose a retrospective on some of the most well-known word embedding algorithms. In this work, we deconstruct Word2vec, GloVe, and others, into a common form, unveiling some of the common conditions that seem to be required for making performant word embeddings. We believe that the theoretical findings in this paper can provide a basis for more informed development of future models.",,Semantics: Lexical Semantics,Long,https://www.aclweb.org/anthology/2020.emnlp-main.681,38938885 +main.1320,Mind Your Inflections! Improving NLP for Non-Standard Englishes with Base-Inflection Encoding,Samson Tan|Shafiq Joty|Lav Varshney|Min-Yen Kan,"Inflectional variation is a common feature of World Englishes such as Colloquial Singapore English and African American Vernacular English. Although comprehension by human readers is usually unimpaired by non-standard inflections, current NLP systems are not yet robust. We propose Base-Inflection Encoding (BITE), a method to tokenize English text by reducing inflected words to their base forms before reinjecting the grammatical information as special symbols. Fine-tuning pretrained NLP models for downstream tasks using our encoding defends against inflectional adversaries while maintaining performance on clean data. Models using BITE generalize better to dialects with non-standard inflections without explicit training and translation models converge faster when trained with BITE. Finally, we show that our encoding improves the vocabulary efficiency of popular data-driven subword tokenizers. Since there has been no prior work on quantitatively evaluating vocabulary efficiency, we propose metrics to do so.",,"Phonology, Morphology and Word Segmentation",Long,https://www.aclweb.org/anthology/2020.emnlp-main.455,38938886 +main.1322,NwQM: A Neural Quality Assessment Framework for Wikipedia,Bhanu Prakash Reddy Guda|Sasi Bhushan Seelaboyina|Soumya Sarkar|Animesh Mukherjee,"Millions of people irrespective of socioeconomic and demographic backgrounds, depend on Wikipedia articles everyday for keeping themselves informed regarding popular as well as obscure topics. Articles have been categorized by editors into several quality classes, which indicate their reliability as encyclopedic content. This manual designation is an onerous task because it necessitates profound knowledge about encyclopedic language, as well navigating circuitous set of wiki guidelines. In this paper we propose Neural wikipedia Quality Monitor (NwQM), a novel deep learning model which accumulates signals from several key information sources such as article text, meta data and images to obtain improved Wikipedia article representation. We present comparison of our approach against a plethora of available solutions and show 8% improvement over state-of-the-art approaches with detailed ablation studies.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.674,38938887 +main.1339,Zero-Shot Crosslingual Sentence Simplification,Jonathan Mallinson|Rico Sennrich|Mirella Lapata,"Sentence simplification aims to make sentences easier to read and understand. Recent approaches have shown promising results with encoder-decoder models trained on large amounts of parallel data which often only exists in English. We propose a zero-shot modeling framework which transfers simplification knowledge from English to another language (for which no parallel simplification corpus exists) while generalizing across languages and tasks. A shared transformer encoder constructs language-agnostic representations, with a combination of task-specific encoder layers added on top (e.g., for translation and simplification). Empirical results using both human and automatic metrics show that our approach produces better simplifications than unsupervised and pivot-based methods.",,Language Generation,Long,https://www.aclweb.org/anthology/2020.emnlp-main.415,38938888 +main.1351,On the Importance of Pre-training Data Volume for Compact Language Models,Vincent Micheli|Martin d'Hoffschmidt|François Fleuret,"Recent advances in language modeling have led to computationally intensive and resource-demanding state-of-the-art models. In an effort towards sustainable practices, we study the impact of pre-training data volume on compact language models. Multiple BERT-based models are trained on gradually increasing amounts of French text. Through fine-tuning on the French Question Answering Dataset (FQuAD), we observe that well-performing models are obtained with as little as 100 MB of text. In addition, we show that past critically low amounts of pre-training data, an intermediate pre-training step on the task-specific corpus does not yield substantial improvements.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.632,38938889 +main.1356,Sequence-level Mixed Sample Data Augmentation,Demi Guo|Yoon Kim|Alexander Rush,"Despite their empirical success, neural networks still have difficulty capturing compositional aspects of natural language. This work proposes a simple data augmentation approach to encourage compositional behavior in neural models for sequence-to-sequence problems. Our approach, SeqMix, creates new synthetic examples by softly combining input/output sequences from the training set. We connect this approach to existing techniques such as SwitchOut and word dropout, and show that these techniques are all essentially approximating variants of a single objective. SeqMix consistently yields approximately 1.0 BLEU improvement on five different translation datasets over strong Transformer baselines. On tasks that require strong compositional generalization such as SCAN and semantic parsing, SeqMix also offers further improvements.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.447,38938890 +main.1377,"If Beam Search Is the Answer, What Was the Question?",Clara Meister|Ryan Cotterell|Tim Vieira,"Quite surprisingly, exact maximum a posteriori (MAP) decoding of neural language generators frequently leads to low-quality results. Rather, most state-of-the-art results on language generation tasks are attained using beam search despite its overwhelmingly high search error rate. This implies that the MAP objective alone does not express the properties we desire in text, which merits the question: if beam search is the answer, what was the question? We frame beam search as the exact solution to a different decoding objective in order to gain insights into why high probability under a model alone may not indicate adequacy. We find that beam search enforces uniform information density in text, a property motivated by cognitive science. We suggest a set of decoding objectives that explicitly enforce this property and find that exact decoding with these objectives alleviates the problems encountered when decoding poorly calibrated language generation models. Additionally, we analyze the text produced using various decoding strategies and see that, in our neural machine translation experiments, the extent to which this property is adhered to strongly correlates with BLEU.",,Language Generation,Long,https://www.aclweb.org/anthology/2020.emnlp-main.170,38938891 +main.1379,X-SRL: A Parallel Cross-Lingual Semantic Role Labeling Dataset,Angel Daza|Anette Frank,"Even though SRL is researched for many languages, major improvements have mostly been obtained for English, for which more resources are available. In fact, existing multilingual SRL datasets contain disparate annotation styles or come from different domains, hampering generalization in multilingual learning. In this work we propose a method to automatically construct an SRL corpus that is parallel in four languages: English, French, German, Spanish, with unified predicate and role annotations that are fully comparable across languages. We apply high-quality machine translation to the English CoNLL-09 dataset and use multilingual BERT to project its high-quality annotations to the target languages. We include human-validated test sets that we use to measure the projection quality, and show that projection is denser and more precise than a strong baseline. Finally, we train different SOTA models on our novel corpus for mono- and multilingual SRL, showing that the multilingual annotations improve performance especially for the weaker languages.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",Long,https://www.aclweb.org/anthology/2020.emnlp-main.321,38938892 +main.1383,Identifying Elements Essential for BERT's Multilinguality,Philipp Dufter|Hinrich Schütze,"It has been shown that multilingual BERT (mBERT) yields high quality multilingual representations and enables effective zero-shot transfer. This is surprising given that mBERT does not use any crosslingual signal during training. While recent literature has studied this phenomenon, the reasons for the multilinguality are still somewhat obscure. We aim to identify architectural properties of BERT and linguistic properties of languages that are necessary for BERT to become multilingual. To allow for fast experimentation we propose an efficient setup with small BERT models trained on a mix of synthetic and natural data. Overall, we identify four architectural and two linguistic elements that influence multilinguality. Based on our insights, we experiment with a multilingual pretraining setup that modifies the masking strategy using VecMap, i.e., unsupervised embedding alignment. Experiments on XNLI with three languages indicate that our findings transfer from our small setup to larger scale settings.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.358,38938893 +main.1388,Visually Grounded Continual Learning of Compositional Phrases,Xisen Jin|Junyi Du|Arka Sadhu|Ram Nevatia|Xiang Ren,"Humans acquire language continually with much more limited access to data samples at a time, as compared to contemporary NLP systems. To study this human-like language acquisition ability, we present VisCOLL, a visually grounded language learning task, which simulates the continual acquisition of compositional phrases from streaming visual scenes. In the task, models are trained on a paired image-caption stream which has shifting object distribution; while being constantly evaluated by a visually-grounded masked language prediction task on held-out test sets. VisCOLL compounds the challenges of continual learning (i.e., learning from continuously shifting data distribution) and compositional generalization (i.e., generalizing to novel compositions). To facilitate research on VisCOLL, we construct two datasets, COCO-shift and Flickr-shift, and benchmark them using different continual learning methods. Results reveal that SoTA continual learning approaches provide little to no improvements on VisCOLL, since storing examples of all possible compositions is infeasible. We conduct further ablations and analysis to guide future work.",,"Language Grounding to Vision, Robotics and Beyond",Long,https://www.aclweb.org/anthology/2020.emnlp-main.158,38938894 +main.1389,Iterative Feature Mining for Constraint-Based Data Collection to Increase Data Diversity and Model Robustness,Stefan Larson|Anthony Zheng|Anish Mahendran|Rishi Tekriwal|Adrian Cheung|Eric Guldan|Kevin Leach|Jonathan K. Kummerfeld,"Diverse data is crucial for training robust models, but crowdsourced text often lacks diversity as workers tend to write simple variations from prompts. We propose a general approach for guiding workers to write more diverse text by iteratively constraining their writing. We show how prior workflows are special cases of our approach, and present a way to apply the approach to dialog tasks such as intent classification and slot-filling. Using our method, we create more challenging versions of test sets from prior dialog datasets and find dramatic performance drops for standard models. Finally, we show that our approach is complementary to recent work on improving data diversity, and training on data collected with our approach leads to more robust models.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.650,38938895 +main.1390,Conversational Document Prediction to Assist Customer Care Agents,Jatin Ganhotra|Haggai Roitman|Doron Cohen|Nathaniel Mills|Chulaka Gunasekara|Yosi Mass|Sachindra Joshi|Luis Lastras|David Konopnicki,"A frequent pattern in customer care conversations is the agents responding with appropriate webpage URLs that address users' needs. We study the task of predicting the documents that customer care agents can use to facilitate users' needs. We also introduce a new public dataset which supports the aforementioned problem. Using this dataset and two others, we investigate state-of-the art deep learning (DL) and information retrieval (IR) models for the task. Additionally, we analyze the practicality of such systems in terms of inference time complexity. Our show that an hybrid IR+DL approach provides the best of both worlds.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.25,38938896 +main.1393,Sequential Modelling of the Evolution of Word Representations for Semantic Change Detection,Adam Tsakalidis|Maria Liakata,"Semantic change detection concerns the task of identifying words whose meaning has changed over time. Current state-of-the-art approaches operating on neural embeddings detect the level of semantic change in a word by comparing its vector representation in two distinct time periods, without considering its evolution through time. In this work, we propose three variants of sequential models for detecting semantically shifted words, effectively accounting for the changes in the word representations over time. Through extensive experimentation under various settings with synthetic and real data we showcase the importance of sequential modelling of word vectors through time for semantic change detection. Finally, we compare different approaches in a quantitative manner, demonstrating that temporal modelling of word representations yields a clear-cut advantage in performance.",,Semantics: Lexical Semantics,Long,https://www.aclweb.org/anthology/2020.emnlp-main.682,38938897 +main.1395,Towards Better Context-aware Lexical Semantics: Adjusting Contextualized Representations through Static Anchors,Qianchu Liu|Diana McCarthy|Anna Korhonen,"One of the most powerful features of contextualized models is their dynamic embeddings for words in context, leading to state-of-the-art representations for context-aware lexical semantics. In this paper, we present a post-processing technique that enhances these representations by learning a transformation through static anchors. Our method requires only another pre-trained model and no labeled data is needed. We show consistent improvement in a range of benchmark tasks that test contextual variations of meaning both across different usages of a word and across different words as they are used in context. We demonstrate that while the original contextual representations can be improved by another embedding space from both contextualized and static models, the static embeddings, which have lower computational requirements, provide the most gains.",,Semantics: Lexical Semantics,Long,https://www.aclweb.org/anthology/2020.emnlp-main.333,38938898 +main.1399,Debiasing Knowledge Graph Embeddings,Joseph Fisher|Arpit Mittal|Dave Palfrey|Christos Christodoulopoulos,"It has been shown that knowledge graph embeddings encode potentially harmful social biases, such as the information that women are more likely to be nurses, and men more likely to be bankers. As graph embeddings begin to be used more widely in NLP pipelines, there is a need to develop training methods which remove such biases. Previous approaches to this problem both significantly increase the training time, by a factor of eight or more, and decrease the accuracy of the model substantially. We present a novel approach, in which all embeddings are trained to be neutral to sensitive attributes such as gender by default using an adversarial loss. We then add sensitive attributes back on in whitelisted cases. Training time only marginally increases over a baseline model, and the debiased embeddings perform almost as accurately in the triple prediction task as their non-debiased counterparts.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.595,38938899 +main.1402,Simultaneous Machine Translation with Visual Context,Ozan Caglayan|Julia Ive|Veneta Haralampieva|Pranava Madhyastha|Loïc Barrault|Lucia Specia,"Simultaneous machine translation (SiMT) aims to translate a continuous input text stream into another language with the lowest latency and highest quality possible. The translation thus has to start with an incomplete source text, which is read progressively, creating the need for anticipation. In this paper, we seek to understand whether the addition of visual information can compensate for the missing source context. To this end, we analyse the impact of different multimodal approaches and visual features on state-of-the-art SiMT frameworks. Our results show that visual context is helpful and that visually-grounded models based on explicit object region information are much better than commonly used global features, reaching up to 3 BLEU points improvement under low latency scenarios. Our qualitative analysis illustrates cases where only the multimodal systems are able to translate correctly from English into gender-marked languages, as well as deal with differences in word order, such as adjective-noun placement between English and French.",,Speech and Multimodality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.184,38938900 +main.1408,Towards Debiasing NLU Models from Unknown Biases,Prasetya Ajie Utama|Nafise Sadat Moosavi|Iryna Gurevych,"NLU models often exploit biases to achieve high dataset-specific performance without properly learning the intended task. Recently proposed debiasing methods are shown to be effective in mitigating this tendency. However, these methods rely on a major assumption that the types of bias should be known a-priori, which limits their application to many NLU tasks and datasets. In this work, we present the first step to bridge this gap by introducing a self-debiasing framework that prevents models from mainly utilizing biases without knowing them in advance. The proposed framework is general and complementary to the existing debiasing methods. We show that it allows these existing methods to retain the improvement on the challenge datasets (i.e., sets of examples designed to expose models' reliance on biases) without specifically targeting certain biases. Furthermore, the evaluation suggests that applying the framework results in improved overall robustness.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",Long,https://www.aclweb.org/anthology/2020.emnlp-main.613,38938901 +main.1421,Analogous Process Structure Induction for Sub-event Sequence Prediction,Hongming Zhang|Muhao Chen|Haoyu Wang|Yangqiu Song|Dan Roth,"Computational and cognitive studies of event understanding suggest that identifying, comprehending, and predicting events depend on having structured representations of a sequence of events and on conceptualizing (abstracting) its components into (soft) event categories. Thus, knowledge about a known process such as ``buying a car'' can be used in the context of a new but analogous process such as ``buying a house''. Nevertheless, most event understanding work in NLP is still at the ground level and does not consider abstraction. In this paper, we propose an Analogous Process Structure Induction (APSI) framework, which leverages analogies among processes and conceptualization of sub-event instances to predict the whole sub-event sequence of previously unseen open-domain processes. As our experiments and analysis indicate, APSI supports the generation of meaningful sub-event sequences for unseen processes and can help predict missing events.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",Long,https://www.aclweb.org/anthology/2020.emnlp-main.119,38938902 +main.1428,Improving Low Compute Language Modeling with In-Domain Embedding Initialisation,Charles Welch|Rada Mihalcea|Jonathan K. Kummerfeld,"Many NLP applications, such as biomedical data and technical support, have 10-100 million tokens of in-domain data and limited computational resources for learning from it. How should we train a language model in this scenario? Most language modeling research considers either a small dataset with a closed vocabulary (like the standard 1 million token Penn Treebank), or the whole web with byte-pair encoding. We show that for our target setting in English, initialising and freezing input embeddings using in-domain data can improve language model performance by providing a useful representation of rare words, and this pattern holds across several different domains. In the process, we show that the standard convention of tying input and output embeddings does not improve perplexity when initializing with embeddings trained on in-domain data.",,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.696,38938903 +main.143,XL-WiC: A Multilingual Benchmark for Evaluating Semantic Contextualization,Alessandro Raganato|Tommaso Pasini|Jose Camacho-Collados|Mohammad Taher Pilehvar,"The ability to correctly model distinct meanings of a word is crucial for the effectiveness of semantic representation techniques. However, most existing evaluation benchmarks for assessing this criterion are tied to sense inventories (usually WordNet), restricting their usage to a small subset of knowledge-based representation techniques. The Word-in-Context dataset (WiC) addresses the dependence on sense inventories by reformulating the standard disambiguation task as a binary classification problem; but, it is limited to the English language. We put forward a large multilingual benchmark, XL-WiC, featuring gold standards in 12 new languages from varied language families and with different degrees of resource availability, opening room for evaluation scenarios such as zero-shot cross-lingual transfer. We perform a series of experiments to determine the reliability of the datasets and to set performance baselines for several recent contextualized multilingual models. Experimental results show that even when no tagged instances are available for a target language, models trained solely on the English data can attain competitive performance in the task of distinguishing different meanings of a word, even for distant languages. XL-WiC is available at https://pilehvar.github.io/xlwic/.",,Semantics: Lexical Semantics,Long,https://www.aclweb.org/anthology/2020.emnlp-main.584,38938656 +main.1432,Iterative Refinement in the Continuous Space for Non-Autoregressive Neural Machine Translation,Jason Lee|Raphael Shu|Kyunghyun Cho,"We propose an efficient inference procedure for non-autoregressive machine translation that iteratively refines translation purely in the continuous space. Given a continuous latent variable model for machine translation (Shu et al., 2020), we train an inference network to approximate the gradient of the marginal log probability of the target sentence, using the latent variable instead. This allows us to use gradient-based optimization to find the target sentence at inference time that approximately maximizes its marginal probability. As each refinement step only involves computation in the latent space of low dimensionality (we use 8 in our experiments), we avoid computational overhead incurred by existing non-autoregressive inference procedures that often refine in token space. We compare our approach to a recently proposed EM-like inference procedure (Shu et al., 2020) that optimizes in a hybrid space, consisting of both discrete and continuous variables. We evaluate our approach on WMT’14 En→De, WMT’16 Ro→En and IWSLT’16 De→En, and observe two advantages over the EM-like inference: (1) it is computationally efficient, i.e. each refinement step is twice as fast, and (2) it is more effective, resulting in higher marginal probabilities and BLEU scores with the same number of refinement steps. On WMT’14 En→De, for instance, our approach is able to decode 6.2 times faster than the autoregressive model with minimal degradation to translation quality (0.9 BLEU).",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.73,38938904 +main.1445,On Negative Interference in Multilingual Language Models,Zirui Wang|Zachary C. Lipton|Yulia Tsvetkov,"Modern multilingual models are trained on concatenated text from multiple languages in hopes of conferring benefits to each (positive transfer), with the most pronounced benefits accruing to low-resource languages. However, recent work has shown that this approach can degrade performance on high-resource languages, a phenomenon known as negative interference. In this paper, we present the first systematic study of negative interference. We show that, contrary to previous belief, negative interference also impacts low-resource languages. While parameters are maximally shared to learn language-universal structures, we demonstrate that language-specific parameters do exist in multilingual models and they are a potential cause of negative interference. Motivated by these observations, we also present a meta-learning algorithm that obtains better cross-lingual transferability and alleviates negative interference, by adding language-specific layers as meta-parameters and training them in a manner that explicitly improves shared layers' generalization on all languages. Overall, our results show that negative interference is more common than previously known, suggesting new directions for improving multilingual representations.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.359,38938905 +main.1446,Optimus: Organizing Sentences via Pre-trained Modeling of a Latent Space,Chunyuan Li|Xiang Gao|Yuan Li|Baolin Peng|Xiujun Li|Yizhe Zhang|Jianfeng Gao,"When trained effectively, the Variational Autoencoder (VAE) can be both a powerful generative model and an effective representation learning framework for natural language. In this paper, we propose the first large-scale language VAE model Optimus (Organizing sentences via Pre-Trained Modeling of a Universal Space). A universal latent embedding space for sentences is first pre-trained on large text corpus, and then fine-tuned for various language generation and understanding tasks. Compared with GPT-2, Optimus enables guided language generation from an abstract level using the latent vectors. Compared with BERT, Optimus can generalize better on low-resource language understanding tasks due to the smooth latent space structure. Extensive experimental results on a wide range of language tasks demonstrate the effectiveness of Optimus. It achieves new state-of-the-art on VAE language modeling benchmarks.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.378,38938906 +main.1455,Experience Grounds Language,Yonatan Bisk|Ari Holtzman|Jesse Thomason|Jacob Andreas|Yoshua Bengio|Joyce Chai|Mirella Lapata|Angeliki Lazaridou|Jonathan May|Aleksandr Nisnevich|Nicolas Pinto|Joseph Turian,"Language understanding research is held back by a failure to relate language to the physical world it describes and to the social interactions it facilitates. Despite the incredible effectiveness of language processing models to tackle tasks after being trained on text alone, successful linguistic communication relies on a shared experience of the world. It is this shared experience that makes utterances meaningful. Natural language processing is a diverse field, and progress throughout its development has come from new representational theories, modeling techniques, data collection paradigms, and tasks. We posit that the present success of representation learning approaches trained on large, text-only corpora requires the parallel tradition of research on the broader physical and social context of language to address the deeper questions of communication.",,"Language Grounding to Vision, Robotics and Beyond",Long,https://www.aclweb.org/anthology/2020.emnlp-main.703,38938907 +main.1456,Pre-tokenization of Multi-word Expressions in Cross-lingual Word Embeddings,Naoki Otani|Satoru Ozaki|Xingyuan Zhao|Yucen Li|Micaelah St Johns|Lori Levin,"Cross-lingual word embedding (CWE) algorithms represent words in multiple languages in a unified vector space. Multi-Word Expressions (MWE) are common in every language. When training word embeddings, each component word of an MWE gets its own separate embedding, and thus, MWEs are not translated by CWEs. We propose a simple method for word translation of MWEs to and from English in ten languages: we first compile lists of MWEs in each language and then tokenize the MWEs as single tokens before training word embeddings. CWEs are trained on a word-translation task using the dictionaries that only contain single words. In order to evaluate MWE translation, we created bilingual word lists from multilingual WordNet that include single-token words and MWEs, and most importantly, include MWEs that correspond to single words in another language. We release these dictionaries to the research community. We show that the pre-tokenization of MWEs as single tokens performs better than averaging the embeddings of the individual tokens of the MWE. We can translate MWEs at a top-10 precision of 30-60%. The tokenization of MWEs makes the occurrences of single words in a training corpus more sparse, but we show that it does not pose negative impacts on single-word translations.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.360,38938908 +main.1458,SSMBA: Self-Supervised Manifold Based Data Augmentation for Improving Out-of-Domain Robustness,Nathan Ng|Kyunghyun Cho|Marzyeh Ghassemi,"Models that perform well on a training domain often fail to generalize to out-of-domain (OOD) examples. Data augmentation is a common method used to prevent overfitting and improve OOD generalization. However, in natural language, it is difficult to generate new examples that stay on the underlying data manifold. We introduce SSMBA, a data augmentation method for generating synthetic training examples by using a pair of corruption and reconstruction functions to move randomly on a data manifold. We investigate the use of SSMBA in the natural language domain, leveraging the manifold assumption to reconstruct corrupted text with masked language models. In experiments on robustness benchmarks across 3 tasks and 9 datasets, SSMBA consistently outperforms existing data augmentation methods and baseline models on both in-domain and OOD data, achieving gains of 0.8% on OOD Amazon reviews, 1.8% accuracy on OOD MNLI, and 1.4 BLEU on in-domain IWSLT14 German-English.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.97,38938909 +main.1460,Evaluating the Calibration of Knowledge Graph Embeddings for Trustworthy Link Prediction,Tara Safavi|Danai Koutra|Edgar Meij,"Little is known about the trustworthiness of predictions made by knowledge graph embedding (KGE) models. In this paper we take initial steps toward this direction by investigating the calibration of KGE models, or the extent to which they output confidence scores that reflect the expected correctness of predicted knowledge graph triples. We first conduct an evaluation under the standard closed-world assumption (CWA), in which predicted triples not already in the knowledge graph are considered false, and show that existing calibration techniques are effective for KGE under this common but narrow assumption. Next, we introduce the more realistic but challenging open-world assumption (OWA), in which unobserved predictions are not considered true or false until ground-truth labels are obtained. Here, we show that existing calibration techniques are much less effective under the OWA than the CWA, and provide explanations for this discrepancy. Finally, to motivate the utility of calibration for KGE from a practitioner's perspective, we conduct a unique case study of human-AI collaboration, showing that calibrated predictions can improve human performance in a knowledge graph completion task.",,Information Retrieval and Text Mining,Long,https://www.aclweb.org/anthology/2020.emnlp-main.667,38938910 +main.1465,Recurrent Event Network: Autoregressive Structure Inference over Temporal Knowledge Graphs,Woojeong Jin|Meng Qu|Xisen Jin|Xiang Ren,"Knowledge graph reasoning is a critical task in natural language processing. The task becomes more challenging on temporal knowledge graphs, where each fact is associated with a timestamp. Most existing methods focus on reasoning at past timestamps and they are not able to predict facts happening in the future. This paper proposes Recurrent Event Network (RE-Net), a novel autoregressive architecture for predicting future interactions. The occurrence of a fact (event) is modeled as a probability distribution conditioned on temporal sequences of past knowledge graphs. Specifically, our RE-Net employs a recurrent event encoder to encode past facts, and uses a neighborhood aggregator to model the connection of facts at the same timestamp. Future facts can then be inferred in a sequential manner based on the two modules. We evaluate our proposed method via link prediction at future times on five public datasets. Through extensive experiments, we demonstrate the strength of RE-Net, especially on multi-step inference over future timestamps, and achieve state-of-the-art performance on all five datasets.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.541,38938911 +main.1466,Learning Collaborative Agents with Rule Guidance for Knowledge Graph Reasoning,Deren Lei|Gangrong Jiang|Xiaotao Gu|Kexuan Sun|Yuning Mao|Xiang Ren,"Walk-based models have shown their advantages in knowledge graph (KG) reasoning by achieving decent performance while providing interpretable decisions. However, the sparse reward signals offered by the KG during a traversal are often insufficient to guide a sophisticated walk-based reinforcement learning (RL) model. An alternate approach is to use traditional symbolic methods (e.g., rule induction), which achieve good performance but can be hard to generalize due to the limitation of symbolic representation. In this paper, we propose RuleGuider, which leverages high-quality rules generated by symbolic-based methods to provide reward supervision for walk-based agents. Experiments on benchmark datasets shows that RuleGuider clearly improves the performance of walk-based models without losing interpretability.",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.688,38938912 +main.148,Augmented Natural Language for Generative Sequence Labeling,Ben Athiwaratkun|Cicero Nogueira dos Santos|Jason Krone|Bing Xiang,"We propose a generative framework for joint sequence labeling and sentence-level classification. Our model performs multiple sequence labeling tasks at once using a single, shared natural language output space. Unlike prior discriminative methods, our model naturally incorporates label semantics and shares knowledge across tasks. Our framework general purpose, performing well on few-shot learning, low resource, and high resource tasks. We demonstrate these advantages on popular named entity recognition, slot labeling, and intent classification benchmarks. We set a new state-of-the-art for few-shot slot labeling, improving substantially upon the previous 5-shot (75.0% to 90.9%) and 1-shot (70.4% to 81.0%) state-of-the-art results. Furthermore, our model generates large improvements (46.27% to 63.83%) in low resource slot labeling over a BERT baseline by incorporating label semantics. We also maintain competitive results on high resource tasks, performing within two points of the state-of-the-art on all tasks and setting a new state-of-the-art on the SNIPS dataset.",,Language Generation,Long,https://www.aclweb.org/anthology/2020.emnlp-main.27,38938657 +main.1482,KGLM: Pretrained Knowledge-Grounded Language Model for Data-to-Text Generation,Wenhu Chen|Yu Su|Xifeng Yan|William Yang Wang,"Data-to-text generation has recently attracted substantial interests due to its wide applications. Existing methods have shown impressive performance on an array of tasks. However, they rely on a significant amount of labeled data for each task, which is costly to acquire and thus limits their application to new tasks and domains. In this paper, we propose to leverage pre-training and transfer learning to address this issue. We propose a knowledge-grounded pre-training (KGPT), which consists of two parts, 1) a general knowledge-grounded generation model to generate knowledge-enriched text. 2) a pre-training paradigm on a massive knowledge-grounded text corpus crawled from the web. The pre-trained model can be fine-tuned on various data-to-text generation tasks to generate task-specific text. We adopt three settings, namely fully-supervised, zero-shot, few-shot to evaluate its effectiveness. Under the fully-supervised setting, our model can achieve remarkable gains over the known baselines. Under zero-shot setting, our model without seeing any examples achieves over 30 ROUGE-L on WebNLG while all other baselines fail. Under the few-shot setting, our model only needs about one-fifteenth as many labeled examples to achieve the same level of performance as baseline models. These experiments consistently prove the strong generalization ability of our proposed framework\footnote{\url{https://github.com/wenhuchen/KGPT}}.",,Language Generation,Long,https://www.aclweb.org/anthology/2020.emnlp-main.697,38938913 +main.1484,Utility Is in the Eye of the User: A Critique of NLP Leaderboard Design,Kawin Ethayarajh|Dan Jurafsky,"Benchmarks such as GLUE have helped drive advances in NLP by incentivizing the creation of more accurate models. While this leaderboard paradigm has been remarkably successful, a historical focus on performance-based evaluation has been at the expense of other qualities that the NLP community values in models, such as compactness, fairness, and energy efficiency. In this opinion paper, we study the divergence between what is incentivized by leaderboards and what is useful in practice through the lens of microeconomic theory. We frame both the leaderboard and NLP practitioners as consumers and the benefit they get from a model as its utility to them. With this framing, we formalize how leaderboards -- in their current form -- can be poor proxies for the NLP community at large. For example, a highly inefficient model would provide less utility to practitioners but not to a leaderboard, since it is a cost that only the former must bear. To allow practitioners to better estimate a model's utility to them, we advocate for more transparency on leaderboards, such as the reporting of statistics that are of practical concern (e.g., model size, energy efficiency, and inference latency).",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.393,38938914 +main.1485,Why Skip If You Can Combine: A Simple Knowledge Distillation Technique for Intermediate Layers,Yimeng Wu|Peyman Passban|Mehdi Rezagholizadeh|Qun Liu,"With the growth of computing power neural machine translation (NMT) models also grow accordingly and become better. However, they also become harder to deploy on edge devices due to memory constraints. To cope with this problem, a common practice is to distill knowledge from a large and accurately-trained teacher network (T) into a compact student network (S). Although knowledge distillation (KD) is useful in most cases, our study shows that existing KD techniques might not be suitable enough for deep NMT engines, so we propose a novel alternative. In our model, besides matching T and S predictions we have a combinatorial mechanism to inject layer-level supervision from T to S. In this paper, we target low-resource settings and evaluate our translation engines for Portuguese→English, Turkish→English, and English→German directions. Students trained using our technique have 50% fewer parameters and can still deliver comparable results to those of 12-layer teachers.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.74,38938915 +main.1488,Text Graph Transformer for Document Classification,Haopeng Zhang|Jiawei Zhang,"Text classification is a fundamental problem in natural language processing. Recent studies applied graph neural network (GNN) techniques to capture global word co-occurrence in a corpus. However, previous works are not scalable to large-sized corpus and ignore the heterogeneity of the text graph. To address these problems, we introduce a novel Transformer based heterogeneous graph neural network, namely Text Graph Transformer (TG-Transformer). Our model learns effective node representations by capturing structure and heterogeneity from the text graph. We propose a mini-batch text graph sampling method that significantly reduces computing and memory costs to handle large-sized corpus. Extensive experiments have been conducted on several benchmark datasets, and the results demonstrate that TG-Transformer outperforms state-of-the-art approaches on text classification task.",,Information Retrieval and Text Mining,Long,https://www.aclweb.org/anthology/2020.emnlp-main.668,38938916 +main.1490,Plug and Play Autoencoders for Conditional Text Generation,Florian Mai|Nikolaos Pappas|Ivan Montero|Noah A. Smith|James Henderson,"Text autoencoders are commonly used for conditional generation tasks such as style transfer. We propose methods which are plug and play, where any pretrained autoencoder can be used, and only require learning a mapping within the autoencoder's embedding space, training embedding-to-embedding (Emb2Emb). This reduces the need for labeled training data for the task and makes the training procedure more efficient. Crucial to the success of this method is a loss term for keeping the mapped embedding on the manifold of the autoencoder and a mapping which is trained to navigate the manifold by learning offset vectors. Evaluations on style transfer tasks both with and without sequence-to-sequence supervision show that our method performs better than or comparable to strong baselines while being up to four times faster.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.491,38938917 +main.1492,Surprisal Predicts Code-Switching in Chinese-English Bilingual Text,Jesús Calvillo|Le Fang|Jeremy Cole|David Reitter,"Why do bilinguals switch languages within a sentence? The present observational study asks whether word surprisal and word entropy predict code-switching in bilingual written conversation. We describe and model a new dataset of Chinese-English text with 1476 clean code-switched sentences, translated back into Chinese. The model includes known control variables together with word surprisal and word entropy. We found that word surprisal, but not entropy, is a significant predictor that explains code-switching above and beyond other well-known predictors. We also found sentence length to be a significant predictor, which has been related to sentence complexity. We propose high cognitive effort as a reason for code-switching, as it leaves fewer resources for inhibition of the alternative language. We also corroborate previous findings, but this time using a computational model of surprisal, a new language pair, and doing so for written language.",,"Linguistic Theories, Cognitive Modeling and Psycholinguistics",Long,https://www.aclweb.org/anthology/2020.emnlp-main.330,38938918 +main.1493,CoDEx: A Comprehensive Knowledge Graph Completion Benchmark,Tara Safavi|Danai Koutra,"We present CoDEx, a set of knowledge graph completion datasets extracted from Wikidata and Wikipedia that improve upon existing knowledge graph completion benchmarks in scope and level of difficulty. In terms of scope, CoDEx comprises three knowledge graphs varying in size and structure, multilingual descriptions of entities and relations, and tens of thousands of hard negative triples that are plausible but verified to be false. To characterize CoDEx, we contribute thorough empirical analyses and benchmarking experiments. First, we analyze each CoDEx dataset in terms of logical relation patterns. Next, we report baseline link prediction and triple classification results on CoDEx for five extensively tuned embedding models. Finally, we differentiate CoDEx from the popular FB15K-237 knowledge graph completion dataset by showing that CoDEx covers more diverse and interpretable content, and is a more difficult link prediction benchmark. Data, code, and pretrained models are available at https://bit.ly/2EPbrJs.",,Information Retrieval and Text Mining,Long,https://www.aclweb.org/anthology/2020.emnlp-main.669,38938919 +main.1494,Unsupervised Parsing via Constituency Tests,Steven Cao|Nikita Kitaev|Dan Klein,"We propose a method for unsupervised parsing based on the linguistic notion of a constituency test. One type of constituency test involves modifying the sentence via some transformation (e.g. replacing the span with a pronoun) and then judging the result (e.g. checking if it is grammatical). Motivated by this idea, we design an unsupervised parser by specifying a set of transformations and using an unsupervised neural acceptability model to make grammaticality decisions. To produce a tree given a sentence, we score each span by aggregating its constituency test judgments, and we choose the binary tree with the highest total score. While this approach already achieves performance in the range of current methods, we further improve accuracy by fine-tuning the grammaticality model through a refinement procedure, where we alternate between improving the estimated trees and improving the grammaticality model. The refined model achieves 62.8 F1 on the Penn Treebank test set, an absolute improvement of 7.6 points over the previously best published result.",,"Syntax: Tagging, Chunking, and Parsing",Long,https://www.aclweb.org/anthology/2020.emnlp-main.389,38938920 +main.1495,An Imitation Game for Learning Semantic Parsers from User Interaction,Ziyu Yao|Yiqi Tang|Wen-tau Yih|Huan Sun|Yu Su,"Despite the widely successful applications, bootstrapping and fine-tuning semantic parsers are still a tedious process with challenges such as costly data annotation and privacy risks. In this paper, we suggest an alternative, human-in-the-loop methodology for learning semantic parsers directly from users. A semantic parser should be introspective of its uncertainties and prompt for user demonstrations when uncertain. In doing so it also gets to imitate the user behavior and continue improving itself autonomously with the hope that eventually it may become as good as the user in interpreting their questions. To combat the sparsity of demonstrations, we propose a novel annotation-efficient imitation learning algorithm, which iteratively collects new datasets by mixing demonstrated states and confident predictions and retrains the semantic parser in a Dataset Aggregation fashion (Ross et al., 2011). We provide a theoretical analysis of its cost bound and also empirically demonstrate its promising performance on the text-to-SQL problem. Code will be available at {https://github.com/sunlab-osu/MISP}.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",Long,https://www.aclweb.org/anthology/2020.emnlp-main.559,38938921 +main.1498,Sparse Parallel Training for Hierarchical Dirichlet Process Topic Models,Alexander Terenin|Måns Magnusson|Leif Jonsson,"To scale non-parametric extensions of probabilistic topic models such as Latent Dirichlet allocation to larger data sets, practitioners rely increasingly on parallel and distributed systems. In this work, we study data-parallel training for the hierarchical Dirichlet process (HDP) topic model. Based upon a representation of certain conditional distributions within an HDP, we propose a doubly sparse data-parallel sampler for the HDP topic model. This sampler utilizes all available sources of sparsity found in natural language - an important way to make computation efficient. We benchmark our method on a well-known corpus (PubMed) with 8m documents and 768m tokens, using a single multi-core machine in under four days.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.234,38938922 +main.1503,A Supervised Word Alignment Method Based on Cross-Language Span Prediction Using Multilingual BERT,Masaaki Nagata|Katsuki Chousa|Masaaki Nishino,"We present a novel supervised word alignment method based on cross-language span prediction. We first formalize a word alignment problem as a collection of independent predictions from a token in the source sentence to a span in the target sentence. Since this step is equivalent to a SQuAD v2.0 style question answering task, we solve it using the multilingual BERT, which is fine-tuned on manually created gold word alignment data. It is nontrivial to obtain accurate alignment from a set of independently predicted spans. We greatly improved the word alignment accuracy by adding to the question the source token's context and symmetrizing two directional predictions. In experiments using five word alignment datasets from among Chinese, Japanese, German, Romanian, French, and English, we show that our proposed method significantly outperformed previous supervised and unsupervised word alignment methods without any bitexts for pretraining. For example, we achieved 86.7 F1 score for the Chinese-English data, which is 13.3 points higher than the previous state-of-the-art supervised method.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.41,38938923 +main.1504,Dynamic Context Selection for Document-level Neural Machine Translation via Reinforcement Learning,Xiaomian Kang|Yang Zhao|Jiajun Zhang|Chengqing Zong,"Document-level neural machine translation has yielded attractive improvements. However, majority of existing methods roughly use all context sentences in a fixed scope. They neglect the fact that different source sentences need different sizes of context. To address this problem, we propose an effective approach to select dynamic context so that the document-level translation model can utilize the more useful selected context sentences to produce better translations. Specifically, we introduce a selection module that is independent of the translation module to score each candidate context sentence. Then, we propose two strategies to explicitly select a variable number of context sentences and feed them into the translation module. We train the two modules end-to-end via reinforcement learning. A novel reward is proposed to encourage the selection and utilization of dynamic context sentences. Experiments demonstrate that our approach can select adaptive context sentences for different source sentences, and significantly improves the performance of document-level translation methods.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.175,38938924 +main.1508,Structure Aware Negative Sampling in Knowledge Graphs,Kian Ahrabian|Aarash Feizi|Yasmin Salehi|William L. Hamilton|Avishek Joey Bose,"Learning low-dimensional representations for entities and relations in knowledge graphs using contrastive estimation represents a scalable and effective method for inferring connectivity patterns. A crucial aspect of contrastive learning approaches is the choice of corruption distribution that generates hard negative samples, which force the embedding model to learn discriminative representations and find critical characteristics of observed data. While earlier methods either employ too simple corruption distributions, i.e. uniform, yielding easy uninformative negatives or sophisticated adversarial distributions with challenging optimization schemes, they do not explicitly incorporate known graph structure resulting in suboptimal negatives. In this paper, we propose Structure Aware Negative Sampling (SANS), an inexpensive negative sampling strategy that utilizes the rich graph structure by selecting negative samples from a node's $k$-hop neighborhood. Empirically, we demonstrate that SANS finds semantically meaningful negatives and is competitive with SOTA approaches while requires no additional parameters nor difficult adversarial optimization.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.492,38938925 +main.151,Be More with Less: Hypergraph Attention Networks for Inductive Text Classification,Kaize Ding|Jianling Wang|Jundong Li|Dingcheng Li|Huan Liu,"Text classification is a critical research topic with broad applications in natural language processing. Recently, graph neural networks (GNNs) have received increasing attention in the research community and demonstrated their promising results on this canonical task. Despite the success, their performance could be largely jeopardized in practice since they are: (1) unable to capture high-order interaction between words; (2) inefficient to handle large datasets and new documents. To address those issues, in this paper, we propose a principled model -- hypergraph attention networks (HyperGAT), which can obtain more expressive power with less computational consumption for text representation learning. Extensive experiments on various benchmark datasets demonstrate the efficacy of the proposed approach on the text classification task.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.399,38938658 +main.1518,Learning to Ignore: Long Document Coreference with Bounded Memory Neural Networks,Shubham Toshniwal|Sam Wiseman|Allyson Ettinger|Karen Livescu|Kevin Gimpel,"Long document coreference resolution remains a challenging task due to the large memory and runtime requirements of current models. Recent work doing incremental coreference resolution using just the global representation of entities shows practical benefits but requires keeping all entities in memory, which can be impractical for long documents. We argue that keeping all entities in memory is unnecessary, and we propose a memory-augmented neural network that tracks only a small bounded number of entities at a time, thus guaranteeing a linear runtime in length of document. We show that (a) the model remains competitive with models with high memory and computational requirements on OntoNotes and LitBank, and (b) the model learns an efficient memory management strategy easily outperforming a rule-based strategy",,Discourse and Pragmatics,Long,https://www.aclweb.org/anthology/2020.emnlp-main.685,38938926 +main.1522,Cross Copy Network for Dialogue Generation,Changzhen Ji|Xin Zhou|Yating Zhang|Xiaozhong Liu|Changlong Sun|Conghui Zhu|Tiejun Zhao,"In the past few years, audiences from different fields witness the achievements of sequence-to-sequence models (e.g., LSTM+attention, Pointer Generator Networks and Transformer) to enhance dialogue content generation. While content fluency and accuracy often serve as the major indicators for model training, dialogue logics, carrying critical information for some particular domains, are often ignored. Take customer service and court debate dialogue as examples, compatible logics can be observed across different dialogue instances, and this information can provide vital evidence for utterance generation. In this paper, we propose a novel network architecture - Cross Copy Networks (CCN) to explore the current dialog context and similar dialogue instances’ logical structure simultaneously. Experiments with two tasks, court debate and customer service content generation, proved that the proposed algorithm is superior to existing state-of-art content generation models.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.149,38938927 +main.1528,Entities as Experts: Sparse Memory Access with Entity Supervision,Thibault Févry|Livio Baldini Soares|Nicholas FitzGerald|Eunsol Choi|Tom Kwiatkowski,"We focus on the problem of capturing declarative knowledge about entities in the learned parameters of a language model. We introduce a new model---Entities as Experts (EaE)---that can access distinct memories of the entities mentioned in a piece of text. Unlike previous efforts to integrate entity knowledge into sequence models, EaE's entity representations are learned directly from text. We show that EaE's learned representations capture sufficient knowledge to answer TriviaQA questions such as ""Which Dr. Who villain has been played by Roger Delgado, Anthony Ainley, Eric Roberts?'', outperforming an encoder-generator Transformer model with 10x the parameters on this task. According to the Lama knowledge probes, EaE contains more factual knowledge than a similar sized Bert, as well as previous approaches that integrate external sources of entity knowledge.Because EaE associates parameters with specific entities, it only needs to access a fraction of its parameters at inference time, and we show that the correct identification and representation of entities is essential to EaE's performance.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.400,38938928 +main.1540,What Can We Learn from Collective Human Opinions on Natural Language Inference Data?,Yixin Nie|Xiang Zhou|Mohit Bansal,"Despite the subjective nature of many NLP tasks, most NLU evaluations have focused on using the majority label with presumably high agreement as the ground truth. Less attention has been paid to the distribution of human opinions. We collect ChaosNLI, a dataset with a total of 464,500 annotations to study Collective HumAn OpinionS in oft-used NLI evaluation sets. This dataset is created by collecting 100 annotations per example for 3,113 examples in SNLI and MNLI and 1,532 examples in αNLI. Analysis reveals that: (1) high human disagreement exists in a noticeable amount of examples in these datasets; (2) the state-of-the-art models lack the ability to recover the distribution over human labels; (3) models achieve near-perfect accuracy on the subset of data with a high level of human agreement, whereas they can barely beat a random guess on the data with low levels of human agreement, which compose most of the common errors made by state-of-the-art models on the evaluation sets. This questions the validity of improving model performance on old metrics for the low-agreement part of evaluation datasets. Hence, we argue for a detailed examination of human agreement in future data collection efforts, and evaluating model outputs against the distribution over collective human opinions.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",Long,https://www.aclweb.org/anthology/2020.emnlp-main.734,38938929 +main.1547,SelfORE: Self-supervised Relational Feature Learning for Open Relation Extraction,Xuming Hu|Lijie Wen|Yusong Xu|Chenwei Zhang|Philip Yu,"Open relation extraction is the task of extracting open-domain relation facts from natural language sentences. Existing works either utilize heuristics or distant-supervised annotations to train a supervised classifier over pre-defined relations, or adopt unsupervised methods with additional assumptions that have less discriminative power. In this work, we propose a self-supervised framework named SelfORE, which exploits weak, self-supervised signals by leveraging large pretrained language model for adaptive clustering on contextualized relational features, and bootstraps the self-supervised signals by improving contextualized features in relation classification. Experimental results on three datasets show the effectiveness and robustness of SelfORE on open-domain Relation Extraction when comparing with competitive baselines.",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.299,38938930 +main.1550,SentiLARE: Linguistic Knowledge Enhanced Language Representation for Sentiment Analysis,Pei Ke|Haozhe Ji|Siyang Liu|Xiaoyan Zhu|Minlie Huang,"Most of the existing pre-trained language representation models neglect to consider the linguistic knowledge of texts, which can promote language understanding in NLP tasks. To benefit the downstream tasks in sentiment analysis, we propose a novel language representation model called SentiLARE, which introduces word-level linguistic knowledge including part-of-speech tag and sentiment polarity (inferred from SentiWordNet) into pre-trained models. We first propose a context-aware sentiment attention mechanism to acquire the sentiment polarity of each word with its part-of-speech tag by querying SentiWordNet. Then, we devise a new pre-training task called label-aware masked language model to construct knowledge-aware language representation. Experiments show that SentiLARE obtains new state-of-the-art performance on a variety of sentiment analysis tasks.",,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Long,https://www.aclweb.org/anthology/2020.emnlp-main.567,38938931 +main.1551,Pareto Probing: Trading-Off Accuracy and Complexity,Tiago Pimentel|Naomi Saphra|Adina Williams|Ryan Cotterell,"The question of how to probe contextual word representations in a way that is principled and useful has seen significant recent attention. In our contribution to this discussion, we argue, first, for a probe metric that reflects the trade-off between probe complexity and performance: the Pareto hypervolume. To measure complexity, we present a number of parametric and non-parametric metrics. Our experiments with such metrics show that probe's performance curves often fail to align with widely accepted rankings between language representations (with, e.g., non-contextual representations outperforming contextual ones). These results lead us to argue, second, that common simplistic probe tasks such as POS labeling and dependency arc labeling, are inadequate to evaluate the properties encoded in contextual word representations. We propose full dependency parsing as an example probe task, and demonstrate it with the Pareto hypervolume. In support of our arguments, the results of this illustrative experiment conform closer to accepted rankings among contextual word representations.",,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.254,38938932 +main.1552,Understanding the Difficulty of Training Transformers,Liyuan Liu|Xiaodong Liu|Jianfeng Gao|Weizhu Chen|Jiawei Han,"Transformers have proved effective in many NLP tasks. However, their training requires non-trivial efforts regarding carefully designing cutting-edge optimizers and learning rate schedulers (e.g., conventional SGD fails to train Transformers effectively). Our objective here is to understand __what complicates Transformer training__ from both empirical and theoretical perspectives. Our analysis reveals that unbalanced gradients are not the root cause of the instability of training. Instead, we identify an amplification effect that influences training substantially—for each layer in a multi-layer Transformer model, heavy dependency on its residual branch makes training unstable, since it amplifies small parameter perturbations (e.g., parameter updates) and results in significant disturbances in the model output. Yet we observe that a light dependency limits the model potential and leads to inferior trained models. Inspired by our analysis, we propose Admin (Adaptive model initialization) to stabilize the early stage’s training and unleash its full potential in the late stage. Extensive experiments show that Admin is more stable, converges faster, and leads to better performance",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.463,38938933 +main.1561,Conversational Semantic Parsing for Dialog State Tracking,Jianpeng Cheng|Devang Agrawal|Héctor Martínez Alonso|Shruti Bhargava|Joris Driesen|Federico Flego|Dain Kaplan|Dimitri Kartsaklis|Lin Li|Dhivya Piraviperumal|Jason D Williams|Hong Yu|Diarmuid Ó Séaghdha|Anders Johannsen,"We consider a new perspective on dialog state tracking (DST), the task of estimating a user's goal through the course of a dialog. By formulating DST as a semantic parsing task over hierarchical representations, we can incorporate semantic compositionality, cross-domain knowledge sharing and co-reference. We present TreeDST, a dataset of 27k conversations annotated with tree-structured dialog states and system acts. We describe an encoder-decoder framework for DST with hierarchical representations, which leads to ~20% improvement over state-of-the-art DST approaches that operate on a flat meaning space of slot-value pairs.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.651,38938934 +main.1566,Denoising Relation Extraction from Document-level Distant Supervision,Chaojun Xiao|Yuan Yao|Ruobing Xie|Xu Han|Zhiyuan Liu|Maosong Sun|Fen Lin|Leyu Lin,"Distant supervision (DS) has been widely adopted to generate auto-labeled data for sentence-level relation extraction (RE) and achieved great results. However, the existing success of DS cannot be directly transferred to more challenging document-level relation extraction (DocRE), as the inevitable noise caused by DS may be even multiplied in documents and significantly harm the performance of RE. To alleviate this issue, we propose a novel pre-trained model for DocRE, which de-emphasize noisy DS data via multiple pre-training tasks. The experimental results on the large-scale DocRE benchmark show that our model can capture useful information from noisy data and achieve promising results.",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.300,38938935 +main.1569,Exploring Contextualized Neural Language Models for Temporal Dependency Parsing,Hayley Ross|Jonathon Cai|Bonan Min,"Extracting temporal relations between events and time expressions has many applications such as constructing event timelines and time-related question answering. It is a challenging problem which requires syntactic and semantic information at sentence or discourse levels, which may be captured by deep contextualized language models (LMs) such as BERT (Devlin et al., 2019). In this paper, we develop several variants of BERT-based temporal dependency parser, and show that BERT significantly improves temporal dependency parsing (Zhang and Xue, 2018a). We also present a detailed analysis on why deep contextualized neural LMs help and where they may fall short. Source code and resources are made available at https://github.com/bnmin/tdp_ranking.",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.689,38938936 +main.1572,Dynamic Data Selection and Weighting for Iterative Back-Translation,Zi-Yi Dou|Antonios Anastasopoulos|Graham Neubig,"Back-translation has proven to be an effective method to utilize monolingual data in neural machine translation (NMT), and iteratively conducting back-translation can further improve the model performance. Selecting which monolingual data to back-translate is crucial, as we require that the resulting synthetic data are of high quality and reflect the target domain. To achieve these two goals, data selection and weighting strategies have been proposed, with a common practice being to select samples close to the target domain but also dissimilar to the average general-domain text. In this paper, we provide insights into this commonly used approach and generalize it to a dynamic curriculum learning strategy, which is applied to iterative back-translation models. In addition, we propose weighting strategies based on both the current quality of the sentence and its improvement over the previous iteration. We evaluate our models on domain adaptation, low-resource, and high-resource MT settings and on two language pairs. Experimental results demonstrate that our methods achieve improvements of up to~1.8 BLEU points over competitive baselines.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.475,38938937 +main.1574,BERT-of-Theseus: Compressing BERT by Progressive Module Replacing,Canwen Xu|Wangchunshu Zhou|Tao Ge|Furu Wei|Ming Zhou,"In this paper, we propose a novel model compression approach to effectively compress BERT by progressive module replacing. Our approach first divides the original BERT into several modules and builds their compact substitutes. Then, we randomly replace the original modules with their substitutes to train the compact modules to mimic the behavior of the original modules. We progressively increase the probability of replacement through the training. In this way, our approach brings a deeper level of interaction between the original and compact models. Compared to the previous knowledge distillation approaches for BERT compression, our approach does not introduce any additional loss function. Our approach outperforms existing knowledge distillation approaches on GLUE benchmark, showing a new perspective of model compression.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.633,38938938 +main.1575,SetConv: A New Approach for Learning from Imbalanced Data,Yang Gao|Yi-Fan Li|Yu Lin|Charu Aggarwal|Latifur Khan,"For many real-world classification problems, e.g., sentiment classification, most existing machine learning methods are biased towards the majority class when the Imbalance Ratio (IR) is high. To address this problem, we propose a set convolution (SetConv) operation and an episodic training strategy to extract a single representative for each class, so that classifiers can later be trained on a balanced class distribution. We prove that our proposed algorithm is permutation-invariant despite the order of inputs, and experiments on multiple large-scale benchmark text datasets show the superiority of our proposed framework when compared to other SOTA methods.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.98,38938939 +main.1578,Keep CALM and Explore: Language Models for Action Generation in Text-based Games,Shunyu Yao|Rohan Rao|Matthew Hausknecht|Karthik Narasimhan,"Text-based games present a unique challenge for autonomous agents to operate in natural language and handle enormous action spaces. In this paper, we propose the Contextual Action Language Model (CALM) to generate a compact set of action candidates at each game state. Our key insight is to train language models on human gameplay, where people demonstrate linguistic priors and a general game sense for promising actions conditioned on game history. We combine CALM with a reinforcement learning agent which re-ranks the generated action candidates to maximize in-game rewards. We evaluate our approach using the Jericho benchmark, on games unseen by CALM during training. Our method obtains a 69% relative improvement in average game score over the previous state-of-the-art model. Surprisingly, on half of these games, CALM is competitive with or better than other models that have access to ground truth admissible actions. Code and data are available at https://github.com/princeton-nlp/calm-textgame.",,"Language Grounding to Vision, Robotics and Beyond",Long,https://www.aclweb.org/anthology/2020.emnlp-main.704,38938940 +main.158,Double Graph Based Reasoning for Document-level Relation Extraction,Shuang Zeng|Runxin Xu|Baobao Chang|Lei Li,"Document-level relation extraction aims to extract relations among entities within a document. Different from sentence-level relation extraction, it requires reasoning over multiple sentences across paragraphs. In this paper, we propose Graph Aggregation-and-Inference Network (GAIN), a method to recognize such relations for long paragraphs. GAIN constructs two graphs, a heterogeneous mention-level graph (MG) and an entity-level graph (EG). The former captures complex interaction among different mentions and the latter aggregates mentions underlying for the same entities. Based on the graphs we propose a novel path reasoning mechanism to infer relations between entities. Experiments on the public dataset, DocRED, show GAIN achieves a significant performance improvement (2.85 on F1) over the previous state-of-the-art. Our code is available at https://github.com/PKUnlp-icler/GAIN.",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.127,38938659 +main.1580,AmbigQA: Answering Ambiguous Open-domain Questions,Sewon Min|Julian Michael|Hannaneh Hajishirzi|Luke Zettlemoyer,"Ambiguity is inherent to open-domain question answering; especially when exploring new topics, it can be difficult to ask questions that have a single, unambiguous answer. In this paper, we introduce AmbigQA, a new open-domain question answering task which involves finding every plausible answer, and then rewriting the question for each one to resolve the ambiguity. To study this task, we construct AmbigNQ, a dataset covering 14,042 questions from NQ-open, an existing open-domain QA benchmark. We find that over half of the questions in NQ-open are ambiguous, with diverse sources of ambiguity such as event and entity references. We also present strong baseline models for AmbigQA which we show benefit from weakly supervised learning that incorporates NQ-open, strongly suggesting our new task and data will support significant future research effort. Our data and baselines are available at https://nlp.cs.washington.edu/ambigqa.",,Question Answering,Long,https://www.aclweb.org/anthology/2020.emnlp-main.466,38938941 +main.1581,Reformulating Unsupervised Style Transfer as Paraphrase Generation,Kalpesh Krishna|John Wieting|Mohit Iyyer,"Modern NLP defines the task of style transfer as modifying the style of a given sentence without appreciably changing its semantics, which implies that the outputs of style transfer systems should be paraphrases of their inputs. However, many existing systems purportedly designed for style transfer inherently warp the input's meaning through attribute transfer, which changes semantic properties such as sentiment. In this paper, we reformulate unsupervised style transfer as a paraphrase generation problem, and present a simple methodology based on fine-tuning pretrained language models on automatically generated paraphrase data. Despite its simplicity, our method significantly outperforms state-of-the-art style transfer systems on both human and automatic evaluations. We also survey 23 style transfer papers and discover that existing automatic metrics can be easily gamed and propose fixed variants. Finally, we pivot to a more real-world style transfer setting by collecting a large dataset of 15M sentences in 11 diverse styles, which we use for an in-depth analysis of our system.",,Language Generation,Long,https://www.aclweb.org/anthology/2020.emnlp-main.55,38938942 +main.1594,Bio-Megatron: Larger Biomedical Domain Language Model,Hoo-Chang Shin|Yang Zhang|Evelina Bakhturina|Raul Puri|Mostofa Patwary|Mohammad Shoeybi|Raghav Mani,"There has been an influx of biomedical domain-specific language models, showing language models pre-trained on biomedical text perform better on biomedical domain benchmarks than those trained on general domain text corpora such as Wikipedia and Books. Yet, most works do not study the factors affecting each domain language application deeply. Additionally, the study of model size on domain-specific models has been mostly missing. We empirically study and evaluate several factors that can affect performance on domain language applications, such as the sub-word vocabulary set, model size, pre-training corpus, and domain transfer. We show consistent improvements on benchmarks with our larger BioMegatron model trained on a larger domain corpus, contributing to our understanding of domain language model applications. We demonstrate noticeable improvements over the previous state-of-the-art (SOTA) on standard biomedical NLP benchmarks of question answering, named entity recognition, and relation extraction. Code and checkpoints to reproduce our experiments are available at [github.com/NVIDIA/NeMo].",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.379,38938943 +main.16,Meta Fine-Tuning Neural Language Models for Multi-Domain Text Mining,Chengyu Wang|Minghui Qiu|jun huang|XIAOFENG HE,"Pre-trained neural language models bring significant improvement for various NLP tasks, by fine-tuning the models on task-specific training sets. During fine-tuning, the parameters are initialized from pre-trained models directly, which ignores how the learning process of similar NLP tasks in different domains is correlated and mutually reinforced. In this paper, we propose an effective learning procedure named Meta Fine-Tuning (MFT), serving as a meta-learner to solve a group of similar NLP tasks for neural language models. Instead of simply multi-task training over all the datasets, MFT only learns from typical instances of various domains to acquire highly transferable knowledge. It further encourages the language model to encode domain-invariant representations by optimizing a series of novel domain corruption loss functions. After MFT, the model can be fine-tuned for each domain with better parameter initializations and higher generalization ability. We implement MFT upon BERT to solve several multi-domain text mining tasks. Experimental results confirm the effectiveness of MFT and its usefulness for few-shot learning.",,Information Retrieval and Text Mining,Long,https://www.aclweb.org/anthology/2020.emnlp-main.250,38938637 +main.1603,CancerEmo: A Dataset for Fine-Grained Emotion Detection,Tiberiu Sosea|Cornelia Caragea,"Emotions are an important element of human nature, often affecting the overall wellbeing of a person. Therefore, it is no surprise that the health domain is a valuable area of interest for emotion detection, as it can provide medical staff or caregivers with essential information about patients. However, progress on this task has been hampered by the absence of large labeled datasets. To this end, we introduce CancerEmo, an emotion dataset created from an online health community and annotated with eight fine-grained emotions. We perform a comprehensive analysis of these emotions and develop deep learning models on the newly created dataset. Our best BERT model achieves an average F1 of 71%, which we improve further using domain-specific pre-training.",,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Long,https://www.aclweb.org/anthology/2020.emnlp-main.715,38938944 +main.1606,GRADE: Automatic Graph-Enhanced Coherence Metric for Evaluating Open-Domain Dialogue Systems,Lishan Huang|Zheng Ye|Jinghui Qin|Liang Lin|Xiaodan Liang,"Automatically evaluating dialogue coherence is a challenging but high-demand ability for developing high-quality open-domain dialogue systems. However, current evaluation metrics consider only surface features or utterance-level semantics, without explicitly considering the fine-grained topic transition dynamics of dialogue flows. Here, we first consider that the graph structure constituted with topics in a dialogue can accurately depict the underlying communication logic, which is a more natural way to produce persuasive metrics. Capitalized on the topic-level dialogue graph, we propose a new evaluation metric GRADE, which stands for Graph-enhanced Representations for Automatic Dialogue Evaluation. Specifically, GRADE incorporates both coarse-grained utterance-level contextualized representations and fine-grained topic-level graph representations to evaluate dialogue coherence. The graph representations are obtained by reasoning over topic-level dialogue graphs enhanced with the evidence from a commonsense graph, including k-hop neighboring representations and hop-attention weights. Experimental results show that our GRADE significantly outperforms other state-of-the-art metrics on measuring diverse dialogue models in terms of the Pearson and Spearman correlations with human judgments. Besides, we release a new large-scale human evaluation benchmark to facilitate future research on automatic metrics.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.742,38938945 +main.1611,Weakly-Supervised Text Classification Using Label Names Only,Yu Meng|Yunyi Zhang|Jiaxin Huang|Chenyan Xiong|Heng Ji|Chao Zhang|Jiawei Han,"Current text classification methods typically require a good number of human-labeled documents as training data, which can be costly and difficult to obtain in real applications. Humans can perform classification without seeing any labeled examples but only based on a small set of words describing the categories to be classified. In this paper, we explore the potential of only using the label name of each class to train classification models on unlabeled data, without using any labeled documents. We use pre-trained neural language models both as general linguistic knowledge sources for category understanding and as representation learning models for document classification. Our method (1) associates semantically related words with the label names, (2) finds category-indicative words and trains the model to predict their implied categories, and (3) generalizes the model via self-training. We show that our model achieves around 90% accuracy on four benchmark datasets including topic and sentiment classification without using any labeled documents but learning from unlabeled data supervised by at most 3 words (1 in most cases) per class as the label name.",,Information Retrieval and Text Mining,Long,https://www.aclweb.org/anthology/2020.emnlp-main.724,38938946 +main.1612,Interpretation of NLP Models through Input Marginalization,Siwon Kim|Jihun Yi|Eunji Kim|Sungroh Yoon,"To demystify the ``black box"" property of deep neural networks for natural language processing (NLP), several methods have been proposed to interpret their predictions by measuring the change in prediction probability after erasing each token of an input. Since existing methods replace each token with a predefined value (i.e., zero), the resulting sentence lies out of the training data distribution, yielding misleading interpretations. In this study, we raise the out-of-distribution problem induced by the existing interpretation methods and present a remedy; we propose to marginalize each token out. We interpret various NLP models trained for sentiment analysis and natural language inference using the proposed method.",,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.255,38938947 +main.1613,Learning Music Helps You Read: Using Transfer to Study Linguistic Structure in Language Models,Isabel Papadimitriou|Dan Jurafsky,"We propose transfer learning as a method for analyzing the encoding of grammatical structure in neural language models. We train LSTMs on non-linguistic data and evaluate their performance on natural language to assess which kinds of data induce generalizable structural features that LSTMs can use for natural language. We find that training on non-linguistic data with latent structure (MIDI music or Java code) improves test performance on natural language, despite no overlap in surface form or vocabulary. To pinpoint the kinds of abstract structure that models may be encoding to lead to this improvement, we run similar experiments with two artificial parentheses languages: one which has a hierarchical recursive structure, and a control which has paired tokens but no recursion. Surprisingly, training a model on either of these artificial languages leads the same substantial gains when testing on natural language. Further experiments on transfer between natural languages controlling for vocabulary overlap show that zero-shot performance on a test language is highly correlated with typological syntactic similarity to the training language, suggesting that representations induced by pre-training correspond to the cross-linguistic syntactic properties. Our results provide insights into the ways that neural models represent abstract syntactic structure, and also about the kind of structural inductive biases which allow for natural language acquisition.",,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.554,38938948 +main.1614,Generating Label Cohesive and Well-Formed Adversarial Claims,Pepa Atanasova|Dustin Wright|Isabelle Augenstein,"Adversarial attacks reveal important vulnerabilities and flaws of trained models. One potent type of attack are universal adversarial triggers, which are individual n-grams that, when appended to instances of a class under attack, can trick a model into predicting a target class. However, for inference tasks such as fact checking, these triggers often inadvertently invert the meaning of instances they are inserted in. In addition, such attacks produce semantically nonsensical inputs, as they simply concatenate triggers to existing samples. Here, we investigate how to generate adversarial attacks against fact checking systems that preserve the ground truth meaning and are semantically valid. We extend the HotFlip attack algorithm used for universal trigger generation by jointly minimizing the target class loss of a fact checking model and the entailment class loss of an auxiliary natural language inference model. We then train a conditional language model to generate semantically valid statements, which include the found universal triggers. We find that the generated attacks maintain the directionality and semantic validity of the claim better than previous work.",,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.256,38938949 +main.1615,AIN: Fast and Accurate Sequence Labeling with Approximate Inference Network,Xinyu Wang|Yong Jiang|Nguyen Bach|Tao Wang|Zhongqiang Huang|Fei Huang|Kewei Tu,"The linear-chain Conditional Random Field (CRF) model is one of the most widely-used neural sequence labeling approaches. Exact probabilistic inference algorithms such as the forward-backward and Viterbi algorithms are typically applied in training and prediction stages of the CRF model. However, these algorithms require sequential computation that makes parallelization impossible. In this paper, we propose to employ a parallelizable approximate variational inference algorithm for the CRF model. Based on this algorithm, we design an approximate inference network that can be connected with the encoder of the neural CRF model to form an end-to-end network, which is amenable to parallelization for faster training and prediction. The empirical results show that our proposed approaches achieve a 12.7-fold improvement in decoding speed with long sentences and a competitive accuracy compared with the traditional CRF approach.",,"Syntax: Tagging, Chunking, and Parsing",Long,https://www.aclweb.org/anthology/2020.emnlp-main.485,38938950 +main.1618,ETC: Encoding Long and Structured Inputs in Transformers,Joshua Ainslie|Santiago Ontanon|Chris Alberti|Vaclav Cvicek|Zachary Fisher|Philip Pham|Anirudh Ravula|Sumit Sanghai|Qifan Wang|Li Yang,"Transformer models have advanced the state of the art in many Natural Language Processing (NLP) tasks. In this paper, we present a new Transformer architecture, ""Extended Transformer Construction"" (ETC), that addresses two key challenges of standard Transformer architectures, namely scaling input length and encoding structured inputs. To scale attention to longer inputs, we introduce a novel global-local attention mechanism between global tokens and regular input tokens. We also show that combining global-local attention with relative position encodings and a ""Contrastive Predictive Coding"" (CPC) pre-training objective allows ETC to encode structured inputs. We achieve state-of-the-art results on four natural language datasets requiring long and/or structured inputs.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.19,38938951 +main.1621,Revealing the Myth of Higher-Order Inference in Coreference Resolution,Liyan Xu|Jinho D. Choi,"This paper analyzes the impact of higher-order inference (HOI) on the task of coreference resolution. HOI has been adapted by almost all recent coreference resolution models without taking much investigation on its true effectiveness over representation learning. To make a comprehensive analysis, we implement an end-to-end coreference system as well as four HOI approaches, attended antecedent, entity equalization, span clustering, and cluster merging, where the latter two are our original methods. We find that given a high-performing encoder such as SpanBERT, the impact of HOI is negative to marginal, providing a new perspective of HOI to this task. Our best model using cluster merging shows the Avg-F1 of 80.2 on the CoNLL 2012 shared task dataset in English.",,Discourse and Pragmatics,Long,https://www.aclweb.org/anthology/2020.emnlp-main.686,38938952 +main.1622,Discern: Discourse-Aware Entailment Reasoning Network for Conversational Machine Reading,Yifan Gao|Chien-Sheng Wu|Jingjing Li|Shafiq Joty|Steven C.H. Hoi|Caiming Xiong|Irwin King|Michael Lyu,"Document interpretation and dialog understanding are the two major challenges for conversational machine reading. In this work, we propose ""Discern"", a discourse-aware entailment reasoning network to strengthen the connection and enhance the understanding of both document and dialog. Specifically, we split the document into clause-like elementary discourse units (EDU) using a pre-trained discourse segmentation model, and we train our model in a weakly-supervised manner to predict whether each EDU is entailed by the user feedback in a conversation. Based on the learned EDU and entailment representations, we either reply to the user our final decision ""yes/no/irrelevant"" of the initial question, or generate a follow-up question to inquiry more information. Our experiments on the ShARC benchmark (blind, held-out test set) show that Discern achieves state-of-the-art results of 78.3% macro-averaged accuracy on decision making and 64.0 BLEU1 on follow-up question generation. Code and models are released at https://github.com/Yifan-Gao/Discern.",,Question Answering,Long,https://www.aclweb.org/anthology/2020.emnlp-main.191,38938953 +main.1625,Parsing Gapping Constructions Based on Grammatical and Semantic Roles,Yoshihide Kato|Shigeki Matsubara,A gapping construction consists of a coordinated structure where redundant elements are elided from all but one conjuncts. This paper proposes a method of parsing sentences with gapping to recover elided elements. The proposed method is based on constituent trees annotated with grammatical and semantic roles that are useful for identifying elided elements. Our method outperforms the previous method in terms of F-measure and recall.,,"Syntax: Tagging, Chunking, and Parsing",Long,https://www.aclweb.org/anthology/2020.emnlp-main.218,38938954 +main.1626,Hate-Speech and Offensive Language Detection in Roman Urdu,Hammad Rizwan|Muhammad Haroon Shakeel|Asim Karim,"The task of automatic hate-speech and offensive language detection in social media content is of utmost importance due to its implications in unprejudiced society concerning race, gender, or religion. Existing research in this area, however, is mainly focused on the English language, limiting the applicability to particular demographics. Despite its prevalence, Roman Urdu (RU) lacks language resources, annotated datasets, and language models for this task. In this study, we: (1) Present a lexicon of hateful words in RU, (2) Develop an annotated dataset called RUHSOLD consisting of 10,012 tweets in RU with both coarse-grained and fine-grained labels of hate-speech and offensive language, (3) Explore the feasibility of transfer learning of five existing embedding models to RU, (4) Propose a novel deep learning architecture called CNN-gram for hate-speech and offensive language detection and compare its performance with seven current baseline approaches on RUHSOLD dataset, and (5) Train domain-specific embeddings on more than 4.7 million tweets and make them publicly available. We conclude that transfer learning is more beneficial as compared to training embedding from scratch and that the proposed model exhibits greater robustness as compared to the baselines.",,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.197,38938955 +main.1631,An Empirical Investigation towards Efficient Multi-Domain Language Model Pre-training,Kristjan Arumae|Qing Sun|Parminder Bhatia,"Pre-training large language models has become a standard in the natural language processing community. Such models are pre-trained on generic data (e.g. BookCorpus and English Wikipedia) and often fine-tuned on tasks in the same domain. However, in order to achieve state-of-the-art performance on out of domain tasks such as clinical named entity recognition and relation extraction, additional in domain pre-training is required. In practice, staged multi-domain pre-training presents performance deterioration in the form of catastrophic forgetting (CF) when evaluated on a generic benchmark such as GLUE. In this paper we conduct an empirical investigation into known methods to mitigate CF. We find that elastic weight consolidation provides best overall scores yielding only a 0.33% drop in performance across seven generic tasks while remaining competitive in bio-medical tasks. Furthermore, we explore gradient and latent clustering based data selection techniques to improve coverage when using elastic weight consolidation and experience replay methods.",,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.394,38938956 +main.1634,Planning and Generating Natural and Diverse Disfluent Texts as Augmentation for Disfluency Detection,Jingfeng Yang|Diyi Yang|Zhaoran Ma,"Existing approaches to disfluency detection heavily depend on human-annotated data. Numbers of data augmentation methods have been proposed to alleviate the dependence on labeled data. However, current augmentation approaches such as random insertion or repetition fail to resemble training corpus well and usually resulted in unnatural and limited types of disfluencies. In this work, we propose a simple Planner-Generator based disfluency generation model to generate natural and diverse disfluent texts as augmented data, where the Planner decides on where to insert disfluent segments and the Generator follows the prediction to generate corresponding disfluent segments. We further utilize this augmented data for pretraining and leverage it for the task of disfluency detection. Experiments demonstrated that our two-stage disfluency generation model outperforms existing baselines; those disfluent sentences generated significantly aided the task of disfluency detection and led to state-of-the-art performance on Switchboard corpus.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.113,38938957 +main.1647,Controllable Story Generation with External Knowledge Using Large-Scale Language Models,Peng Xu|Mostofa Patwary|Mohammad Shoeybi|Raul Puri|Pascale Fung|Anima Anandkumar|Bryan Catanzaro,"Existing pre-trained large language models have shown unparalleled generative capabilities. However, they are not controllable. In this paper, we propose MEGATRON-CNTRL, a novel framework that uses large-scale language models and adds control to text generation by incorporating an external knowledge base. Our framework consists of a keyword predictor, a knowledge retriever, a contextual knowledge ranker, and a conditional text generator. As we do not have access to ground-truth supervision for the knowledge ranker, we make use of weak supervision from sentence embedding. The empirical results show that our model generates more fluent, consistent, and coherent stories with less repetition and higher diversity compared to prior work on the ROC story dataset. We showcase the controllability of our model by replacing the keywords used to generate stories and re-running the generation process. Human evaluation results show that 77.5% of these stories are successfully controlled by the new keywords. Furthermore, by scaling our model from 124 million to 8.3 billion parameters we demonstrate that larger models improve both the quality of generation (from 74.5% to 93.0% for consistency) and controllability (from 77.5% to 91.5%).",,Language Generation,Long,https://www.aclweb.org/anthology/2020.emnlp-main.226,38938958 +main.1648,Scalable Multi-Hop Relational Reasoning for Knowledge-Aware Question Answering,Yanlin Feng|Xinyue Chen|Bill Yuchen Lin|Peifeng Wang|Jun Yan|Xiang Ren,"Existing work on augmenting question answering (QA) models with external knowledge (e.g., knowledge graphs) either struggle to model multi-hop relations efficiently, or lack transparency into the model's prediction rationale. In this paper, we propose a novel knowledge-aware approach that equips pre-trained language models (PTLMs) has with a multi-hop relational reasoning module, named multi-hop graph relation network (MHGRN). It performs multi-hop, multi-relational reasoning over subgraphs extracted from external knowledge graphs. The proposed reasoning module unifies path-based reasoning methods and graph neural networks to achieve better interpretability and scalability. We also empirically show its effectiveness and scalability on CommonsenseQA and OpenbookQA datasets, and interpret its behaviors with case studies, with the code for experiments released.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.99,38938959 +main.1649,Enabling Cross-Lingual AMR Parsing with Transfer Learning Techniques,Rexhina Blloshmi|Rocco Tripodi|Roberto Navigli,"Abstract Meaning Representation (AMR) is a popular formalism of natural language that represents the meaning of a sentence as a semantic graph. It is agnostic about how to derive meanings from strings and for this reason it lends itself well to the encoding of semantics across languages. However, cross-lingual AMR parsing is a hard task, because training data are scarce in languages other than English and the existing English AMR parsers are not directly suited to being used in a cross-lingual setting. In this work we tackle these two problems so as to enable cross-lingual AMR parsing: we explore different transfer learning techniques for producing automatic AMR annotations across languages and develop a cross-lingual AMR parser, XL-AMR. This can be trained on the produced data and does not rely on AMR aligners or source-copy mechanisms as is commonly the case in English AMR parsing. The results of XL-AMR significantly surpass those previously reported in Chinese, German, Italian and Spanish. Finally we provide a qualitative analysis which sheds light on the suitability of AMR across languages. We release XL-AMR at github.com/SapienzaNLP/xl-amr.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",Long,https://www.aclweb.org/anthology/2020.emnlp-main.195,38938960 +main.165,Towards Persona-Based Empathetic Conversational Models,Peixiang Zhong|Chen Zhang|Hao Wang|Yong Liu|Chunyan Miao,"Empathetic conversational models have been shown to improve user satisfaction and task outcomes in numerous domains. In Psychology, persona has been shown to be highly correlated to personality, which in turn influences empathy. In addition, our empirical analysis also suggests that persona plays an important role in empathetic conversations. To this end, we propose a new task towards persona-based empathetic conversations and present the first empirical study on the impact of persona on empathetic responding. Specifically, we first present a novel large-scale multi-domain dataset for persona-based empathetic conversations. We then propose CoBERT, an efficient BERT-based response selection model that obtains the state-of-the-art performance on our dataset. Finally, we conduct extensive experiments to investigate the impact of persona on empathetic responding. Notably, our results show that persona improves empathetic responding more when CoBERT is trained on empathetic conversations than non-empathetic ones, establishing an empirical link between persona and empathy in human conversations.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.531,38938660 +main.1654,Probing Task-Oriented Dialogue Representation from Language Models,Chien-Sheng Wu|Caiming Xiong,"This paper investigates pre-trained language models to find out which model intrinsically carries the most informative representation for task-oriented dialogue tasks. We approach the problem from two aspects: supervised classifier probe and unsupervised mutual information probe. We fine-tune a feed-forward layer as the classifier probe on top of a fixed pre-trained language model with annotated labels in a supervised way. Meanwhile, we propose an unsupervised mutual information probe to evaluate the mutual dependence between a real clustering and a representation clustering. The goals of this empirical paper are to 1) investigate probing techniques, especially from the unsupervised mutual information aspect, 2) provide guidelines of pre-trained language model selection for the dialogue research community, 3) find insights of pre-training factors for dialogue application that may be the key to success.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.409,38938961 +main.1658,Generating Similes E̶F̶F̶O̶R̶T̶L̶E̶S̶S̶L̶Y̶ 𝘭𝘪𝘬𝘦 𝘢 𝘗𝘳𝘰: A Style Transfer Approach for Simile Generation,Tuhin Chakrabarty|Smaranda Muresan|Nanyun Peng,"Literary tropes, from poetry to stories, are at the crux of human imagination and communication. Figurative language such as a simile go beyond plain expressions to give readers new insights and inspirations. In this paper, we tackle the problem of simile generation. Generating a simile requires proper understanding for effective mapping of properties between two concepts. To this end, we first propose a method to automatically construct a parallel corpus by transforming a large number of similes collected from Reddit to their literal counterpart using structured common sense knowledge. We then propose to fine-tune a pre-trained sequence to sequence model, BART (Lewis et al 2019), on the literal-simile pairs to gain generalizability, so that we can generate novel similes given a literal sentence. Experiments show that our approach generates 88% novel similes that do not share properties with the training data. Human evaluation on an independent set of literal statements shows that our model generates similes better than two literary experts 37% of the time when compared pairwise. We also show how replacing literal sentences with similes from our best model in machine-generated stories improves evocativeness and leads to better acceptance by human judges.",,Language Generation,Long,https://www.aclweb.org/anthology/2020.emnlp-main.524,38938962 +main.1669,Systematic Comparison of Neural Architectures and Training Approaches for Open Information Extraction,Patrick Hohenecker|Frank Mtumbuka|Vid Kocijan|Thomas Lukasiewicz,"The goal of open information extraction (OIE) is to extract facts from natural language text, and to represent them as structured triples of the form . For example, given the sentence ""Beethoven composed the Ode to Joy."", we are expected to extract the triple . In this work, we systematically compare different neural network architectures and training approaches, and improve the performance of the currently best models on the OIE16 benchmark (Stanovsky and Dagan, 2016) by 0.421 F1 score and 0.420 AUC-PR, respectively, in our experiments (i.e., by more than 200% in both cases). Furthermore, we show that appropriate problem and loss formulations often affect the performance more than the network architecture.",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.690,38938963 +main.1670,Does My Multimodal Model Learn Cross-modal Interactions? It's Harder to Tell than You Might Think!,Jack Hessel|Lillian Lee,"Modeling expressive cross-modal interactions seems crucial in multimodal tasks, such as visual question answering. However, sometimes high-performing black-box algorithms turn out to be mostly exploiting unimodal signals in the data. We propose a new diagnostic tool, empirical multimodally-additive function projection (EMAP), for isolating whether or not cross-modal interactions improve performance for a given model on a given task. This function projection modifies model predictions so that cross-modal interactions are eliminated, isolating the additive, unimodal structure. For seven image+text classification tasks (on each of which we set new state-of-the-art benchmarks), we find that, in many cases, removing cross-modal interactions results in little to no performance degradation. Surprisingly, this holds even when expressive models, with capacity to consider interactions, otherwise outperform less expressive models; thus, performance improvements, even when present, often cannot be attributed to consideration of cross-modal feature interactions. We hence recommend that researchers in multimodal machine learning report the performance not only of unimodal baselines, but also the EMAP of their best-performing model.",,"Language Grounding to Vision, Robotics and Beyond",Long,https://www.aclweb.org/anthology/2020.emnlp-main.62,38938964 +main.1675,Aspect-Based Sentiment Analysis by Aspect-Sentiment Joint Embedding,Jiaxin Huang|Yu Meng|Fang Guo|Heng Ji|Jiawei Han,"Aspect-based sentiment analysis of review texts is of great value for understanding user feedback in a fine-grained manner. It has in general two sub-tasks: (i) extracting aspects from each review, and (ii) classifying aspect-based reviews by sentiment polarity. In this paper, we propose a weakly-supervised approach for aspect-based sentiment analysis, which uses only a few keywords describing each aspect/sentiment without using any labeled examples. Existing methods are either designed only for one of the sub-tasks, or are based on topic models that may contain overlapping concepts. We propose to first learn joint topic embeddings in the word embedding space by imposing regularizations to encourage topic distinctiveness, and then use neural models to generalize the word-level discriminative information by pre-training the classifiers with embedding-based predictions and self-training them on unlabeled data. Our comprehensive performance analysis shows that our method generates quality joint topics and outperforms the baselines significantly (7.4% and 5.1% F1-score gain on average for aspect and sentiment classification respectively) on benchmark datasets.",,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Long,https://www.aclweb.org/anthology/2020.emnlp-main.568,38938965 +main.168,Facilitating the Communication of Politeness through Fine-Grained Paraphrasing,Liye Fu|Susan Fussell|Cristian Danescu-Niculescu-Mizil,"Aided by technology, people are increasingly able to communicate across geographical, cultural, and language barriers. This ability also results in new challenges, as interlocutors need to adapt their communication approaches to increasingly diverse circumstances. In this work, we take the first steps towards automatically assisting people in adjusting their language to a specific communication circumstance. As a case study, we focus on facilitating the accurate transmission of pragmatic intentions and introduce a methodology for suggesting paraphrases that achieve the intended level of politeness under a given communication circumstance. We demonstrate the feasibility of this approach by evaluating our method in two realistic communication scenarios and show that it can reduce the potential for misalignment between the speaker's intentions and the listener's perceptions in both cases.",,Language Generation,Long,https://www.aclweb.org/anthology/2020.emnlp-main.416,38938661 +main.1680,Multi-task Learning for Multilingual Neural Machine Translation,Yiren Wang|ChengXiang Zhai|Hany Hassan,"While monolingual data has been shown to be useful in improving bilingual neural machine translation (NMT), effectively and efficiently leveraging monolingual data for Multilingual NMT (MNMT) systems is a less explored area. In this work, we propose a multi-task learning (MTL) framework that jointly trains the model with the translation task on bitext data and two denoising tasks on the monolingual data. We conduct extensive empirical studies on MNMT systems with $10$ language pairs from WMT datasets. We show that the proposed approach can effectively improve the translation quality for both high-resource and low-resource languages with large margin, achieving significantly better results than the individual bilingual models. We also demonstrate the efficacy of the proposed approach in the zero-shot setup for language pairs without bitext training data. Furthermore, we show the effectiveness of MTL over pre-training approaches for both NMT and cross-lingual transfer learning NLU tasks; the proposed approach outperforms massive scale models trained on single task.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.75,38938966 +main.1682,Multi-label Few/Zero-shot Learning with Knowledge Aggregated from Multiple Label Graphs,Jueqing Lu|Lan Du|Ming Liu|Joanna Dipnall,"Few/Zero-shot learning is a big challenge of many classifications tasks, where a classifier is required to recognise instances of classes that have very few or even no training samples. It becomes more difficult in multi-label classification, where each instance is labelled with more than one class. In this paper, we present a simple multi-graph aggregation model that fuses knowledge from multiple label graphs encoding different semantic label relationships in order to study how the aggregated knowledge can benefit multi-label zero/few-shot document classification. The model utilises three kinds of semantic information, i.e., the pre-trained word embeddings, label description, and pre-defined label relations. Experimental results derived on two large clinical datasets (i.e., MIMIC-II and MIMIC-III ) and the EU legislation dataset show that methods equipped with the multi-graph knowledge aggregation achieve significant performance improvement across almost all the measures on few/zero-shot labels.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.235,38938967 +main.1687,PlotMachines: Outline-Conditioned Generation with Dynamic Plot State Tracking,Hannah Rashkin|Asli Celikyilmaz|Yejin Choi|Jianfeng Gao,"We propose the task of outline-conditioned story generation: given an outline as a set of phrases that describe key characters and events to appear in a story, the task is to generate a coherent narrative that is consistent with the provided outline. This task is challenging as the input only provides a rough sketch of the plot, and thus, models need to generate a story by interweaving the key points provided in the outline. This requires the model to keep track of the dynamic states of the latent plot, conditioning on the input outline while generating the full story. We present PlotMachines, a neural narrative model that learns to transform an outline into a coherent story by tracking the dynamic plot states. In addition, we enrich PlotMachines with high-level discourse structure so that the model can learn different writing styles corresponding to different parts of the narrative. Comprehensive experiments over three fiction and non-fiction datasets demonstrate that large-scale language models, such as GPT-2 and Grover, despite their impressive generation performance, are not sufficient in generating coherent narratives for the given outline, and dynamic plot state tracking is important for composing narratives with tighter, more consistent plots.",,Language Generation,Long,https://www.aclweb.org/anthology/2020.emnlp-main.349,38938968 +main.1694,Accurate Word Alignment Induction from Neural Machine Translation,Yun Chen|Yang Liu|Guanhua Chen|Xin Jiang|Qun Liu,"Despite its original goal to jointly learn to align and translate, prior researches suggest that Transformer captures poor word alignments through its attention mechanism. In this paper, we show that attention weights do capture accurate word alignments and propose two novel word alignment induction methods Shift-Att and Shift-AET. The main idea is to induce alignments at the step when the to-be-aligned target token is the decoder input rather than the decoder output as in previous work. Shift-Att is an interpretation method that induces alignments from the attention weights of Transformer and does not require parameter update or architecture change. Shift-AET extracts alignments from an additional alignment module which is tightly integrated into Transformer and trained in isolation with supervision from symmetrized Shift-Att alignments. Experiments on three publicly available datasets demonstrate that both methods perform better than their corresponding neural baselines and Shift-AET significantly outperforms GIZA++ by 1.4-4.8 AER points.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.42,38938969 +main.1700,Dialogue Response Ranking Training with Large-Scale Human Feedback Data,Xiang Gao|Yizhe Zhang|Michel Galley|Chris Brockett|Bill Dolan,"Existing open-domain dialog models are generally trained to minimize the perplexity of target human responses. However, some human replies are more engaging than others, spawning more followup interactions. Current conversational models are increasingly capable of producing turns that are context-relevant, but in order to produce compelling agents, these models need to be able to predict and optimize for turns that are genuinely engaging. We leverage social media feedback data (number of replies and upvotes) to build a large-scale training dataset for feedback prediction. To alleviate possible distortion between the feedback and engagingness, we convert the ranking problem to a comparison of response pairs which involve few confounding factors. We trained DialogRPT, a set of GPT-2 based models on 133M pairs of human feedback data and the resulting ranker outperformed several baselines. Particularly, our ranker outperforms the conventional dialog perplexity baseline with a large margin on predicting Reddit feedback. We finally combine the feedback prediction models and a human-like scoring model to rank the machine-generated dialog responses. Crowd-sourced human evaluation shows that our ranking method correlates better with real human preferences than baseline models.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.28,38938970 +main.1702,RiSAWOZ: A Large-Scale Multi-Domain Wizard-of-Oz Dataset with Rich Semantic Annotations for Task-Oriented Dialogue Modeling,Jun Quan|Shian Zhang|Qian Cao|Zizhong Li|Deyi Xiong,"In order to alleviate the shortage of multi-domain data and to capture discourse phenomena for task-oriented dialogue modeling, we propose RiSAWOZ, a large-scale multi-domain Chinese Wizard-of-Oz dataset with Rich Semantic Annotations. RiSAWOZ contains 11.2K human-to-human (H2H) multi-turn semantically annotated dialogues, with more than 150K utterances spanning over 12 domains, which is larger than all previous annotated H2H conversational datasets. Both single- and multi-domain dialogues are constructed, accounting for 65% and 35%, respectively. Each dialogue is labeled with comprehensive dialogue annotations, including dialogue goal in the form of natural language description, domain, dialogue states and acts at both the user and system side. In addition to traditional dialogue annotations, we especially provide linguistic annotations on discourse phenomena, e.g., ellipsis and coreference, in dialogues, which are useful for dialogue coreference and ellipsis resolution tasks. Apart from the fully annotated dataset, we also present a detailed description of the data collection procedure, statistics and analysis of the dataset. A series of benchmark models and results are reported, including natural language understanding (intent detection & slot filling), dialogue state tracking and dialogue context-to-text generation, as well as coreference and ellipsis resolution, which facilitate the baseline comparison for future research on this corpus.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.67,38938971 +main.1706,Knowledge Graph Empowered Entity Description Generation,Liying Cheng|Dekun Wu|Lidong Bing|Yan Zhang|Zhanming Jie|Wei Lu|Luo Si,"Previous works on knowledge-to-text generation take as input a few RDF triples or key-value pairs conveying the knowledge of some entities to generate a natural language description. Existing datasets, such as WIKIBIO, WebNLG, and E2E, basically have a good alignment between an input triple/pair set and its output text. However, in practice, the input knowledge could be more than enough, since the output description may only cover the most significant knowledge. In this paper, we introduce a large-scale and challenging dataset to facilitate the study of such a practical scenario in KG-to-text. Our dataset involves retrieving abundant knowledge of various types of main entities from a large knowledge graph (KG), which makes the current graph-to-sequence models severely suffer from the problems of information loss and parameter explosion while generating the descriptions. We address these challenges by proposing a multi-graph structure that is able to represent the original graph information more comprehensively. Furthermore, we also incorporate aggregation methods that learn to extract the rich graph information. Extensive experiments demonstrate the effectiveness of our model architecture.",,Language Generation,Long,https://www.aclweb.org/anthology/2020.emnlp-main.90,38938972 +main.1707,Pointer: Constrained Text Generation via Insertion-based Generative Pre-training,Yizhe Zhang|Guoyin Wang|Chunyuan Li|Zhe Gan|Chris Brockett|Bill Dolan,"Large-scale pre-trained language models, such as BERT and GPT-2, have achieved excellent performance in language representation learning and free-form text generation. However, these models cannot be directly employed to generate text under specified lexical constraints. To address this challenge, we present POINTER (PrOgressive INsertion-based TransformER), a simple yet novel insertion-based approach for hard-constrained text generation. The proposed method operates by progressively inserting new tokens between existing tokens in a parallel manner. This procedure is recursively applied until a sequence is completed. The resulting coarse-to-fine hierarchy makes the generation process intuitive and interpretable. We pre-train our model with the proposed progressive insertion-based objective on a 12GB Wikipedia dataset, and fine-tune it on downstream hard-constrained generation tasks. Non-autoregressive decoding yields a logarithmic time complexity during inference time. Experimental results on both News and Yelp datasets demonstrate that Pointer achieves state-of-the-art performance on constrained text generation. We released the pre-trained models and the source code to facilitate future research.",,Language Generation,Long,https://www.aclweb.org/anthology/2020.emnlp-main.698,38938973 +main.1720,SeqMix: Augmenting Active Sequence Labeling via Sequence Mixup,Rongzhi Zhang|Yue Yu|Chao Zhang,"Active learning is an important technique for low-resource sequence labeling tasks. However, current active sequence labeling methods use the queried samples alone in each iteration, which is an inefficient way of leveraging human annotations. We propose a simple but effective data augmentation method to improve label efficiency of active sequence labeling. Our method, SeqMix, simply augments the queried samples by generating extra labeled sequences in each iteration. The key difficulty is to generate plausible sequences along with token-level labels. In SeqMix, we address this challenge by performing mixup for both sequences and token-level labels of the queried samples. Furthermore, we design a discriminator during sequence mixup, which judges whether the generated sequences are plausible or not. Our experiments on Named Entity Recognition and Event Detection tasks show that SeqMix can improve the standard active sequence labeling method by $2.27\%$--$3.75\%$ in terms of $F_1$ scores. The code and data for SeqMix can be found at https://github.com/rz-zhang/SeqMix.",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.691,38938974 +main.1733,Domain Adaptation of Thai Word Segmentation Models Using Stacked Ensemble,Peerat Limkonchotiwat|Wannaphong Phatthiyaphaibun|Raheem Sarwar|Ekapol Chuangsuwanich|Sarana Nutanong,"Like many Natural Language Processing tasks, Thai word segmentation is domain-dependent. Researchers have been relying on transfer learning to adapt an existing model to a new domain. However, this approach is inapplicable to cases where we can interact with only input and output layers of the models, also known as ""black boxes''. We propose a filter-and-refine solution based on the stacked-ensemble learning paradigm to address this black-box limitation. We conducted extensive experimental studies comparing our method against state-of-the-art models and transfer learning. Experimental results show that our proposed solution is an effective domain adaptation method and has a similar performance as the transfer learning method.",,"Phonology, Morphology and Word Segmentation",Long,https://www.aclweb.org/anthology/2020.emnlp-main.315,38938975 +main.1734,Recall and Learn: Fine-tuning Deep Pretrained Language Models with Less Forgetting,Sanyuan Chen|Yutai Hou|Yiming Cui|Wanxiang Che|Ting Liu|Xiangzhan Yu,"Deep pretrained language models have achieved great success in the way of pretraining first and then fine-tuning. But such a sequential transfer learning paradigm often confronts the catastrophic forgetting problem and leads to sub-optimal performance. To fine-tune with less forgetting, we propose a recall and learn mechanism, which adopts the idea of multi-task learning and jointly learns pretraining tasks and downstream tasks. Specifically, we introduce a Pretraining Simulation mechanism to recall the knowledge from pretraining tasks without data, and an Objective Shifting mechanism to focus the learning on downstream tasks gradually. Experiments show that our method achieves state-of-the-art performance on the GLUE benchmark. Our method also enables BERT-base to achieve better average performance than directly fine-tuning of BERT-large. Further, we provide the open-source RecAdam optimizer, which integrates the proposed mechanisms into Adam optimizer, to facility the NLP community.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.634,38938976 +main.1738,Coarse-to-Fine Pre-training for Named Entity Recognition,Xue Mengge|Bowen Yu|Zhenyu Zhang|Tingwen Liu|Yue Zhang|Bin Wang,"More recently, Named Entity Recognition hasachieved great advances aided by pre-trainingapproaches such as BERT. However, currentpre-training techniques focus on building lan-guage modeling objectives to learn a gen-eral representation, ignoring the named entity-related knowledge. To this end, we proposea NER-specific pre-training framework to in-ject coarse-to-fine automatically mined entityknowledge into pre-trained models. Specifi-cally, we first warm-up the model via an en-tity span identification task by training it withWikipedia anchors, which can be deemed asgeneral-typed entities. Then we leverage thegazetteer-based distant supervision strategy totrain the model extract coarse-grained typedentities. Finally, we devise a self-supervisedauxiliary task to mine the fine-grained namedentity knowledge via clustering.Empiricalstudies on three public NER datasets demon-strate that our framework achieves significantimprovements against several pre-trained base-lines, establishing the new state-of-the-art per-formance on three benchmarks. Besides, weshow that our framework gains promising re-sults without using human-labeled trainingdata, demonstrating its effectiveness in label-few and low-resource scenarios.",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.514,38938977 +main.1739,Point to the Expression: Solving Algebraic Word Problems Using the Expression-Pointer Transformer Model,Bugeun Kim|Kyung Seo Ki|Donggeon Lee|Gahgene Gweon,"Solving algebraic word problems has recently emerged as an important natural language processing task. To solve algebraic word problems, recent studies suggested neural models that generate solution equations by using 'Op (operator/operand)' tokens as a unit of input/output. However, such a neural model suffered two issues: expression fragmentation and operand-context separation. To address each of these two issues, we propose a pure neural model, Expression-Pointer Transformer (EPT), which uses (1) 'Expression' token and (2) operand-context pointers when generating solution equations. The performance of the EPT model is tested on three datasets: ALG514, DRAW-1K, and MAWPS. Compared to the state-of-the-art (SoTA) models, the EPT model achieved a comparable performance accuracy in each of the three datasets; 81.3% on ALG514, 59.5% on DRAW-1K, and 84.5% on MAWPS. The contribution of this paper is two-fold; (1) We propose a pure neural model, EPT, which can address the expression fragmentation and the operand-context separation. (2) The fully automatic EPT model, which does not use hand-crafted features, yields comparable performance to existing models using hand-crafted features, and achieves better performance than existing pure neural models by at most 40%.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.308,38938978 +main.1749,Incremental Event Detection via Knowledge Consolidation Networks,Pengfei Cao|Yubo Chen|Jun Zhao|Taifeng Wang,"Conventional approaches to event detection usually require a fixed set of pre-defined event types. Such a requirement is often challenged in real-world applications, as new events continually occur. Due to huge computation cost and storage budge, it is infeasible to store all previous data and re-train the model with all previous data and new data, every time new events arrive. We formulate such challenging scenarios as incremental event detection, which requires a model to learn new classes incrementally without performance degradation on previous classes. However, existing incremental learning methods cannot handle semantic ambiguity and training data imbalance problems between old and new classes in the task of incremental event detection. In this paper, we propose a Knowledge Consolidation Network (KCN) to address the above issues. Specifically, we devise two components, prototype enhanced retrospection and hierarchical distillation, to mitigate the adverse effects of semantic ambiguity and class imbalance, respectively. Experimental results demonstrate the effectiveness of the proposed method, outperforming the state-of-the-art model by 19% and 13.4% of whole F1 score on ACE benchmark and TAC KBP benchmark, respectively.",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.52,38938979 +main.1750,Exploring the Role of Argument Structure in Online Debate Persuasion,Jialu Li|Esin Durmus|Claire Cardie,"Online debate forums provide users a platform to express their opinions on controversial topics while being exposed to opinions from diverse set of viewpoints. Existing work in Natural Language Processing (NLP) has shown that linguistic features extracted from the debate text and features encoding the characteristics of the audience are both critical in persuasion studies. In this paper, we aim to further investigate the role of discourse structure of the arguments from online debates in their persuasiveness. In particular, we use the factor graph model to obtain features for the argument structure of debates from an online debating platform and incorporate these features to an LSTM-based model to predict the debater that makes the most convincing arguments. We find that incorporating argument structure features play an essential role in achieving the best predictive performance in assessing the persuasiveness of the arguments on online debates.",,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Long,https://www.aclweb.org/anthology/2020.emnlp-main.716,38938980 +main.1754,Graph Convolutions over Constituent Trees for Syntax-Aware Semantic Role Labeling,Diego Marcheggiani|Ivan Titov,"Semantic role labeling (SRL) is the task of identifying predicates and labeling argument spans with semantic roles. Even though most semantic-role formalisms are built upon constituent syntax, and only syntactic constituents can be labeled as arguments (e.g., FrameNet and PropBank), all the recent work on syntax-aware SRL relies on dependency representations of syntax. In contrast, we show how graph convolutional networks (GCNs) can be used to encode constituent structures and inform an SRL system. Nodes in our SpanGCN correspond to constituents. The computation is done in 3 stages. First, initial node representations are produced by `composing' word representations of the first and last words in the constituent. Second, graph convolutions relying on the constituent tree are performed, yielding syntactically-informed constituent representations. Finally, the constituent representations are `decomposed' back into word representations, which are used as input to the SRL classifier. We evaluate SpanGCN against alternatives, including a model using GCNs over dependency trees, and show its effectiveness on standard English SRL benchmarks CoNLL-2005, CoNLL-2012, and FrameNet.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",Long,https://www.aclweb.org/anthology/2020.emnlp-main.322,38938981 +main.1755,HIT: Nested Named Entity Recognition via Head-Tail Pair and Token Interaction,Yu Wang|Yun Li|Hanghang Tong|Ziye Zhu,"Named Entity Recognition (NER) is a fundamental task in natural language processing. In order to identify entities with nested structure, many sophisticated methods have been recently developed based on either the traditional sequence labeling approaches or directed hypergraph structures. Despite being successful, these methods often fall short in striking a good balance between the expression power for nested structure and the model complexity. To address this issue, we present a novel nested NER model named HIT. Our proposed HIT model leverages two key properties pertaining to the (nested) named entity, including (1) explicit boundary tokens and (2) tight internal connection between tokens within the boundary. Specifically, we design (1) Head-Tail Detector based on the multi-head self-attention mechanism and bi-affine classifier to detect boundary tokens, and (2) Token Interaction Tagger based on traditional sequence labeling approaches to characterize the internal token connection within the boundary. Experiments on three public NER datasets demonstrate that the proposed HIT achieves state-of-the-art performance.",,"Syntax: Tagging, Chunking, and Parsing",Long,https://www.aclweb.org/anthology/2020.emnlp-main.486,38938982 +main.1766,Convolution over Hierarchical Syntactic and Lexical Graphs for Aspect Level Sentiment Analysis,Mi Zhang|Tieyun Qian,"The state-of-the-art methods in aspect-level sentiment classification have leveraged the graph based models to incorporate the syntactic structure of a sentence. While being effective, these methods ignore the corpus level word co-occurrence information, which reflect the collocations in linguistics like “nothing special”. Moreover, they do not distinguish the different types of syntactic dependency, e.g., a nominal subject relation “food-was” is treated equally as an adjectival complement relation “was-okay” in “food was okay”. To tackle the above two limitations, we propose a novel architecture which convolutes over hierarchical syntactic and lexical graphs. Specifically, we employ a global lexical graph to encode the corpus level word co-occurrence information. Moreover, we build a concept hierarchy on both the syntactic and lexical graphs for differentiating various types of dependency relations or lexical word pairs. Finally, we design a bi-level interactive graph convolution network to fully exploit these two graphs. Extensive experiments on five bench- mark datasets show that our method outperforms the state-of-the-art baselines.",,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Long,https://www.aclweb.org/anthology/2020.emnlp-main.286,38938983 +main.1770,Token-level Adaptive Training for Neural Machine Translation,Shuhao Gu|Jinchao Zhang|Fandong Meng|Yang Feng|Wanying Xie|Jie Zhou|Dong Yu,"There exists a token imbalance phenomenon in natural language as different tokens appear with different frequencies, which leads to different learning difficulties for tokens in Neural Machine Translation (NMT). The vanilla NMT model usually adopts trivial equal-weighted objectives for target tokens with different frequencies and tends to generate more high-frequency tokens and less low-frequency tokens compared with the golden token distribution. However, low-frequency tokens may carry critical semantic information that will affect the translation quality once they are neglected. In this paper, we explored target token-level adaptive objectives based on token frequencies to assign appropriate weights for each target token during training. We aimed that those meaningful but relatively low-frequency words could be assigned with larger weights in objectives to encourage the model to pay more attention to these tokens. Our method yields consistent improvements in translation quality on ZH-EN, EN-RO, and EN-DE translation tasks, especially on sentences that contain more low-frequency tokens where we can get 1.68, 1.02, and 0.52 BLEU increases compared with baseline, respectively. Further analyses show that our method can also improve the lexical diversity of translation.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.76,38938984 +main.1782,EXAMS: A Multi-subject High School Examinations Dataset for Cross-lingual and Multilingual Question Answering,Momchil Hardalov|Todor Mihaylov|Dimitrina Zlatkova|Yoan Dinkov|Ivan Koychev|Preslav Nakov,"We propose EXAMS – a new benchmark dataset for cross-lingual and multilingual question answering for high school examinations. We collected more than 24,000 high-quality high school exam questions in 16 languages, covering 8 language families and 24 school subjects from Natural Sciences and Social Sciences, among others.EXAMS offers unique fine-grained evaluation framework across multiple languages and subjects, which allows precise analysis and comparison of the proposed models. We perform various experiments with existing top-performing multilingual pre-trained models and show that EXAMS offers multiple challenges that require multilingual knowledge and reasoning in multiple domains. We hope that EXAMS will enable researchers to explore challenging reasoning and knowledge transfer methods and pre-trained models for school question answering in various languages which was not possible by now. The data, code, pre-trained models, and evaluation are available at http://github.com/mhardalov/exams-qa.",,Question Answering,Long,https://www.aclweb.org/anthology/2020.emnlp-main.438,38938985 +main.1784,IGSQL: Database Schema Interaction Graph Based Neural Model for Context-Dependent Text-to-SQL Generation,Yitao Cai|Xiaojun Wan,"Context-dependent text-to-SQL task has drawn much attention in recent years. Previous models on context-dependent text-to-SQL task only concentrate on utilizing historic user inputs. In this work, in addition to using encoders to capture historic information of user inputs, we propose a database schema interaction graph encoder to utilize historic information of database schema items. In decoding phase, we introduce a gate mechanism to weigh the importance of different vocabularies and then make the prediction of SQL tokens. We evaluate our model on the benchmark SParC and CoSQL datasets, which are two large complex context-dependent cross-domain text-to-SQL datasets. Our model outperforms previous state-of-the-art model by a large margin and achieves new state-of-the-art results on the two datasets. The comparison and ablation results demonstrate the efficacy of our model and the usefulness of the database schema interaction graph encoder.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",Long,https://www.aclweb.org/anthology/2020.emnlp-main.560,38938986 +main.1787,"Exploring and Evaluating Attributes, Values, and Structure for Entity Alignment",Zhiyuan Liu|Yixin Cao|Liangming Pan|Juanzi Li|Zhiyuan Liu|Tat-Seng Chua,"Entity alignment (EA) aims at building a unified Knowledge Graph (KG) of rich content by linking the equivalent entities from various KGs. GNN-based EA methods present promising performance by modeling the KG structure defined by relation triples. However, attribute triples can also provide crucial alignment signal but have not been well explored yet. In this paper, we propose to utilize an attributed value encoder and partition the KG into subgraphs to model the various types of attribute triples efficiently. Besides, the performances of current EA methods are overestimated because of the name-bias of existing EA datasets. To make an objective evaluation, we propose a hard experimental setting where we select equivalent entity pairs with very different names as the test set. Under both the regular and hard settings, our method achieves significant improvements (5.10% on average Hits@1 in DBP15k) over 12 baselines in cross-lingual and monolingual datasets. Ablation studies on different subgraphs and a case study about attribute types further demonstrate the effectiveness of our method. Source code and data can be found at \url{https://github.com/thunlp/explore-and-evaluate}.",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.515,38938987 +main.1788,Look at the First Sentence: Position Bias in Question Answering,Miyoung Ko|Jinhyuk Lee|Hyunjae Kim|Gangwoo Kim|Jaewoo Kang,"Many extractive question answering models are trained to predict start and end positions of answers. The choice of predicting answers as positions is mainly due to its simplicity and effectiveness. In this study, we hypothesize that when the distribution of the answer positions is highly skewed in the training set (e.g., answers lie only in the k-th sentence of each passage), QA models predicting answers as positions can learn spurious positional cues and fail to give answers in different positions. We first illustrate this position bias in popular extractive QA models such as BiDAF and BERT and thoroughly examine how position bias propagates through each layer of BERT. To safely deliver position information without position bias, we train models with various de-biasing methods including entropy regularization and bias ensembling. Among them, we found that using the prior distribution of answer positions as a bias model is very effective at reducing position bias, recovering the performance of BERT from 37.48% to 81.64% when trained on a biased SQuAD dataset.",,Question Answering,Long,https://www.aclweb.org/anthology/2020.emnlp-main.84,38938988 +main.179,Semantically-Aligned Universal Tree-Structured Solver for Math Word Problems,Jinghui Qin|Lihui Lin|Xiaodan Liang|Rumin Zhang|Liang Lin,"A practical automatic textual math word problems (MWPs) solver should be able to solve various textual MWPs while most existing works only focused on one-unknown linear MWPs. Herein, we propose a simple but efficient method called Universal Expression Tree (UET) to make the first attempt to represent the equations of various MWPs uniformly. Then a semantically-aligned universal tree-structured solver (SAU-Solver) based on an encoder-decoder framework is proposed to resolve multiple types of MWPs in a unified model, benefiting from our UET representation. Our SAU-Solver generates a universal expression tree explicitly by deciding which symbol to generate according to the generated symbols’ semantic meanings like human solving MWPs. Besides, our SAU-Solver also includes a novel subtree-level semanticallyaligned regularization to further enforce the semantic constraints and rationality of the generated expression tree by aligning with the contextual information. Finally, to validate the universality of our solver and extend the research boundary of MWPs, we introduce a new challenging Hybrid Math Word Problems dataset (HMWP), consisting of three types of MWPs. Experimental results on several MWPs datasets show that our model can solve universal types of MWPs and outperforms several state-of-the-art models.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.309,38938662 +main.1797,Like Hiking? You Probably Enjoy Nature: Persona-grounded Dialog with Commonsense Expansions,Bodhisattwa Prasad Majumder|Harsh Jhamtani|Taylor Berg-Kirkpatrick|Julian McAuley,"Existing persona-grounded dialog models often fail to capture simple implications of given persona descriptions, something which humans are able to do seamlessly. For example, state-of-the-art models cannot infer that interest in hiking might imply love for nature or longing for a break. In this paper, we propose to expand available persona sentences using existing commonsense knowledge bases and paraphrasing resources to imbue dialog models with access to an expanded and richer set of persona descriptions. Additionally, we introduce fine-grained grounding on personas by encouraging the model to make a discrete choice among persona sentences while synthesizing a dialog response. Since such a choice is not observed in the data, we model it using a discrete latent random variable and use variational learning to sample from hundreds of persona expansions. Our model outperforms competitive baselines on the Persona-Chat dataset in terms of dialog quality and diversity while achieving persona-consistent and controllable dialog generation.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.739,38938989 +main.1798,Multi-Unit Transformers for Neural Machine Translation,Jianhao Yan|Fandong Meng|Jie Zhou,"Transformer models achieve remarkable success in Neural Machine Translation. Many efforts have been devoted to deepening the Transformer by stacking several units (i.e., a combination of Multihead Attentions and FFN) in a cascade, while the investigation over multiple parallel units draws little attention. In this paper, we propose the Multi-Unit Transformer (MUTE) , which aim to promote the expressiveness of the Transformer by introducing diverse and complementary units. Specifically, we use several parallel units and show that modeling with multiple units improves model performance and introduces diversity. Further, to better leverage the advantage of the multi-unit setting, we design biased module and sequential dependency that guide and encourage complementariness among different units. Experimental results on three machine translation tasks, the NIST Chinese-to-English, WMT'14 English-to-German and WMT'18 Chinese-to-English, show that the MUTE models significantly outperform the Transformer-Base, by up to +1.52, +1.90 and +1.10 BLEU points, with only a mild drop in inference speed (about 3.1\%). In addition, our methods also surpass the Transformer-Big model, with only 54\% of its parameters. These results demonstrate the effectiveness of the MUTE, as well as its efficiency in both the inference process and parameter usage.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.77,38938990 +main.1803,MAD-X: An Adapter-based Framework for Multi-task Cross-lingual Transfer,Jonas Pfeiffer|Ivan Vulić|Iryna Gurevych|Sebastian Ruder,"The main goal behind state-of-the-art pre-trained multilingual models such as multilingual BERT and XLM-R is enabling and bootstrapping NLP applications in low-resource languages through zero-shot or few-shot cross-lingual transfer. However, due to limited model capacity, their transfer performance is the weakest exactly on such low-resource languages and languages unseen during pre-training. We propose MAD-X, an adapter-based framework that enables high portability and parameter-efficient transfer to arbitrary tasks and languages by learning modular language and task representations. In addition, we introduce a novel invertible adapter architecture and a strong baseline method for adapting a pre-trained multilingual model to a new language. MAD-X outperforms the state of the art in cross lingual transfer across a representative set of typologically diverse languages on named entity recognition and causal commonsense reasoning, and achieves competitive results on question answering. Our code and adapters are available at AdapterHub.ml.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.617,38938991 +main.1817,AxCell: Automatic Extraction of Results from Machine Learning Papers,Marcin Kardas|Piotr Czapla|Pontus Stenetorp|Sebastian Ruder|Sebastian Riedel|Ross Taylor|Robert Stojnic,"Tracking progress in machine learning has become increasingly difficult with the recent explosion in the number of papers. In this paper, we present AxCell, an automatic machine learning pipeline for extracting results from papers. AxCell uses several novel components, including a table segmentation subtask, to learn relevant structural knowledge that aids extraction. When compared with existing methods, our approach significantly improves the state of the art for results extraction. We also release a structured, annotated dataset for training models for results extraction, and a dataset for evaluating the performance of models on this task. Lastly, we show the viability of our approach enables it to be used for semi-automated results extraction in production, suggesting our improvements make this task practically viable for the first time. Code is available on GitHub.",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.692,38938992 +main.1832,Latent Geographical Factors for Analyzing the Evolution of Dialects in Contact,Yugo Murawaki,"Analyzing the evolution of dialects remains a challenging problem because contact phenomena hinder the application of the standard tree model. Previous statistical approaches to this problem resort to admixture analysis, where each dialect is seen as a mixture of latent ancestral populations. However, such ancestral populations are hardly interpretable in the context of the tree model. In this paper, we propose a probabilistic generative model that represents latent factors as geographical distributions. We argue that the proposed model has higher affinity with the tree model because a tree can alternatively be represented as a set of geographical distributions. Experiments involving synthetic and real data suggest that the proposed method is both quantitatively and qualitatively superior to the admixture model.",,"Linguistic Theories, Cognitive Modeling and Psycholinguistics",Long,https://www.aclweb.org/anthology/2020.emnlp-main.69,38938993 +main.1834,Discriminatively-Tuned Generative Classifiers for Robust Natural Language Inference,Xiaoan Ding|Tianyu Liu|Baobao Chang|Zhifang Sui|Kevin Gimpel,"While discriminative neural network classifiers are generally preferred, recent work has shown advantages of generative classifiers in term of data efficiency and robustness. In this paper, we focus on natural language inference ({NLI}). We propose {G}en{NLI}, a generative classifier for {NLI} tasks, and empirically characterize its performance by comparing it to five baselines, including discriminative models and large-scale pretrained language representation models like {BERT}. We explore training objectives for discriminative fine-tuning of our generative classifiers, showing improvements over log loss fine-tuning from prior work (Lewis and Fan, 2019). In particular, we find strong results with a simple unbounded modification to log loss, which we call the ``infinilog loss''. Our experiments show that {GenNLI} outperforms both discriminative and pretrained baselines across several challenging {NLI} experimental settings, including small training sets, imbalanced label distributions, and label noise.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",Long,https://www.aclweb.org/anthology/2020.emnlp-main.657,38938994 +main.1835,On Extractive and Abstractive Neural Document Summarization with Transformer Language Models,Jonathan Pilault|Raymond Li|Sandeep Subramanian|Chris Pal,"We present a method to produce abstractive summaries of long documents that exceed several thousand words via neural abstractive summarization. We perform a simple extractive step before generating a summary, which is then used to condition the transformer language model on relevant information before being tasked with generating a summary. We also show that this approach produces more abstractive summaries compared to prior work that employs a copy mechanism while still achieving higher ROUGE scores. We provide extensive comparisons with strong baseline methods, prior state of the art work as well as multiple variants of our approach including those using only transformers, only extractive techniques and combinations of the two. We examine these models using four different summarization tasks and datasets: arXiv papers, PubMed papers, the Newsroom and BigPatent datasets. We find that transformer based methods produce summaries with fewer n-gram copies, leading to n-gram copying statistics that are more similar to human generated abstracts. We include a human evaluation, finding that transformers are ranked highly for coherence and fluency, but purely extractive methods score higher for informativeness and relevance. We hope that these architectures and experiments may serve as strong points of comparison for future work. Note: The abstract above was collaboratively written by the authors and one of the models presented in this paper based on an earlier draft of this paper.",,Summarization,Long,https://www.aclweb.org/anthology/2020.emnlp-main.748,38938995 +main.1837,Don't Read Too Much into It: Adaptive Computation for Open-Domain Question Answering,Yuxiang Wu|Sebastian Riedel|Pasquale Minervini|Pontus Stenetorp,"Most approaches to Open-Domain Question Answering consist of a light-weight retriever that selects a set of candidate passages, and a computationally expensive reader that examines the passages to identify the correct answer. Previous works have shown that as the number of retrieved passages increases, so does the performance of the reader. However, they assume all retrieved passages are of equal importance and allocate the same amount of computation to them, leading to a substantial increase in computational cost. To reduce this cost, we propose the use of adaptive computation to control the computational budget allocated for the passages to be read. We first introduce a technique operating on individual passages in isolation which relies on anytime prediction and a per-layer estimation of early exit probability. We then introduce SKYLINEBUILDER, an approach for dynamically deciding on which passage to allocate computation at each step, based on a resource allocation policy trained via reinforcement learning. Our results on SQuAD-Open show that adaptive computation with global prioritisation improves over several strong static and adaptive methods, leading to a 4.3x reduction in computation while retaining 95% performance of the full model.",,Question Answering,Long,https://www.aclweb.org/anthology/2020.emnlp-main.244,38938996 +main.1846,MinTL: Minimalist Transfer Learning for Task-Oriented Dialogue Systems,Zhaojiang Lin|Andrea Madotto|Genta Indra Winata|Pascale Fung,"In this paper, we propose Minimalist Transfer Learning (MinTL) to simplify the system design process of task-oriented dialogue systems and alleviate the over-dependency on annotated data. MinTL is a simple yet effective transfer learning framework, which allows us to plug-and-play pre-trained seq2seq models, and jointly learn dialogue state tracking and dialogue response generation. Unlike previous approaches, which use a copy mechanism to ""carryover'' the old dialogue states to the new one, we introduce Levenshtein belief spans (Lev), that allows efficient dialogue state tracking with a minimal generation length. We instantiate our learning framework with two pre-trained backbones: T5 and BART, and evaluate them on MultiWOZ. Extensive experiments demonstrate that: 1) our systems establish new state-of-the-art results on end-to-end response generation, 2) MinTL-based systems are more robust than baseline methods in the low resource setting, and they achieve competitive results with only 20\% training data, and 3) Lev greatly improves the inference efficiency.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.273,38938997 +main.1857,Task-oriented Domain-specific Meta-Embedding for Text Classification,Xin Wu|Yi Cai|Yang Kai|Tao Wang|Qing Li,"Meta-embedding learning, which combines complementary information in different word embeddings, have shown superior performances across different Natural Language Processing tasks. However, domain-specific knowledge is still ignored by existing meta-embedding methods, which results in unstable performances across specific domains. Moreover, the importance of general and domain word embeddings is related to downstream tasks, how to regularize meta-embedding to adapt downstream tasks is an unsolved problem. In this paper, we propose a method to incorporate both domain-specific and task-oriented information into meta-embeddings. We conducted extensive experiments on four text classification datasets and the results show the effectiveness of our proposed method.",,Semantics: Lexical Semantics,Long,https://www.aclweb.org/anthology/2020.emnlp-main.282,38938998 +main.1862,Let's Stop Error Propagation in the End-to-End Relation Extraction Literature!,Bruno Taillé|Vincent Guigue|Geoffrey Scoutheeten|patrick Gallinari,"Despite efforts to distinguish three different evaluation setups (Bekoulis et al., 2018), numerous end-to-end Relation Extraction (RE) articles present unreliable performance comparison to previous work. In this paper, we first identify several patterns of invalid comparisons in published papers and describe them to avoid their propagation. We then propose a small empirical study to quantify the most common mistake's impact and evaluate it leads to overestimating the final RE performance by around 5% on ACE05. We also seize this opportunity to study the unexplored ablations of two recent developments: the use of language model pretraining (specifically BERT) and span-level NER. This meta-analysis emphasizes the need for rigor in the report of both the evaluation setting and the dataset statistics. We finally call for unifying the evaluation setting in end-to-end RE.",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.301,38938999 +main.1863,Online Conversation Disentanglement with Pointer Networks,Tao Yu|Shafiq Joty,"Huge amounts of textual conversations occur online every day, where multiple conversations take place concurrently. Interleaved conversations lead to difficulties in not only following the ongoing discussions but also extracting relevant information from simultaneous messages. Conversation disentanglement aims to separate intermingled messages into detached conversations. However, existing disentanglement methods rely mostly on handcrafted features that are dataset specific, which hinders generalization and adaptability. In this work, we propose an end-to-end online framework for conversation disentanglement that avoids time-consuming domain-specific feature engineering. We design a novel way to embed the whole utterance that comprises timestamp, speaker, and message text, and propose a custom attention mechanism that models disentanglement as a pointing problem while effectively capturing inter-utterance interactions in an end-to-end fashion. We also introduce a joint-learning objective to better capture contextual information. Our experiments on the Ubuntu IRC dataset show that our method achieves state-of-the-art performance in both link and conversation prediction tasks.",,Discourse and Pragmatics,Long,https://www.aclweb.org/anthology/2020.emnlp-main.512,38939000 +main.1866,"""What Do You Mean by That?"" - a Parser-Independent Interactive Approach for Enhancing Text-to-SQL",Yuntao Li|Bei Chen|Qian Liu|Yan Gao|Jian-Guang LOU|Yan Zhang|Dongmei Zhang,"In Natural Language Interfaces to Databases systems, the text-to-SQL technique allows users to query databases by using natural language questions. Though significant progress in this area has been made recently, most parsers may fall short when they are deployed in real systems. One main reason stems from the difficulty of fully understanding the users' natural language questions. In this paper, we include human in the loop and present a novel parser-independent interactive approach (PIIA) that interacts with users using multi-choice questions and can easily work with arbitrary parsers. Experiments were conducted on two cross-domain datasets, the WikiSQL and the more complex Spider, with five state-of-the-art parsers. These demonstrated that PIIA is capable of enhancing the text-to-SQL performance with limited interaction turns by using both simulation and human evaluation.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",Long,https://www.aclweb.org/anthology/2020.emnlp-main.561,38939001 +main.1877,End-to-End Synthetic Data Generation for Domain Adaptation of Question Answering Systems,Siamak Shakeri|Cicero Nogueira dos Santos|Henghui Zhu|Patrick Ng|Feng Nan|Zhiguo Wang|Ramesh Nallapati|Bing Xiang,"We propose an end-to-end approach for synthetic QA data generation. Our model comprises a single transformer-based encoder-decoder network that is trained end-to-end to generate both answers and questions. In a nutshell, we feed a passage to the encoder and ask the decoder to generate a question and an answer token-by-token. The likelihood produced in the generation process is used as a filtering score, which avoids the need for a separate filtering model. Our generator is trained by fine-tuning a pretrained LM using maximum likelihood estimation. The experimental results indicate significant improvements in the domain adaptation of QA models outperforming current state-of-the-art methods.",,Question Answering,Long,https://www.aclweb.org/anthology/2020.emnlp-main.439,38939002 +main.1892,SLM: Learning a Discourse Language Representation with Sentence Unshuffling,Haejun Lee|Drew A. Hudson|Kangwook Lee|Christopher D. Manning,"We introduce Sentence-level Language Modeling, a new pre-training objective for learning a discourse language representation in a fully self-supervised manner. Recent pre-training methods in NLP focus on learning either bottom or top-level language representations: contextualized word representations derived from language model objectives at one extreme and a whole sequence representation learned by order classification of two given textual segments at the other. However, these models are not directly encouraged to capture representations of intermediate-size structures that exist in natural languages such as sentences and the relationships among them. To that end, we propose a new approach to encourage learning of a contextualized sentence-level representation by shuffling the sequence of input sentences and training a hierarchical transformer model to reconstruct the original ordering. Through experiments on downstream tasks such as GLUE, SQuAD, and DiscoEval, we show that this feature of our model improves the performance of the original BERT by large margins.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",Long,https://www.aclweb.org/anthology/2020.emnlp-main.120,38939003 +main.1898,Unsupervised Text Style Transfer with Masked Language Models,Eric Malmi|Aliaksei Severyn|Sascha Rothe,"We propose Masker, an unsupervised text-editing method for style transfer. To tackle cases when no parallel source--target pairs are available, we train masked language models (MLMs) for both the source and the target domain. Then we find the text spans where the two models disagree the most in terms of likelihood. This allows us to identify the source tokens to delete to transform the source text to match the style of the target domain. The deleted tokens are replaced with the target MLM, and by using a padded MLM variant, we avoid having to predetermine the number of inserted tokens. Our experiments on sentence fusion and sentiment transfer demonstrate that Masker performs competitively in a fully unsupervised setting. Moreover, in low-resource settings, it improves supervised methods' accuracy by over 10 percentage points when pre-training them on silver training data generated by Masker.",,Language Generation,Long,https://www.aclweb.org/anthology/2020.emnlp-main.699,38939004 +main.1901,Are All Good Word Vector Spaces Isomorphic?,Ivan Vulić|Sebastian Ruder|Anders Søgaard,"Existing algorithms for aligning cross-lingual word vector spaces assume that vector spaces are approximately isomorphic. As a result, they perform poorly or fail completely on non-isomorphic spaces. Such non-isomorphism has been hypothesised to result from typological differences between languages. In this work, we ask whether non-isomorphism is also crucially a sign of degenerate word vector spaces. We present a series of experiments across diverse languages which show that variance in performance across language pairs is not only due to typological differences, but can mostly be attributed to the size of the monolingual resources available, and to the properties and duration of monolingual training (e.g. ""under-training"").",,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.257,38939005 +main.1904,Suicidal Risk Detection for Military Personnel,Sungjoon Park|Kiwoong Park|Jaimeen Ahn|Alice Oh,"We analyze social media for detecting the suicidal risk of military personnel, which is especially crucial for countries with compulsory military service such as the Republic of Korea. From a widely-used Korean social Q\&A site, we collect posts containing military-relevant content written by active-duty military personnel. We then annotate the posts with two groups of experts: military experts and mental health experts. Our dataset includes 2,791 posts with 13,955 corresponding expert annotations of suicidal risk levels, and this dataset is available to researchers who consent to research ethics agreement. Using various fine-tuned state-of-the-art language models, we predict the level of suicide risk, reaching .88 F1 score for classifying the risks.",,Computational Social Science and Social Media,Long,https://www.aclweb.org/anthology/2020.emnlp-main.198,38939006 +main.1906,Towards Modeling Revision Requirements in wikiHow Instructions,Irshad Bhat|Talita Anthonio|Michael Roth,"wikiHow is a resource of how-to guidesthat describe the steps necessary to accomplish a goal. Guides in this resource are regularly edited by a community of users, who try to improve instructions in terms of style, clarity and correctness. In this work, we test whether the need for such edits can be predicted automatically. For this task, we extend an existing resource of textual edits with a complementary set of approx. 4 million sentences that remain unedited over time and report on the outcome of two revision modeling experiments.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.675,38939007 +main.1908,Predicting Clinical Trial Results by Implicit Evidence Integration,Qiao Jin|Chuanqi Tan|Mosha Chen|Xiaozhong Liu|Songfang Huang,"Clinical trials provide essential guidance for practicing Evidence-Based Medicine, though often accompanying with unendurable costs and risks. To optimize the design of clinical trials, we introduce a novel Clinical Trial Result Prediction (CTRP) task. In the CTRP framework, a model takes a PICO-formatted clinical trial proposal with its background as input and predicts the result, i.e. how the Intervention group compares with the Comparison group in terms of the measured Outcome in the studied Population. While structured clinical evidence is prohibitively expensive for manual collection, we exploit large-scale unstructured sentences from medical literature that implicitly contain PICOs and results as evidence. Specifically, we pre-train a model to predict the disentangled results from such implicit evidence and fine-tune the model with limited data on the downstream datasets. Experiments on the benchmark Evidence Integration dataset show that the proposed model outperforms the baselines by large margins, e.g., with a 10.7% relative gain over BioBERT in macro-F1. Moreover, the performance improvement is also validated on another dataset composed of clinical trials related to COVID-19.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.114,38939008 +main.1923,Collecting Entailment Data for Pretraining: New Protocols and Negative Results,Samuel R. Bowman|Jennimaria Palomaki|Livio Baldini Soares|Emily Pitler,"Natural language inference (NLI) data has proven useful in benchmarking and, especially, as pretraining data for tasks requiring language understanding. However, the crowdsourcing protocol that was used to collect this data has known issues and was not explicitly optimized for either of these purposes, so it is likely far from ideal. We propose four alternative protocols, each aimed at improving either the ease with which annotators can produce sound training examples or the quality and diversity of those examples. Using these alternatives and a fifth baseline protocol, we collect and compare five new 8.5k-example training sets. In evaluations focused on transfer learning applications, our results are solidly negative, with models trained on our baseline dataset yielding good transfer performance to downstream tasks, but none of our four new methods (nor the recent ANLI) showing any improvements over that baseline. In a small silver lining, we observe that all four new protocols, especially those where annotators edit *pre-filled* text boxes, reduce previously observed issues with annotation artifacts.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",Long,https://www.aclweb.org/anthology/2020.emnlp-main.658,38939009 +main.1928,HUSH: A Dataset and Platform for Human-in-the-Loop Story Generation,Nader Akoury|Shufan Wang|Josh Whiting|Stephen Hood|Nanyun Peng|Mohit Iyyer,"Systems for story generation are asked to produce plausible and enjoyable stories given an input context. This task is underspecified, as a vast number of diverse stories can originate from a single input. The large output space makes it difficult to build and evaluate story generation models, as (1) existing datasets lack rich enough contexts to meaningfully guide models, and (2) existing evaluations (both crowdsourced and automatic) are unreliable for assessing long-form creative text. To address these issues, we introduce a dataset and evaluation platform built from STORIUM, an online collaborative storytelling community. Our author-generated dataset contains 6K lengthy stories (125M tokens) with fine-grained natural language annotations (e.g., character goals and attributes) interspersed throughout each narrative, forming a robust source for guiding models. We evaluate language models fine-tuned on our dataset by integrating them onto STORIUM, where real authors can query a model for suggested story continuations and then edit them. Automatic metrics computed over these edits correlate well with both user ratings of generated stories and qualitative feedback from semi-structured user interviews. We release both the STORIUM dataset and evaluation platform to spur more principled research into story generation.",,Language Generation,Long,https://www.aclweb.org/anthology/2020.emnlp-main.525,38939010 +main.1935,Don't Neglect the Obvious: On the Role of Unambiguous Words in Word Sense Disambiguation,Daniel Loureiro|Jose Camacho-Collados,"State-of-the-art methods for Word Sense Disambiguation (WSD) combine two different features: the power of pre-trained language models and a propagation method to extend the coverage of such models. This propagation is needed as current sense-annotated corpora lack coverage of many instances in the underlying sense inventory (usually WordNet). At the same time, unambiguous words make for a large portion of all words in WordNet, while being poorly covered in existing sense-annotated corpora. In this paper, we propose a simple method to provide annotations for most unambiguous words in a large corpus. We introduce the UWA (Unambiguous Word Annotations) dataset and show how a state-of-the-art propagation-based model can use it to extend the coverage and quality of its word sense embeddings by a significant margin, improving on its original results on WSD.",,Semantics: Lexical Semantics,Long,https://www.aclweb.org/anthology/2020.emnlp-main.283,38939011 +main.1938,Character-level Representations Still Improve Semantic Parsing in the Age of BERT,Rik van Noord|Antonio Toral|Johan Bos,"We combine character-level and contextual language model representations to improve performance on Discourse Representation Structure parsing. Character representations can easily be added in a sequence-to-sequence model in either one encoder or as a fully separate encoder, with improvements that are robust to different language models, languages and data sets. For English, these improvements are larger than adding individual sources of linguistic information or adding non-contextual embeddings. A new method of analysis based on semantic tags demonstrates that the character-level representations improve performance across a subset of selected semantic phenomena.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",Long,https://www.aclweb.org/anthology/2020.emnlp-main.371,38939012 +main.1942,Explainable Clinical Decision Support from Text,Jinyue Feng|Chantal Shaib|Frank Rudzicz,"Clinical prediction models often use structured variables and provide outcomes that are not readily interpretable by clinicians. Further, free-text medical notes may contain information not immediately available in structured variables. We propose a hierarchical CNN-transformer model with explicit attention as an interpretable, multi-task clinical language model, which achieves an AUROC of 0.75 and 0.78 on sepsis and mortality prediction, respectively. We also explore the relationships between learned features from structured and unstructured variables using projection-weighted canonical correlation analysis. Finally, we outline a protocol to evaluate model usability in a clinical decision support context. From domain-expert evaluations, our model generates informative rationales that have promising real-life applications.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.115,38939013 +main.1943,Please Mind the Root: Decoding Arborescences for Dependency Parsing,Ran Zmigrod|Tim Vieira|Ryan Cotterell,"The connection between dependency trees and spanning trees is exploited by the NLP community to train and to decode graph-based dependency parsers. However, the NLP literature has missed an important difference between the two structures: only one edge may emanate from the root in a dependency tree. We analyzed the output of state-of-the-art parsers on many languages from the Universal Dependency Treebank: although these parsers are often able to learn that trees which violate the constraint should be assigned lower probabilities, their ability to do so unsurprisingly de-grades as the size of the training set decreases.In fact, the worst constraint-violation rate we observe is 24%. Prior work has proposed an inefficient algorithm to enforce the constraint, which adds a factor of n to the decoding runtime. We adapt an algorithm due to Gabow and Tarjan (1984) to dependency parsing, which satisfies the constraint without compromising the original runtime.",,"Syntax: Tagging, Chunking, and Parsing",Long,https://www.aclweb.org/anthology/2020.emnlp-main.390,38939014 +main.1949,Beyond [CLS] through Ranking by Generation,Cicero Nogueira dos Santos|Xiaofei Ma|Ramesh Nallapati|zhiheng huang|Bing Xiang,"Generative models for Information Retrieval, where ranking of documents is viewed as the task of generating a query from a document's language model, were very successful in various IR tasks in the past. However, with the advent of modern deep neural networks, attention has shifted to discriminative ranking functions that model the semantic similarity of documents and queries instead. Recently, deep generative models such as GPT2 and BART have been shown to be excellent text generators, but their effectiveness as rankers have not been demonstrated yet. In this work, we revisit the generative framework for information retrieval and show that our generative approaches are as effective as state-of-the-art semantic similarity-based discriminative models for the answer selection task. Additionally, we demonstrate the effectiveness of unlikelihood losses for IR.",,Information Retrieval and Text Mining,Long,https://www.aclweb.org/anthology/2020.emnlp-main.134,38939015 +main.1952,Modeling Content Importance for Summarization with Pre-trained Language Models,Liqiang Xiao|Lu Wang|Hao He|Yaohui Jin,"Modeling content importance is an essential yet challenging task for summarization. Previous work is mostly based on statistical methods that estimate word-level salience, which does not consider semantics and larger context when quantifying importance. It is thus hard for these methods to generalize to semantic units of longer text spans. In this work, we apply information theory on top of pre-trained language models and define the concept of importance from the perspective of information amount. It considers both the semantics and context when evaluating the importance of each semantic unit. With the help of pre-trained language models, it can easily generalize to different kinds of semantic units n-grams or sentences. Experiments on CNN/Daily Mail and New York Times datasets demonstrate that our method can better model the importance of content than prior work based on F1 and ROUGE scores.",,Summarization,Long,https://www.aclweb.org/anthology/2020.emnlp-main.293,38939016 +main.1957,Semantic Role Labeling as Syntactic Dependency Parsing,Tianze Shi|Igor Malioutov|Ozan Irsoy,"We reduce the task of (span-based) PropBank-style semantic role labeling (SRL) to syntactic dependency parsing. Our approach is motivated by our empirical analysis that shows three common syntactic patterns account for over 98% of the SRL annotations for both English and Chinese data. Based on this observation, we present a conversion scheme that packs SRL annotations into dependency tree representations through joint labels that permit highly accurate recovery back to the original format. This representation allows us to train statistical dependency parsers to tackle SRL and achieve competitive performance with the current state of the art. Our findings show the promise of syntactic dependency trees in encoding semantic role relations within their syntactic domain of locality, and point to potential further integration of syntactic methods into semantic role labeling in the future.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",Long,https://www.aclweb.org/anthology/2020.emnlp-main.610,38939017 +main.1960,On the Sparsity of Neural Machine Translation Models,Yong Wang|Longyue Wang|Victor Li|Zhaopeng Tu,"Modern neural machine translation (NMT) models employ a large number of parameters, which leads to serious over-parameterization and typically causes the underutilization of computational resources. In response to this problem, we empirically investigate whether the redundant parameters can be reused to achieve better performance. Experiments and analyses are systematically conducted on different datasets and NMT architectures. We show that: 1) the pruned parameters can be rejuvenated to improve the baseline model by up to +0.8 BLEU points; 2) the rejuvenated parameters are reallocated to enhance the ability of modeling low-level lexical information.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.78,38939018 +main.1970,"Compositional and Lexical Semantics in RoBERTa, BERT and DistilBERT: A Case Study on CoQA",Ieva Staliūnaitė|Ignacio Iacobacci,"Many NLP tasks have benefited from transferring knowledge from contextualized word embeddings, however the picture of what type of knowledge is transferred is incomplete. This paper studies the types of linguistic phenomena accounted for by language models in the context of a Conversational Question Answering (CoQA) task. We identify the problematic areas for the finetuned RoBERTa, BERT and DistilBERT models through systematic error analysis - basic arithmetic (counting phrases), compositional semantics (negation and Semantic Role Labeling), and lexical semantics (surprisal and antonymy). When enhanced with the relevant linguistic knowledge through multitask learning, the models improve in performance. Ensembles of the enhanced models yield a boost between 2.2 and 2.7 points in F1 score overall, and up to 42.1 points in F1 on the hardest question classes. The results show differences in ability to represent compositional and lexical information between RoBERTa, BERT and DistilBERT.",,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.573,38939019 +main.1972,"""I’D Rather Just Go to Bed"": Understanding Indirect Answers",Annie Louis|Dan Roth|Filip Radlinski,"We revisit a pragmatic inference problem in dialog: Understanding indirect responses to questions. Humans can interpret 'I’m starving.’ in response to ‘Hungry?’, even without direct cue words such as 'yes' and 'no'. In dialog systems, allowing natural responses rather than closed vocabularies would be similarly beneficial. However, today’s systems are only as sensitive to these pragmatic moves as their language model allows. We create and release the first large-scale English language corpus 'Circa’ with 34,268 (polar question, indirect answer) pairs to enable progress on this task. The data was collected via elaborate crowdsourcing, and contains utterances with yes/no meaning, as well as uncertain, middle-ground, and conditional responses. We also present BERT-based neural models to predict such categories for a question-answer pair. We find that while transfer learning from entailment works reasonably, performance is not yet sufficient for robust dialog. Our models reach 82-88% accuracy for a 4-class distinction, and 74-85% for 6 classes.",,Discourse and Pragmatics,Long,https://www.aclweb.org/anthology/2020.emnlp-main.601,38939020 +main.1974,A Rigorous Study on Named Entity Recognition: Can Fine-tuning Pretrained Model Lead to the Promised Land?,Hongyu Lin|Yaojie Lu|Jialong Tang|Xianpei Han|Le Sun|Zhicheng Wei|Nicholas Jing Yuan,"Fine-tuning pretrained model has achieved promising performance on standard NER benchmarks. Generally, these benchmarks are blessed with strong name regularity, high mention coverage and sufficient context diversity. Unfortunately, when scaling NER to open situations, these advantages may no longer exist. And therefore it raises a critical question of whether previous creditable approaches can still work well when facing these challenges. As there is no currently available dataset to investigate this problem, this paper proposes to conduct randomization test on standard benchmarks. Specifically, we erase name regularity, mention coverage and context diversity respectively from the benchmarks, in order to explore their impact on the generalization ability of models. To further verify our conclusions, we also construct a new open NER dataset that focuses on entity types with weaker name regularity and lower mention coverage to verify our conclusion. From both randomization test and empirical experiments, we draw the conclusions that 1) name regularity is critical for the models to generalize to unseen mentions; 2) high mention coverage may undermine the model generalization ability and 3) context patterns may not require enormous data to capture when using pretrained encoders.",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.592,38939021 +main.1975,Question Directed Graph Attention Network for Numerical Reasoning over Text,Kunlong Chen|Weidi Xu|Xingyi Cheng|Zou Xiaochuan|Yuyu Zhang|Le Song|Taifeng Wang|Yuan Qi|Wei Chu,"Numerical reasoning over texts, such as addition, subtraction, sorting and counting, is a challenging machine reading comprehension task, since it requires both natural language understanding and arithmetic computation. To address this challenge, we propose a heterogeneous graph representation for the context of the passage and question needed for such reasoning, and design a question directed graph attention network to drive multi-step numerical reasoning over this context graph. Our model, which combines deep learning and graph reasoning, achieves remarkable results in benchmark datasets such as DROP.",,Question Answering,Long,https://www.aclweb.org/anthology/2020.emnlp-main.549,38939022 +main.1977,Event Extraction as Machine Reading Comprehension,Jian Liu|Yubo Chen|Kang Liu|Wei Bi|Xiaojiang Liu,"Event extraction (EE) is a crucial information extraction task that aims to extract event information in texts. Previous methods for EE typically model it as a classification task, which are usually prone to the data scarcity problem. In this paper, we propose a new learning paradigm of EE, by explicitly casting it as a machine reading comprehension problem (MRC). Our approach includes an unsupervised question generation process, which can transfer event schema into a set of natural questions, followed by a BERT-based question-answering process to retrieve answers as EE results. This learning paradigm enables us to strengthen the reasoning process of EE, by introducing sophisticated models in MRC, and relieve the data scarcity problem, by introducing the large-scale datasets in MRC. The empirical results show that: i) our approach attains state-of-the-art performance by considerable margins over previous methods. ii) Our model is excelled in the data-scarce scenario, for example, obtaining 49.8\% in F1 for event argument extraction with only 1\% data, compared with 2.2\% of the previous method. iii) Our model also fits with zero-shot scenarios, achieving $37.0\%$ and $16\%$ in F1 on two datasets without using any EE training data.",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.128,38939023 +main.1986,Revisiting Modularized Multilingual NMT to Meet Industrial Demands,Sungwon Lyu|Bokyung Son|Kichang Yang|Jaekyoung Bae,"The complete sharing of parameters for multilingual translation (1-1) has been the mainstream approach in current research. However, degraded performance due to the capacity bottleneck and low maintainability hinders its extensive adoption in industries. In this study, we revisit the multilingual neural machine translation model that only share modules among the same languages (M2) as a practical alternative to 1-1 to satisfy industrial requirements. Through comprehensive experiments, we identify the benefits of multi-way training and demonstrate that the M2 can enjoy these benefits without suffering from the capacity bottleneck. Furthermore, the interlingual space of the M2 allows convenient modification of the model. By leveraging trained modules, we find that incrementally added modules exhibit better performance than singly trained models. The zero-shot performance of the added modules is even comparable to supervised models. Our findings suggest that the M2 can be a competent candidate for multilingual translation in industries.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.476,38939024 +main.1996,Cold-start and Interpretability: Turning Regular Expressions into Trainable Recurrent Neural Networks,Chengyue Jiang|Yinggong Zhao|Shanbo Chu|Libin Shen|Kewei Tu,"Neural networks can achieve impressive performance on many natural language processing applications, but they typically need large labeled data for training and are not easily interpretable. On the other hand, symbolic rules such as regular expressions are interpretable, require no training, and often achieve decent accuracy; but rules cannot benefit from labeled data when available and hence underperform neural networks in rich-resource scenarios. In this paper, we propose a type of recurrent neural networks called FA-RNNs that combine the advantages of neural networks and regular expression rules. An FA-RNN can be converted from regular expressions and deployed in zero-shot and cold-start scenarios. It can also utilize labeled data for training to achieve improved prediction accuracy. After training, an FA-RNN often remains interpretable and can be converted back into regular expressions. We apply FA-RNNs to text classification and observe that FA-RNNs significantly outperform previous neural approaches in both zero-shot and low-resource settings and remain very competitive in rich-resource settings.",,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.258,38939025 +main.1997,Multilingual Offensive Language Identification with Cross-lingual Embeddings,Tharindu Ranasinghe|Marcos Zampieri,"Offensive content is pervasive in social media and a reason for concern to companies and government organizations. Several studies have been recently published investigating methods to detect the various forms of such content (e.g. hate speech, cyberbulling, and cyberaggression). The clear majority of these studies deal with English partially because most annotated datasets available contain English data. In this paper, we take advantage of English data available by applying cross-lingual contextual word embeddings and transfer learning to make predictions in languages with less resources. We project predictions on comparable data in Bengali, Hindi, and Spanish and we report results of 0.8415 F1 macro for Bengali, 0.8568 F1 macro for Hindi, and 0.7513 F1 macro for Spanish. Finally, we show that our approach compares favorably to the best systems submitted to recent shared tasks on these three languages, confirming the robustness of cross-lingual contextual embeddings and transfer learning for this task.",,Computational Social Science and Social Media,Long,https://www.aclweb.org/anthology/2020.emnlp-main.470,38939026 +main.2005,Adversarial Semantic Collisions,Congzheng Song|Alexander Rush|Vitaly Shmatikov,"We study \emph{semantic collisions}: texts that are semantically unrelated but judged as similar by NLP models. We develop gradient-based approaches for generating semantic collisions and demonstrate that state-of-the-art models for many tasks which rely on analyzing the meaning and similarity of texts\textemdash including paraphrase identification, document retrieval, response suggestion, and extractive summarization\textemdash are vulnerable to semantic collisions. For example, given a target query, inserting a crafted collision into an irrelevant document can shift its retrieval rank from 1000 to top 3. We show how to generate semantic collisions that evade perplexity-based filtering and discuss other potential mitigations. Our code is available at \url{https://github.com/csong27/collision-bert}.",,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.344,38939027 +main.2012,De-biased Court's View Generation with Causality,Yiquan Wu|Kun Kuang|Yating Zhang|Xiaozhong Liu|Changlong Sun|Jun Xiao|Yueting Zhuang|Luo Si|Fei Wu,"Court's view generation is a novel but essential task for legal AI, aiming at improving the interpretability of judgment prediction results and enabling automatic legal document generation. While prior text-to-text natural language generation (NLG) approaches can be used to address this problem, neglecting the confounding bias from the data generation mechanism can limit the model performance, and the bias may pollute the learning outcomes. In this paper, we propose a novel Attentional and Counterfactual based Natural Language Generation (AC-NLG) method, consisting of an attentional encoder and a pair of innovative counterfactual decoders. The attentional encoder leverages the plaintiff's claim and fact description as input to learn a claim-aware encoder from which the claim-related information in fact description can be emphasized. The counterfactual decoders are employed to eliminate the confounding bias in data and generate judgment-discriminative court's views (both supportive and non-supportive views) by incorporating with a synergistic judgment predictive model. Comprehensive experiments show the effectiveness of our method under both quantitative and qualitative evaluation metrics.",,Language Generation,Long,https://www.aclweb.org/anthology/2020.emnlp-main.56,38939028 +main.204,Retrofitting Structure-aware Transformer Language Model for End Tasks,Hao Fei|Yafeng Ren|Donghong Ji,"We consider retrofitting structure-aware Transformer language model for facilitating end tasks by proposing to exploit syntactic distance to encode both the phrasal constituency and dependency connection into the language model. A middle-layer structural learning strategy is leveraged for structure integration, accomplished with main semantic task training under multi-task learning scheme. Experimental results show that the retrofitted structure-aware Transformer language model achieves improved perplexity, meanwhile inducing accurate syntactic phrases. By performing structure-aware fine-tuning, our model achieves significant improvements for both semantic- and syntactic-dependent tasks.",,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.168,38938663 +main.2040,Learning Explainable Linguistic Expressions with Neural Inductive Logic Programming for Sentence Classification,Prithviraj Sen|Marina Danilevsky|Yunyao Li|Siddhartha Brahma|Matthias Boehm|Laura Chiticariu|Rajasekar Krishnamurthy,"Interpretability of predictive models is becoming increasingly important with growing adoption in the real-world. We present RuleNN, a neural network architecture for learning transparent models for sentence classification. The models are in the form of rules expressed in first-order logic, a dialect with well-defined, human-understandable semantics. More precisely, RuleNN learns linguistic expressions (LE) built on top of predicates extracted using shallow natural language understanding. Our experimental results show that RuleNN outperforms statistical relational learning and other neuro-symbolic methods, and performs comparably with black-box recurrent neural networks. Our user studies confirm that the learned LEs are explainable and capture domain semantics. Moreover, allowing domain experts to modify LEs and instill more domain knowledge leads to human-machine co-creation of models with better performance.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.345,38939029 +main.2042,Personal Information Leakage Detection in Conversations,Qiongkai Xu|Lizhen Qu|Zeyu Gao|Gholamreza Haffari,"The global market size of conversational assistants (chatbots) is expected to grow to USD 9.4 billion by 2024, according to MarketsandMarkets. Despite the wide use of chatbots, leakage of personal information through chatbots poses serious privacy concerns for their users. In this work, we propose to protect personal information by warning users of detected suspicious sentences generated by conversational assistants. The detection task is formulated as an alignment optimization problem and a new dataset PERSONA-LEAKAGE is collected for evaluation. In this paper, we propose two novel constrained alignment models, which consistently outperform baseline methods on Moreover, we conduct analysis on the behavior of recently proposed personalized chit-chat dialogue systems. The empirical results show that those systems suffer more from personal information disclosure than the widely used Seq2Seq model and the language model. In those cases, a significant number of information leaking utterances can be detected by our models with high precision.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.532,38939030 +main.2048,MAVEN: A Massive General Domain Event Detection Dataset,Xiaozhi Wang|Ziqi Wang|Xu Han|Wangyi Jiang|Rong Han|Zhiyuan Liu|Juanzi Li|Peng Li|Yankai Lin|Jie Zhou,"Event detection (ED), which means identifying event trigger words and classifying event types, is the first and most fundamental step for extracting event knowledge from plain text. Most existing datasets exhibit the following issues that limit further development of ED: (1) Data scarcity. Existing small-scale datasets are not sufficient for training and stably benchmarking increasingly sophisticated modern neural methods. (2) Low coverage. Limited event types of existing datasets cannot well cover general-domain events, which restricts the applications of ED models. To alleviate these problems, we present a MAssive eVENt detection dataset (MAVEN), which contains 4,480 Wikipedia documents, 118,732 event mention instances, and 168 event types. MAVEN alleviates the data scarcity problem and covers much more general event types. We reproduce the recent state-of-the-art ED models and conduct a thorough evaluation on MAVEN. The experimental results show that existing ED methods cannot achieve promising results on MAVEN as on the small datasets, which suggests that ED in the real world remains a challenging task and requires further research efforts. We also discuss further directions for general domain ED with empirical analyses. The source code and dataset can be obtained from https://github.com/THU-KEG/MAVEN-dataset.",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.129,38939031 +main.2050,Response Selection for Multi-Party Conversations with Dynamic Topic Tracking,Weishi Wang|Steven C.H. Hoi|Shafiq Joty,"While participants in a multi-party multi-turn conversation simultaneously engage in multiple conversation topics, existing response selection methods are developed mainly focusing on a two-party single-conversation scenario. Hence, the prolongation and transition of conversation topics are ignored by current methods. In this work, we frame response selection as a dynamic topic tracking task to match the topic between the response and relevant conversation context. With this new formulation, we propose a novel multi-task learning framework that supports efficient encoding through large pretrained models with only two utterances at once to perform dynamic topic disentanglement and response selection. We also propose Topic-BERT an essential pretraining step to embed topic information into BERT with self-supervised learning. Experimental results on the DSTC-8 Ubuntu IRC dataset show state-of-the-art results in response selection and topic disentanglement tasks outperforming existing methods by a good margin.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.533,38939032 +main.2054,PRover: Proof Generation for Interpretable Reasoning over Rules,Swarnadeep Saha|Sayan Ghosh|Shashank Srivastava|Mohit Bansal,"Recent work by Clark et al. (2020) shows that transformers can act as ""soft theorem provers'' by answering questions over explicitly provided knowledge in natural language. In our work, we take a step closer to emulating formal theorem provers, by proposing PRover, an interpretable transformer-based model that jointly answers binary questions over rule-bases and generates the corresponding proofs. Our model learns to predict nodes and edges corresponding to proof graphs in an efficient constrained training paradigm. During inference, a valid proof, satisfying a set of global constraints is generated. We conduct experiments on synthetic, hand-authored, and human-paraphrased rule-bases to show promising results for QA and proof generation, with strong generalization performance. First, PRover generates proofs with an accuracy of 87%, while retaining or improving performance on the QA task, compared to RuleTakers (up to 6% improvement on zero-shot evaluation). Second, when trained on questions requiring lower depths of reasoning, it generalizes significantly better to higher depths (up to 15% improvement). Third, PRover obtains near perfect QA accuracy of 98% using only 40% of the training data. However, generating proofs for questions requiring higher depths of reasoning becomes challenging, and the accuracy drops to 65% for ""depth 5"", indicating significant scope for future work.",,Question Answering,Long,https://www.aclweb.org/anthology/2020.emnlp-main.9,38939033 +main.2055,Translation Artifacts in Cross-lingual Transfer Learning,Mikel Artetxe|Gorka Labaka|Eneko Agirre,"Both human and machine translation play a central role in cross-lingual transfer learning: many multilingual datasets have been created through professional translation services, and using machine translation to translate either the test set or the training set is a widely used transfer technique. In this paper, we show that such translation process can introduce subtle artifacts that have a notable impact in existing cross-lingual models. For instance, in natural language inference, translating the premise and the hypothesis independently can reduce the lexical overlap between them, which current models are highly sensitive to. We show that some previous findings in cross-lingual transfer learning need to be reconsidered in the light of this phenomenon. Based on the gained insights, we also improve the state-of-the-art in XNLI for the translate-test and zero-shot approaches by 4.3 and 2.8 points, respectively.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.618,38939034 +main.2057,Comparative Evaluation of Label Agnostic Selection Bias in Multilingual Hate Speech Datasets,Nedjma Ousidhoum|Yangqiu Song|Dit-Yan Yeung,"Work on bias in hate speech typically aims to improve classification performance while relatively overlooking the quality of the data. We examine selection bias in hate speech in a language and label independent fashion. We first use topic models to discover latent semantics in eleven hate speech corpora, then, we present two bias evaluation metrics based on the semantic similarity between topics and search words frequently used to build corpora. We discuss the possibility of revising the data collection process by comparing datasets and analyzing contrastive case studies.",,Computational Social Science and Social Media,Long,https://www.aclweb.org/anthology/2020.emnlp-main.199,38939035 +main.2058,"Refer, Reuse, Reduce: Grounding Subsequent References in Visual and Conversational Contexts",Ece Takmaz|Mario Giulianelli|Sandro Pezzelle|Arabella Sinclair|Raquel Fernández,"Dialogue participants often refer to entities or situations repeatedly within a conversation, which contributes to its cohesiveness. Subsequent references exploit the common ground accumulated by the interlocutors and hence have several interesting properties, namely, they tend to be shorter and reuse expressions that were effective in previous mentions. In this paper, we tackle the generation of first and subsequent references in visually grounded dialogue. We propose a generation model that produces referring utterances grounded in both the visual and the conversational context. To assess the referring effectiveness of its output, we also implement a reference resolution system. Our experiments and analyses show that the model produces better, more effective referring utterances than a model not grounded in the dialogue context, and generates subsequent references that exhibit linguistic patterns akin to humans.",,"Language Grounding to Vision, Robotics and Beyond",Long,https://www.aclweb.org/anthology/2020.emnlp-main.353,38939036 +main.2061,Translation Quality Estimation by Jointly Learning to Score and Rank,Jingyi Zhang|Josef van Genabith,"The translation quality estimation (QE) task, particularly the QE as a Metric task, aims to evaluate the general quality of a translation based on the translation and the source sentence without using reference translations. Supervised learning of this QE task requires human evaluation of translation quality as training data. Human evaluation of translation quality can be performed in different ways, including assigning an absolute score to a translation or ranking different translations. In order to make use of different types of human evaluation data for supervised learning, we present a multi-task learning QE model that jointly learns two tasks: score a translation and rank two translations. Our QE model exploits cross-lingual sentence embeddings from pre-trained multilingual language models. We obtain new state-of-the-art results on the WMT 2019 QE as a Metric task and outperform sentBLEU on the WMT 2019 Metrics task.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.205,38939037 +main.2064,Automatic Extraction of Rules Governing Morphological Agreement,Aditi Chaudhary|Antonios Anastasopoulos|Adithya Pratapa|David R. Mortensen|Zaid Sheikh|Yulia Tsvetkov|Graham Neubig,"Creating a descriptive grammar of a language is an indispensable step for language documentation and preservation. However, at the same time it is a tedious, time-consuming task. In this paper, we take steps towards automating this process by devising an automated framework for extracting a first-pass grammatical specification from raw text in a concise, human- and machine-readable format. We focus on extracting rules describing agreement, a morphosyntactic phenomenon at the core of the grammars of many of the world’s languages. We apply our framework to all languages included in the Universal Dependencies project, with promising results. Using cross-lingual transfer, even with no expert annotations in the language of interest, our framework extracts a grammatical specification which is nearly equivalent to those created with large amounts of gold-standard annotated data. We confirm this finding with human expert evaluations of the rules that our framework produces, which have an average accuracy of 78%. We release an interface demonstrating the extracted rules at https://neulab.github.io/lase/",,"Phonology, Morphology and Word Segmentation",Long,https://www.aclweb.org/anthology/2020.emnlp-main.422,38939038 +main.2066,Improving Out-of-Scope Detection in Intent Classification by Using Embeddings of the Word Graph Space of the Classes,Paulo Cavalin|Victor Henrique Alves Ribeiro|Ana Appel|Claudio Pinhanez,"This paper explores how intent classification can be improved by representing the class labels not as a discrete set of symbols but as a space where the word graphs associated to each class are mapped using typical graph embedding techniques. The approach, inspired by a previous algorithm used for an inverse dictionary task, allows the classification algorithm to take in account inter-class similarities provided by the repeated occurrence of some words in the training examples of the different classes. The classification is carried out by mapping text embeddings to the word graph embeddings of the classes. Focusing solely on improving the representation of the class label set, we show in experiments conducted in both private and public intent classification datasets, that better detection of out-of-scope examples (OOS) is achieved and, as a consequence, that the overall accuracy of intent classification is also improved. In particular, using the recently-released \emph{Larson dataset}, an error of about 9.9% has been achieved for OOS detection, beating the previous state-of-the-art result by more than 31 percentage points.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.324,38939039 +main.2068,Textual Data Augmentation for Efficient Active Learning on Tiny Datasets,Husam Quteineh|Spyridon Samothrakis|Richard Sutcliffe,"In this paper we propose a novel data augmentation approach where guided outputs of a language generation model, e.g. GPT-2, when labeled, can improve the performance of text classifiers through an active learning process. We transform the data generation task into an optimization problem which maximizes the usefulness of the generated output, using Monte Carlo Tree Search (MCTS) as the optimization strategy and incorporating entropy as one of the optimization criteria. We test our approach against a Non-Guided Data Generation (NGDG) process that does not optimize for a reward function. Starting with a small set of data, our results show an increased performance with MCTS of 26% on the TREC-6 Questions dataset, and 10% on the Stanford Sentiment Treebank SST-2 dataset. Compared with NGDG, we are able to achieve increases of 3% and 5% on TREC-6 and SST-2.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.600,38939040 +main.207,Span-based Discontinuous Constituency Parsing: A Family of Exact Chart-based Algorithms with Time Complexities from O(n^6) Down to O(n^3),Caio Corro,"We introduce a novel chart-based algorithm for span-based parsing of discontinuous constituency trees of block degree two, including ill-nested structures. In particular, we show that we can build variants of our parser with smaller search spaces and time complexities ranging from O(n^6) down to O(n^3). The cubic time variant covers 98% of constituents observed in linguistic treebanks while having the same complexity as continuous constituency parsers. We evaluate our approach on German and English treebanks (Negra, Tiger, and DPTB) and report state-of-the-art results in the fully supervised setting. We also experiment with pre-trained word embeddings and Bert-based neural networks.",,"Syntax: Tagging, Chunking, and Parsing",Long,https://www.aclweb.org/anthology/2020.emnlp-main.219,38938664 +main.2070,Variational Hierarchical Dialog Autoencoder for Dialog State Tracking Data Augmentation,Kang Min Yoo|Hanbit Lee|Franck Dernoncourt|Trung Bui|Walter Chang|Sang-goo Lee,"Recent works have shown that generative data augmentation, where synthetic samples generated from deep generative models complement the training dataset, benefit NLP tasks. In this work, we extend this approach to the task of dialog state tracking for goaloriented dialogs. Due to the inherent hierarchical structure of goal-oriented dialogs over utterances and related annotations, the deep generative model must be capable of capturing the coherence among different hierarchies and types of dialog features. We propose the Variational Hierarchical Dialog Autoencoder (VHDA) for modeling the complete aspects of goal-oriented dialogs, including linguistic features and underlying structured annotations, namely speaker information, dialog acts, and goals. The proposed architecture is designed to model each aspect of goal-oriented dialogs using inter-connected latent variables and learns to generate coherent goal-oriented dialogs from the latent spaces. To overcome training issues that arise from training complex variational models, we propose appropriate training strategies. Experiments on various dialog datasets show that our model improves the downstream dialog trackers’ robustness via generative data augmentation. We also discover additional benefits of our unified approach to modeling goal-oriented dialogs – dialog response generation and user simulation, where our model outperforms previous strong baselines.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.274,38939041 +main.2072,PowerTransformer: Unsupervised Controllable Revision for Biased Language Correction,Xinyao Ma|Maarten Sap|Hannah Rashkin|Yejin Choi,"Unconscious biases continue to be prevalent in modern text and media, calling for algorithms that can assist writers with bias correction. For example, a female character in a story is often portrayed as passive and powerless (""_She daydreams about being a doctor_"") while a man is portrayed as more proactive and powerful (""_He pursues his dream of being a doctor_""). We formulate **Controllable Debiasing**, a new revision task that aims to rewrite a given text to correct the implicit and potentially undesirable bias in character portrayals. We then introduce PowerTransformer as an approach that debiases text through the lens of connotation frames (Sap et al., 2017), which encode pragmatic knowledge of implied power dynamics with respect to verb predicates. One key challenge of our task is the lack of parallel corpora. To address this challenge, we adopt an unsupervised approach using auxiliary supervision with related tasks such as paraphrasing and self-supervision based on a reconstruction loss, building on pretrained language models. Through comprehensive experiments based on automatic and human evaluations, we demonstrate that our approach outperforms ablations and existing methods from related tasks. Furthermore, we demonstrate the use of PowerTransformer as a step toward mitigating the well-documented gender bias in character portrayal in movie scripts.",,Discourse and Pragmatics,Long,https://www.aclweb.org/anthology/2020.emnlp-main.602,38939042 +main.2075,TED-CDB: A Large-Scale Chinese Discourse Relation Dataset on TED Talks,Wanqiu Long|Bonnie Webber|Deyi Xiong,"As different genres are known to differ in their communicative properties and as previously, for Chinese, discourse relations have only been annotated over news text, we have created the TED-CDB dataset. TED-CDB comprises a large set of TED talks in Chinese that have been manually annotated according to the goals and principles of Penn Discourse Treebank, but adapted to features that are not present in English. It serves as a unique Chinese corpus of spoken discourse. Benchmark experiments show that TED-CDB poses a challenge for state-of-the-art discourse relation classifiers, whose F1 performance on 4-way classification is \textless 60\%. This is a dramatic drop of 35\% from performance on the news text in the Chinese Discourse Treebank. Transfer learning experiments have been carried out with the TED-CDB for both same-language cross-domain transfer and same-domain cross-language transfer. Both demonstrate that the TED-CDB can improve the performance of systems being developed for languages other than Chinese and would be helpful for insufficient or unbalanced data in other corpora. The dataset and our Chinese annotation guidelines will be made freely available.",,Discourse and Pragmatics,Long,https://www.aclweb.org/anthology/2020.emnlp-main.223,38939043 +main.2076,When Hearst Is Not Enough: Improving Hypernymy Detection from Corpus with Distributional Models,Changlong Yu|Jialong Han|Peifeng Wang|Yangqiu Song|Hongming Zhang|Wilfred Ng|Shuming Shi,"We address hypernymy detection, i.e., whether an is-a relationship exists between words (x ,y), with the help of large textual corpora. Most conventional approaches to this task have been categorized to be either pattern-based or distributional. Recent studies suggest that pattern-based ones are superior, if large-scale Hearst pairs are extracted and fed, with the sparsity of unseen (x ,y) pairs relieved. However, they become invalid in some specific sparsity cases, where x or y is not involved in any pattern. For the first time, this paper quantifies the non-negligible existence of those specific cases. We also demonstrate that distributional methods are ideal to make up for pattern-based ones in such cases. We devise a complementary framework, under which a pattern-based and a distributional model collaborate seamlessly in cases which they each prefer. On several benchmark datasets, our framework demonstrates improvements that are both competitive and explainable.",,Semantics: Lexical Semantics,Long,https://www.aclweb.org/anthology/2020.emnlp-main.502,38939044 +main.2078,Multi-Stage Pretraining for Low-Resource Domain Adaptation,Rong Zhang|Revanth Gangi Reddy|Md Arafat Sultan|Vittorio Castelli|Anthony Ferritto|Radu Florian|Efsun Sarioglu Kayi|Salim Roukos|Avi Sil|Todd Ward,"Transfer learning techniques are particularly useful for NLP tasks where a sizable amount of high-quality annotated data is difficult to obtain. Current approaches directly adapt a pretrained language model (LM) on in-domain text before fine-tuning to downstream tasks. We show that extending the vocabulary of the LM with domain-specific terms leads to further gains. To a bigger effect, we utilize structure in the unlabeled data to create auxiliary synthetic tasks, which helps the LM transfer to downstream tasks. We apply these approaches incrementally on a pretrained Roberta-large LM and show considerable performance gain on three tasks in the IT domain: Extractive Reading Comprehension, Document Ranking and Duplicate Question Detection.",,Question Answering,Long,https://www.aclweb.org/anthology/2020.emnlp-main.440,38939045 +main.2083,An Effective Framework for Weakly-Supervised Phrase Grounding,Qinxin Wang|Hao Tan|Sheng Shen|Michael Mahoney|Zhewei Yao,"Phrase localization is a task that studies the mapping from textual phrases to regions of an image. Given difficulties in annotating phrase-to-object datasets at scale, we develop a Multimodal Alignment Framework (MAF) to leverage more widely-available caption-image datasets, which can then be used as a form of weak supervision. We first present algorithms to model phrase-object relevance by leveraging fine-grained visual representations and visually-aware language representations. By adopting a contrastive objective, our method uses information in caption-image pairs to boost the performance in weakly-supervised scenarios. Experiments conducted on the widely-adopted Flickr30k dataset show a significant improvement over existing weakly-supervised methods. With the help of the visually-aware language representations, we can also improve the previous best unsupervised result by 5.56%. We conduct ablation studies to show that both our novel model and our weakly-supervised strategies significantly contribute to our strong results.",,"Language Grounding to Vision, Robotics and Beyond",Long,https://www.aclweb.org/anthology/2020.emnlp-main.159,38939046 +main.2087,Exploring and Predicting Transferability across NLP Tasks,Tu Vu|Tong Wang|Tsendsuren Munkhdalai|Alessandro Sordoni|Adam Trischler|Andrew Mattarella-Micke|Subhransu Maji|Mohit Iyyer,"Recent advances in NLP demonstrate the effectiveness of training large-scale language models and transferring them to downstream tasks. Can fine-tuning these models on tasks other than language modeling further improve performance? In this paper, we conduct an extensive study of the transferability between 33 NLP tasks across three broad classes of problems (text classification, question answering, and sequence labeling). Our results show that transfer learning is more beneficial than previously thought, especially when target task data is scarce, and can improve performance even with low-data source tasks that differ substantially from the target task (e.g., part-of-speech tagging transfers well to the DROP QA dataset). We also develop task embeddings that can be used to predict the most transferable source tasks for a given target task, and we validate their effectiveness in experiments controlled for source and target data size. Overall, our experiments reveal that factors such as data size, task and domain similarity, and task complexity all play a role in determining transferability.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.635,38939047 +main.2094,"BERT Knows Punta Cana Is Not Just Beautiful, It's Gorgeous: Ranking Scalar Adjectives with Contextualised Representations",Aina Garí Soler|Marianna Apidianaki,"Adjectives like pretty, beautiful and gorgeous describe positive properties of the nouns they modify but with different intensity. These differences are important for natural language understanding and reasoning. We propose a novel BERT-based approach to intensity detection for scalar adjectives. We model intensity by vectors directly derived from contextualised representations and show they can successfully rank scalar adjectives. We evaluate our models both intrinsically, on gold standard datasets, and on an Indirect Question Answering task. Our results demonstrate that BERT encodes rich knowledge about the semantics of scalar adjectives, and is able to provide better quality intensity rankings than static embeddings and previous models with access to dedicated resources.",,Semantics: Lexical Semantics,Long,https://www.aclweb.org/anthology/2020.emnlp-main.598,38939048 +main.2098,Improving AMR Parsing with Sequence-to-Sequence Pre-training,Dongqin Xu|Junhui Li|Muhua Zhu|Min Zhang|Guodong Zhou,"In the literature, the research on abstract meaning representation (AMR) parsing is much restricted by the size of human-curated dataset which is critical to build an AMR parser with good performance. To alleviate such data size restriction, pre-trained models have been drawing more and more attention in AMR parsing. However, previous pre-trained models, like BERT, are implemented for general purpose which may not work as expected for the specific task of AMR parsing. In this paper, we focus on sequence-to-sequence (seq2seq) AMR parsing and propose a seq2seq pre-training approach to build pre-trained models in both single and joint way on three relevant tasks, i.e., machine translation, syntactic parsing, and AMR parsing itself. Moreover, we extend the vanilla fine-tuning method to a multi-task learning fine-tuning method that optimizes for the performance of AMR parsing while endeavors to preserve the response of pre-trained models. Extensive experimental results on two English benchmark datasets show that both the single and joint pre-trained models significantly improve the performance (e.g., from 71.5 to 80.2 on AMR 2.0), which reaches the state of the art. The result is very encouraging since we achieve this with seq2seq models rather than complex models. We make our code and model available at https:// github.com/xdqkid/S2S-AMR-Parser.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",Long,https://www.aclweb.org/anthology/2020.emnlp-main.196,38939049 +main.210,Small but Mighty: New Benchmarks for Split and Rephrase,Li Zhang|Huaiyu Zhu|Siddhartha Brahma|Yunyao Li,"Split and Rephrase is a text simplification task of rewriting a complex sentence into simpler ones. As a relatively new task, it is paramount to ensure the soundness of its evaluation benchmark and metric. We find that the widely used benchmark dataset universally contains easily exploitable syntactic cues caused by its automatic generation process. Taking advantage of such cues, we show that even a simple rule-based model can perform on par with the state-of-the-art model. To remedy such limitations, we collect and release two crowdsourced benchmark datasets. We not only make sure that they contain significantly more diverse syntax, but also carefully control for their quality according to a well-defined set of criteria. While no satisfactory automatic metric exists, we apply fine-grained manual evaluation based on these criteria using crowdsourcing, showing that our datasets better represent the task and are significantly more challenging for the models.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",Long,https://www.aclweb.org/anthology/2020.emnlp-main.91,38938665 +main.2100,Incorporating a Local Translation Mechanism into Non-autoregressive Translation,Xiang Kong|Zhisong Zhang|Eduard Hovy,"In this work, we introduce a novel local autoregressive translation (LAT) mechanism into non-autoregressive translation (NAT) models so as to capture local dependencies among target outputs. Specifically, for each target decoding position, instead of only one token, we predict a short sequence of tokens in an autoregressive way. We further design an efficient merging algorithm to align and merge the output pieces into one final output sequence. We integrate LAT into the conditional masked language model (CMLM) (Ghazvininejad et al.,2019) and similarly adopt iterative decoding. Empirical results on five translation tasks show that compared with CMLM, our method achieves comparable or better performance with fewer decoding iterations, bringing a 2.5x speedup. Further analysis indicates that our method reduces repeated translations and performs better at longer sentences. Our code will be released to the public.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.79,38939050 +main.2112,Interpreting Open-Domain Modifiers: Decomposition of Wikipedia Categories into Disambiguated Property-Value Pairs,Marius Pasca,"This paper proposes an open-domain method for automatically annotating modifier constituents (20th-century') within Wikipedia categories (20th-century male writers) with properties (date of birth). The annotations offer a semantically-anchored understanding of the role of the constituents in defining the underlying meaning of the categories. In experiments over an evaluation set of Wikipedia categories, the proposed method annotates constituent modifiers as semantically-anchored properties, rather than as mere strings in a previous method. It does so at a better trade-off between precision and recall.",,Semantics: Lexical Semantics,Long,https://www.aclweb.org/anthology/2020.emnlp-main.503,38939051 +main.2114,Towards Detecting and Exploiting Disambiguation Biases in Neural Machine Translation,Denis Emelin|Ivan Titov|Rico Sennrich,"Word sense disambiguation is a well-known source of translation errors in NMT. We posit that some of the incorrect disambiguation choices are due to models' over-reliance on dataset artifacts found in training data, specifically superficial word co-occurrences, rather than a deeper understanding of the source text. We introduce a method for the prediction of disambiguation errors based on statistical data properties, demonstrating its effectiveness across several domains and model types. Moreover, we develop a simple adversarial attack strategy that minimally perturbs sentences in order to elicit disambiguation errors to further probe the robustness of translation models. Our findings indicate that disambiguation robustness varies substantially between domains and that different models trained on the same data are vulnerable to different attacks.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.616,38939052 +main.2117,Generating Fact Checking Briefs,Angela Fan|Aleksandra Piktus|Fabio Petroni|Guillaume Wenzek|Marzieh Saeidi|Andreas Vlachos|Antoine Bordes|Sebastian Riedel,"Fact checking at scale is difficult---while the number of active fact checking websites is growing, it remains too small for the needs of the contemporary media ecosystem. However, despite good intentions, contributions from volunteers are often error-prone, and thus in practice restricted to claim detection. We investigate how to increase the accuracy and efficiency of fact checking by providing information about the claim before performing the check, in the form of natural language briefs. We investigate passage-based briefs, containing a relevant passage from Wikipedia, entity-centric ones consisting of Wikipedia pages of mentioned entities, and Question-Answering Briefs, with questions decomposing the claim, and their answers. To produce QABriefs, we develop QABriefer, a model that generates a set of questions conditioned on the claim, searches the web for evidence, and generates answers. To train its components, we introduce QABriefDataset We show that fact checking with briefs --- in particular QABriefs --- increases the accuracy of crowdworkers by 10% while slightly decreasing the time taken. For volunteer (unpaid) fact checkers, QABriefs slightly increase accuracy and reduce the time required by around 20%.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.580,38939053 +main.2120,ISAAQ - Mastering Textbook Questions with Pre-trained Transformers and Bottom-Up and Top-Down Attention,Jose Manuel Gomez-Perez|Raúl Ortega,"Textbook Question Answering is a complex task in the intersection of Machine Comprehension and Visual Question Answering that requires reasoning with multimodal information from text and diagrams. For the first time, this paper taps on the potential of transformer language models and bottom-up and top-down attention to tackle the language and visual understanding challenges this task entails. Rather than training a language-visual transformer from scratch we rely on pre-trained transformers, fine-tuning and ensembling. We add bottom-up and top-down attention to identify regions of interest corresponding to diagram constituents and their relationships, improving the selection of relevant visual information for each question and answer options. Our system ISAAQ reports unprecedented success in all TQA question types, with accuracies of 81.36%, 71.11% and 55.12% on true/false, text-only and diagram multiple choice questions. ISAAQ also demonstrates its broad applicability, obtaining state-of-the-art results in other demanding datasets.",,Question Answering,Long,https://www.aclweb.org/anthology/2020.emnlp-main.441,38939054 +main.2122,Intrinsic Probing through Dimension Selection,Lucas Torroba Hennigen|Adina Williams|Ryan Cotterell,"Most modern NLP systems make use of pre-trained contextual representations that attain astonishingly high performance on a variety of tasks. Such high performance should not be possible unless some form of linguistic structure inheres in these representations, and a wealth of research has sprung up on probing for it. In this paper, we draw a distinction between intrinsic probing, which examines how linguistic information is structured within a representation, and the extrinsic probing popular in prior work, which only argues for the presence of such information by showing that it can be successfully extracted. To enable intrinsic probing, we propose a novel framework based on a decomposable multivariate Gaussian probe that allows us to determine whether the linguistic information in word embeddings is dispersed or focal. We then probe fastText and BERT for various morphosyntactic attributes across 36 languages. We find that most attributes are reliably encoded by only a few neurons, with fastText concentrating its linguistic structure more than BERT.",,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.15,38939055 +main.2125,Multi-Fact Correction in Abstractive Text Summarization,Yue Dong|Shuohang Wang|Zhe Gan|Yu Cheng|Jackie Chi Kit Cheung|Jingjing Liu,"Pre-trained neural abstractive summarization systems have dominated extractive strategies on news summarization performance, at least in terms of ROUGE. However, system-generated abstractive summaries often face the pitfall of factual inconsistency: generating incorrect facts with respect to the source text. To address this challenge, we propose Span-Fact, a suite of two factual correction models that leverages knowledge learned from question answering models to make corrections in system-generated summaries via span selection. Our models employ single or multi-masking strategies to either iteratively or auto-regressively replace entities in order to ensure semantic consistency w.r.t. the source text, while retaining the syntactic structure of summaries generated by abstractive summarization models. Experiments show that our models significantly boost the factual consistency of system-generated summaries without sacrificing summary quality in terms of both automatic metrics and human evaluation.",,Summarization,Long,https://www.aclweb.org/anthology/2020.emnlp-main.749,38939056 +main.2131,The Secret Is in the Spectra: Predicting Cross-lingual Task Performance with Spectral Similarity Measures,Haim Dubossarsky|Ivan Vulić|Roi Reichart|Anna Korhonen,"Performance in cross-lingual NLP tasks is impacted by the (dis)similarity of languages at hand: e.g., previous work has suggested there is a connection between the expected success of bilingual lexicon induction (BLI) and the assumption of (approximate) isomorphism between monolingual embedding spaces. In this work we present a large-scale study focused on the correlations between monolingual embedding space similarity and task performance, covering thousands of language pairs and four different tasks: BLI, parsing, POS tagging and MT. We hypothesize that statistics of the spectrum of each monolingual embedding space indicate how well they can be aligned. We then introduce several isomorphism measures between two embedding spaces, based on the relevant statistics of their individual spectra. We empirically show that (1) language similarity scores derived from such spectral isomorphism measures are strongly associated with performance observed in different cross-lingual tasks, and (2) our spectral-based measures consistently outperform previous standard isomorphism measures, while being computationally more tractable and easier to interpret. Finally, our measures capture complementary information to typologically driven language distance measures, and the combination of measures from the two families yields even higher task performance correlations.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.186,38939057 +main.2133,Argument Pair Extraction from Peer Review and Rebuttal via Multi-task Learning,Liying Cheng|Lidong Bing|Qian Yu|Wei Lu|Luo Si,"Peer review and rebuttal, with rich interactions and argumentative discussions in between, are naturally a good resource to mine arguments. However, few works study both of them simultaneously. In this paper, we introduce a new argument pair extraction (APE) task on peer review and rebuttal in order to study the contents, the structure and the connections between them. We prepare a challenging dataset that contains 4,764 fully annotated review-rebuttal passage pairs from an open review platform to facilitate the study of this task. To automatically detect argumentative propositions and extract argument pairs from this corpus, we cast it as the combination of a sequence labeling task and a text relation classification task. Thus, we propose a multitask learning framework based on hierarchical LSTM networks. Extensive experiments and analysis demonstrate the effectiveness of our multi-task framework, and also show the challenges of the new task as well as motivate future research directions.",,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Long,https://www.aclweb.org/anthology/2020.emnlp-main.569,38939058 +main.214,Incomplete Utterance Rewriting as Semantic Segmentation,Qian Liu|Bei Chen|Jian-Guang LOU|Bin Zhou|Dongmei Zhang,"Recent years the task of incomplete utterance rewriting has raised a large attention. Previous works usually shape it as a machine translation task and employ sequence to sequence based architecture with copy mechanism. In this paper, we present a novel and extensive approach, which formulates it as a semantic segmentation task. Instead of generating from scratch, such a formulation introduces edit operations and shapes the problem as prediction of a word-level edit matrix. Benefiting from being able to capture both local and global information, our approach achieves state-of-the-art performance on several public datasets. Furthermore, our approach is four times faster than the standard approach in inference.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.227,38938666 +main.2141,Doc2Dial: A Goal-Oriented Document-Grounded Dialogue Dataset,Song Feng|Hui Wan|Chulaka Gunasekara|Siva Patel|Sachindra Joshi|Luis Lastras,"We introduce doc2dial, a new dataset of goal-oriented dialogues that are grounded in the associated documents. Inspired by how the authors compose documents for guiding end users, we first construct dialogue flows based on the content elements that corresponds to higher-level relations across text sections as well as lower-level relations between discourse units within a section. Then we present these dialogue flows to crowd contributors to create conversational utterances. The dataset includes over 4500 annotated conversations with an average of 14 turns that are grounded in over 450 documents from four domains. Compared to the prior document-grounded dialogue datasets, this dataset covers a variety of dialogue scenes in information-seeking conversations. For evaluating the versatility of the dataset, we introduce multiple dialogue modeling tasks and present baseline approaches.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.652,38939059 +main.215,Multi-turn Response Selection Using Dialogue Dependency Relations,Qi Jia|Yizhu Liu|Siyu Ren|Kenny Zhu|Haifeng Tang,"Multi-turn response selection is a task designed for developing dialogue agents. The performance on this task has a remarkable improvement with pre-trained language models. However, these models simply concatenate the turns in dialogue history as the input and largely ignore the dependencies between the turns. In this paper, we propose a dialogue extraction algorithm to transform a dialogue history into threads based on their dependency relations. Each thread can be regarded as a self-contained sub-dialogue. We also propose Thread-Encoder model to encode threads and candidates into compact representations by pre-trained Transformers and finally get the matching score through an attention layer. The experiments show that dependency relations are helpful for dialogue context understanding, and our model outperforms the state-of-the-art baselines on both DSTC7 and DSTC8*, with competitive results on UbuntuV2.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.150,38938667 +main.2151,Within-Between Lexical Relation Classification Using Path-based and Distributional Data,Oren Barkan|Avi Caciularu|Ido Dagan,"We propose the novel \emph{Within-Between} Relation model for recognizing lexical-semantic relations between words. Our model integrates relational and distributional signals, forming an effective sub-space representation for each relation. We show that the proposed model is competitive and outperforms other baselines, across various benchmarks.",,Semantics: Lexical Semantics,Long,https://www.aclweb.org/anthology/2020.emnlp-main.284,38939060 +main.2163,Direct Segmentation Models for Streaming Speech Translation,Javier Iranzo-Sánchez|Adrià Giménez Pastor|Joan Albert Silvestre-Cerdà|Pau Baquero-Arnal|Jorge Civera Saiz|Alfons Juan,"The cascade approach to Speech Translation (ST) is based on a pipeline that concatenates an Automatic Speech Recognition (ASR) system followed by a Machine Translation (MT) system. These systems are usually connected by a segmenter that splits the ASR output into hopefully, semantically self-contained chunks to be fed into the MT system. This is specially challenging in the case of streaming ST, where latency requirements must also be taken into account. This work proposes novel segmentation models for streaming ST that incorporate not only textual, but also acoustic information to decide when the ASR output is split into a chunk. An extensive and throughly experimental setup is carried out on the Europarl-ST dataset to prove the contribution of acoustic information to the performance of the segmentation model in terms of BLEU score in a streaming ST scenario. Finally, comparative results with previous work also show the superiority of the segmentation models proposed in this work.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.206,38939061 +main.2164,Filtering Noisy Dialogue Corpora by Connectivity and Content Relatedness,Reina Akama|Sho Yokoi|Jun Suzuki|Kentaro Inui,"Large-scale dialogue datasets have recently become available for training neural dialogue agents. However, these datasets have been reported to contain a non-negligible number of unacceptable utterance pairs. In this paper, we propose a method for scoring the quality of utterance pairs in terms of their connectivity and relatedness. The proposed scoring method is designed based on findings widely shared in the dialogue and linguistics research communities. We demonstrate that it has a relatively good correlation with the human judgment of dialogue quality. Furthermore, the method is applied to filter out potentially unacceptable utterance pairs from a large-scale noisy dialogue corpus to ensure its quality. We experimentally confirm that training data filtered by the proposed method improves the quality of neural dialogue agents in response generation.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.68,38939062 +main.2167,An Empirical Study on Large-Scale Multi-Label Text Classification Including Few and Zero-Shot Labels,Ilias Chalkidis|Manos Fergadiotis|Sotiris Kotitsas|Prodromos Malakasiotis|Nikolaos Aletras|Ion Androutsopoulos,"Large-scale Multi-label Text Classification (LMTC) has a wide range of Natural Language Processing (NLP) applications and presents interesting challenges. First, not all labels are well represented in the training set, due to the very large label set and the skewed label distributions of \lmtc datasets. Also, label hierarchies and differences in human labelling guidelines may affect graph-aware annotation proximity. Finally, the label hierarchies are periodically updated, requiring LMTC models capable of zero-shot generalization. Current state-of-the-art LMTC models employ Label-Wise Attention Networks (LWANs), which (1) typically treat LMTC as flat multi-label classification; (2) may use the label hierarchy to improve zero-shot learning, although this practice is vastly understudied; and (3) have not been combined with pre-trained Transformers (e.g. BERT), which have led to state-of-the-art results in several NLP benchmarks. Here, for the first time, we empirically evaluate a battery of LMTC methods from vanilla LWANs to hierarchical classification approaches and transfer learning, on frequent, few, and zero-shot learning on three datasets from different domains. We show that hierarchical methods based on Probabilistic Label Trees (PLTs) outperform LWANs. Furthermore, we show that Transformer-based approaches outperform the state-of-the-art in two of the datasets, and we propose a new state-of-the-art method which combines BERT with LWAN. Finally, we propose new models that leverage the label hierarchy to improve few and zero-shot learning, considering on each dataset a graph-aware annotation proximity measure that we introduce.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.607,38939063 +main.2179,COGS: A Compositional Generalization Challenge Based on Semantic Interpretation,Najoung Kim|Tal Linzen,"Natural language is characterized by compositionality: the meaning of a complex expression is constructed from the meanings of its constituent parts. To facilitate the evaluation of the compositional abilities of language processing architectures, we introduce COGS, a semantic parsing dataset based on a fragment of English. The evaluation portion of COGS contains multiple systematic gaps that can only be addressed by compositional generalization; these include new combinations of familiar syntactic structures, or new combinations of familiar words and familiar structures. In experiments with Transformers and LSTMs, we found that in-distribution accuracy on the COGS test set was near-perfect (96--99%), but generalization accuracy was substantially lower (16--35%) and showed high sensitivity to random seed (+-6--8%). These findings indicate that contemporary standard NLP models are limited in their compositional generalization capacity, and position COGS as a good way to measure progress.",,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.731,38939064 +main.2181,H2KGAT: Hierarchical Hyperbolic Knowledge Graph Attention Network,Shen Wang|Xiaokai Wei|Cicero Nogueira dos Santos|Zhiguo Wang|Ramesh Nallapati|Andrew Arnold|Bing Xiang|Philip S. Yu,"Knowledge Graphs encode rich relationships among large number of entities. Embedding entities and relations in low-dimensional space has shed light on representing knowledge graphs and reasoning over them, e.g., predicting missing relations between pairs of entities. Existing knowledge graph embedding approaches concentrate on modeling symmetry/asymmetry, inversion, and composition typed relations but overlook the hierarchical nature of relations. Recent studies have observed that there exist rich semantic hierarchical relations in knowledge graphs such as WordNet, where synsets are linked together in a hierarchy. To fill this gap, in this paper, we propose Hierarchical Hyperbolic Knowledge Graph Attention Network (H2KGAT), a novel knowledge graph embedding framework, which is able to better model and infer hierarchical relation patterns. Specifically, H2KGAT defines each entity in a hyperbolic polar embedding space. In addition, we propose an attentional neural context aggregator to enhance embedding learning, which can adaptively integrate the relational context. Our empirical study offers insights into the efficacy of modeling the semantic hierarchies in knowledge graphs, and we achieve significant performance gains compared to existing state-of-the-art methods on benchmark datasets for link prediction task, particularly at low dimensionality.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.401,38939065 +main.2198,Consistency of a Recurrent Language Model with Respect to Incomplete Decoding,Sean Welleck|Ilia Kulikov|Jaedeok Kim|Richard Yuanzhe Pang|Kyunghyun Cho,"Despite strong performance on a variety of tasks, neural sequence models trained with maximum likelihood have been shown to exhibit issues such as length bias and degenerate repetition. We study the related issue of receiving infinite-length sequences from a recurrent language model when using common decoding algorithms. To analyze this issue, we first define inconsistency of a decoding algorithm, meaning that the algorithm can yield an infinite-length sequence that has zero probability under the model. We prove that commonly used incomplete decoding algorithms – greedy search, beam search, top-k sampling, and nucleus sampling – are inconsistent, despite the fact that recurrent language models are trained to produce sequences of finite length. Based on these insights, we propose two remedies which address inconsistency: consistent variants of top-k and nucleus sampling, and a self-terminating recurrent language model. Empirical results show that inconsistency occurs in practice, and that the proposed methods prevent inconsistency.",,Language Generation,Long,https://www.aclweb.org/anthology/2020.emnlp-main.448,38939066 +main.2205,MEGA RST Discourse Treebanks with Structure and Nuclearity from Scalable Distant Sentiment Supervision,Patrick Huber|Giuseppe Carenini,"The lack of large and diverse discourse treebanks hinders the application of data-driven approaches, such as deep-learning, to RST-style discourse parsing. In this work, we present a novel scalable methodology to automatically generate discourse treebanks using distant supervision from sentiment annotated datasets, creating and publishing MEGA-DT, a new large-scale discourse-annotated corpus. Our approach generates discourse trees incorporating structure and nuclearity for documents of arbitrary length by relying on an efficient heuristic beam-search strategy, extended with a stochastic component. Experiments on multiple datasets indicate that a discourse parser trained on our MEGA-DT treebank delivers promising inter-domain performance gains when compared to parsers trained on human-annotated discourse corpora.",,Discourse and Pragmatics,Long,https://www.aclweb.org/anthology/2020.emnlp-main.603,38939067 +main.2208,Learning to Pronounce Chinese without a Pronunciation Dictionary,Christopher Chu|Scot Fang|Kevin Knight,"We demonstrate a program that learns to pronounce Chinese text in Mandarin, without a pronunciation dictionary. From non-parallel streams of Chinese characters and Chinese pinyin syllables, it establishes a many-to-many mapping between characters and pronunciations. Using unsupervised methods, the program effectively deciphers writing into speech. Its token-level character-to-syllable accuracy is 89%, which significantly exceeds the 22% accuracy of prior work.",,"Phonology, Morphology and Word Segmentation",Long,https://www.aclweb.org/anthology/2020.emnlp-main.458,38939068 +main.2209,Parallel Interactive Networks for Multi-Domain Dialogue State Generation,Junfan Chen|Richong Zhang|Yongyi Mao|Jie Xu,"The dependencies between system and user utterances in the same turn and across different turns are not fully considered in existing multidomain dialogue state tracking (MDST) models. In this study, we argue that the incorporation of these dependencies is crucial for the design of MDST and propose Parallel Interactive Networks (PIN) to model these dependencies. Specifically, we integrate an interactive encoder to jointly model the in-turn dependencies and cross-turn dependencies. The slot-level context is introduced to extract more expressive features for different slots. And a distributed copy mechanism is utilized to selectively copy words from historical system utterances or historical user utterances. Empirical studies demonstrated the superiority of the proposed PIN model.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.151,38939069 +main.2212,Bridging the Gap between Prior and Posterior Knowledge Selection for Knowledge-Grounded Dialogue Generation,Xiuyi Chen|Fandong Meng|Peng Li|Feilong Chen|Shuang Xu|Bo Xu|Jie Zhou,"Knowledge selection plays an important role in knowledge-grounded dialogue, which is a challenging task to generate more informative responses by leveraging external knowledge. Recently, latent variable models have been proposed to deal with the diversity of knowledge selection by using both prior and posterior distributions over knowledge and achieve promising performance. However, these models suffer from a huge gap between prior and posterior knowledge selection. Firstly, the prior selection module may not learn to select knowledge properly because of lacking the necessary posterior information. Secondly, latent variable models suffer from the exposure bias that dialogue generation is based on the knowledge selected from the posterior distribution at training but from the prior distribution at inference. Here, we deal with these issues on two aspects: (1) We enhance the prior selection module with the necessary posterior information obtained from the specially designed Posterior Information Prediction Module (PIPM); (2) We propose a Knowledge Distillation Based Training Strategy (KDBTS) to train the decoder with the knowledge selected from the prior distribution, removing the exposure bias of knowledge selection. Experimental results on two knowledge-grounded dialogue datasets show that both PIPM and KDBTS achieve performance improvement over the state-of-the-art latent variable model and their combination shows further improvement.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.275,38939070 +main.2215,"When BERT Plays the Lottery, All Tickets Are Winning",Sai Prasanna|Anna Rogers|Anna Rumshisky,"{L}arge {T}ransformer-based models were shown to be reducible to a smaller number of self-attention heads and layers. We consider this phenomenon from the perspective of the lottery ticket hypothesis, using both structured and magnitude pruning. For fine-tuned {BERT}, we show that (a) it is possible to find subnetworks achieving performance that is comparable with that of the full model, and (b) similarly-sized subnetworks sampled from the rest of the model perform worse. Strikingly, with structured pruning even the worst possible subnetworks remain highly trainable, indicating that most pre-trained {BERT} weights are potentially useful. We also study the ``good"" subnetworks to see if their success can be attributed to superior linguistic knowledge, but find them unstable, and not explained by meaningful self-attention patterns.",,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.259,38939071 +main.2216,Solving Historical Dictionary Codes with a Neural Language Model,Christopher Chu|Raphael Valenti|Kevin Knight,"We solve difficult word-based substitution codes by constructing a decoding lattice and searching that lattice with a neural language model. We apply our method to a set of enciphered letters exchanged between US Army General James Wilkinson and agents of the Spanish Crown in the late 1700s and early 1800s, obtained from the US Library of Congress. We are able to decipher 75.1% of the cipher-word tokens correctly.",,Computational Social Science and Social Media,Long,https://www.aclweb.org/anthology/2020.emnlp-main.471,38939072 +main.2218,Digital Voicing of Silent Speech,David Gaddy|Dan Klein,"In this paper, we consider the task of digitally voicing silent speech, where silently mouthed words are converted to audible speech based on electromyography (EMG) sensor measurements that capture muscle impulses. While prior work has focused on training speech synthesis models from EMG collected during vocalized speech, we are the first to train from EMG collected during silently articulated speech. We introduce a method of training on silent EMG by transferring audio targets from vocalized to silent signals. Our method greatly improves intelligibility of audio generated from silent EMG compared to a baseline that only trains with vocalized data, decreasing transcription word error rate from 64% to 4% in one data condition and 88% to 68% in another. To spur further development on this task, we share our new dataset of silent and vocalized facial EMG measurements.",,Speech and Multimodality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.445,38939073 +main.2221,DORB: Dynamically Optimizing Multiple Rewards with Bandits,Ramakanth Pasunuru|Han Guo|Mohit Bansal,"Policy gradients-based reinforcement learning has proven to be a promising approach for directly optimizing non-differentiable evaluation metrics for language generation tasks. However, optimizing for a specific metric reward leads to improvements in mostly that metric only, suggesting that the model is gaming the formulation of that metric in a particular way without often achieving real qualitative improvements. Hence, it is more beneficial to make the model optimize multiple diverse metric rewards jointly. While appealing, this is challenging because one needs to manually decide the importance and scaling weights of these metric rewards. Further, it is important to consider using a dynamic combination and curriculum of metric rewards that flexibly changes over time. Considering the above aspects, in our work, we automate the optimization of multiple metric rewards simultaneously via a multi-armed bandit approach (DORB), where at each round, the bandit chooses which metric reward to optimize next, based on expected arm gains. We use the Exp3 algorithm for bandits and formulate two approaches for bandit rewards: (1) Single Multi-reward Bandit (SM-Bandit); (2) Hierarchical Multi-reward Bandit (HM-Bandit). We empirically show the effectiveness of our approaches via various automatic metrics and human evaluation on two important NLG tasks: question generation and data-to-text generation. Finally, we present interpretable analyses of the learned bandit curriculum over the optimized rewards.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.625,38939074 +main.2225,Deep Attentive Learning for Stock Movement Prediction from Social Media Text and Company Correlations,Ramit Sawhney|Shivam Agarwal|Arnav Wadhwa|Rajiv Ratn Shah,"In the financial domain, risk modeling and profit generation heavily rely on the sophisticated and intricate stock movement prediction task. Stock forecasting is complex, given the stochastic dynamics and non-stationary behavior of the market. Stock movements are influenced by varied factors beyond the conventionally studied historical prices, such as social media and correlations among stocks. The rising ubiquity of online content and knowledge mandates an exploration of models that factor in such multimodal signals for accurate stock forecasting. We introduce an architecture that achieves a potent blend of chaotic temporal signals from financial data, social media, and inter-stock relationships via a graph neural network in a hierarchical temporal fashion. Through experiments on real-world S\&P 500 index data and English tweets, we show the practical applicability of our model as a tool for investment decision making and trading.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.676,38939075 +main.2228,Learning to Explain: Datasets and Models for Identifying Valid Reasoning Chains in Multihop Question-Answering,Harsh Jhamtani|Peter Clark,"Despite the rapid progress in multihop question-answering (QA), models still have trouble explaining why an answer is correct, with limited explanation training data available to learn from. To address this, we introduce three explanation datasets in which explanations formed from corpus facts are annotated. Our first dataset, eQASC contains over 98K explanation annotations for the multihop question answering dataset QASC, and is the first that annotates multiple candidate explanations for each answer. The second dataset eQASC-perturbed is constructed by crowd-sourcing perturbations (while preserving their validity) of a subset of explanations in QASC, to test consistency and generalization of explanation prediction models. The third dataset eOBQA is constructed by adding explanation annotations to the OBQA dataset to test generalization of models trained on eQASC. We show that this data can be used to significantly improve explanation quality (+14% absolute F1 over a strong retrieval baseline) using a BERT-based classifier, but still behind the upper bound, offering a new challenge for future research. We also explore a delexicalized chain representation in which repeated noun phrases are replaced by variables, thus turning them into generalized reasoning chains (for example: ""X is a Y"" AND ""Y has Z"" IMPLIES ""X has Z""). We find that generalized chains maintain performance while also being more robust to certain perturbations.\footnote{Code and datasets can be found at https://allenai.org/data/eqasc.",,Question Answering,Long,https://www.aclweb.org/anthology/2020.emnlp-main.10,38939076 +main.2238,With Little Power Comes Great Responsibility,Dallas Card|Peter Henderson|Urvashi Khandelwal|Robin Jia|Kyle Mahowald|Dan Jurafsky,"Despite its importance to experimental design, statistical power (the probability that, given a real effect, an experiment will reject the null hypothesis) has largely been ignored by the NLP community. Underpowered experiments make it more difficult to discern the difference between statistical noise and meaningful model improvements, and increase the chances of exaggerated findings. By meta-analyzing a set of existing NLP papers and datasets, we characterize typical power for a variety of settings and conclude that underpowered experiments are common in the NLP literature. In particular, for several tasks in the popular GLUE benchmark, small test sets mean that most attempted comparisons to state of the art models will not be adequately powered. Similarly, based on reasonable assumptions, we find that the most typical experimental design for human rating studies will be underpowered to detect small model differences, of the sort that are frequently studied. For machine translation, we find that typical test sets of 2000 sentences have approximately 75% power to detect differences of 1 BLEU point. To improve the situation going forward, we give an overview of best practices for power analysis in NLP and release a series of notebooks to assist with future power analyses.",,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.745,38939077 +main.2251,With More Contexts Comes Better Performance: Contextualized Sense Embeddings for All-Round Word Sense Disambiguation,Bianca Scarlini|Tommaso Pasini|Roberto Navigli,"Contextualized word embeddings have been employed effectively across several tasks in Natural Language Processing, as they have proved to carry useful semantic information. However, it is still hard to link them to structured sources of knowledge. In this paper we present ARES (context-AwaRe Embeddings of Senses), a semi-supervised approach to producing sense embeddings for the lexical meanings within a lexical knowledge base that lie in a space that is comparable to that of contextualized word vectors. ARES representations enable a simple 1 Nearest-Neighbour algorithm to outperform state-of-the-art models, not only in the English Word Sense Disambiguation task, but also in the multilingual one, whilst training on sense-annotated data in English only. We further assess the quality of our embeddings in the Word-in-Context task, where, when used as an external source of knowledge, they consistently improve the performance of a neural model, leading it to compete with other more complex architectures. ARES embeddings for all WordNet concepts and the automatically-extracted contexts used for creating the sense representations are freely available at http://sensembert.org/ares.",,Semantics: Lexical Semantics,Long,https://www.aclweb.org/anthology/2020.emnlp-main.285,38939078 +main.2253,Multi-Step Inference for Reasoning over Paragraphs,Jiangming Liu|Matt Gardner|Shay B. Cohen|Mirella Lapata,"Complex reasoning over text requires understanding and chaining together free-form predicates and logical connectives. Prior work has largely tried to do this either symbolically or with black-box transformers. We present a middle ground between these two extremes: a compositional model reminiscent of neural module networks that can perform chained logical reasoning. This model first finds relevant sentences in the context and then chains them together using neural modules. Our model gives significant performance improvements (up to 29% relative error reduction when combined with a reranker) on ROPES, a recently-introduced complex reasoning dataset.",,Question Answering,Long,https://www.aclweb.org/anthology/2020.emnlp-main.245,38939079 +main.2258,What Do Models Learn from Question Answering Datasets?,Priyanka Sen|Amir Saffari,"While models have reached superhuman performance on popular question answering (QA) datasets such as SQuAD, they have yet to outperform humans on the task of question answering itself. In this paper, we investigate if models are learning reading comprehension from QA datasets by evaluating BERT-based models across five datasets. We evaluate models on their generalizability to out-of-domain examples, responses to missing or incorrect data, and ability to handle question variations. We find that no single dataset is robust to all of our experiments and identify shortcomings in both datasets and evaluation methods. Following our analysis, we make recommendations for building future QA datasets that better evaluate the task of question answering through reading comprehension. We also release code to convert QA datasets to a shared format for easier experimentation at https://github.com/amazon-research/qa-dataset-converter",,Question Answering,Long,https://www.aclweb.org/anthology/2020.emnlp-main.190,38939080 +main.2261,Multi-resolution Annotations for Emoji Prediction,Weicheng Ma|Ruibo Liu|Lili Wang|Soroush Vosoughi,"Emojis are able to express various linguistic components, including emotions, sentiments, events, etc. Predicting the proper emojis associated with text provides a way to summarize the text accurately, and it has been proven to be a good auxiliary task to many Natural Language Understanding (NLU) tasks. Labels in existing emoji prediction datasets are all passage-based and are usually under the multi-class classification setting. However, in many cases, one single emoji cannot fully cover the theme of a piece of text. It is thus useful to infer the part of text related to each emoji. The lack of multi-label and aspect-level emoji prediction datasets is one of the bottlenecks for this task. This paper annotates an emoji prediction dataset with passage-level multi-class/multi-label, and aspect-level multi-class annotations. We also present a novel annotation method with which we generate the aspect-level annotations. The annotations are generated heuristically, taking advantage of the self-attention mechanism in Transformer networks. We validate the annotations both automatically and manually to ensure their quality. We also benchmark the dataset with a pre-trained BERT model.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.542,38939081 +main.2268,"The Curse of Performance Instability in Analysis Datasets: Consequences, Source, and Suggestions",Xiang Zhou|Yixin Nie|Hao Tan|Mohit Bansal,"We find that the performance of state-of-the-art models on Natural Language Inference (NLI) and Reading Comprehension (RC) analysis/stress sets can be highly unstable. This raises three questions: (1) How will the instability affect the reliability of the conclusions drawn based on these analysis sets? (2) Where does this instability come from? (3) How should we handle this instability and what are some potential solutions? For the first question, we conduct a thorough empirical study over analysis sets and find that in addition to the unstable final performance, the instability exists all along the training curve. We also observe lower-than-expected correlations between the analysis validation set and standard validation set, questioning the effectiveness of the current model-selection routine. Next, to answer the second question, we give both theoretical explanations and empirical evidence regarding the source of the instability, demonstrating that the instability mainly comes from high inter-example correlations within analysis sets. Finally, for the third question, we discuss an initial attempt to mitigate the instability and suggest guidelines for future work such as reporting the decomposed variance for more interpretable results and fair comparison across models.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",Long,https://www.aclweb.org/anthology/2020.emnlp-main.659,38939082 +main.2271,Does the Objective Matter? Comparing Training Objectives for Pronoun Resolution,Yordan Yordanov|Oana-Maria Camburu|Vid Kocijan|Thomas Lukasiewicz,"Hard cases of pronoun resolution have been used as a long-standing benchmark for commonsense reasoning. In the recent literature, pre-trained language models have been used to obtain state-of-the-art results on pronoun resolution. Overall, four categories of training and evaluation objectives have been introduced. The variety of training datasets and pre-trained language models used in these works makes it unclear whether the choice of training objective is critical. In this work, we make a fair comparison of the performance and seed-wise stability of four models that represent the four categories of objectives. Our experiments show that the objective of sequence ranking performs the best in-domain, while the objective of semantic similarity between candidates and pronoun performs the best out-of-domain. We also observe a seed-wise instability of the model using sequence ranking, which is not the case when the other objectives are used.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.402,38939083 +main.2273,Finding Domain-Specific Grounding in Noisy Visual-Textual Documents,Gregory Yauney|Jack Hessel|David Mimno,"Images can give us insights into the contextual meanings of words, but current image-text grounding approaches require detailed annotations. Such granular annotation is rare, expensive, and unavailable in most domain-specific contexts. In contrast, unlabeled multi-image, multi-sentence documents are abundant. Can lexical grounding be learned from such documents, even though they have significant lexical and visual overlap? Working with a case study dataset of real estate listings, we demonstrate the challenge of distinguishing highly correlated grounded terms, such as ``kitchen'' and ``bedroom'', and introduce metrics to assess this document similarity. We present a simple unsupervised clustering-based method that increases precision and recall beyond object detection and image tagging baselines when evaluated on labeled subsets of the dataset. The proposed method is particularly effective for local contextual meanings of a word, for example associating ``granite'' with countertops in the real estate dataset and with rocky landscapes in a Wikipedia dataset.",,"Language Grounding to Vision, Robotics and Beyond",Long,https://www.aclweb.org/anthology/2020.emnlp-main.160,38939084 +main.2278,LAReQA: Language-agnostic Answer Retrieval from a Multilingual Pool,Uma Roy|Noah Constant|Rami Al-Rfou|Aditya Barua|Aaron Phillips|Yinfei Yang,"We present LAReQA, a challenging new benchmark for language-agnostic answer retrieval from a multilingual candidate pool. Unlike previous cross-lingual tasks, LAReQA tests for ``strong'' cross-lingual alignment, requiring semantically related \textit{cross}-language pairs to be closer in representation space than unrelated \textit{same}-language pairs. This level of alignment is important for the practical task of cross-lingual information retrieval. Building on multilingual BERT (mBERT), we study different strategies for achieving strong alignment. We find that augmenting training data via machine translation is effective, and improves significantly over using mBERT out-of-the-box. Interestingly, model performance on zero-shot variants of our task that only target ``weak"" alignment is not predictive of performance on LAReQA\@. This finding underscores our claim that language-agnostic retrieval is a substantively new kind of cross-lingual evaluation, and suggests that measuring both weak and strong alignment will be important for improving cross-lingual systems going forward. We release our dataset and evaluation code at \url{https://github.com/google-research-datasets/lareqa}.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.477,38939085 +main.2281,Spot the Bot: A Robust and Efficient Framework for the Evaluation of Conversational Dialogue Systems,Jan Deriu|Don Tuggener|Pius von Däniken|Jon Ander Campos|Alvaro Rodrigo|Thiziri Belkacem|Aitor Soroa|Eneko Agirre|Mark Cieliebak,"The lack of time efficient and reliable evalu-ation methods is hampering the development of conversational dialogue systems (chat bots). Evaluations that require humans to converse with chat bots are time and cost intensive, put high cognitive demands on the human judges, and tend to yield low quality results. In this work, we introduce Spot The Bot, a cost-efficient and robust evaluation framework that replaces human-bot conversations with conversations between bots. Human judges then only annotate for each entity in a conversation whether they think it is human or not (assuming there are humans participants in these conversations). These annotations then allow us to rank chat bots regarding their ability to mimic conversational behaviour of humans. Since we expect that all bots are eventually recognized as such, we incorporate a metric that measures which chat bot is able to uphold human-like be-havior the longest, i.e.Survival Analysis. This metric has the ability to correlate a bot’s performance to certain of its characteristics (e.g.fluency or sensibleness), yielding interpretable results. The comparably low cost of our frame-work allows for frequent evaluations of chatbots during their evaluation cycle. We empirically validate our claims by applying Spot The Bot to three domains, evaluating several state-of-the-art chat bots, and drawing comparisonsto related work. The framework is released asa ready-to-use tool.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.326,38939086 +main.2289,META: Metadata-Empowered Weak Supervision for Text Classification,Dheeraj Mekala|Xinyang Zhang|Jingbo Shang,"Recent advances in weakly supervised learning enable training high-quality text classifiers by only providing a few user-provided seed words. Existing methods mainly use text data alone to generate pseudo-labels despite the fact that metadata information (e.g., author and timestamp) is widely available across various domains. Strong label indicators exist in the metadata and it has been long overlooked mainly due to the following challenges: (1) metadata is multi-typed, requiring systematic modeling of different types and their combinations, (2) metadata is noisy, some metadata entities (e.g., authors, venues) are more compelling label indicators than others. In this paper, we propose a novel framework, META, which goes beyond the existing paradigm and leverages metadata as an additional source of weak supervision. Specifically, we organize the text data and metadata together into a text-rich network and adopt network motifs to capture appropriate combinations of metadata. Based on seed words, we rank and filter motif instances to distill highly label-indicative ones as “seed motifs”, which provide additional weak supervision. Following a bootstrapping manner, we train the classifier and expand the seed words and seed motifs iteratively. Extensive experiments and case studies on real-world datasets demonstrate superior performance and significant advantages of leveraging metadata as weak supervision.",,Information Retrieval and Text Mining,Long,https://www.aclweb.org/anthology/2020.emnlp-main.670,38939087 +main.2298,"Not Low-Resource Anymore: Aligner Ensembling, Batch Filtering, and New Datasets for Bengali-English Machine Translation",Tahmid Hasan|Abhik Bhattacharjee|Kazi Samin|Masum Hasan|Madhusudan Basak|M. Sohel Rahman|Rifat Shahriyar,"Despite being the seventh most widely spoken language in the world, Bengali has received much less attention in machine translation literature due to being low in resources. Most publicly available parallel corpora for Bengali are not large enough; and have rather poor quality, mostly because of incorrect sentence alignments resulting from erroneous sentence segmentation, and also because of a high volume of noise present in them. In this work, we build a customized sentence segmenter for Bengali and propose two novel methods for parallel corpus creation on low-resource setups: aligner ensembling and batch filtering. With the segmenter and the two methods combined, we compile a high-quality Bengali-English parallel corpus comprising of 2.75 million sentence pairs, more than 2 million of which were not available before. Training on neural models, we achieve an improvement of more than 9 BLEU score over previous approaches to Bengali-English machine translation. We also evaluate on a new test set of 1000 pairs made with extensive quality control. We release the segmenter, parallel corpus, and the evaluation set, thus elevating Bengali from its low-resource status. To the best of our knowledge, this is the first ever large scale study on Bengali-English machine translation. We believe our study will pave the way for future research on Bengali-English machine translation as well as other low-resource languages. Our data and code are available at https://github.com/csebuetnlp/banglanmt.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.207,38939088 +main.2307,An Information Bottleneck Approach for Controlling Conciseness in Rationale Extraction,Bhargavi Paranjape|Mandar Joshi|John Thickstun|Hannaneh Hajishirzi|Luke Zettlemoyer,"Decisions of complex models for language understanding can be explained by limiting the inputs they are provided to a relevant subsequence of the original text --- a rationale. Models that condition predictions on a concise rationale, while being more interpretable, tend to be less accurate than models that are able to use the entire context. In this paper, we show that it is possible to better manage the trade-off between concise explanations and high task accuracy by optimizing a bound on the Information Bottleneck (IB) objective. Our approach jointly learns an explainer that predicts sparse binary masks over input sentences without explicit supervision, and an end-task predictor that considers only the residual sentences. Using IB, we derive a learning objective that allows direct control of mask sparsity levels through a tunable sparse prior. Experiments on the ERASER benchmark demonstrate significant gains over previous work for both task performance and agreement with human rationales. Furthermore, we find that in the semi-supervised setting, a modest amount of gold rationales (25% of training examples with gold masks) can close the performance gap with a model that uses the full input.",,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.153,38939089 +main.2313,CAT-Gen: Improving Robustness in NLP Models via Controlled Adversarial Text Generation,Tianlu Wang|Xuezhi Wang|Yao Qin|Ben Packer|Kang Li|Jilin Chen|Alex Beutel|Ed Chi,"NLP models are shown to suffer from robustness issues, i.e., a model's prediction can be easily changed under small perturbations to the input. In this work, we present a Controlled Adversarial Text Generation (CAT-Gen) model that, given an input text, generates adversarial texts through controllable attributes that are known to be invariant to task labels. For example, in order to attack a model for sentiment classification over product reviews, we can use the product categories as the controllable attribute which would not change the sentiment of the reviews. Experiments on real-world NLP datasets demonstrate that our method can generate more diverse and fluent adversarial texts, compared to many existing adversarial text generation approaches. We further use our generated adversarial examples to improve models through adversarial training, and we demonstrate that our generated attacks are more robust against model re-training and different model architectures.",,Language Generation,Long,https://www.aclweb.org/anthology/2020.emnlp-main.417,38939090 +main.2322,CapWAP: Captioning with a Purpose,Adam Fisch|Kenton Lee|Ming-Wei Chang|Jonathan Clark|Regina Barzilay,"The traditional image captioning task uses generic reference captions to provide textual information about images. Different user populations, however, will care about different visual aspects of images. In this paper, we propose a new task, Captioning with A Purpose (CapWAP). Our goal is to develop systems that can be tailored to be useful for the information needs of an intended population, rather than merely provide generic information about an image. In this task, we use question-answer (QA) pairs---a natural expression of information need---from users, instead of reference captions, for both training and post-inference evaluation. We show that it is possible to use reinforcement learning to directly optimize for the intended information need, by rewarding outputs that allow a question answering model to provide correct answers to sampled user questions. We convert several visual question answering datasets into CapWAP datasets, and demonstrate that under a variety of scenarios our purposeful captioning system learns to anticipate and fulfill specific information needs better than its generic counterparts, as measured by QA performance on user questions from unseen images, when using the caption alone as context.",,"Language Grounding to Vision, Robotics and Beyond",Long,https://www.aclweb.org/anthology/2020.emnlp-main.705,38939091 +main.233,MedDialog: A Large-scale Medical Dialogue Dataset,Guangtao Zeng|Wenmian Yang|Zeqian Ju|Yue Yang|Sicheng Wang|Ruisi Zhang|Meng Zhou|Jiaqi Zeng|Xiangyu Dong|Ruoyu Zhang|Hongchao Fang|Penghui Zhu|Shu Chen|Pengtao Xie,"Medical dialogue systems are promising in assisting in telemedicine to increase access to healthcare services, improve the quality of patient care, and reduce medical costs. To facilitate the research and development of medical dialogue systems, we build large-scale medical dialogue datasets -- MedDialog, which contain 1) a Chinese dataset with 3.4 million conversations between patients and doctors, 11.3 million utterances, 660.2 million tokens, covering 172 specialties of diseases, and 2) an English dataset with 0.26 million conversations, 0.51 million utterances, 44.53 million tokens, covering 96 specialties of diseases. To our best knowledge, MedDialog is the largest medical dialogue dataset to date. We pretrain several dialogue generation models on the Chinese MedDialog dataset, including Transformer, GPT, BERT-GPT, and compare their performance. It is shown that models trained on MedDialog are able to generate clinically correct and doctor-like medical dialogues. We also study the transferability of models trained on MedDialog to low-resource medical dialogue generation tasks. It is shown that via transfer learning which finetunes the models pretrained on MedDialog, the performance on medical dialogue generation tasks with small datasets can be greatly improved, as shown in human evaluation and automatic evaluation. The datasets and code are available at https://github.com/UCSD-AI4H/Medical-Dialogue-System",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.743,38938668 +main.2331,To BERT or Not to BERT: Comparing Task-specific and Task-agnostic Semi-Supervised Approaches for Sequence Tagging,Kasturi Bhattacharjee|Miguel Ballesteros|Rishita Anubhai|Smaranda Muresan|Jie Ma|Faisal Ladhak|Yaser Al-Onaizan,"Leveraging large amounts of unlabeled data using Transformer-like architectures, like BERT, has gained popularity in recent times owing to their effectiveness in learning general representations that can then be further fine-tuned for downstream tasks to much success. However, training these models can be costly both from an economic and environmental standpoint. In this work, we investigate how to effectively use unlabeled data: by exploring the task-specific semi-supervised approach, Cross-View Training (CVT) and comparing it with task-agnostic BERT in multiple settings that include domain and task relevant English data. CVT uses a much lighter model architecture and we show that it achieves similar performance to BERT on a set of sequence tagging tasks, with lesser financial and environmental impact.",,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.636,38939092 +main.2337,Learning a Cost-Effective Annotation Policy for Question Answering,Bernhard Kratzwald|Stefan Feuerriegel|Huan Sun,"State-of-the-art question answering (QA) relies upon large amounts of training data for which labeling is time consuming and thus expensive. For this reason, customizing QA systems is challenging. As a remedy, we propose a novel framework for annotating QA datasets that entails learning a cost-effective annotation policy and a semi-supervised annotation scheme. The latter reduces the human effort: it leverages the underlying QA system to suggest potential candidate annotations. Human annotators then simply provide binary feedback on these candidates. Our system is designed such that past annotations continuously improve the future performance and thus overall annotation cost. To the best of our knowledge, this is the first paper to address the problem of annotating questions with minimal annotation cost. We compare our framework against traditional manual annotations in an extensive set of experiments. We find that our approach can reduce up to 21.1% of the annotation cost.",,Question Answering,Long,https://www.aclweb.org/anthology/2020.emnlp-main.246,38939093 +main.2342,Universal Natural Language Processing with Limited Annotations: Try Few-shot Textual Entailment as a Start,Wenpeng Yin|Nazneen Fatema Rajani|Dragomir Radev|Richard Socher|Caiming Xiong,"A standard way to address different NLP problems is by first constructing a problem-specific dataset, then building a model to fit this dataset. To build the ultimate artificial intelligence, we desire a single machine that can handle diverse new problems, for which task-specific annotations are limited. We bring up textual entailment as a unified solver for such NLP problems. However, current research of textual entailment has not spilled much ink on the following questions: (i) How well does a pretrained textual entailment system generalize across domains with only a handful of domain-specific examples? and (ii) When is it worth transforming an NLP task into textual entailment? We argue that the transforming is unnecessary if we can obtain rich annotations for this task. Textual entailment really matters particularly when the target NLP task has insufficient annotations. Universal NLP can be probably achieved through different routines. In this work, we introduce Universal Few-shot textual Entailment (UFO-Entail). We demonstrate that this framework enables a pretrained entailment model to work well on new entailment domains in a few-shot setting, and show its effectiveness as a unified solver for several downstream NLP tasks such as question answering and coreference resolution when the end-task annotations are limited.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",Long,https://www.aclweb.org/anthology/2020.emnlp-main.660,38939094 +main.2343,Pre-Training Transformers as Energy-Based Cloze Models,Kevin Clark|Minh-Thang Luong|Quoc Le|Christopher D. Manning,"We introduce Electric, an energy-based cloze model for representation learning over text. Like BERT, it is a conditional generative model of tokens given their contexts. However, Electric does not use masking or output a full distribution over tokens that could occur in a context. Instead, it assigns a scalar energy score to each input token indicating how likely it is given its context. We train Electric using an algorithm based on noise-contrastive estimation and elucidate how this learning objective is closely related to the recently proposed ELECTRA pre-training method. Electric performs well when transferred to downstream tasks and is particularly effective at producing likelihood scores for text: it re-ranks speech recognition n-best lists better than language models and much faster than masked language models. Furthermore, it offers a clearer and more principled view of what ELECTRA learns during pre-training.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.20,38939095 +main.2349,"Generationary or: ""How We Went beyond Word Sense Inventories and Learned to Gloss""",Michele Bevilacqua|Marco Maru|Roberto Navigli,"Mainstream computational lexical semantics embraces the assumption that word senses can be represented as discrete items of a predefined inventory. In this paper we show this needs not be the case, and propose a unified model that is able to produce contextually appropriate definitions. In our model, Generationary, we employ a novel span-based encoding scheme which we use to fine-tune an English pre-trained Encoder-Decoder system to generate glosses. We show that, even though we drop the need of choosing from a predefined sense inventory, our model can be employed effectively: not only does Generationary outperform previous approaches in the generative task of Definition Modeling in many settings, but it also matches or surpasses the state of the art in discriminative tasks such as Word Sense Disambiguation and Word-in-Context. Finally, we show that Generationary benefits from training on data from multiple inventories, with strong gains on various zero-shot benchmarks, including a novel dataset of definitions for free adjective-noun phrases. The software and reproduction materials are available at http://generationary.org.",,Semantics: Lexical Semantics,Long,https://www.aclweb.org/anthology/2020.emnlp-main.585,38939096 +main.2357,Improving Grammatical Error Correction Models with Purpose-Built Adversarial Examples,Lihao Wang|Xiaoqing Zheng,"A sequence-to-sequence (seq2seq) learning with neural networks empirically shows to be an effective framework for grammatical error correction (GEC), which takes a sentence with errors as input and outputs the corrected one. However, the performance of GEC models with the seq2seq framework heavily relies on the size and quality of the corpus on hand. We propose a method inspired by adversarial training to generate more meaningful and valuable training examples by continually identifying the weak spots of a model, and to enhance the model by gradually adding the generated adversarial examples to the training set. Extensive experimental results show that such adversarial training can improve both the generalization and robustness of GEC models.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.228,38939097 +main.2363,Probing Pretrained Language Models for Lexical Semantics,Ivan Vulić|Edoardo Maria Ponti|Robert Litschko|Goran Glavaš|Anna Korhonen,"The success of large pretrained language models (LMs) such as BERT and RoBERTa has sparked interest in probing their representations, in order to unveil what types of knowledge they implicitly capture. While prior research focused on morphosyntactic, semantic, and world knowledge, it remains unclear to which extent LMs also derive lexical type-level knowledge from words in context. In this work, we present a systematic empirical analysis across six typologically diverse languages and five different lexical tasks, addressing the following questions: 1) How do different lexical knowledge extraction strategies (monolingual versus multilingual source LM, out-of-context versus in-context encoding, inclusion of special tokens, and layer-wise averaging) impact performance? How consistent are the observed effects across tasks and languages? 2) Is lexical knowledge stored in few parameters, or is it scattered throughout the network? 3) How do these representations fare against traditional static word vectors in lexical tasks 4) Does the lexical information emerging from independently trained monolingual LMs display latent similarities? Our main results indicate patterns and best practices that hold universally, but also point to prominent variations across languages and tasks. Moreover, we validate the claim that lower Transformer layers carry more type-level lexical knowledge, but also show that this knowledge is distributed across multiple layers.",,Semantics: Lexical Semantics,Long,https://www.aclweb.org/anthology/2020.emnlp-main.586,38939098 +main.2367,Text Segmentation by Cross Segment Attention,Michal Lukasik|Boris Dadachev|Kishore Papineni|Goncalo Simoes,"Document and discourse segmentation are two fundamental NLP tasks pertaining to breaking up text into constituents, which are commonly used to help downstream tasks such as information retrieval or text summarization. In this work, we propose three transformer-based architectures and provide comprehensive comparisons with previously proposed approaches on three standard datasets. We establish a new state-of-the-art, reducing in particular the error rates by a large margin in all cases. We further analyze model sizes and find that we can build models with many fewer parameters while keeping good performance, thus facilitating real-world applications.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.380,38939099 +main.237,Connecting the Dots: Event Graph Schema Induction with Path Language Modeling,Manling Li|Qi Zeng|Ying Lin|Kyunghyun Cho|Heng Ji|Jonathan May|Nathanael Chambers|Clare Voss,"Event schemas can guide our understanding and ability to make predictions with respect to what might happen next. We propose a new Event Graph Schema, where two event types are connected through multiple paths involving entities that fill important roles in a coherent story. We then introduce Path Language Model, an auto-regressive language model trained on event-event paths, and select salient and coherent paths to probabilistically construct these graph schemas. We design two evaluation metrics, instance coverage and instance coherence, to evaluate the quality of graph schema induction, by checking when coherent event instances are covered by the schema graph. Intrinsic evaluations show that our approach is highly effective at inducing salient and coherent schemas. Extrinsic evaluations show the induced schema repository provides significant improvement to downstream end-to-end Information Extraction over a state-of-the-art joint neural extraction model, when used as additional global features to unfold instance graphs.",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.50,38938669 +main.2370,Word Rotator's Distance,Sho Yokoi|Ryo Takahashi|Reina Akama|Jun Suzuki|Kentaro Inui,"One key principle for assessing textual similarity is measuring the degree of semantic overlap between texts by considering the word alignment. Such alignment-based approaches are both intuitive and interpretable; however, they are empirically inferior to the simple cosine similarity between general-purpose sentence vectors. We focus on the fact that the norm of word vectors is a good proxy for word importance, and the angle of them is a good proxy for word similarity. However, alignment-based approaches do not distinguish the norm and direction, whereas sentence-vector approaches automatically use the norm as the word importance. Accordingly, we propose decoupling word vectors into their norm and direction then computing the alignment-based similarity with the help of earth mover's distance (optimal transport), which we refer to as word rotator's distance. Furthermore, we demonstrate how to grow the norm and direction of word vectors (vector converter); this is a new systematic approach derived from the sentence-vector estimation methods, which can significantly improve the performance of the proposed method. On several STS benchmarks, the proposed methods outperform not only alignment-based approaches but also strong baselines. The source code is avaliable at https://github.com/eumesy/wrd",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.236,38939100 +main.2377,A Centering Approach for Discourse Structure-aware Coherence Modeling,Sungho Jeon|Michael Strube,"Previous neural coherence models have focused on identifying semantic relations between adjacent sentences. However, they do not have the means to exploit structural information. In this work, we propose a coherence model which takes discourse structural information into account without relying on human annotations. We approximate a linguistic theory of coherence, Centering theory, which we use to track the changes of focus between discourse segments. Our model first identifies the focus of each sentence, recognized with regards to the context, and constructs the structural relationship for discourse segments by tracking the changes of the focus. The model then incorporates this structural information into a structure-aware transformer. We evaluate our model on two tasks, automated essay scoring and assessing writing quality. Our results demonstrate that our model, built on top of a pretrained language model, achieves state-of-the-art performance on both tasks. We next statistically examine the identified trees of texts assigned to different quality scores. Finally, we investigate what our model learns in terms of theoretical claims.",,Discourse and Pragmatics,Long,https://www.aclweb.org/anthology/2020.emnlp-main.604,38939101 +main.2380,ProtoQA: A Question Answering Dataset for Prototypical Common-Sense Reasoning,Michael Boratko|Xiang Li|Tim O'Gorman|Rajarshi Das|Dan Le|Andrew McCallum,"Given questions regarding some prototypical situation --- such as Name something that people usually do before they leave the house for work? --- a human can easily answer them via acquired experiences. There can be multiple right answers for such questions, with some more common for a situation than others. This paper introduces a new question answering dataset for training and evaluating common sense reasoning capabilities of artificial intelligence systems in such prototypical situations. The training set is gathered from an existing set of questions played in a long-running international trivia game show -- Family Feud. The hidden evaluation set is created by gathering answers for each question from 100 crowd-workers. We also propose a generative evaluation task where a model has to output a ranked list of answers, ideally covering all prototypical answers for a question. After presenting multiple competitive baseline models, we find that human performance still exceeds model scores on all evaluation metrics with a meaningful gap, supporting the challenging nature of the task.",,Question Answering,Long,https://www.aclweb.org/anthology/2020.emnlp-main.85,38939102 +main.2382,Substance over Style: Document-Level Targeted Content Transfer,Allison Hegel|Sudha Rao|Asli Celikyilmaz|Bill Dolan,"Existing language models excel at writing from scratch, but many real-world scenarios require rewriting an existing document to fit a set of constraints. Although sentence-level rewriting has been fairly well-studied, little work has addressed the challenge of rewriting an entire document coherently. In this work, we introduce the task of document-level targeted content transfer and address it in the recipe domain, with a recipe as the document and a dietary restriction (such as vegan or dairy-free) as the targeted constraint. We propose a novel model for this task based on the generative pre-trained language model (GPT-2) and train on a large number of roughly-aligned recipe pairs. Both automatic and human evaluations show that our model out-performs existing methods by generating coherent and diverse rewrites that obey the constraint while remaining close to the original document. Finally, we analyze our model's rewrites to assess progress toward the goal of making language generation more attuned to constraints that are substantive rather than stylistic.",,Language Generation,Long,https://www.aclweb.org/anthology/2020.emnlp-main.526,38939103 +main.2383,Widget Captioning: Generating Natural Language Description for Mobile User Interface Elements,Yang Li|Gang Li|Luheng He|Jingjie Zheng|Hong Li|Zhiwei Guan,"Natural language descriptions of user interface (UI) elements such as alternative text are crucial for accessibility and language-based interaction in general. Yet, these descriptions are constantly missing in mobile UIs. We propose widget captioning, a novel task for automatically generating language descriptions for UI elements from multimodal input including both the image and the structural representations of user interfaces. We collected a large-scale dataset for widget captioning with crowdsourcing. Our dataset contains 162,860 language phrases created by human workers for annotating 61,285 UI elements across 21,750 unique UI screens. We thoroughly analyze the dataset, and train and evaluate a set of deep model configurations to investigate how each feature modality as well as the choice of learning strategies impact the quality of predicted captions. The task formulation and the dataset as well as our benchmark models contribute a solid basis for this novel multimodal captioning task that connects language and user interfaces.",,Speech and Multimodality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.443,38939104 +main.2389,Improving Text Generation with Student-Forcing Optimal Transport,Jianqiao Li|Chunyuan Li|Guoyin Wang|Hao Fu|Yuhchen Lin|Liqun Chen|Yizhe Zhang|Chenyang Tao|Ruiyi Zhang|Wenlin Wang|Dinghan Shen|Qian Yang|Lawrence Carin,"Neural language models are often trained with maximum likelihood estimation (MLE), where the next word is generated conditioned on the ground-truth word tokens. During testing, however, the model is instead conditioned on previously generated tokens, resulting in what is termed exposure bias. To reduce this gap between training and testing, we propose using optimal transport (OT) to match the sequences generated in these two modes. We examine the necessity of adding Student-Forcing scheme during training with an imitation learning interpretation. An extension is further proposed to improve the OT learning for long sequences, based on the structural and contextual information of the text sequences. The effectiveness of the proposed method is validated on machine translation, text summarization, and text generation tasks.",,Language Generation,Long,https://www.aclweb.org/anthology/2020.emnlp-main.735,38939105 +main.2391,RussianSuperGLUE: A Russian Language Understanding Evaluation Benchmark,Tatiana Shavrina|Alena Fenogenova|Emelyanov Anton|Denis Shevelev|Ekaterina Artemova|Valentin Malykh|Vladislav Mikhailov|Maria Tikhonova|Andrey Chertok|Andrey Evlampiev,"In this paper, we introduce an advanced Russian general language understanding evaluation benchmark -- Russian SuperGLUE. Recent advances in the field of universal language models and transformers require the development of a methodology for their broad diagnostics and testing for general intellectual skills - detection of natural language inference, commonsense reasoning, ability to perform simple logical operations regardless of text subject or lexicon. For the first time, a benchmark of nine tasks, collected and organized analogically to the SuperGLUE methodology, was developed from scratch for the Russian language. We also provide baselines, human level evaluation, open-source framework for evaluating models, and an overall leaderboard of transformer models for the Russian language. Besides, we present the first results of comparing multilingual models in the translated diagnostic test set and offer the first steps to further expanding or assessing State-of-the-art models independently of language.",,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.381,38939106 +main.2396,An Empirical Study of Pre-trained Transformers for Arabic Information Extraction,Wuwei Lan|Yang Chen|Wei Xu|Alan Ritter,"Multilingual pre-trained Transformers, such as mBERT (Devlin et al., 2019) and XLM-RoBERTa (Conneau et al., 2020a), have been shown to enable effective cross-lingual zero-shot transfer. However, their performance on Arabic information extraction (IE) tasks is not very well studied. In this paper, we pre-train a customized bilingual BERT, dubbed GigaBERT, that is designed specifically for Arabic NLP and English-to-Arabic zero-shot transfer learning. We study GigaBERT's effectiveness on zero-short transfer across four IE tasks: named entity recognition, part-of-speech tagging, argument role labeling, and relation extraction. Our best model significantly outperforms mBERT, XLM-RoBERTa, and AraBERT (Antoun et al., 2020) in both the supervised and zero-shot transfer settings. We have made our pre-trained models publicly available at: https://github.com/lanwuwei/GigaBERT.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.382,38939107 +main.2406,Message Passing for Hyper-Relational Knowledge Graphs,Mikhail Galkin|Priyansh Trivedi|Gaurav Maheshwari|Ricardo Usbeck|Jens Lehmann,"Hyper-relational knowledge graphs (KGs) (e.g., Wikidata) enable associating additional key-value pairs along with the main triple to disambiguate, or restrict the validity of a fact. In this work, we propose a message passing based graph encoder - StarE capable of modeling such hyper-relational KGs. Unlike existing approaches, StarE can encode an arbitrary number of additional information (qualifiers) along with the main triple while keeping the semantic roles of qualifiers and triples intact. We also demonstrate that existing benchmarks for evaluating link prediction (LP) performance on hyper-relational KGs suffer from fundamental flaws and thus develop a new Wikidata-based dataset - WD50K. Our experiments demonstrate that StarE based LP model outperforms existing approaches across multiple benchmarks. We also confirm that leveraging qualifiers is vital for link prediction with gains up to 25 MRR points compared to triple-based representations.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.596,38939108 +main.2410,Human-centric Dialog Training via Offline Reinforcement Learning,Natasha Jaques|Judy Hanwen Shen|Asma Ghandeharioun|Craig Ferguson|Agata Lapedriza|Noah Jones|Shixiang Gu|Rosalind Picard,"How can we train a dialog model to produce better conversations by learning from human feedback, without the risk of humans teaching it harmful chat behaviors? We start by hosting models online, and gather human feedback from real-time, open-ended conversations, which we then use to train and improve the models using offline reinforcement learning (RL). We identify implicit conversational cues including language similarity, elicitation of laughter, sentiment, and more, which indicate positive human feedback, and embed these in multiple reward functions. A well-known challenge is that learning an RL policy in an offline setting usually fails due to the lack of ability to explore and the tendency to make over-optimistic estimates of future reward. These problems become even harder when using RL for language models, which can easily have a 20,000 action vocabulary and many possible reward functions. We solve the challenge by developing a novel class of offline RL algorithms. These algorithms use KL-control to penalize divergence from a pre-trained prior language model, and use a new strategy to make the algorithm pessimistic, instead of optimistic, in the face of uncertainty. We test the resulting dialog model with ratings from 80 users in an open-domain setting and find it achieves significant improvements over existing deep offline RL approaches. The novel offline RL method is viable for improving any existing generative dialog model using a static dataset of human feedback.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.327,38939109 +main.2412,Language Adapters for Zero Shot Neural Machine Translation,Jerin Philip|Alexandre Berard|Matthias Gallé|Laurent Besacier,"We propose a novel adapter layer formalism for adapting multilingual models. They are more parameter-efficient than existing adapter layers while obtaining as good or better performance. The layers are specific to one language (as opposed to bilingual adapters) allowing to compose them and generalize to unseen language-pairs. In this zero-shot setting, they obtain a median improvement of +2.77 BLEU points over a strong 20-language multilingual Transformer baseline trained on TED talks.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.361,38939110 +main.2414,Analyzing Individual Neurons in Pre-trained Language Models,Nadir Durrani|Hassan Sajjad|Fahim Dalvi|Yonatan Belinkov,"While a lot of analysis has been carried to demonstrate linguistic knowledge captured by the representations learned within deep NLP models, very little attention has been paid towards individual neurons.We carry outa neuron-level analysis using core linguistic tasks of predicting morphology, syntax and semantics, on pre-trained language models, with questions like: i) do individual neurons in pre-trained models capture linguistic information? ii) which parts of the network learn more about certain linguistic phenomena? iii) how distributed or focused is the information? and iv) how do various architectures differ in learning these properties? We found small subsets of neurons to predict linguistic tasks, with lower level tasks (such as morphology) localized in fewer neurons, compared to higher level task of predicting syntax. Our study also reveals interesting cross architectural comparisons. For example, we found neurons in XLNet to be more localized and disjoint when predicting properties compared to BERT and others, where they are more distributed and coupled.",,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.395,38939111 +main.2415,ConjNLI: Natural Language Inference over Conjunctive Sentences,Swarnadeep Saha|Yixin Nie|Mohit Bansal,"Reasoning about conjuncts in conjunctive sentences is important for a deeper understanding of conjunctions in English and also how their usages and semantics differ from conjunctive and disjunctive boolean logic. Existing NLI stress tests do not consider non-boolean usages of conjunctions and use templates for testing such model knowledge. Hence, we introduce ConjNLI, a challenge stress-test for natural language inference over conjunctive sentences, where the premise differs from the hypothesis by conjuncts removed, added, or replaced. These sentences contain single and multiple instances of coordinating conjunctions (""and"", ""or"", ""but"", ""nor"") with quantifiers, negations, and requiring diverse boolean and non-boolean inferences over conjuncts. We find that large-scale pre-trained language models like RoBERTa do not understand conjunctive semantics well and resort to shallow heuristics to make inferences over such sentences. As some initial solutions, we first present an iterative adversarial fine-tuning method that uses synthetically created training data based on boolean and non-boolean heuristics. We also propose a direct model advancement by making RoBERTa aware of predicate semantic roles. While we observe some performance gains, ConjNLI is still challenging for current methods, thus encouraging interesting future work for better understanding of conjunctions.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",Long,https://www.aclweb.org/anthology/2020.emnlp-main.661,38939112 +main.2416,An Analysis of Natural Language Inference Benchmarks through the Lens of Negation,Md Mosharaf Hossain|Venelin Kovatchev|Pranoy Dutta|Tiffany Kao|Elizabeth Wei|Eduardo Blanco,"Negation is underrepresented in existing natural language inference benchmarks. Additionally, one can often ignore the few negations in existing benchmarks and still make the right inference judgments. In this paper, we present a new benchmark for natural language inference in which negation plays a critical role. We also show that state-of-the-art transformers struggle making inference judgments with the new pairs.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",Long,https://www.aclweb.org/anthology/2020.emnlp-main.732,38939113 +main.2419,Fast Semantic Parsing with Well-typedness Guarantees,Matthias Lindemann|Jonas Groschwitz|Alexander Koller,"AM dependency parsing is a linguistically principled method for neural semantic parsing with high accuracy across multiple graphbanks. It relies on a type system that models semantic valency but makes existing parsers slow. We describe an A* parser and a transition-based parser for AM dependency parsing which guarantee well-typedness and improve parsing speed by up to 3 orders of magnitude, while maintaining or improving accuracy.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",Long,https://www.aclweb.org/anthology/2020.emnlp-main.323,38939114 +main.2422,Online Back-Parsing for AMR-to-Text Generation,Xuefeng Bai|Linfeng Song|Yue Zhang,"AMR-to-text generation aims to recover a text containing the same meaning as an input AMR graph. Current research develops increasingly powerful graph encoders to better represent AMR graphs, with decoders based on standard language modeling being used to generate outputs. We propose a decoder that back predicts projected AMR graphs on the target sentence during text generation. As the result, our outputs can better preserve the input meaning than standard decoders. Experiments on two AMR benchmarks show the superiority of our model over the previous state-of-the-art system based on graph Transformer.",,Language Generation,Long,https://www.aclweb.org/anthology/2020.emnlp-main.92,38939115 +main.2424,DagoBERT: Generating Derivational Morphology with a Pretrained Language Model,Valentin Hofmann|Janet Pierrehumbert|Hinrich Schütze,"Can pretrained language models (PLMs) generate derivationally complex words? We present the first study investigating this question, taking BERT as the example PLM. We examine BERT’s derivational capabilities in different settings, ranging from using the unmodified pretrained model to full finetuning. Our best model, DagoBERT (Derivationally and generatively optimized BERT), clearly outperforms the previous state of the art in derivation generation (DG). Furthermore, our experiments show that the input segmentation crucially impacts BERT’s derivational knowledge, suggesting that the performance of PLMs could be further improved if a morphologically informed vocabulary of units were used.",,"Phonology, Morphology and Word Segmentation",Long,https://www.aclweb.org/anthology/2020.emnlp-main.316,38939116 +main.2426,Knowledge-guided Open Attribute Value Extraction with Reinforcement Learning,Ye Liu|Sheng Zhang|Rui Song|Suo Feng|Yanghua Xiao,"Open attribute value extraction for emerging entities is an important but challenging task. A lot of previous works formulate the problem as a question-answering (QA) task. While the collections of articles from web corpus provide updated information about the emerging entities, the retrieved texts can be noisy, irrelevant, thus leading to inaccurate answers. Effectively filtering out noisy articles as well as bad answers is the key to improve extraction accuracy. Knowledge graph (KG), which contains rich, well organized information about entities, provides a good resource to address the challenge. In this work, we propose a knowledge-guided reinforcement learning (RL) framework for open attribute value extraction. Informed by relevant knowledge in KG, we trained a deep Q-network to sequentially compare extracted answers to improve extraction accuracy. The proposed framework is applicable to different information extraction system. Our experimental results show that our method outperforms the baselines by 16.5 - 27.8%.",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.693,38939117 +main.2427,Semi-supervised New Event Type Induction and Event Detection,Lifu Huang|Heng Ji,"Most previous event extraction studies assume a set of target event types and corresponding event annotations are given, which could be very expensive. In this paper, we work on a new task of semi-supervised event type induction, aiming to automatically discover a set of unseen types from a given corpus by leveraging annotations available for a few seen types. We design a Semi-Supervised Vector Quantized Variational Autoencoder framework to automatically learn a discrete latent type representation for each seen and unseen type and optimize them using seen type event annotations. A variational autoencoder is further introduced to enforce the reconstruction of each event mention conditioned on its latent type distribution. Experiments show that our approach can not only achieve state-of-the-art performance on supervised event detection but also discover high-quality new event types.",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.53,38939118 +main.2430,Do Sequence-to-sequence VAEs Learn Global Features of Sentences?,Tom Bosc|Pascal Vincent,"Autoregressive language models are powerful and relatively easy to train. However, these models are usually trained without explicit conditioning labels and do not offer easy ways to control global aspects such as sentiment or topic during generation. Bowman & al. 2016 adapted the Variational Autoencoder (VAE) for natural language with the sequence-to-sequence architecture and claimed that the latent vector was able to capture such global features in an unsupervised manner. We question this claim. We measure which words benefit most from the latent information by decomposing the reconstruction loss per position in the sentence. Using this method, we find that VAEs are prone to memorizing the first words and the sentence length, producing local features of limited usefulness. To alleviate this, we investigate alternative architectures based on bag-of-words assumptions and language model pretraining. These variants learn latent variables that are more global, i.e., more predictive of topic or sentiment labels. Moreover, using reconstructions, we observe that they decrease memorization: the first word and the sentence length are not recovered as accurately than with the baselines, consequently yielding more diverse reconstructions.",,Language Generation,Long,https://www.aclweb.org/anthology/2020.emnlp-main.350,38939119 +main.2437,Factual Error Correction for Abstractive Summarization Models,Meng Cao|Yue Dong|Jiapeng Wu|Jackie Chi Kit Cheung,"Neural abstractive summarization systems have achieved promising progress, thanks to the availability of large-scale datasets and models pre-trained with self-supervised methods. However, ensuring the factual consistency of the generated summaries for abstractive summarization systems is a challenge. We propose a post-editing corrector module to address this issue by identifying and correcting factual errors in generated summaries. The neural corrector model is pre-trained on artificial examples that are created by applying a series of heuristic transformations on reference summaries. These transformations are inspired by the error analysis of state-of-the-art summarization model outputs. Experimental results show that our model is able to correct factual errors in summaries generated by other neural summarization models and outperforms previous models on factual consistency evaluation on the CNN/DailyMail dataset. We also find that transferring from artificial error correction to downstream settings is still very challenging.",,Summarization,Long,https://www.aclweb.org/anthology/2020.emnlp-main.506,38939120 +main.2438,Dissecting Span Identification Tasks with Performance Prediction,Sean Papay|Roman Klinger|Sebastian Padó,"Span identification (in short, span ID) tasks such as chunking, NER, or code-switching detection, ask models to identify and classify relevant spans in a text. Despite being a staple of NLP, and sharing a common structure, there is little insight on how these tasks' properties influence their difficulty, and thus little guidance on what model families work well on span ID tasks, and why. We analyze span ID tasks via performance prediction, estimating how well neural architectures do on different tasks. Our contributions are: (a) we identify key properties of span ID tasks that can inform performance prediction; (b) we carry out a large-scale experiment on English data, building a model to predict performance for unseen span ID tasks that can support architecture choices; (c), we investigate the parameters of the meta model, yielding new insights on how model and task properties interact to affect span ID performance. We find, e.g., that span frequency is especially important for LSTMs, and that CRFs help when spans are infrequent and boundaries non-distinctive.",,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.396,38939121 +main.2444,Interview: Large-scale Modeling of Media Dialog with Discourse Patterns and Knowledge Grounding,Bodhisattwa Prasad Majumder|Shuyang Li|Jianmo Ni|Julian McAuley,"In this work, we perform the first large-scale analysis of discourse in media dialog and its impact on generative modeling of dialog turns, with a focus on interrogative patterns and use of external knowledge. Discourse analysis can help us understand modes of persuasion, entertainment, and information elicitation in such settings, but has been limited to manual review of small corpora. We introduce **Interview**---a large-scale (105K conversations) media dialog dataset collected from news interview transcripts---which allows us to investigate such patterns at scale. We present a dialog model that leverages external knowledge as well as dialog acts via auxiliary losses and demonstrate that our model quantitatively and qualitatively outperforms strong discourse-agnostic baselines for dialog modeling---generating more specific and topical responses in interview-style conversations.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.653,38939122 +main.2448,Seq2Edits: Sequence Transduction Using Span-level Edit Operations,Felix Stahlberg|Shankar Kumar,"We propose Seq2Edits, an open-vocabulary approach to sequence editing for natural language processing (NLP) tasks with a high degree of overlap between input and output texts. In this approach, each sequence-to-sequence transduction is represented as a sequence of edit operations, where each operation either replaces an entire source span with target tokens or keeps it unchanged. We evaluate our method on five NLP tasks (text normalization, sentence fusion, sentence splitting & rephrasing, text simplification, and grammatical error correction) and report competitive results across the board. For grammatical error correction, our method speeds up inference by up to 5.2x compared to full sequence models because inference time depends on the number of edits rather than the number of target tokens. For text normalization, sentence fusion, and grammatical error correction, our approach improves explainability by associating each edit operation with a human-readable tag.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.418,38939123 +main.2452,Is the Best Better? Bayesian Statistical Model Comparison for Natural Language Processing,Piotr Szymański|Kyle Gorman,"Recent work raises concerns about the use of standard splits to compare natural language processing models. We propose a Bayesian statistical model comparison technique which uses k-fold cross-validation across multiple data sets to estimate the likelihood that one model will outperform the other, or that the two will produce practically equivalent results. We use this technique to rank six English part-of-speech taggers across two data sets and three evaluation metrics.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.172,38939124 +main.246,CSP: Code-Switching Pre-training for Neural Machine Translation,Zhen Yang|Bojie Hu|ambyera han|shen huang|Qi Ju,"This paper proposes a new pre-training method, called Code-Switching Pre-training (CSP for short) for Neural Machine Translation (NMT). Unlike traditional pre-training method which randomly masks some fragments of the input sentence, the proposed CSP randomly replaces some words in the source sentence with their translation words in the target language. Specifically, we firstly perform lexicon induction with unsupervised word embedding mapping between the source and target languages, and then randomly replace some words in the input sentence with their translation words according to the extracted translation lexicons. CSP adopts the encoder-decoder framework: its encoder takes the code-mixed sentence as input, and its decoder predicts the replaced fragment of the input sentence. In this way, CSP is able to pre-train the NMT model by explicitly making the most of the alignment information extracted from the source and target monolingual corpus. Additionally, we relieve the pretrain-finetune discrepancy caused by the artificial symbols like [mask]. To verify the effectiveness of the proposed method, we conduct extensive experiments on unsupervised and supervised NMT. Experimental results show that CSP achieves significant improvements over baselines without pre-training or with other pre-training methods.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.208,38938670 +main.247,Neural Mask Generator: Learning to Generate Adaptive Word Maskings for Language Model Adaptation,Minki Kang|Moonsu Han|Sung Ju Hwang,"We propose a method to automatically generate a domain- and task-adaptive maskings of the given text for self-supervised pre-training, such that we can effectively adapt the language model to a particular target task (e.g. question answering). Specifically, we present a novel reinforcement learning-based framework which learns the masking policy, such that using the generated masks for further pre-training of the target language model helps improve task performance on unseen texts. We use off-policy actor-critic with entropy regularization and experience replay for reinforcement learning, and propose a Transformer-based policy network that can consider the relative importance of words in a given text. We validate our Neural Mask Generator (NMG) on several question answering and text classification datasets using BERT and DistilBERT as the language models, on which it outperforms rule-based masking strategies, by automatically learning optimal adaptive maskings.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.493,38938671 +main.2470,Compressive Summarization with Plausibility and Salience Modeling,Shrey Desai|Jiacheng Xu|Greg Durrett,"Compressive summarization systems typically rely on a seed set of syntactic rules to determine under what circumstances deleting a span is permissible, then learn which compressions to actually apply by optimizing for ROUGE. In this work, we propose to relax these explicit syntactic constraints on candidate spans, and instead leave the decision about what to delete to two data-driven criteria: plausibility and salience. Deleting a span is plausible if removing it maintains the grammaticality and factuality of a sentence, and it is salient if it removes important information from the summary. Each of these is judged by a pre-trained Transformer model, and only deletions that are both plausible and not salient can be applied. When integrated into a simple extraction-compression pipeline, our method achieves strong in-domain results on benchmark datasets, and human evaluation shows that the plausibility model generally selects for grammatical and factual deletions. Furthermore, the flexibility of our approach allows it to generalize cross-domain, and we show that our system fine-tuned on only 500 samples from a new domain can match or exceed a strong in-domain extractive model.",,Summarization,Long,https://www.aclweb.org/anthology/2020.emnlp-main.507,38939125 +main.2476,DualTKB: A Dual Learning Bridge between Text and Knowledge Base,Pierre Dognin|Igor Melnyk|Inkit Padhi|Cicero Nogueira dos Santos|Payel Das,"In this work, we present a dual learning approach for unsupervised text to path and path to text transfers in Commonsense Knowledge Bases (KBs). We investigate the impact of weak supervision by creating a weakly supervised dataset and show that even a slight amount of supervision can significantly improve the model performance and enable better-quality transfers. We examine different model architectures, and evaluation metrics, proposing a novel Commonsense KB completion metric tailored for generative models. Extensive experimental results show that the proposed method compares very favorably to the existing baselines. This approach is a viable step towards a more advanced system for automatic KB construction/expansion and the reverse operation of KB conversion to coherent textual descriptions.",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.694,38939126 +main.2490,Do Explicit Alignments Robustly Improve Massively Multilingual Encoders?,Shijie Wu|Mark Dredze,"Multilingual BERT (mBERT), XLM-RoBERTa (XLMR) and other unsupervised multilingual encoders can effectively learn cross-lingual representation. Explicit alignment objectives based on bitexts like Europarl or MultiUN have been shown to further improve these representations. However, word-level alignments are often suboptimal and such bitexts are unavailable for many languages. In this paper, we propose a new contrastive alignment objective that can better utilize such signal, and examine whether these previous alignment methods can be adapted to noisier sources of aligned data: a randomly sampled 1 million pair subset of the OPUS collection. Additionally, rather than report results on a single dataset with a single model run, we report the mean and standard derivation of multiple runs with different seeds, on four datasets and tasks. Our more extensive analysis finds that, while our new objective outperforms previous work, overall these methods do not improve performance with a more robust evaluation framework. Furthermore, the gains from using a better underlying model eclipse any benefits from alignment training. These negative results dictate more care in evaluating these methods and suggest limitations in applying explicit alignment objectives.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.362,38939127 +main.2491,On Losses for Modern Language Models,Stéphane Aroca-Ouellette|Frank Rudzicz,"BERT set many state-of-the-art results over varied NLU benchmarks by pre-training over two tasks: masked language modelling (MLM) and next sentence prediction (NSP), the latter of which has been highly criticized. In this paper, we 1) clarify NSP's effect on BERT pre-training, 2) explore fourteen possible auxiliary pre-training tasks, of which seven are novel to modern language models, and 3) investigate different ways to include multiple tasks into pre-training. We show that NSP is detrimental to training due to its context splitting and shallow semantic signal. We also identify six auxiliary pre-training tasks -- sentence ordering, adjacent sentence prediction, TF prediction, TF-IDF prediction, a FastSent variant, and a Quick Thoughts variant -- that outperform a pure MLM baseline. Finally, we demonstrate that using multiple tasks in a multi-task pre-training framework provides better results than using any single auxiliary task. Using these methods, we outperform BERT\textsubscript{Base} on the GLUE benchmark using fewer than a quarter of the training tokens.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.403,38939128 +main.2493,OCR Post-Correction for Endangered Language Texts,Shruti Rijhwani|Antonios Anastasopoulos|Graham Neubig,"There is little to no data available to build natural language processing models for most endangered languages. However, textual data in these languages often exists in formats that are not machine-readable, such as paper books and scanned images. In this work, we address the task of extracting text from these resources. We create a benchmark dataset of transcriptions for scanned books in three critically endangered languages and present a systematic analysis of how general-purpose OCR tools are not robust to the data-scarce setting of endangered languages. We develop an OCR post-correction method tailored to ease training in this data-scarce setting, reducing the recognition error rate by 34% on average across the three languages.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.478,38939129 +main.2500,From Zero to Hero: On the Limitations of Zero-Shot Language Transfer with Multilingual Transformers,Anne Lauscher|Vinit Ravishankar|Ivan Vulić|Goran Glavaš,"Massively multilingual transformers (MMTs) pretrained via language modeling (e.g., mBERT, XLM-R) have become a default paradigm for zero-shot language transfer in NLP, offering unmatched transfer performance. Current evaluations, however, verify their efficacy in transfers (a) to languages with sufficiently large pretraining corpora, and (b) between close languages. In this work, we analyze the limitations of downstream language transfer with MMTs, showing that, much like cross-lingual word embeddings, they are substantially less effective in resource-lean scenarios and for distant languages. Our experiments, encompassing three lower-level tasks (POS tagging, dependency parsing, NER) and two high-level tasks (NLI, QA), empirically correlate transfer performance with linguistic proximity between source and target languages, but also with the size of target language corpora used in MMT pretraining. Most importantly, we demonstrate that the inexpensive few-shot transfer (i.e., additional fine-tuning on a few target-language instances) is surprisingly effective across the board, warranting more research efforts reaching beyond the limiting zero-shot conditions.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.363,38939130 +main.2506,Evaluating the Factual Consistency of Abstractive Text Summarization,Wojciech Kryscinski|Bryan McCann|Caiming Xiong|Richard Socher,"The most common metrics for assessing summarization algorithms do not account for whether summaries are factually consistent with source documents. We propose a weakly-supervised, model-based approach for verifying factual consistency and identifying conflicts between source documents and generated summaries. Training data is generated by applying a series of rule-based transformations to the sentences of source documents.The factual consistency model is then trained jointly for three tasks: 1) predict whether each summary sentence is factually consistent or not, 2) in either case, extract a span in the source document to support this consistency prediction, 3) for each summary sentence that is deemed inconsistent, extract the inconsistent span from it. Transferring this model to summaries generated by several neural models reveals that this highly scalable approach outperforms previous models, including those trained with strong supervision using datasets from related domains, such as natural language inference and fact checking. Additionally, human evaluation shows that the auxiliary span extraction tasks provide useful assistance in the process of verifying factual consistency. We also release a manually annotated dataset for factual consistency verification, code for training data generation, and trained model weights at https://github.com/salesforce/factCC.",,Summarization,Long,https://www.aclweb.org/anthology/2020.emnlp-main.750,38939131 +main.2508,Weakly Supervised Subevent Knowledge Acquisition,Wenlin Yao|Zeyu Dai|Maitreyi Ramaswamy|Bonan Min|Ruihong Huang,"Subevents elaborate an event and widely exist in event descriptions. Subevent knowledge is useful for discourse analysis and event-centric applications. Acknowledging the scarcity of subevent knowledge, we propose a weakly supervised approach to extract subevent relation tuples from text and build the first large scale subevent knowledge base. We first obtain the initial set of event pairs that are likely to have the subevent relation, by exploiting two observations that 1) subevents are temporally contained by the parent event, and 2) the definitions of the parent event can be used to further guide the identification of subevents. Then, we collect rich weak supervision using the initial seed subevent pairs to train a contextual classifier using BERT and apply the classifier to identify new subevent pairs. The evaluation showed that the acquired subevent tuples (239K) are of high quality (90.1% accuracy) and cover a wide range of event types. The acquired subevent knowledge has been shown useful for discourse analysis and identifying a range of event-event relations.",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.430,38939132 +main.2510,A Time-Aware Transformer Based Model for Suicide Ideation Detection on Social Media,Ramit Sawhney|Harshit Joshi|Saumya Gandhi|Rajiv Ratn Shah,"Social media's ubiquity fosters a space for users to exhibit suicidal thoughts outside of traditional clinical settings. Understanding the build-up of such ideation is critical for the identification of at-risk users and suicide prevention. Suicide ideation is often linked to a history of mental depression. The emotional spectrum of a user's historical activity on social media can be indicative of their mental state over time. In this work, we focus on identifying suicidal intent in English tweets by augmenting linguistic models with historical context. We propose STATENet, a time-aware transformer based model for preliminary screening of suicidal risk on social media. STATENet outperforms competitive methods, demonstrating the utility of emotional and temporal contextual cues for suicide risk assessment. We discuss the empirical, qualitative, practical, and ethical aspects of STATENet for suicide ideation detection.",,Computational Social Science and Social Media,Long,https://www.aclweb.org/anthology/2020.emnlp-main.619,38939133 +main.2511,PAIR: Planning and Iterative Refinement in Pre-trained Transformers for Long Text Generation,Xinyu Hua|Lu Wang,"Pre-trained Transformers have enabled impressive breakthroughs in generating long and fluent text, yet their outputs are often “rambling” without coherently arranged content. In this work, we present a novel content-controlled text generation framework, PAIR, with planning and iterative refinement, which is built upon a large model, BART. We first adapt the BERT model to automatically construct the content plans, consisting of keyphrase assignments and their corresponding sentence-level positions. The BART model is employed for generation without modifying its structure. We then propose a refinement algorithm to gradually enhance the generation quality within the sequence-to-sequence framework. Evaluation with automatic metrics shows that adding planning consistently improves the generation quality on three distinct domains, with an average of 20 BLEU points and 12 METEOR points improvements. In addition, human judges rate our system outputs to be more relevant and coherent than comparisons without planning.",,Language Generation,Long,https://www.aclweb.org/anthology/2020.emnlp-main.57,38939134 +main.2512,"QADiscourse - Discourse Relations as QA Pairs: Representation, Crowdsourcing and Baselines",Valentina Pyatkin|Ayal Klein|Reut Tsarfaty|Ido Dagan,"Discourse relations describe how two propositions relate to one another, and identifying them automatically is an integral part of natural language understanding. However, annotating discourse relations typically requires expert annotators. Recently, different semantic aspects of a sentence have been represented and crowd-sourced via question-and-answer (QA) pairs. This paper proposes a novel representation of discourse relations as QA pairs, which in turn allows us to crowd-source wide-coverage data annotated with discourse relations, via an intuitively appealing interface for composing such questions and answers. Based on our proposed representation, we collect a novel and wide-coverage QADiscourse dataset, and present baseline algorithms for predicting QADiscourse relations.",,Discourse and Pragmatics,Long,https://www.aclweb.org/anthology/2020.emnlp-main.224,38939135 +main.2515,TNT: Text Normalization Based Pre-training of Transformers for Content Moderation,Fei Tan|Yifan Hu|Changwei Hu|Keqian Li|Kevin Yen,"In this work, we present a new language pre-training model TNT (Text Normalization based pre-training of Transformers) for content moderation. Inspired by the masking strategy and text normalization, TNT is developed to learn language representation by training transformers to reconstruct text from four operation types typically seen in text manipulation: substitution, transposition, deletion, and insertion. Furthermore, the normalization involves the prediction of both operation types and token labels, enabling TNT to learn from more challenging tasks than the standard task of masked word recovery. As a result, the experiments demonstrate that TNT outperforms strong baselines on the hate speech classification task. Additional text normalization experiments and case studies show that TNT is a new potential approach to misspelling correction.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.383,38939136 +main.252,UNION: An Unreferenced Metric for Evaluating Open-ended Story Generation,Jian Guan|Minlie Huang,"Despite the success of existing referenced metrics (e.g., BLEU and MoverScore), they correlate poorly with human judgments for open-ended text generation including story or dialog generation because of the notorious one-to-many issue: there are many plausible outputs for the same input, which may differ substantially in literal or semantics from the limited number of given references. To alleviate this issue, we propose UNION, a learnable UNreferenced metrIc for evaluating Open-eNded story generation, which measures the quality of a generated story without any reference. Built on top of BERT, UNION is trained to distinguish human-written stories from negative samples and recover the perturbation in negative stories. We propose an approach of constructing negative samples by mimicking the errors commonly observed in existing NLG models, including repeated plots, conflicting logic, and long-range incoherence. Experiments on two story datasets demonstrate that UNION is a reliable measure for evaluating the quality of generated stories, which correlates better with human judgments and is more generalizable than existing state-of-the-art metrics.",,Language Generation,Long,https://www.aclweb.org/anthology/2020.emnlp-main.736,38938672 +main.2520,VolTAGE: Volatility Forecasting via Text-Audio Fusion with Graph Convolution Networks for Earnings Calls,Ramit Sawhney|Piyush Khanna|Arshiya Aggarwal|Taru Jain|Puneet Mathur|Rajiv Ratn Shah,"Natural language processing has recently made stock movement forecasting and volatility forecasting advances, leading to improved financial forecasting. Transcripts of companies' earnings calls are well studied for risk modeling, offering unique investment insight into stock performance. However, vocal cues in the speech of company executives present an underexplored rich source of natural language data for estimating financial risk. Additionally, most existing approaches ignore the correlations between stocks. Building on existing work, we introduce a neural model for stock volatility prediction that accounts for stock interdependence via graph convolutions while fusing verbal, vocal, and financial features in a semi-supervised multi-task risk forecasting formulation. Our proposed model, VolTAGE, outperforms existing methods demonstrating the effectiveness of multimodal learning for volatility prediction.",,Speech and Multimodality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.643,38939137 +main.2529,Supertagging Combinatory Categorial Grammar with Attentive Graph Convolutional Networks,Yuanhe Tian|Yan Song|Fei Xia,"Supertagging is conventionally regarded as an important task for combinatory categorial grammar (CCG) parsing, where effective modeling of contextual information is highly important to this task. However, existing studies have made limited efforts to leverage contextual features except for applying powerful encoders (e.g., bi-LSTM). In this paper, we propose attentive graph convolutional networks to enhance neural CCG supertagging through a novel solution of leveraging contextual information. Specifically, we build the graph from chunks (n-grams) extracted from a lexicon and apply attention over the graph, so that different word pairs from the contexts within and across chunks are weighted in the model and facilitate the supertagging accordingly. The experiments performed on the CCGbank demonstrate that our approach outperforms all previous studies in terms of both supertagging and parsing. Further analyses illustrate the effectiveness of each component in our approach to discriminatively learn from word pairs to enhance CCG supertagging.",,"Syntax: Tagging, Chunking, and Parsing",Long,https://www.aclweb.org/anthology/2020.emnlp-main.487,38939138 +main.2533,PARADE: A New Dataset for Paraphrase Identification Requiring Computer Science Domain Knowledge,Yun He|Zhuoer Wang|Yin Zhang|Ruihong Huang|James Caverlee,"We present a new benchmark dataset called PARADE for paraphrase identification that requires specialized domain knowledge. PARADE contains paraphrases that overlap very little at the lexical and syntactic level but are semantically equivalent based on computer science domain knowledge, as well as non-paraphrases that overlap greatly at the lexical and syntactic level but are not semantically equivalent based on this domain knowledge. Experiments show that both state-of-the-art neural models and non-expert human annotators have poor performance on PARADE. For example, BERT after fine-tuning achieves an F1 score of 0.709, which is much lower than its performance on other paraphrase identification datasets. PARADE can serve as a resource for researchers interested in testing models that incorporate domain knowledge. We make our data and code freely available.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",Long,https://www.aclweb.org/anthology/2020.emnlp-main.611,38939139 +main.2535,Understanding the Mechanics of SPIGOT: Surrogate Gradients for Latent Structure Learning,Tsvetomila Mihaylova|Vlad Niculae|André F. T. Martins,"Latent structure models are a powerful tool for modeling language data: they can mitigate the error propagation and annotation bottleneck in pipeline systems, while simultaneously uncovering linguistic insights about the data. One challenge with end-to-end training of these models is the argmax operation, which has null gradient. In this paper, we focus on surrogate gradients, a popular strategy to deal with this problem. We explore latent structure learning through the angle of pulling back the downstream learning objective. In this paradigm, we discover a principled motivation for both the straight-through estimator (STE) as well as the recently-proposed SPIGOT -- a variant of STE for structured models. Our perspective leads to new algorithms in the same family. We empirically compare the known and the novel pulled-back estimators against the popular alternatives, yielding new insight for practitioners and revealing intriguing failure cases.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.171,38939140 +main.2549,Affective Event Classification with Discourse-enhanced Self-training,Yuan Zhuang|Tianyu Jiang|Ellen Riloff,"Prior research has recognized the need to associate affective polarities with events and has produced several techniques and lexical resources for identifying affective events. Our research introduces new classification models to assign affective polarity to event phrases. First, we present a BERT-based model for affective event classification and show that the classifier achieves substantially better performance than a large affective event knowledge base. Second, we present a discourse-enhanced self-training method that iteratively improves the classifier with unlabeled data. The key idea is to exploit event phrases that occur with a coreferent sentiment expression. The discourse-enhanced self-training algorithm iteratively labels new event phrases based on both the classifier's predictions and the polarities of the event's coreferent sentiment expressions. Our results show that discourse-enhanced self-training further improves both recall and precision for affective event classification.",,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Long,https://www.aclweb.org/anthology/2020.emnlp-main.452,38939141 +main.2553,Detecting Fine-Grained Cross-Lingual Semantic Divergences without Supervision by Learning to Rank,Eleftheria Briakou|Marine Carpuat,"Detecting fine-grained differences in content conveyed in different languages matters for cross-lingual NLP and multilingual corpora analysis, but it is a challenging machine learning problem since annotation is expensive and hard to scale. This work improves the prediction and annotation of fine-grained semantic divergences. We introduce a training strategy for multilingual BERT models by learning to rank synthetic divergent examples of varying granularity. We evaluate our models on the Rationalized English-French Semantic Divergences, a new dataset released with this work, consisting of English-French sentence-pairs annotated with semantic divergence classes and token-level rationales. Learning to rank helps detect fine-grained sentence-level divergences more accurately than a strong sentence-level similarity model, while token-level predictions have the potential of further distinguishing between coarse and fine-grained divergences.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",Long,https://www.aclweb.org/anthology/2020.emnlp-main.121,38939142 +main.2561,Condolences and Empathy in Online Communities,Naitian Zhou|David Jurgens,"Offering condolence is a natural reaction to hearing someone's distress. Individuals frequently express distress in social media, where some communities can provide support. However, not all condolence is equal—trite responses offer little actual support despite their good intentions. Here, we develop computational tools to create a massive dataset of 11.4M expressions of distress and 2.8M corresponding offerings of condolence in order to examine the dynamics of condolence online. Our study reveals widespread disparity in what types of distress receive supportive condolence rather than just engagement. Building on studies from social psychology, we analyze the language of condolence and develop a new dataset for quantifying the empathy in a condolence using appraisal theory. Finally, we demonstrate that the features of condolence individuals find most helpful online differ substantially in their features from those seen in interpersonal settings.",,Computational Social Science and Social Media,Long,https://www.aclweb.org/anthology/2020.emnlp-main.45,38939143 +main.2570,Hierarchical Evidence Set Modeling for Automated Fact Extraction and Verification,Shyam Subramanian|Kyumin Lee,"Automated fact extraction and verification is a challenging task that involves finding relevant evidence sentences from a reliable corpus to verify the truthfulness of a claim. Existing models either (i) concatenate all the evidence sentences, leading to the inclusion of redundant and noisy information; or (ii) process each claim-evidence sentence pair separately and aggregate all of them later, missing the early combination of related sentences for more accurate claim verification. Unlike the prior works, in this paper, we propose Hierarchical Evidence Set Modeling (HESM), a framework to extract evidence sets (each of which may contain multiple evidence sentences), and verify a claim to be supported, refuted or not enough info, by encoding and attending the claim and evidence sets at different levels of hierarchy. Our experimental results show that HESM outperforms 7 state-of-the-art methods for fact extraction and claim verification. Our source code is available at https://github.com/ShyamSubramanian/HESM.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",Long,https://www.aclweb.org/anthology/2020.emnlp-main.627,38939144 +main.2574,Interactive Fiction Game Playing as Multi-Paragraph Reading Comprehension with Reinforcement Learning,Xiaoxiao Guo|Mo Yu|Yupeng Gao|Chuang Gan|Murray Campbell|Shiyu Chang,"Interactive Fiction (IF) games with real human-written natural language texts provide a new natural evaluation for language understanding techniques. In contrast to previous text games with mostly synthetic texts, IF games pose language understanding challenges on the human-written textual descriptions of diverse and sophisticated game worlds and language generation challenges on the action command generation from less restricted combinatorial space. We take a novel perspective of IF game solving and re-formulate it as Multi-Passage Reading Comprehension (MPRC) tasks. Our approaches utilize the context-query attention mechanisms and the structured prediction in MPRC to efficiently generate and evaluate action outputs and apply an object-centric historical observation retrieval strategy to mitigate the partial observability of the textual observations. Extensive experiments on the recent IF benchmark (Jericho) demonstrate clear advantages of our approaches achieving high winning rates and low data requirements compared to all previous approaches.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.624,38939145 +main.2579,Which *BERT? A Survey Organizing Contextualized Encoders,Patrick Xia|Shijie Wu|Benjamin Van Durme,"Pretrained contextualized text encoders are now a staple of the NLP community. We present a survey on language representation learning with the aim of consolidating a series of shared lessons learned across a variety of recent efforts. While significant advancements continue at a rapid pace, we find that enough has now been discovered, in different directions, that we can begin to organize advances according to common themes. Through this organization, we highlight important considerations when interpreting recent contributions and choosing which model to use.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.608,38939146 +main.2581,An Empirical Study of Generation Order for Machine Translation,William Chan|Mitchell Stern|Jamie Kiros|Jakob Uszkoreit,"In this work, we present an empirical study of generation order for machine translation. Building on recent advances in insertion-based modeling, we first introduce a soft order-reward framework that enables us to train models to follow arbitrary oracle generation policies. We then make use of this framework to explore a large variety of generation orders, including uninformed orders, location-based orders, frequency-based orders, content-based orders, and model-based orders. Curiously, we find that for the WMT'14 English $\to$ German and WMT'18 English $\to$ Chinese translation tasks, order does not have a substantial impact on output quality. Moreover, for English $\to$ German, we even discover that unintuitive orderings such as alphabetical and shortest-first can match the performance of a standard Transformer, suggesting that traditional left-to-right generation may not be necessary to achieve high performance.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.464,38939147 +main.2583,Supervised Seeded Iterated Learning for Interactive Language Learning,Yuchen Lu|Soumye Singhal|Florian Strub|Olivier Pietquin|Aaron Courville,"Language drift has been one of the major obstacles to train language models through interaction. When word-based conversational agents are trained towards completing a task, they tend to invent their language rather than leveraging natural language. In recent literature, two general methods partially counter this phenomenon: Supervised Selfplay (S2P) and Seeded Iterated Learning (SIL). While S2P jointly trains interactive and supervised losses to counter the drift, SIL changes the training dynamics to prevent language drift from occurring. In this paper, we first highlight their respective weaknesses, i.e., late-stage training collapses and higher negative likelihood when evaluated on human corpus. Given these observations, we introduce Supervised Seeded Iterated Learning (SSIL) to combine both methods to minimize their respective weaknesses. We then show the effectiveness of \algo in the language-drift translation game.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.325,38939148 +main.2585,Learning Variational Word Masks to Improve the Interpretability of Neural Text Classifiers,Hanjie Chen|Yangfeng Ji,"To build an interpretable neural text classifier, most of the prior work has focused on designing inherently interpretable models or finding faithful explanations. A new line of work on improving model interpretability has just started, and many existing methods require either prior information or human annotations as additional inputs in training. To address this limitation, we propose the variational word mask (VMASK) method to automatically learn task-specific important words and reduce irrelevant information on classification, which ultimately improves the interpretability of model predictions. The proposed method is evaluated with three neural text classifiers (CNN, LSTM, and BERT) on seven benchmark text classification datasets. Experiments show the effectiveness of VMASK in improving both model prediction accuracy and interpretability.",,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.347,38939149 +main.2586,Training Question Answering Models from Synthetic Data,Raul Puri|Ryan Spring|Mohammad Shoeybi|Mostofa Patwary|Bryan Catanzaro,"Question and answer generation is a data augmentation method that aims to improve question answering (QA) models given the limited amount of human labeled data. However, a considerable gap remains between synthetic and human-generated question-answer pairs. This work aims to narrow this gap by taking advantage of large language models and explores several factors such as model size, quality of pretrained models, scale of data synthesized, and algorithmic choices. On the SQuAD1.1 question answering task, we achieve higher accuracy using solely synthetic questions and answers than when using the SQuAD1.1 training set questions alone. Removing access to real Wikipedia data, we synthesize questions and answers from a synthetic text corpus generated by an 8.3 billion parameter GPT-2 model and achieve 88.4 Exact Match (EM) and 93.9 F1 score on the SQuAD1.1 dev set. We further apply our methodology to SQuAD2.0 and show a 2.8 absolute gain on EM score compared to prior work using synthetic data.",,Question Answering,Long,https://www.aclweb.org/anthology/2020.emnlp-main.468,38939150 +main.2587,Dense Passage Retrieval for Open-Domain Question Answering,Vladimir Karpukhin|Barlas Oguz|Sewon Min|Patrick Lewis|Ledell Wu|Sergey Edunov|Danqi Chen|Wen-tau Yih,"Open-domain question answering relies on efficient passage retrieval to select candidate contexts, where traditional sparse vector space models, such as TF-IDF or BM25, are the de facto method. In this work, we show that retrieval can be practically implemented using dense representations alone, where embeddings are learned from a small number of questions and passages by a simple dual-encoder framework. When evaluated on a wide range of open-domain QA datasets, our dense retriever outperforms a strong Lucene-BM25 system greatly by 9%-19% absolute in terms of top-20 passage retrieval accuracy, and helps our end-to-end QA system establish new state-of-the-art on multiple open-domain QA benchmarks.",,Question Answering,Long,https://www.aclweb.org/anthology/2020.emnlp-main.550,38939151 +main.2590,Template Guided Text Generation for Task Oriented Dialogue,Mihir Kale|Abhinav Rastogi,"Virtual assistants such as Google Assistant, Amazon Alexa, and Apple Siri enable users to interact with a large number of services and APIs on the web using natural language. In this work, we investigate two methods for Natural Language Generation (NLG) using a single domain-independent model across a large number of APIs. First, we propose a schema-guided approach which conditions the generation on a schema describing the API in natural language. Our second method investigates the use of a small number of templates, growing linearly in number of slots, to convey the semantics of the API. To generate utterances for an arbitrary slot combination, a few simple templates are first concatenated to give a semantically correct, but possibly incoherent and ungrammatical utterance. A pre-trained language model is subsequently employed to rewrite it into coherent, natural sounding text. Through automatic metrics and human evaluation, we show that our method improves over strong baselines, is robust to out-of-domain inputs and shows improved sample efficiency.",,Language Generation,Long,https://www.aclweb.org/anthology/2020.emnlp-main.527,38939152 +main.2596,Compositional Demographic Word Embeddings,Charles Welch|Jonathan K. Kummerfeld|Verónica Pérez-Rosas|Rada Mihalcea,"Word embeddings are usually derived from corpora containing text from many individuals, thus leading to general purpose representations rather than individually personalized representations. While personalized embeddings can be useful to improve language model performance and other language processing tasks, they can only be computed for people with a large amount of longitudinal data, which is not the case for new users. We propose a new form of personalized word embeddings that use demographic-specific word representations derived compositionally from full or partial demographic information for a user (i.e., gender, age, location, religion). We show that the resulting demographic-aware word representations outperform generic word representations on two tasks for English: language modeling and word associations. We further explore the trade-off between the number of available attributes and their relative effectiveness and discuss the ethical implications of using them.",,Semantics: Lexical Semantics,Long,https://www.aclweb.org/anthology/2020.emnlp-main.334,38939153 +main.26,Self-Paced Learning for Neural Machine Translation,Yu Wan|Baosong Yang|Derek F. Wong|Yikai Zhou|Lidia S. Chao|Haibo Zhang|Boxing Chen,"Recent studies have proven that the training of neural machine translation (NMT) can be facilitated by mimicking the learning process of humans. Nevertheless, achievements of such kind of curriculum learning rely on the quality of artificial schedule drawn up with the handcrafted features, e.g. sentence length or word rarity. We ameliorate this procedure with a more flexible manner by proposing self-paced learning, where NMT model is allowed to 1) automatically quantify the learning confidence over training examples; and 2) flexibly govern its learning via regulating the loss in each iteration step. Experimental results over multiple translation tasks demonstrate that the proposed model yields better performance than strong baselines and those models trained with human-designed curricula on both translation quality and convergence speed.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.80,38938638 +main.2608,Biomedical Event Extraction as Sequence Labeling,Alan Ramponi|Rob van der Goot|Rosario Lombardo|Barbara Plank,"We introduce Biomedical Event Extraction as Sequence Labeling (BeeSL), a joint end-to-end neural information extraction model. BeeSL recasts the task as sequence labeling, taking advantage of a multi-label aware encoding strategy and jointly modeling the intermediate tasks via multi-task learning. BeeSL is fast, accurate, end-to-end, and unlike current methods does not require any external knowledge base or preprocessing tools. BeeSL outperforms the current best system (Li et al., 2019) on the Genia 2011 benchmark by 1.57% absolute F1 score reaching 60.22% F1, establishing a new state of the art for the task. Importantly, we also provide first results on biomedical event extraction without gold entity information. Empirical results show that BeeSL's speed and accuracy makes it a viable approach for large-scale real-world scenarios.",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.431,38939154 +main.2612,INSPIRED: Toward Sociable Recommendation Dialog Systems,Shirley Anugrah Hayati|Dongyeop Kang|Qingxiaoyang Zhu|Weiyan Shi|Zhou Yu,"In recommendation dialogs, humans commonly disclose their preference and make recommendations in a friendly manner. However, this is a challenge when developing a sociable recommendation dialog system, due to the lack of dialog dataset annotated with such sociable strategies. Therefore, we present INSPIRED, a new dataset of 1,001 human-human dialogs for movie recommendation with measures for successful recommendations. To better understand how humans make recommendations in communication, we design an annotation scheme related to recommendation strategies based on social science theories and annotate these dialogs. Our analysis shows that sociable recommendation strategies, such as sharing personal opinions or communicating with encouragement, more frequently lead to successful recommendations. Based on our dataset, we train end-to-end recommendation dialog systems with and without our strategy labels. In both automatic and human evaluation, our model with strategy incorporation outperforms the baseline model. This work is a first step for building sociable recommendation dialog systems with a basis of social science theories.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.654,38939155 +main.2614,Fortifying Toxic Speech Detectors against Disguised Toxicity,Xiaochuang Han|Yulia Tsvetkov,"Modern toxic speech detectors are incompetent in recognizing disguised offensive language, such as adversarial attacks that deliberately avoid known toxic lexicons, or manifestations of implicit bias. Building a large annotated dataset for such veiled toxicity can be very expensive. In this work, we propose a framework aimed at fortifying existing toxic speech detectors without a large labeled corpus of veiled toxicity. Just a handful of probing examples are used to surface orders of magnitude more disguised offenses. We augment the toxic speech detector's training data with these discovered offensive examples, thereby making it more robust to veiled toxicity while preserving its utility in detecting overt toxicity.",,Computational Social Science and Social Media,Long,https://www.aclweb.org/anthology/2020.emnlp-main.622,38939156 +main.2615,Calibration of Pre-trained Transformers,Shrey Desai|Greg Durrett,"Pre-trained Transformers are now ubiquitous in natural language processing, but despite their high end-task performance, little is known empirically about whether they are calibrated. Specifically, do these models' posterior probabilities provide an accurate empirical measure of how likely the model is to be correct on a given example? We focus on BERT and RoBERTa in this work, and analyze their calibration across three tasks: natural language inference, paraphrase detection, and commonsense reasoning. For each task, we consider in-domain as well as challenging out-of-domain settings, where models face more examples they should be uncertain about. We show that: (1) when used out-of-the-box, pre-trained models are calibrated in-domain, and compared to baselines, their calibration error out-of-domain can be as much as 3.5x lower; (2) temperature scaling is effective at further reducing calibration error in-domain, and using label smoothing to deliberately increase empirical uncertainty helps calibrate posteriors out-of-domain.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.21,38939157 +main.2630,X-FACTR: Multilingual Factual Knowledge Retrieval from Pretrained Language Models,Zhengbao Jiang|Antonios Anastasopoulos|Jun Araki|Haibo Ding|Graham Neubig,"Language models (LMs) have proven surprisingly successful at capturing factual knowledge by completing cloze-style fill-in-the-blank questions such as ""Punta Cana is located in _."" However, while knowledge is both written and queried in many languages, studies on LMs' factual representation ability have almost invariably been performed on English. To assess factual knowledge retrieval in LMs in different languages, we create a multilingual benchmark of cloze-style probes for \langnum typologically diverse languages. To properly handle language variations, we expand probing methods from single- to multi-word entities, and develop several decoding algorithms to generate multi-token predictions. Extensive experimental results provide insights about how well (or poorly) current state-of-the-art LMs perform at this task in languages with more or fewer available resources. We further propose a code-switching-based method to improve the ability of multilingual LMs to access knowledge, and verify its effectiveness on several benchmark languages. Benchmark data and code have be released at https://x-factr.github.io.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.479,38939158 +main.2632,Autoregressive Knowledge Distillation through Imitation Learning,Alexander Lin|Jeremy Wohlwend|Howard Chen|Tao Lei,"The performance of autoregressive models on natural language generation tasks has dramatically improved due to the adoption of deep, self-attentive architectures. However, these gains have come at the cost of hindering inference speed, making state-of-the-art models cumbersome to deploy in real-world, time-sensitive settings. We develop a compression technique for autoregressive models that is driven by an imitation learning perspective on knowledge distillation. The algorithm is designed to address the exposure bias problem. On prototypical language generation tasks such as translation and summarization, our method consistently outperforms other distillation algorithms, such as sequence-level knowledge distillation. Student models trained with our method attain 1.4 to 4.8 BLEU/ROUGE points higher than those trained from scratch, while increasing inference speed by up to 14 times in comparison to the teacher model.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.494,38939159 +main.2635,Cross-Thought for Sentence Encoder Pre-training,Shuohang Wang|Yuwei Fang|Siqi Sun|Zhe Gan|Yu Cheng|Jingjing Liu|Jing Jiang,"In this paper, we propose Cross-Thought, a novel approach to pre-training sequence encoder, which is instrumental in building reusable sequence embeddings for large-scale NLP tasks such as question answering. Instead of using the original signals of full sentences, we train a Transformer-based sequence encoder over a large set of short sequences, which allows the model to automatically select the most useful information for predicting masked words. Experiments on question answering and textual entailment tasks demonstrate that our pre-trained encoder can outperform state-of-the-art encoders trained with continuous sentence signals as well as traditional masked language modeling baselines. Our proposed approach also achieves new state of the art on HotpotQA (full-wiki setting) by improving intermediate information retrieval performance.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",Long,https://www.aclweb.org/anthology/2020.emnlp-main.30,38939160 +main.2636,An Exploration of Arbitrary-Order Sequence Labeling via Energy-Based Inference Networks,Lifu Tu|Tianyu Liu|Kevin Gimpel,"Many tasks in natural language processing involve predicting structured outputs, e.g., sequence labeling, semantic role labeling, parsing, and machine translation. Researchers are increasingly applying deep representation learning to these problems, but the structured component of these approaches is usually quite simplistic. In this work, we propose several high-order energy terms to capture complex dependencies among labels in sequence labeling, including several that consider the entire label sequence. We use neural parameterizations for these energy terms, drawing from convolutional, recurrent, and self-attention networks. We use the framework of learning energy-based inference networks (Tu and Gimpel, 2018) for dealing with the difficulties of training and inference with such models. We empirically demonstrate that this approach achieves substantial improvement using a variety of high-order energy terms on four sequence labeling tasks, while having the same decoding speed as simple, local classifiers. We also find high-order energies to help in noisy data conditions.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.449,38939161 +main.2638,Predicting Reference: What Do Language Models Learn about Discourse Models?,Shiva Upadhye|Leon Bergen|Andrew Kehler,"Whereas there is a growing literature that probes neural language models to assess the degree to which they have latently acquired grammatical knowledge, little if any research has investigated their acquisition of discourse modeling ability. We address this question by drawing on a rich psycholinguistic literature that has established how different contexts affect referential biases concerning who is likely to be referred to next. The results reveal that, for the most part, the prediction behavior of neural language models does not resemble that of human language users.",,"Linguistic Theories, Cognitive Modeling and Psycholinguistics",Long,https://www.aclweb.org/anthology/2020.emnlp-main.70,38939162 +main.2640,Self-Supervised Knowledge Triplet Learning for Zero-shot Question Answering,Pratyay Banerjee|Chitta Baral,"The aim of all Question Answering (QA) systems is to generalize to unseen questions. Current supervised methods are reliant on expensive data annotation. Moreover, such annotations can introduce unintended annotator bias, making systems focus more on the bias than the actual task. This work proposes Knowledge Triplet Learning (KTL), a self-supervised task over knowledge graphs. We propose heuristics to create synthetic graphs for commonsense and scientific knowledge. We propose using KTL to perform zero-shot question answering, and our experiments show considerable improvements over large pre-trained transformer language models.",,Question Answering,Long,https://www.aclweb.org/anthology/2020.emnlp-main.11,38939163 +main.2641,End-to-End Slot Alignment and Recognition for Cross-Lingual {NLU},Weijia Xu|Batool Haider|Saab Mansour,"Natural language understanding (NLU) in the context of goal-oriented dialog systems typically includes intent classification and slot labeling tasks. Existing methods to expand an NLU system to new languages use machine translation with slot label projection from source to the translated utterances, and thus are sensitive to projection errors. In this work, we propose a novel end-to-end model that learns to align and predict target slot labels jointly for cross-lingual transfer. We introduce MultiATIS++, a new multilingual NLU corpus that extends the Multilingual ATIS corpus to nine languages across four language families, and evaluate our method using the corpus. Results show that our method outperforms a simple label projection method using fast-align on most languages, and achieves competitive performance to the more complex, state-of-the-art projection method with only half of the training time. We release our MultiATIS++ corpus to the community to continue future research on cross-lingual NLU.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.410,38939164 +main.2644,CrowS-Pairs: A Challenge Dataset for Measuring Social Biases in Masked Language Models,Nikita Nangia|Clara Vania|Rasika Bhalerao|Samuel R. Bowman,"Pretrained language models, especially masked language models (MLMs) have seen success across many NLP tasks. However, there is ample evidence that they use the cultural biases that are undoubtedly present in the corpora they are trained on, implicitly creating harm with biased representations. To measure some forms of social bias in language models against protected demographic groups in the US, we introduce the Crowdsourced Stereotype Pairs benchmark (CrowS-Pairs). CrowS-Pairs has 1508 examples that cover stereotypes dealing with nine types of bias, like race, religion, and age. In CrowS-Pairs a model is presented with two sentences: one that is more stereotyping and another that is less stereotyping. The data focuses on stereotypes about historically disadvantaged groups and contrasts them with advantaged groups. We find that all three of the widely-used MLMs we evaluate substantially favor sentences that express stereotypes in every category in CrowS-Pairs. As work on building less biased models advances, this dataset can be used as a benchmark to evaluate progress.",,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.154,38939165 +main.2650,Understanding Neural Abstractive Summarization Models via Uncertainty,Jiacheng Xu|Shrey Desai|Greg Durrett,"An advantage of seq2seq abstractive summarization models is that they generate text in a free-form manner, but this flexibility makes it difficult to interpret model behavior. In this work, we analyze summarization decoders in both blackbox and whitebox ways by studying on the entropy, or uncertainty, of the model's token-level predictions. For two strong pre-trained models, PEGASUS and BART on two summarization datasets, we find a strong correlation between low prediction entropy and where the model copies tokens rather than generating novel text. The decoder's uncertainty also connects to factors like sentence position and syntactic distance between adjacent pairs of tokens, giving a sense of what factors make a context particularly selective for the model's next output token. Finally, we study the relationship of decoder uncertainty and attention behavior to understand how attention gives rise to these observed effects in the model. We show that uncertainty is a useful perspective for analyzing summarization and text generation models more broadly.",,Summarization,Long,https://www.aclweb.org/anthology/2020.emnlp-main.508,38939166 +main.2651,An Embedding Model for Estimating Legislative Preferences from the Frequency and Sentiment of Tweets,Gregory Spell|Brian Guay|Sunshine Hillygus|Lawrence Carin,"Legislator preferences are typically represented as measures of general ideology estimated from roll call votes on legislation, potentially masking important nuances in legislators’ political attitudes. In this paper we introduce a method of measuring more specific legislator attitudes using an alternative expression of preferences: tweeting. Specifically, we present an embedding-based model for predicting the frequency and sentiment of legislator tweets. To illustrate our method, we model legislators’ attitudes towards President Donald Trump as vector embeddings that interact with embeddings for Trump himself constructed using a neural network from the text of his daily tweets. We demonstrate the predictive performance of our model on tweets authored by members of the U.S. House and Senate related to the president from November 2016 to February 2018. We further assess the quality of our learned representations for legislators by comparing to traditional measures of legislator preferences.",,Computational Social Science and Social Media,Long,https://www.aclweb.org/anthology/2020.emnlp-main.46,38939167 +main.2661,Distilling Multiple Domains for Neural Machine Translation,Anna Currey|Prashant Mathur|Georgiana Dinu,"Neural machine translation achieves impressive results in high-resource conditions, but performance often suffers when the input domain is low-resource. The standard practice of adapting a separate model for each domain of interest does not scale well in practice from both a quality perspective (brittleness under domain shift) as well as a cost perspective (added maintenance and inference complexity). In this paper, we propose a framework for training a single multi-domain neural machine translation model that is able to translate several domains without increasing inference time or memory usage. We show that this model can improve translation on both high- and low-resource domains over strong multi-domain baselines. In addition, our proposed model is effective when domain labels are unknown during training, as well as robust under noisy data conditions.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.364,38939168 +main.267,Making Monolingual Sentence Embeddings Multilingual Using Knowledge Distillation,Nils Reimers|Iryna Gurevych,"We present an easy and efficient method to extend existing sentence embedding models to new languages. This allows to create multilingual versions from previously monolingual models. The training is based on the idea that a translated sentence should be mapped to the same location in the vector space as the original sentence. We use the original (monolingual) model to generate sentence embeddings for the source language and then train a new system on translated sentences to mimic the original model. Compared to other methods for training multilingual sentence embeddings, this approach has several advantages: It is easy to extend existing models with relatively few samples to new languages, it is easier to ensure desired properties for the vector space, and the hardware requirements for training are lower. We demonstrate the effectiveness of our approach for 50+ languages from various language families. Code to extend sentence embeddings models to more than 400 languages is publicly available.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.365,38938673 +main.2674,Data Rejuvenation: Exploiting Inactive Training Examples for Neural Machine Translation,Wenxiang Jiao|Xing Wang|Shilin He|Irwin King|Michael Lyu|Zhaopeng Tu,"Large-scale training datasets lie at the core of the recent success of neural machine translation (NMT) models. However, the complex patterns and potential noises in the large-scale data make training NMT models difficult. In this work, we explore to identify the inactive training examples which contribute less to the model performance, and show that the existence of inactive examples depends on the data distribution. We further introduce data rejuvenation to improve the training of NMT models on large-scale datasets by exploiting inactive examples. The proposed framework consists of three phases. First, we train an identification model on the original training data, and use it to distinguish inactive examples and active examples by their sentence-level output probabilities. Then, we train a rejuvenation model on the active examples, which is used to re-label the inactive examples with forward- translation. Finally, the rejuvenated examples and the active examples are combined to train the final NMT model. Experimental results on WMT14 English-German and English-French datasets show that the proposed data rejuvenation consistently and significantly improves performance for several strong NMT models. Extensive analyses reveal that our approach stabilizes and accelerates the training process of NMT models, resulting in final models with better generalization capability.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.176,38939169 +main.2675,Tackling the Low-resource Challenge for Canonical Segmentation,Manuel Mager|Özlem Çetinoğlu|Katharina Kann,"Canonical morphological segmentation consists of dividing words into their standardized morphemes. Here, we are interested in approaches for the task when training data is limited. We compare model performance in a simulated low-resource setting for the high-resource languages German, English, and Indonesian to experiments on new datasets for the truly low-resource languages Popoluca and Tepehua. We explore two new models for the task, borrowing from the closely related area of morphological generation: an LSTM pointer-generator and a sequence-to-sequence model with hard monotonic attention trained with imitation learning. We find that, in the low-resource setting, the novel approaches out-perform existing ones on all languages by up to 11.4% accuracy. However, while accuracy in emulated low-resource scenarios is over 50% for all languages, for the truly low-resource languages Popoluca and Tepehua, our best model only obtains 37.4% and 28.4% accuracy, respectively. Thus, we conclude that canonical segmentation is still a challenging task for low-resource languages.",,"Phonology, Morphology and Word Segmentation",Long,https://www.aclweb.org/anthology/2020.emnlp-main.423,38939170 +main.2684,On the Role of Supervision in Unsupervised Constituency Parsing,Haoyue Shi|Karen Livescu|Kevin Gimpel,"We analyze several recent unsupervised constituency parsing models, which are tuned with respect to the parsing F1 score on the Wall Street Journal (WSJ) development set (1,700 sentences). We introduce strong baselines for them, by training an existing supervised parsing model (Kitaev and Klein, 2018) on the same labeled examples they access. When training on the 1,700 examples, or even when using only 50 examples for training and 5 for development, such a few-shot parsing approach can outperform all the unsupervised parsing methods by a significant margin. Few-shot parsing can be further improved by a simple data augmentation method and self-training. This suggests that, in order to arrive at fair conclusions, we should carefully consider the amount of labeled data used for model development. We propose two protocols for future work on unsupervised parsing: (i) use fully unsupervised criteria for hyperparameter tuning and model selection; (ii) use as few labeled examples as possible for model development, and compare to few-shot parsing trained on the same labeled examples.",,"Syntax: Tagging, Chunking, and Parsing",Long,https://www.aclweb.org/anthology/2020.emnlp-main.614,38939171 +main.2688,Quantitative Argument Summarization and beyond: Cross-Domain Key Point Analysis,Roy Bar-Haim|Yoav Kantor|Lilach Eden|Roni Friedman|Dan Lahav|Noam Slonim,"When summarizing a collection of views, arguments or opinions on some topic, it is often desirable not only to extract the most salient points, but also to quantify their prevalence. Work on multi-document summarization has traditionally focused on creating textual summaries, which lack this quantitative aspect. Recent work has proposed to summarize arguments by mapping them to a small set of expert-generated key points, where the salience of each key point corresponds to the number of its matching arguments. The current work advances key point analysis in two important respects: first, we develop a method for automatic extraction of key points, which enables fully automatic analysis, and is shown to achieve performance comparable to a human expert. Second, we demonstrate that the applicability of key point analysis goes well beyond argumentation data. Using models trained on publicly available argumentation datasets, we achieve promising results in two additional domains: municipal surveys and user reviews. An additional contribution is an in-depth evaluation of argument-to-key point matching models, where we substantially outperform previous results.",,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Long,https://www.aclweb.org/anthology/2020.emnlp-main.3,38939172 +main.2696,On the Ability of Self-Attention Networks to Recognize Counter Languages,Satwik Bhattamishra|Kabir Ahuja|Navin Goyal,"Transformers have supplanted recurrent models in a large number of NLP tasks. However, the differences in their abilities to model different syntactic properties remain largely unknown. Past works suggest that LSTMs generalize very well on regular languages and have close connections with counter languages. In this work, we systematically study the ability of Transformers to model such languages as well as the role of its individual components in doing so. We first provide a construction of Transformers for a subclass of counter languages, including well-studied languages such as n-ary Boolean Expressions, Dyck-1, and its generalizations. In experiments, we find that Transformers do well on this subclass, and their learned mechanism strongly correlates with our construction. Perhaps surprisingly, in contrast to LSTMs, Transformers do well only on a subset of regular languages with degrading performance as we make languages more complex according to a well-known measure of complexity. Our analysis also provides insights on the role of self-attention mechanism in modeling certain behaviors and the influence of positional encoding schemes on the learning and generalization abilities of the model.",,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.576,38939173 +main.2702,Visually Grounded Compound PCFGs,Yanpeng Zhao|Ivan Titov,"Exploiting visual groundings for language understanding has recently been drawing much attention. In this work, we study visually grounded grammar induction and learn a constituency parser from both unlabeled text and its visual groundings. Existing work on this task (Shi et al., 2019) optimizes a parser via Reinforce and derives the learning signal only from the alignment of images and sentences. While their model is relatively accurate overall, its error distribution is very uneven, with low performance on certain constituents types (e.g., 26.2% recall on verb phrases, VPs) and high on others (e.g., 79.6% recall on noun phrases, NPs). This is not surprising as the learning signal is likely insufficient for deriving all aspects of phrase-structure syntax and gradient estimates are noisy. We show that using an extension of probabilistic context-free grammar model we can do fully-differentiable end-to-end visually grounded learning. Additionally, this enables us to complement the image-text alignment loss with a language modeling objective. On the MSCOCO test captions, our model establishes a new state of the art, outperforming its non-grounded version and, thus, confirming the effectiveness of visual groundings in constituency grammar induction. It also substantially outperforms the previous grounded model, with largest improvements on more `abstract' categories (e.g., +55.1% recall on VPs).",,"Language Grounding to Vision, Robotics and Beyond",Long,https://www.aclweb.org/anthology/2020.emnlp-main.354,38939174 +main.2705,Dataset Cartography: Mapping and Diagnosing Datasets with Training Dynamics,Swabha Swayamdipta|Roy Schwartz|Nicholas Lourie|Yizhong Wang|Hannaneh Hajishirzi|Noah A. Smith|Yejin Choi,"Large datasets have become commonplace in NLP research. However, the increased emphasis on data quantity has made it challenging to assess the quality of data. We introduce Data Maps---a model-based tool to characterize and diagnose datasets. We leverage a largely ignored source of information: the behavior of the model on individual instances during training (training dynamics) for building data maps. This yields two intuitive measures for each example---the model's confidence in the true class, and the variability of this confidence across epochs---obtained in a single run of training. Experiments on four datasets show that these model-dependent measures reveal three distinct regions in the data map, each with pronounced characteristics. First, our data maps show the presence of ""ambiguous"" regions with respect to the model, which contribute the most towards out-of-distribution generalization. Second, the most populous regions in the data are ""easy to learn"" for the model, and play an important role in model optimization. Finally, data maps uncover a region with instances that the model finds ""hard to learn""; these often correspond to labeling errors. Our results indicate that a shift in focus from quantity to quality of data could lead to robust models and improved out-of-distribution generalization.",,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.746,38939175 +main.2707,A Computational Approach to Understanding Empathy Expressed in Text-Based Mental Health Support,Ashish Sharma|Adam Miner|David Atkins|Tim Althoff,"Empathy is critical to successful mental health support. Empathy measurement has predominantly occurred in synchronous, face-to-face settings, and may not translate to asynchronous, text-based contexts. Because millions of people use text-based platforms for mental health support, understanding empathy in these contexts is crucial. In this work, we present a computational approach to understanding how empathy is expressed in online mental health platforms. We develop a novel unifying theoretically-grounded framework for characterizing the communication of empathy in text-based conversations. We collect and share a corpus of 10k (post, response) pairs annotated using this empathy framework with supporting evidence for annotations (rationales). We develop a multi-task RoBERTa-based bi-encoder model for identifying empathy in conversations and extracting rationales underlying its predictions. Experiments demonstrate that our approach can effectively identify empathic conversations. We further apply this model to analyze 235k mental health interactions and show that users do not self-learn empathy over time, revealing opportunities for empathy training and feedback.",,Computational Social Science and Social Media,Long,https://www.aclweb.org/anthology/2020.emnlp-main.425,38939176 +main.2712,Zero-Shot Stance Detection: A Dataset and Model Using Generalized Topic Representations,Emily Allaway|Kathleen McKeown,"Stance detection is an important component of understanding hidden influences in everyday life. Since there are thousands of potential topics to take a stance on, most with little to no training data, we focus on zero-shot stance detection: classifying stance from no training examples. In this paper, we present a new dataset for zero-shot stance detection that captures a wider range of topics and lexical variation than in previous datasets. Additionally, we propose a new model for stance detection that implicitly captures relationships between topics using generalized topic representations and show that this model improves performance on a number of challenging linguistic phenomena.",,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Long,https://www.aclweb.org/anthology/2020.emnlp-main.717,38939177 +main.2718,Bridging Linguistic Typology and Multilingual Machine Translation with Multi-view Language Representations,Arturo Oncevay|Barry Haddow|Alexandra Birch,"Sparse language vectors from linguistic typology databases and learned embeddings from tasks like multilingual machine translation have been investigated in isolation, without analysing how they could benefit from each other's language characterisation. We propose to fuse both views using singular vector canonical correlation analysis and study what kind of information is induced from each source. By inferring typological features and language phylogenies, we observe that our representations embed typology and strengthen correlations with language relationships. We then take advantage of our multi-view language vector space for multilingual machine translation, where we achieve competitive overall translation accuracy in tasks that require information about language similarities, such as language clustering and ranking candidates for multilingual transfer. With our method, we can easily project and assess new languages without expensive retraining of massive multilingual or ranking models, which are major disadvantages of related approaches.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.187,38939178 +main.2721,More Bang for Your Buck: Natural Perturbation for Robust Question Answering,Daniel Khashabi|Tushar Khot|Ashish Sabharwal,"Deep learning models for linguistic tasks require large training datasets, which are expensive to create. As an alternative to the traditional approach of creating new instances by repeating the process of creating one instance, we propose doing so by first collecting a set of seed examples and then applying human-driven natural perturbations (as opposed to rule-based machine perturbations), which often change the gold label as well. Such perturbations have the advantage of being relatively easier (and hence cheaper) to create than writing out completely new examples. Further, they help address the issue that even models achieving human-level scores on NLP datasets are known to be considerably sensitive to small changes in input. To evaluate the idea, we consider a recent question-answering dataset (BOOLQ) and study our approach as a function of the perturbation cost ratio, the relative cost of perturbing an existing question vs. creating a new one from scratch. We find that when natural perturbations are moderately cheaper to create (cost ratio under 60%), it is more effective to use them for training BOOLQ models: such models exhibit 9% higher robustness and 4.5% stronger generalization, while retaining performance on the original BOOLQ dataset.",,Question Answering,Long,https://www.aclweb.org/anthology/2020.emnlp-main.12,38939179 +main.2724,Program Enhanced Fact Verification with Verbalization and Graph Attention Network,Xiaoyu Yang|Feng Nie|Yufei Feng|Quan Liu|Zhigang Chen|Xiaodan Zhu,"Performing fact verification based on structured data is important for many real-life applications and is a challenging research problem, particularly when it involves both symbolic operations and informal inference based on language understanding. In this paper, we present a Program-enhanced Verbalization and Graph Attention Network (ProgVGAT) to integrate programs and execution into textual inference models. Specifically, a verbalization with program execution model is proposed to accumulate evidences that are embedded in operations over the tables. Built on that, we construct the graph attention verification networks, which are designed to fuse different sources of evidences from verbalized program execution, program structures, and the original statements and tables, to make the final verification decision. To support the above framework, we propose a program selection module optimized with a new training strategy based on margin loss, to produce more accurate programs, which is shown to be effective in enhancing the final verification results. Experimental results show that the proposed framework achieves the new state-of-the-art performance, a 74.4% accuracy, on the benchmark dataset TABFACT.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",Long,https://www.aclweb.org/anthology/2020.emnlp-main.628,38939180 +main.2733,An Effective Data Augmentation Method for Low-resource Tagging Tasks,BOSHENG DING|Linlin Liu|Lidong Bing|Canasai Kruengkrai|Thien Hai Nguyen|Shafiq Joty|Luo Si|Chunyan Miao,"Data augmentation techniques have been widely used to improve machine learning performance as they facilitate generalization. In this work, we propose a novel augmentation method to generate high quality synthetic data for low-resource tagging tasks with language models trained on the linearized labeled sentences. Our method is applicable to both supervised and semi-supervised settings. For the supervised settings, we conduct extensive experiments on named entity recognition (NER), part of speech (POS) tagging and end-to-end target based sentiment analysis (E2E-TBSA) tasks. For the semi-supervised settings, we evaluate our method on the NER task under the conditions of given unlabeled data only and unlabeled data plus a knowledge base. The results show that our method can consistently outperform the baselines, particularly when the given gold training data are less.",,"Syntax: Tagging, Chunking, and Parsing",Long,https://www.aclweb.org/anthology/2020.emnlp-main.488,38939181 +main.2739,Exposing Shallow Heuristics of Relation Extraction Models with Challenge Data,Shachar Rosenman|Alon Jacovi|Yoav Goldberg,"The process of collecting and annotating training data may introduce distribution artifacts which may limit the ability of models to learn correct generalization behavior. We identify failure modes of SOTA relation extraction (RE) models trained on TACRED, which we attribute to limitations in the data annotation process. We collect and annotate a challenge-set we call Challenging RE (CRE), based on naturally occurring corpus examples, to benchmark this behavior. Our experiments with four state-of-the-art RE models show that they have indeed adopted shallow heuristics that do not generalize to the challenge-set data. Further, we find that alternative question answering modeling performs significantly better than the SOTA models on the challenge-set, despite worse overall TACRED performance. By adding some of the challenge data as training examples, the performance of the model improves. Finally, we provide concrete suggestion on how to improve RE data collection to alleviate this behavior.",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.302,38939182 +main.2746,A Massive Collection of Cross-Lingual Web-Document Pairs,Ahmed El-Kishky|Vishrav Chaudhary|Francisco Guzmán|Philipp Koehn,"Cross-lingual document alignment aims to identify pairs of documents in two distinct languages that are of comparable content or translations of each other. In this paper, we exploit the signals embedded in URLs to label web documents at scale with an average precision of 94.5% across different language pairs. We mine sixty-eight snapshots of the Common Crawl corpus and identify web document pairs that are translations of each other. We release a new web dataset consisting of over 392 million URL pairs from Common Crawl covering documents in 8144 language pairs of which 137 pairs include English. In addition to curating this massive dataset, we introduce baseline methods that leverage cross-lingual representations to identify aligned documents based on their textual content. Finally, we demonstrate the value of this parallel documents dataset through a downstream task of mining parallel sentences and measuring the quality of machine translations from models trained on this mined data. Our objective in releasing this dataset is to foster new research in cross-lingual NLP across a variety of low, medium, and high-resource languages.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.480,38939183 +main.2750,Constrained Fact Verification for FEVER,Adithya Pratapa|Sai Muralidhar Jayanthi|Kavya Nerella,"Fact-verification systems are well explored in the NLP literature with growing attention owing to shared tasks like FEVER. Though the task requires reasoning on extracted evidence to verify a claim's factuality, there is little work on understanding the reasoning process. In this work, we propose a new methodology for fact-verification, specifically FEVER, that enforces a closed-world reliance on extracted evidence. We present an extensive evaluation of state-of-the-art verification models under these constraints.",,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.629,38939184 +main.2756,Neural Topic Modeling with Cycle-Consistent Adversarial Training,Xuemeng Hu|Rui Wang|Deyu Zhou|Yuxuan Xiong,"Advances on deep generative models have attracted significant research interest in neural topic modeling. The recently proposed Adversarial-neural Topic Model models topics with an adversarially trained generator network and employs Dirichlet prior to capture the semantic patterns in latent topics. It is effective in discovering coherent topics but unable to infer topic distributions for given documents or utilize available document labels. To overcome such limitations, we propose Topic Modeling with Cycle-consistent Adversarial Training (ToMCAT) and its supervised version sToMCAT. ToMCAT employs a generator network to interpret topics and an encoder network to infer document topics. Adversarial training and cycle-consistent constraints are used to encourage the generator and the encoder to produce realistic samples that coordinate with each other. sToMCAT extends ToMCAT by incorporating document labels into the topic modeling process to help discover more coherent topics. The effectiveness of the proposed models is evaluated on unsupervised/supervised topic modeling and text classification. The experimental results show that our models can produce both coherent and informative topics, outperforming a number of competitive baselines.",,Information Retrieval and Text Mining,Long,https://www.aclweb.org/anthology/2020.emnlp-main.725,38939185 +main.2758,Reading between the Lines: Exploring Infilling in Visual Narratives,Khyathi Raghavi Chandu|Ruo-Ping Dong|Alan W Black,"Generating long form narratives such as stories and procedures from multiple modalities has been a long standing dream for artificial intelligence. In this regard, there is often crucial subtext that is derived from the surrounding contexts. The general seq2seq training methods render the models shorthanded while attempting to bridge the gap between these neighbouring contexts. In this paper, we tackle this problem by using infilling techniques involving prediction of missing steps in a narrative while generating textual descriptions from a sequence of images. We also present a new large scale visual procedure telling (ViPT) dataset with a total of 46,200 procedures and around 340k pairwise images and textual descriptions that is rich in such contextual dependencies. Generating steps using infilling technique demonstrates the effectiveness in visual procedures with more coherent texts. We conclusively show a METEOR score of 27.51 on procedures which is higher than the state-of-the-art on visual storytelling. We also demonstrate the effects of interposing new text with missing images during inference. The code and the dataset will be publicly available at https://visual-narratives.github.io/Visual-Narratives/.",,Language Generation,Long,https://www.aclweb.org/anthology/2020.emnlp-main.93,38939186 +main.2761,Hierarchical Graph Network for Multi-hop Question Answering,Yuwei Fang|Siqi Sun|Zhe Gan|Rohit Pillai|Shuohang Wang|Jingjing Liu,"In this paper, we present Hierarchical Graph Network (HGN) for multi-hop question answering. To aggregate clues from scattered texts across multiple paragraphs, a hierarchical graph is created by constructing nodes on different levels of granularity (questions, paragraphs, sentences, entities), the representations of which are initialized with pre-trained contextual encoders. Given this hierarchical graph, the initial node representations are updated through graph propagation, and multi-hop reasoning is performed via traversing through the graph edges for each subsequent sub-task (e.g., paragraph selection, supporting facts extraction, answer prediction). By weaving heterogeneous nodes into an integral unified graph, this hierarchical differentiation of node granularity enables HGN to support different question answering sub-tasks simultaneously. Experiments on the HotpotQA benchmark demonstrate that the proposed model achieves new state of the art, outperforming existing multi-hop QA approaches.",,Question Answering,Long,https://www.aclweb.org/anthology/2020.emnlp-main.710,38939187 +main.2763,Eliciting Knowledge from Language Models Using Automatically Generated Prompts,Taylor Shin|Yasaman Razeghi|Robert L Logan IV|Eric Wallace|Sameer Singh,"The remarkable success of pretrained language models has motivated the study of what kinds of knowledge these models learn during pretraining. Reformulating tasks as fill-in-the-blanks problems (e.g., cloze tests) is a natural approach for gauging such knowledge, however, its usage is limited by the manual effort and guesswork required to write suitable prompts. To address this, we develop AutoPrompt, an automated method to create prompts for a diverse set of tasks, based on a gradient-guided search. Using AutoPrompt, we show that masked language models (MLMs) have an inherent capability to perform sentiment analysis and natural language inference without additional parameters or finetuning, sometimes achieving performance on par with recent state-of-the-art supervised models. We also show that our prompts elicit more accurate factual knowledge from MLMs than the manually created prompts on the LAMA benchmark, and that MLMs can be used as relation extractors more effectively than supervised relation extraction models. These results demonstrate that automatically generated prompts are a viable parameter-free alternative to existing probing methods, and as pretrained LMs become more sophisticated and capable, potentially a replacement for finetuning.",,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.346,38939188 +main.2764,Neural Topic Modeling by Incorporating Document Relationship Graph,Deyu Zhou|Xuemeng Hu|Rui Wang,"Graph Neural Networks (GNNs) that capture the relationships between graph nodes via message passing have been a hot research direction in the natural language processing community. In this paper, we propose Graph Topic Model (GTM), a GNN based neural topic model that represents a corpus as a document relationship graph. Documents and words in the corpus become nodes in the graph and are connected based on document-word co-occurrences. By introducing the graph structure, the relationships between documents are established through their shared words and thus the topical representation of a document is enriched by aggregating information from its neighboring nodes using graph convolution. Extensive experiments on three datasets were conducted and the results demonstrate the effectiveness of the proposed approach.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.310,38939189 +main.2766,Homophonic Pun Generation with Lexically Constrained Rewriting,Zhiwei Yu|Hongyu Zang|Xiaojun Wan,"Punning is a creative way to make conversation enjoyable and literary writing elegant. In this paper, we focus on the task of generating a pun sentence given a pair of homophones. We first find the constraint words supporting the semantic incongruity for a sentence. Then we rewrite the sentence with explicit positive and negative constraints. Our model achieves the state-of-the-art results in both automatic and human evaluations. We further make an error analysis and discuss the challenges for the computational pun models.",,Language Generation,Long,https://www.aclweb.org/anthology/2020.emnlp-main.229,38939190 +main.2767,A Streaming Approach for Efficient Batched Beam Search,Kevin Yang|Violet Yao|John DeNero|Dan Klein,"We propose an efficient batching strategy for variable-length decoding on GPU architectures. During decoding, when candidates terminate or are pruned according to heuristics, our streaming approach periodically ""refills"" the batch before proceeding with a selected subset of candidates. We apply our method to variable-width beam search on a state-of-the-art machine translation model. Our method decreases runtime by up to 71% compared to a fixed-width beam search baseline and 17% compared to a variable-width baseline, while matching baselines' BLEU. Finally, experiments show that our method can speed up decoding in other domains, such as semantic and syntactic parsing.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.366,38939191 +main.2777,Data and Representation for Turkish Natural Language Inference,Emrah Budur|Rıza Özçelik|Tunga Gungor|Christopher Potts,"Large annotated datasets in NLP are overwhelmingly in English. This is an obstacle to progress in other languages. Unfortunately, obtaining new annotated resources for each task in each language would be prohibitively expensive. At the same time, commercial machine translation systems are now robust. Can we leverage these systems to translate English-language datasets automatically? In this paper, we offer a positive response for natural language inference (NLI) in Turkish. We translated two large English NLI datasets into Turkish and had a team of experts validate their translation quality and fidelity to the original labels. Using these datasets, we address core issues of representation for Turkish NLI. We find that in-language embeddings are essential and that morphological parsing can be avoided where the training set is large. Finally, we show that models trained on our machine-translated datasets are successful on human-translated evaluation sets. We share all code, models, and data publicly.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",Long,https://www.aclweb.org/anthology/2020.emnlp-main.662,38939192 +main.2779,Routing Enforced Generative Model for Recipe Generation,Zhiwei Yu|Hongyu Zang|Xiaojun Wan,"One of the most challenging part of recipe generation is to deal with the complex restrictions among the input ingredients. Previous researches simplify the problem by treating the inputs independently and generating recipes containing as much information as possible. In this work, we propose a routing method to dive into the content selection under the internal restrictions. The routing enforced generative model (RGM) can generate appropriate recipes according to the given ingredients and user preferences. Our model yields new state-of-the-art results on the recipe generation task with significant improvements on BLEU, F1 and human evaluation.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.311,38939193 +main.2783,TernaryBERT: Distillation-aware Ultra-low Bit BERT,Wei Zhang|Lu Hou|Yichun Yin|Lifeng Shang|Xiao Chen|Xin Jiang|Qun Liu,"Transformer-based pre-training models like BERT have achieved remarkable performance in many natural language processing tasks. However, these models are both computation and memory expensive, hindering their deployment to resource-constrained devices. In this work, we propose TernaryBERT, which ternarizes the weights in a fine-tuned BERT model. Specifically, we use both approximation-based and loss-aware ternarization methods and empirically investigate the ternarization granularity of different parts of BERT. Moreover, to reduce the accuracy degradation caused by lower capacity of low bits, we leverage the knowledge distillation technique in the training process. Experiments on the GLUE benchmark and SQuAD show that our proposed TernaryBERT outperforms the other BERT quantization methods, and even achieves comparable performance as the full-precision model while being 14.9x smaller.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.37,38939194 +main.2784,We Can Detect Your Bias: Predicting the Political Ideology of News Articles,Ramy Baly|Giovanni Da San Martino|James Glass|Preslav Nakov,"We explore the task of predicting the leading political ideology or bias of news articles. First, we collect and release a large dataset of 34,737 articles that were manually annotated for political ideology --left, center, or right--, which is well-balanced across both topics and media. We further use a challenging experimental setup where the test examples come from media that were not seen during training, which prevents the model from learning to detect the source of the target news article instead of predicting its political ideology. From a modeling perspective, we propose an adversarial media adaptation, as well as a specially adapted triplet loss. We further add background information about the source, and we show that it is quite helpful for improving article-level prediction. Our experimental results show very sizable improvements over using state-of-the-art pre-trained Transformers in this challenging setup.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.404,38939195 +main.279,Learning Physical Common Sense as Knowledge Graph Completion via BERT Data Augmentation and Constrained Tucker Factorization,Zhenjie Zhao|Evangelos Papalexakis|Xiaojuan Ma,"Physical common sense plays an essential role in the cognition abilities of robots for human-robot interaction. Machine learning methods have shown promising results on physical commonsense learning in natural language processing but still suffer from model generalization. In this paper, we formulate physical commonsense learning as a knowledge graph completion problem to better use the latent relationships among training samples. Compared with completing general knowledge graphs, completing a physical commonsense knowledge graph has three unique characteristics: training data are scarce, not all facts can be mined from existing texts, and the number of relationships is small. To deal with these problems, we first use a pre-training language model BERT to augment training data, and then employ constrained tucker factorization to model complex relationships by constraining types and adding negative relationships. We compare our method with existing state-of-the-art knowledge graph embedding methods and show its superior performance.",,"Language Grounding to Vision, Robotics and Beyond",Long,https://www.aclweb.org/anthology/2020.emnlp-main.266,38938674 +main.2790,Semantic Label Smoothing for Sequence to Sequence Problems,Michal Lukasik|Himanshu Jain|Aditya Menon|Seungyeon Kim|Srinadh Bhojanapalli|Felix Yu|Sanjiv Kumar,"Label smoothing has been shown to be an effective regularization strategy in classification, that prevents overfitting and helps in label de-noising. However, extending such methods directly to seq2seq settings, such as Machine Translation, is challenging: the large target output space of such problems makes it intractable to apply label smoothing over all possible outputs. Most existing approaches for seq2seq settings either do token level smoothing, or smooth over sequences generated by randomly substituting tokens in the target sequence. Unlike these works, in this paper, we propose a technique that smooths over \emph{well formed} relevant sequences that not only have sufficient n-gram overlap with the target sequence, but are also \emph{semantically similar}. Our method shows a consistent and significant improvement over the state-of-the-art techniques on different datasets.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.405,38939196 +main.2792,Tired of Topic Models? Clusters of Pretrained Word Embeddings Make for Fast and Good Topics Too!,Suzanna Sia|Ayush Dalmia|Sabrina J. Mielke,"Topic models are a useful analysis tool to uncover the underlying themes within document collections. The dominant approach is to use probabilistic topic models that posit a generative story, but in this paper we propose an alternative way to obtain topics: clustering pre-trained word embeddings while incorporating document information for weighted clustering and reranking top words. We provide benchmarks for the combination of different word embeddings and clustering algorithms, and analyse their performance under dimensionality reduction with PCA. The best performing combination for our approach performs as well as classical topic models, but with lower runtime and computational complexity.",,Information Retrieval and Text Mining,Long,https://www.aclweb.org/anthology/2020.emnlp-main.135,38939197 +main.2793,Self-Supervised Meta-Learning for Few-Shot Natural Language Classification Tasks,Trapit Bansal|Rishikesh Jha|Tsendsuren Munkhdalai|Andrew McCallum,"Self-supervised pre-training of transformer models has revolutionized NLP applications. Such pre-training with language modeling objectives provides a useful initial point for parameters that generalize well to new tasks with fine-tuning. However, fine-tuning is still data inefficient --- when there are few labeled examples, accuracy can be low. Data efficiency can be improved by optimizing pre-training directly for future fine-tuning with few examples; this can be treated as a meta-learning problem. However, standard meta-learning techniques require many training tasks in order to generalize; unfortunately, finding a diverse set of such supervised tasks is usually difficult. This paper proposes a self-supervised approach to generate a large, rich, meta-learning task distribution from unlabeled text. This is achieved using a cloze-style objective, but creating separate multi-class classification tasks by gathering tokens-to-be blanked from among only a handful of vocabulary terms. This yields as many unique meta-training tasks as the number of subsets of vocabulary terms. We meta-train a transformer model on this distribution of tasks using a recent meta-learning framework. On 17 NLP tasks, we show that this meta-training leads to better few-shot generalization than language-model pre-training followed by finetuning. Furthermore, we show how the self-supervised tasks can be combined with supervised tasks for meta-learning, providing substantial accuracy gains over previous supervised meta-learning.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.38,38939198 +main.2795,"Lightweight, Dynamic Graph Convolutional Networks for AMR-to-Text Generation",Yan Zhang|Zhijiang Guo|Zhiyang Teng|Wei Lu|Shay B. Cohen|ZUOZHU LIU|Lidong Bing,"AMR-to-text generation is used to transduce Abstract Meaning Representation structures (AMR) into text. A key challenge in this task is to efficiently learn effective graph representations. Previously, Graph Convolution Networks (GCNs) were used to encode input AMRs, however, vanilla GCNs are not able to capture non-local information and additionally, they follow a local (first-order) information aggregation scheme. To account for these issues, larger and deeper GCN models are required to capture more complex interactions. In this paper, we introduce a dynamic fusion mechanism, proposing Lightweight Dynamic Graph Convolutional Networks (LDGCNs) that capture richer non-local interactions by synthesizing higher order information from the input graphs. We further develop two novel parameter saving strategies based on the group graph convolutions and weight tied convolutions to reduce memory usage and model complexity. With the help of these strategies, we are able to train a model with fewer parameters while maintaining the model capacity. Experiments demonstrate that LDGCNs outperform state-of-the-art models on two benchmark datasets for AMR-to-text generation with significantly fewer parameters.",,Language Generation,Long,https://www.aclweb.org/anthology/2020.emnlp-main.169,38939199 +main.2799,Frustratingly Simple Few-Shot Named Entity Recognition with Structured Nearest Neighbor Learning,Yi Yang|Arzoo Katiyar,"We present a simple few-shot named entity recognition (NER) system based on nearest neighbor learning and structured inference. Our system uses a supervised NER model trained on the source domain, as a feature extractor. Across several test domains, we show that a nearest neighbor classifier in this feature-space is far more effective than the standard meta-learning approaches. We further propose a cheap but effective method to capture the label dependencies between entity tags without expensive CRF training. We show that our method of combining structured decoding with nearest neighbor learning achieves state-of-the-art performance on standard few-shot NER evaluation tasks, improving F1 scores by $6\%$ to $16\%$ absolute points over prior meta-learning based systems.",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.516,38939200 +main.2809,A Preliminary Exploration of GANs for Keyphrase Generation,Avinash Swaminathan|Haimin Zhang|Debanjan Mahata|Rakesh Gosangi|Rajiv Ratn Shah|Amanda Stent,"We introduce a new keyphrase generation approach using Generative Adversarial Networks (GANs). For a given document, the generator produces a sequence of keyphrases, and the discriminator distinguishes between human-curated and machine-generated keyphrases. We evaluated this approach on standard benchmark datasets. We observed that our model achieves state-of-the-art performance in the generation of abstractive keyphrases and is comparable to the best performing extractive techniques. Although we achieve promising results using GANs, they are not significantly better than the state-of-the-art generative models. To our knowledge, this is one of the first works that use GANs for keyphrase generation. We present a detailed analysis of our observations and expect that these findings would help other researchers to further study the use of GANs for the task of keyphrase generation.",,Summarization,Long,https://www.aclweb.org/anthology/2020.emnlp-main.645,38939201 +main.2814,Evaluating and Characterizing Human Rationales,Samuel Carton|Anirudh Rathore|Chenhao Tan,"Two main approaches for evaluating the quality of machine-generated rationales are: 1) using human rationales as a gold standard; and 2) automated metrics based on how rationales affect model behavior. An open question, however, is how human rationales fare with these automatic metrics. Analyzing a variety of datasets and models, we find that human rationales do not necessarily perform well on these metrics. To unpack this finding, we propose improved metrics to account for model-dependent baseline performance. We then propose two methods to further characterize rationale quality, one based on model retraining and one on using ``fidelity curves'' to reveal properties such as irrelevance and redundancy. Our work leads to actionable suggestions for evaluating and characterizing rationales.",,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.747,38939202 +main.2818,Improving Bilingual Lexicon Induction for Low Frequency Words,Jiaji Huang|Xingyu Cai|Kenneth Church,"This paper designs a Monolingual Lexicon Induction task and observes that two factors accompany the degraded accuracy of bilingual lexicon induction for rare words. First, a diminishing margin between similarities in low frequency regime, and secondly, exacerbated hubness at low frequency. Based on the observation, we further propose two methods to address these two factors, respectively. The larger issue is hubness. Addressing that improves induction accuracy significantly, especially for low-frequency words.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.100,38939203 +main.2825,SLEDGE: A Simple Yet Effective Zero-Shot Baseline for Coronavirus Scientific Knowledge Search,Sean MacAvaney|Arman Cohan|Nazli Goharian,"With worldwide concerns surrounding the Severe Acute Respiratory Syndrome Coronavirus 2 (SARS-CoV-2), there is a rapidly growing body of scientific literature on the virus. Clinicians, researchers, and policy-makers need to be able to search these articles effectively. In this work, we present a zero-shot ranking algorithm that adapts to COVID-related scientific literature. Our approach filters training data from another collection down to medical-related queries, uses a neural re-ranking model pre-trained on scientific text (SciBERT), and filters the target document collection. This approach ranks top among zero-shot methods on the TREC COVID Round 1 leaderboard, and exhibits a P@5 of 0.80 and an nDCG@10 of 0.68 when evaluated on both Round 1 and 2 judgments. Despite not relying on TREC-COVID data, our method outperforms models that do. As one of the first search methods to thoroughly evaluate COVID-19 search, we hope that this serves as a strong baseline and helps in the global crisis.",,Information Retrieval and Text Mining,Long,https://www.aclweb.org/anthology/2020.emnlp-main.341,38939204 +main.2834,"Ensemble Distillation for Structured Prediction: Calibrated, Accurate, Fast—Choose Three",Steven Reich|David Mueller|Nicholas Andrews,"Modern neural networks do not always produce well-calibrated predictions, even when trained with a proper scoring function such as cross-entropy. In classification settings, simple methods such as isotonic regression or temperature scaling may be used in conjunction with a held-out dataset to calibrate model outputs. However, extending these methods to structured prediction is not always straightforward or effective; furthermore, a held-out calibration set may not always be available. In this paper, we study \emph{ensemble distillation} as a general framework for producing well-calibrated structured prediction models while avoiding the prohibitive inference-time cost of ensembles. We validate this framework on two tasks: named-entity recognition and machine translation. We find that, across both tasks, ensemble distillation produces models which retain much of, and occasionally improve upon, the performance and calibration benefits of ensembles, while only requiring a single model during test-time.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.450,38939205 +main.2838,Efficient Meta Lifelong-Learning with Limited Memory,Zirui Wang|Sanket Vaibhav Mehta|Barnabas Poczos|Jaime Carbonell,"Current natural language processing models work well on a single task, yet they often fail to continuously learn new tasks without forgetting previous ones as they are re-trained throughout their lifetime, a challenge known as lifelong learning. State-of-the-art lifelong language learning methods store past examples in episodic memory and replay them at both training and inference time. However, as we show later in our experiments, there are three significant impediments: (1) needing unrealistically large memory module to achieve good performance, (2) suffering from negative transfer, (3) requiring multiple local adaptation steps for each test example that significantly slows down the inference speed. In this paper, we identify three common principles of lifelong learning methods and propose an efficient meta-lifelong framework that combines them in a synergistic fashion. To achieve sample efficiency, our method trains the model in a manner that it learns a better initialization for local adaptation. Extensive experiments on text classification and question answering benchmarks demonstrate the effectiveness of our framework by achieving state-of-the-art performance using merely 1% memory size and narrowing the gap with multi-task learning. We further show that our method alleviates both catastrophic forgetting and negative transfer at the same time.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.39,38939206 +main.2839,What Is More Likely to Happen Next? Video-and-Language Future Event Prediction,Jie Lei|Licheng Yu|Tamara Berg|Mohit Bansal,"Given a video with aligned dialogue, people can often infer what is more likely to happen next. Making such predictions requires not only a deep understanding of the rich dynamics underlying the video and dialogue, but also a significant amount of commonsense knowledge. In this work, we explore whether AI models are able to learn to make such multimodal commonsense next-event predictions. To support research in this direction, we collect a new dataset, named Video-and-Language Event Prediction (VLEP), with 28,726 future event prediction examples (along with their rationales) from 10,234 diverse TV Show and YouTube Lifestyle Vlog video clips. In order to promote the collection of non-trivial challenging examples, we employ an adversarial human-and-model-in-the-loop data collection procedure. We also present a strong baseline incorporating information from video, dialogue, and commonsense knowledge. Experiments show that each type of information is useful for this challenging task, and that compared to the high human performance on VLEP, our model provides a good starting point but leaves large room for future work.",,"Language Grounding to Vision, Robotics and Beyond",Long,https://www.aclweb.org/anthology/2020.emnlp-main.706,38939207 +main.284,"X-LXMERT: Paint, Caption and Answer Questions with Multi-Modal Transformers",Jaemin Cho|jiasen lu|Dustin Schwenk|Hannaneh Hajishirzi|Aniruddha Kembhavi,"Mirroring the success of masked language models, vision-and-language counterparts like VILBERT, LXMERT and UNITER have achieved state of the art performance on a variety of multimodal discriminative tasks like visual question answering and visual grounding. Recent work has also successfully adapted such models towards the generative task of image captioning. This begs the question: Can these models go the other way and generate images from pieces of text? Our analysis of a popular representative from this model family – LXMERT – finds that it is unable to generate rich and semantically meaningful imagery with its current training setup. We introduce X-LXMERT, an extension to LXMERT with training refinements including: discretizing visual representations, using uniform masking with a large range of masking ratios and aligning the right pre-training datasets to the right objectives which enables it to paint. X-LXMERT’s image generation capabilities rival state of the art generative models while its question answering and captioning abilities remains comparable to LXMERT. Finally, we demonstrate the generality of these training refinements by adding image generation capabilities into UNITER to produce X-UNITER.",,"Language Grounding to Vision, Robotics and Beyond",Long,https://www.aclweb.org/anthology/2020.emnlp-main.707,38938675 +main.2847,IGT2P: From Interlinear Glossed Texts to Paradigms,Sarah Moeller|Ling Liu|Changbing Yang|Katharina Kann|Mans Hulden,"An intermediate step in the linguistic analysis of an under-documented language is to find and organize inflected forms that are attested in natural speech. From this data, linguists generate unseen inflected word forms in order to test hypotheses about the language's inflectional patterns and to complete inflectional paradigm tables. To get the data linguists spend many hours manually creating interlinear glossed texts (IGTs). We introduce a new task that speeds this process and automatically generates new morphological resources for natural language processing systems: IGT-to-paradigms (IGT2P). IGT2P generates entire morphological paradigms from IGT input. We show that existing morphological reinflection models can solve the task with 21% to 64% accuracy, depending on the language. We further find that (i) having a language expert spend only a few hours cleaning the noisy IGT data improves performance by as much as 21 percentage points, and (ii) POS tags, which are generally considered a necessary part of NLP morphological reinflection input, have no effect on the accuracy of the models considered here.",,"Phonology, Morphology and Word Segmentation",Long,https://www.aclweb.org/anthology/2020.emnlp-main.424,38939208 +main.2849,TESA: A Task in Entity Semantic Aggregation for Abstractive Summarization,Clément Jumel|Annie Louis|Jackie Chi Kit Cheung,"Human-written texts contain frequent generalizations and semantic aggregation of content. In a document, they may refer to a pair of named entities such as 'London' and 'Paris' with different expressions: ""the major cities'', ""the capital cities'' and ""two European cities''. Yet generation, especially, abstractive summarization systems have so far focused heavily on paraphrasing and simplifying the source content, to the exclusion of such semantic abstraction capabilities. In this paper, we present a new dataset and task aimed at the semantic aggregation of entities. TESA contains a dataset of 5.3K crowd-sourced entity aggregations of Person, Organization, and Location named entities. The aggregations are document-appropriate, meaning that they are produced by annotators to match the situational context of a given news article from the New York Times. We then build baseline models for generating aggregations given a tuple of entities and document context. We finetune on TESA an encoder-decoder language model and compare it with simpler classification methods based on linguistically informed features. Our quantitative and qualitative evaluations show reasonable performance in making a choice from a given list of expressions, but free-form expressions are understandably harder to generate and evaluate.",,Summarization,Long,https://www.aclweb.org/anthology/2020.emnlp-main.646,38939209 +main.2851,Structural Supervision Improves Few-Shot Learning and Syntactic Generalization in Neural Language Models,Ethan Wilcox|Peng Qian|Richard Futrell|Ryosuke Kohita|Roger Levy|Miguel Ballesteros,"Humans can learn structural properties about a word from minimal experience, and deploy their learned syntactic representations uniformly in different grammatical contexts. We assess the ability of modern neural language models to reproduce this behavior in English and evaluate the effect of structural supervision on learning outcomes. First, we assess few-shot learning capabilities by developing controlled experiments that probe models' syntactic nominal number and verbal argument structure generalizations for tokens seen as few as two times during training. Second, we assess invariance properties of learned representation: the ability of a model to transfer syntactic generalizations from a base context (e.g., a simple declarative active-voice sentence) to a transformed context (e.g., an interrogative sentence). We test four models trained on the same dataset: an n-gram baseline, an LSTM, and two LSTM-variants trained with explicit structural supervision. We find that in most cases, the neural models are able to induce the proper syntactic generalizations after minimal exposure, often from just two examples during training, and that the two structurally supervised models generalize more accurately than the LSTM model. All neural models are able to leverage information learned in base contexts to drive expectations in transformed contexts, indicating that they have learned some invariance properties of syntax.",,"Linguistic Theories, Cognitive Modeling and Psycholinguistics",Long,https://www.aclweb.org/anthology/2020.emnlp-main.375,38939210 +main.2853,Hero: Hierarchical Encoder for Video+Language Omni-representation Pre-training,Linjie Li|Yen-Chun Chen|Yu Cheng|Zhe Gan|Licheng Yu|Jingjing Liu,"We present HERO, a novel framework for large-scale video+language omni-representation learning. HERO encodes multimodal inputs in a hierarchical structure, where local context of a video frame is captured by a Cross-modal Transformer via multimodal fusion, and global video context is captured by a Temporal Transformer. In addition to standard Masked Language Modeling (MLM) and Masked Frame Modeling (MFM) objectives, we design two new pre-training tasks: (i) Video-Subtitle Matching (VSM), where the model predicts both global and local temporal alignment; and (ii) Frame Order Modeling (FOM), where the model predicts the right order of shuffled video frames. HERO is jointly trained on HowTo100M and large-scale TV datasets to gain deep understanding of complex social dynamics with multi-character interactions. Comprehensive experiments demonstrate that HERO achieves new state of the art on multiple benchmarks over Text-based Video/Video-moment Retrieval, Video Question Answering (QA), Video-and-language Inference and Video Captioning tasks across different domains. We also introduce two new challenging benchmarks How2QA and How2R for Video QA and Retrieval, collected from diverse video content over multimodalities.",,"Language Grounding to Vision, Robotics and Beyond",Long,https://www.aclweb.org/anthology/2020.emnlp-main.161,38939211 +main.286,Multi-document Summarization with Maximal Marginal Relevance-guided Reinforcement Learning,Yuning Mao|Yanru Qu|Yiqing Xie|Xiang Ren|Jiawei Han,"While neural sequence learning methods have made significant progress in single-document summarization (SDS), they produce unsatisfactory results on multi-document summarization (MDS). We observe two major challenges when adapting SDS advances to MDS: (1) MDS involves larger search space and yet more limited training data, setting obstacles for neural methods to learn adequate representations; (2) MDS needs to resolve higher information redundancy among the source documents, which SDS methods are less effective to handle. To close the gap, we present RL-MMR, Maximal Margin Relevance-guided Reinforcement Learning for MDS, which unifies advanced neural SDS methods and statistical measures used in classical MDS. RL-MMR casts MMR guidance on fewer promising candidates, which restrains the search space and thus leads to better representation learning. Additionally, the explicit redundancy measure in MMR helps the neural representation of the summary to better capture redundancy. Extensive experiments demonstrate that RL-MMR achieves state-of-the-art performance on benchmark MDS datasets. In particular, we show the benefits of incorporating MMR into end-to-end learning when adapting SDS to MDS in terms of both learning effectiveness and efficiency.",,Information Retrieval and Text Mining,Long,https://www.aclweb.org/anthology/2020.emnlp-main.136,38938676 +main.2864,MOCHA: A Dataset for Training and Evaluating Generative Reading Comprehension Metrics,Anthony Chen|Gabriel Stanovsky|Sameer Singh|Matt Gardner,"Posing reading comprehension as a generation problem provides a great deal of flexibility, allowing for open-ended questions with few restrictions on possible answers. However, progress is impeded by existing generation metrics, which rely on token overlap and are agnostic to the nuances of reading comprehension. To address this, we introduce a benchmark for training and evaluating generative reading comprehension metrics: MOdeling Correctness with Human Annotations. MOCHA contains 40K human judgement scores on model outputs from 6 diverse question answering datasets and an additional set of minimal pairs for evaluation. Using MOCHA, we train a Learned Evaluation metric for Reading Comprehension, LERC, to mimic human judgement scores. LERC outperforms baseline metrics by 10 to 36 absolute Pearson points on held-out annotations. When we evaluate robustness on minimal pairs, LERC achieves 80% accuracy, outperforming baselines by 14 to 26 absolute percentage points while leaving significant room for improvement. MOCHA presents a challenging problem for developing accurate and robust generative reading comprehension metrics.",,Language Generation,Long,https://www.aclweb.org/anthology/2020.emnlp-main.528,38939212 +main.2865,Learning VAE-LDA Models with Rounded Reparameterization Trick,Runzhi Tian|Yongyi Mao|Richong Zhang,"The introduction of VAE provides an efficient framework for the learning of generative models, including generative topic models. However, when the topic model is a Latent Dirichlet Allocation (LDA) model, a central technique of VAE, the reparameterization trick, fails to be applicable. This is because no reparameterization form of Dirichlet distributions is known to date that allows the use of the reparameterization trick. In this work, we propose a new method, which we call Rounded Reparameterization Trick (RRT), to reparameterize Dirichlet distributions for the learning of VAE-LDA models. This method, when applied to a VAE-LDA model, is shown experimentally to outperform the existing neural topic models on several benchmark datasets and on a synthetic dataset.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.101,38939213 +main.287,Learning Structured Representations of Entity Names Using Active Learning and Weak Supervision,Kun Qian|Poornima Chozhiyath Raman|Yunyao Li|Lucian Popa,"Structured representations of entity names are useful for many entity-related tasks such as entity normalization and variant generation. Learning the implicit structured representations of entity names without context and external knowledge is particularly challenging. In this paper, we present a novel learning framework that combines active learning and weak supervision to solve this problem. Our experimental evaluation show that this framework enables the learning of high-quality models from merely a dozen or so labeled examples.",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.517,38938677 +main.2873,Disentangle-based Continual Graph Representation Learning,Xiaoyu Kou|Yankai Lin|Shaobo Liu|Peng Li|Jie Zhou|Yan Zhang,"Graph embedding (GE) methods embed nodes (and/or edges) in graph into a low-dimensional semantic space, and have shown its effectiveness in modeling multi-relational data. However, existing GE models are not practical in real-world applications since it overlooked the streaming nature of incoming data. To address this issue, we study the problem of continual graph representation learning which aims to continually train a GE model on new data to learn incessantly emerging multi-relational data while avoiding catastrophically forgetting old learned knowledge. Moreover, we propose a disentangle-based continual graph representation learning (DiCGRL) framework inspired by the human’s ability to learn procedural knowledge. The experimental results show that DiCGRL could effectively alleviate the catastrophic forgetting problem and outperform state-of-the-art continual learning models. The code and datasets are released on https://github.com/KXY-PUBLIC/DiCGRL.",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.237,38939214 +main.2877,Knowledge Graph Alignment with Entity-Pair Embedding,Zhichun Wang|Jinjian Yang|Xiaoju Ye,"Knowledge Graph (KG) alignment is to match entities in different KGs, which is important to knowledge fusion and integration. Recently, a number of embedding-based approaches for KG alignment have been proposed and achieved promising results. These approaches first embed entities in low-dimensional vector spaces, and then obtain entity alignments by computations on their vector representations. Although continuous improvements have been achieved by recent work, the performances of existing approaches are still not satisfactory. In this work, we present a new approach that directly learns embeddings of entity-pairs for KG alignment. Our approach first generates a pair-wise connectivity graph (PCG) of two KGs, whose nodes are entity-pairs and edges correspond to relation-pairs; it then learns node (entity-pair) embeddings of the PCG, which are used to predict equivalent relations of entities. To get desirable embeddings, a convolutional neural network is used to generate similarity features of entity-pairs from their attributes; and a graph neural network is employed to propagate the similarity features and get the final embeddings of entity-pairs. Experiments on five real-world datasets show that our approach can achieve the state-of-the-art KG alignment results.",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.130,38939215 +main.2886,LOGAN: Local Group Bias Detection by Clustering,Jieyu Zhao|Kai-Wei Chang,"Machine learning techniques have been widely used in natural language processing (NLP). However, as revealed by many recent studies, machine learning models often inherit and amplify the societal biases in data. Various metrics have been proposed to quantify biases in model predictions. In particular, several of them evaluate disparity in model performance between protected groups and advantaged groups in the test corpus. However, we argue that evaluating bias at the corpus level is not enough for understanding how biases are embedded in a model. In fact, a model with similar aggregated performance between different groups on the entire data may behave differently on instances in a local region. To analyze and detect such local bias, we propose LOGAN, a new bias detection technique based on clustering. Experiments on toxicity classification and object classification tasks show that LOGAN identifies bias in a local region and allows us to better analyze the biases in model predictions.",,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.155,38939216 +main.2890,Multitask Learning for Cross-Lingual Transfer of Broad-coverage Semantic Dependencies,Maryam Aminian|Mohammad Sadegh Rasooli|Mona Diab,We describe a method for developing broad-coverage semantic dependency parsers for languages for which no semantically annotated resource is available. We leverage a multitask learning framework coupled with annotation projection. We use syntactic parsing as the auxiliary task in our multitask setup. Our annotation projection experiments from English to Czech show that our multitask setup yields 3.1% (4.2%) improvement in labeled F1-score on in-domain (out-of-domain) test set compared to a single-task baseline.,,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",Long,https://www.aclweb.org/anthology/2020.emnlp-main.663,38939217 +main.2891,Improving Word Sense Disambiguation with Translations,Yixing Luan|Bradley Hauer|Lili Mou|Grzegorz Kondrak,"It has been conjectured that multilingual information can help monolingual word sense disambiguation (WSD). However, existing WSD systems rarely consider multilingual information, and no effective method has been proposed for improving WSD by generating translations. In this paper, we present a novel approach that improves the performance of a base WSD system using machine translation. Since our approach is language independent, we perform WSD experiments on several languages. The results demonstrate that our methods can consistently improve the performance of WSD systems, and obtain state-ofthe-art results in both English and multilingual WSD. To facilitate the use of lexical translation information, we also propose BABALIGN, an precise bitext alignment algorithm which is guided by multilingual lexical correspondences from BabelNet.",,Semantics: Lexical Semantics,Long,https://www.aclweb.org/anthology/2020.emnlp-main.332,38939218 +main.2893,Learning Helpful Inductive Biases from Self-Supervised Pretraining,Alex Warstadt|Yian Zhang|Xiaocheng Li|Haokun Liu|Samuel R. Bowman,"One reason pretraining on self-supervised linguistic tasks is effective is that it teaches models features that are helpful for language understanding. However, we want pretrained models to learn not only to represent linguistic features, but also to use those features preferentially during fine-turning. With this goal in mind, we introduce a new English-language diagnostic set called MSGS (the Mixed Signals Generalization Set), which consists of 20 ambiguous binary classification tasks that we use to test whether a pretrained model prefers linguistic or surface generalizations during finetuning. We pretrain RoBERTa from scratch on quantities of data ranging from 1M to 1B words and compare their performance on MSGS to the publicly available RoBERTa_BASE. We find that models can learn to represent linguistic features with little pretraining data, but require far more data to learn to prefer linguistic generalizations over surface ones. Eventually, with about 30B words of pretraining data, RoBERTa_BASE does consistently demonstrate a linguistic bias with some regularity. We conclude that while self-supervised pretraining is an effective way to learn helpful inductive biases, there is likely room to improve the rate at which models learn which features matter.",,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.16,38939219 +main.2894,Measuring Information Propagation in Literary Social Networks,Matthew Sims|David Bamman,"We present the task of modeling information propagation in literature, in which we seek to identify pieces of information passing from character A to character B to character C, only given a description of their activity in text. We describe a new pipeline for measuring information propagation in this domain and publish a new dataset for speaker attribution, enabling the evaluation of an important component of this pipeline on a wider range of literary texts than previously studied. Using this pipeline, we analyze the dynamics of information propagation in over 5,000 works of fiction, finding that information flows through characters that fill structural holes connecting different communities, and that characters who are women are depicted as filling this role much more frequently than characters who are men.",,Computational Social Science and Social Media,Long,https://www.aclweb.org/anthology/2020.emnlp-main.47,38939220 +main.2895,Adversarial Attack and Defense of Structured Prediction Models,Wenjuan Han|Liwen Zhang|Yong Jiang|Kewei Tu,"Building an effective adversarial attacker and elaborating on countermeasures for adversarial attacks for natural language processing (NLP) have attracted a lot of research in recent years. However, most of the existing approaches focus on classification problems. In this paper, we investigate attacks and defenses for structured prediction tasks in NLP. Besides the difficulty of perturbing discrete words and the sentence fluency problem faced by attackers in any NLP tasks, there is a specific challenge to attackers of structured prediction models: the structured output of structured prediction models is sensitive to small perturbations in the input. To address these problems, we propose a novel and unified framework that learns to attack a structured prediction model using a sequence-to-sequence model with feedbacks from multiple reference models of the same structured prediction task. Based on the proposed attack, we further reinforce the victim model with adversarial training, making its prediction more robust and accurate. We evaluate the proposed framework in dependency parsing and part-of-speech tagging. Automatic and human evaluations show that our proposed framework succeeds in both attacking state-of-the-art structured prediction models and boosting them with adversarial training.",,"Syntax: Tagging, Chunking, and Parsing",Long,https://www.aclweb.org/anthology/2020.emnlp-main.182,38939221 +main.2900,A Spectral Method for Unsupervised Multi-Document Summarization,Kexiang Wang|Baobao Chang|Zhifang Sui,"Multi-document summarization (MDS) aims at producing a good-quality summary for several related documents. In this paper, we propose a spectral-based hypothesis, which states that the goodness of summary candidate is closely linked to its so-called spectral impact. Here spectral impact considers the perturbation to the dominant eigenvalue of affinity matrix when dropping the summary candidate from the document cluster. The hypothesis is validated by three theoretical perspectives: semantic scaling, propagation dynamics and matrix perturbation. According to the hypothesis, we formulate the MDS task as the combinatorial optimization of spectral impact and propose an accelerated greedy solution based on a surrogate of spectral impact. The evaluation results on various datasets demonstrate: (1) The performance of the summary candidate is positively correlated with its spectral impact, which accords with our hypothesis; (2) Our spectral-based method has a competitive result as compared to state-of-the-art MDS systems.",,Summarization,Long,https://www.aclweb.org/anthology/2020.emnlp-main.32,38939222 +main.2914,T3: Tree-Autoencoder Regularized Adversarial Text Generation for Targeted Attack,Boxin Wang|Hengzhi Pei|Boyuan Pan|Qian Chen|Shuohang Wang|Bo Li,"Adversarial attacks against natural language processing systems, which perform seemingly innocuous modifications to inputs, can induce arbitrary mistakes to the target models. Though raised great concerns, such adversarial attacks can be leveraged to estimate the robustness of NLP models. Compared with the adversarial example generation in continuous data domain (e.g., image), generating adversarial text that preserves the original meaning is challenging since the text space is discrete and non-differentiable. To handle these challenges, we propose a target-controllable adversarial attack framework T3, which is applicable to a range of NLP tasks. In particular, we propose a tree-based autoencoder to embed the discrete text data into a continuous representation space, upon which we optimize the adversarial perturbation. A novel tree-based decoder is then applied to regularize the syntactic correctness of the generated text and manipulate it on either sentence (T3(Sent)) or word (T3(Word)) level. We consider two most representative NLP tasks: sentiment analysis and question answering (QA). Extensive experimental results and human studies show that T3 generated adversarial texts can successfully manipulate the NLP models to output the targeted incorrect answer without misleading the human. Moreover, we show that the generated adversarial texts have high transferability which enables the black-box attacks in practice. Our work sheds light on an effective and general way to examine the robustness of NLP models. Our code is publicly available at https://github.com/AI-secure/T3/.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.495,38939223 +main.2915,Effectively Pretraining a Speech Translation Decoder with Machine Translation Data,Ashkan Alinejad|Anoop Sarkar,"Directly translating from speech to text using an end-to-end approach is still challenging for many language pairs due to insufficient data. Although pretraining the encoder parameters using the Automatic Speech Recognition (ASR) task improves the results in low resource settings, attempting to use pretrained parameters from the Neural Machine Translation (NMT) task has been largely unsuccessful in previous works. In this paper, we will show that by using an adversarial regularizer, we can bring the encoder representations of the ASR and NMT tasks closer even though they are in different modalities, and how this helps us effectively use a pretrained NMT decoder for speech translation.",,Speech and Multimodality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.644,38939224 +main.2916,Controllable Meaning Representation to Text Generation: Linearization and Data Augmentation Strategies,Chris Kedzie|Kathleen McKeown,"We study the degree to which neural sequence-to-sequence models exhibit fine-grained controllability when performing natural language generation from a meaning representation. Using two task-oriented dialogue generation benchmarks, we systematically compare the effect of four input linearization strategies on controllability and faithfulness. Additionally, we evaluate how a phrase-based data augmentation method can improve performance. We find that properly aligning input sequences during training leads to highly controllable generation, both when training from scratch or when fine-tuning a larger pre-trained model. Data augmentation further improves control on difficult, randomly generated utterance plans.",,Language Generation,Long,https://www.aclweb.org/anthology/2020.emnlp-main.419,38939225 +main.2920,Do “Undocumented Immigrants” == “Illegal Aliens”? Differentiating Denotation and Connotation in Vector Space,Albert Webson|Zhizhong Chen|Carsten Eickhoff|Ellie Pavlick,"In politics, neologisms are frequently invented for partisan objectives. For example, ``undocumented workers” and ``illegal aliens” refer to the same group of people (i.e., they have the same denotation), but they carry clearly different connotations. Examples like these have traditionally posed a challenge to reference-based semantic theories and led to increasing acceptance of alternative theories (e.g., Two-Factor Semantics) among philosophers and cognitive scientists. In NLP, however, popular pretrained models encode both denotation and connotation as one entangled representation. In this study, we propose an adversarial nerual netowrk that decomposes a pretrained representation as independent denotation and connotation representations. For intrinsic interpretability, we show that words with the same denotation but different connotations (e.g., “immigrants"" vs. “aliens"", “estate tax"" vs. “death tax"") move closer to each other in denotation space while moving further apart in connotation space. For extrinsic application, we train an information retrieval system with our disentangled representations and show that the denotation vectors improve the viewpoint diversity of document rankings.",,Semantics: Lexical Semantics,Long,https://www.aclweb.org/anthology/2020.emnlp-main.335,38939226 +main.2922,Annotating Temporal Dependency Graphs via Crowdsourcing,Jiarui Yao|Haoling Qiu|Bonan Min|Nianwen Xue,"We present the construction of a corpus of 500 Wikinews articles annotated with temporal dependency graphs (TDGs) that can be used to train systems to understand temporal relations in text. We argue that temporal dependency graphs, built on previous research on narrative times and temporal anaphora, provide a representation scheme that achieves a good trade-off between completeness and practicality in temporal annotation. We also provide a crowdsourcing strategy to annotate TDGs, and demonstrate the feasibility of this approach with an evaluation of the quality of the annotation, and the utility of the resulting data set by training a machine learning model on this data set. The data set is publicly available.",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.432,38939227 +main.2927,A Visually-grounded First-person Dialogue Dataset with Verbal and Non-verbal Responses,Hisashi Kamezawa|Noriki Nishida|Nobuyuki Shimizu|Takashi Miyazaki|Hideki Nakayama,"In real-world dialogue, first-person visual information about where the other speakers are and what they are paying attention to is crucial to understand their intentions. Non-verbal responses also play an important role in social interactions. In this paper, we propose a visually-grounded first-person dialogue (VFD) dataset with verbal and non-verbal responses. The VFD dataset provides manually annotated (1) first-person images of agents, (2) utterances of human speakers, (3) eye-gaze locations of the speakers, and (4) the agents' verbal and non-verbal responses. We present experimental results obtained using the proposed VFD dataset and recent neural network models (e.g., BERT, ResNet). The results demonstrate that first-person vision helps neural network models correctly understand human intentions, and the production of non-verbal responses is a challenging task like that of verbal responses. Our dataset is publicly available.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.267,38939228 +main.2931,Improving Neural Topic Models Using Knowledge Distillation,Alexander Miserlis Hoyle|Pranav Goel|Philip Resnik,"Topic models are often used to identify human-interpretable topics to help make sense of large document collections. We use knowledge distillation to combine the best attributes of probabilistic topic models and pretrained transformers. Our modular method can be straightforwardly applied with any neural topic model to improve topic quality, which we demonstrate using two models having disparate architectures, obtaining state-of-the-art topic coherence. We show that our adaptable framework not only improves performance in the aggregate over all estimated topics, as is commonly reported, but also in head-to-head comparisons of aligned topics.",,Information Retrieval and Text Mining,Long,https://www.aclweb.org/anthology/2020.emnlp-main.137,38939229 +main.2938,FedED: Federated Learning via Ensemble Distillation for Medical Relation Extraction,Dianbo Sui|Yubo Chen|Jun Zhao|Yantao Jia|Yuantao Xie|Weijian Sun,"Unlike other domains, medical texts are inevitably accompanied by private information, so sharing or copying these texts is strictly restricted. However, training a medical relation extraction model requires collecting these privacy-sensitive texts and storing them on one machine, which comes in conflict with privacy protection. In this paper, we propose a privacy-preserving medical relation extraction model based on federated learning, which enables training a central model with no single piece of private local data being shared or exchanged. Though federated learning has distinct advantages in privacy protection, it suffers from the communication bottleneck, which is mainly caused by the need to upload cumbersome local parameters. To overcome this bottleneck, we leverage a strategy based on knowledge distillation. Such a strategy uses the uploaded predictions of ensemble local models to train the central model without requiring uploading local parameters. Experiments on three publicly available medical relation extraction datasets demonstrate the effectiveness of our method.",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.165,38939230 +main.2943,A Simple Yet Strong Pipeline for HotpotQA,Dirk Groeneveld|Tushar Khot|Mausam|Ashish Sabharwal,"State-of-the-art models for multi-hop question answering typically augment large-scale language models like BERT with additional, intuitively useful capabilities such as named entity recognition, graph-based reasoning, and question decomposition. However, does their strong performance on popular multi-hop datasets really justify this added design complexity? Our results suggest that the answer may be no, because even our simple pipeline based on BERT, named \model, performs surprisingly well. Specifically, on HotpotQA, Quark outperforms these models on both question answering and support identification (and achieves performance very close to a RoBERTa model). Our pipeline has three steps: 1) use BERT to identify potentially relevant sentences \emph{independently} of each other; 2) feed the set of selected sentences as context into a standard BERT span prediction model to choose an answer; and 3) use the sentence selection model, now with the chosen answer, to produce supporting sentences. The strong performance of Quark resurfaces the importance of carefully exploring simple model designs before using popular benchmarks to justify the value of complex techniques.",,Question Answering,Long,https://www.aclweb.org/anthology/2020.emnlp-main.711,38939231 +main.2947,Entity Enhanced BERT Pre-training for Chinese NER,Chen Jia|Yuefeng Shi|Qinrong Yang|Yue Zhang,"Character-level BERT pre-trained in Chinese suffers a limitation of lacking lexicon information, which shows effectiveness for Chinese NER. To integrate the lexicon into pre-trained LMs for Chinese NER, we investigate a semi-supervised entity enhanced BERT pre-training method. In particular, we first extract an entity lexicon from the relevant raw text using a new-word discovery method. We then integrate the entity information into BERT using Char-Entity-Transformer, which augments the self-attention using a combination of character and entity representations. In addition, an entity classification task helps inject the entity information into model parameters in pre-training. The pre-trained models are used for NER fine-tuning. Experiments on a news dataset and two datasets annotated by ourselves for NER in long-text show that our method is highly effective and achieves the best results.",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.518,38939232 +main.2958,Human-in-the-loop Debugging Deep Text Classifiers,Piyawat Lertvittayakumjorn|Lucia Specia|Francesca Toni,"Since obtaining a perfect training dataset (i.e., a dataset which is considerably large, unbiased, and well-representative of unseen cases) is hardly possible, many real-world text classifiers are trained on the available, yet imperfect, datasets. These classifiers are thus likely to have undesirable properties. For instance, they may have biases against some sub-populations or may not work effectively in the wild due to overfitting. In this paper, we propose FIND -- a framework which enables humans to debug deep learning text classifiers by disabling irrelevant hidden features. Experiments show that by using FIND, humans can improve CNN text classifiers which were trained under different types of imperfect datasets (including datasets with biases and datasets with dissimilar train-test distributions).",,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.24,38939233 +main.2959,Calibrated Fine-Tuning for Pre-trained Language Models via Manifold Smoothing,Lingkai Kong|Haoming Jiang|Yuchen Zhuang|Jie Lyu|Tuo Zhao|Chao Zhang,"Fine-tuned pre-trained language models can suffer from severe miscalibration for both in-distribution and out-of-distribution (OOD) data due to over-parameterization. To mitigate this issue, we propose a regularized fine-tuning method. Our method introduces two types of regularization for better calibration: (1) On-manifold regularization, which generates pseudo on-manifold samples through interpolation within the data manifold. Augmented training with these pseudo samples imposes a smoothness regularization to improve in-distribution calibration. (2) Off-manifold regularization, which encourages the model to output uniform distributions for pseudo off-manifold samples to address the over-confidence issue for OOD data. Our experiments demonstrate that the proposed method outperforms existing calibration methods for text classification in terms of expectation calibration error, misclassification detection, and OOD detection on six datasets. Our code can be found at https://github.com/Lingkai-Kong/Calibrated-BERT-Fine-Tuning.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.102,38939234 +main.2962,Fact or Fiction: Verifying Scientific Claims,David Wadden|Shanchuan Lin|Kyle Lo|Lucy Lu Wang|Madeleine van Zuylen|Arman Cohan|Hannaneh Hajishirzi,"We introduce scientific claim verification, a new task to select abstracts from the research literature containing evidence that SUPPORTS or REFUTES a given scientific claim, and to identify rationales justifying each decision. To study this task, we construct SciFact, a dataset of 1.4K expert-written scientific claims paired with evidence-containing abstracts annotated with labels and rationales. We develop baseline models for SciFact, and demonstrate that simple domain adaptation techniques substantially improve performance compared to models trained on Wikipedia or political news. We show that our system is able to verify claims related to COVID-19 by identifying evidence from the CORD-19 corpus. Our experiments indicate that SciFact will provide a challenging testbed for the development of new systems designed to retrieve and reason over corpora containing specialized domain knowledge. Data and code for this new task are publicly available at https://github.com/allenai/scifact. A leaderboard and COVID-19 fact-checking demo are available at https://scifact.apps.allenai.org.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.609,38939235 +main.2972,Domain Knowledge Empowered Structured Neural Net for End-to-End Event Temporal Relation Extraction,Rujun Han|Yichao Zhou|Nanyun Peng,"Extracting event temporal relations is a critical task for information extraction and plays an important role in natural language understanding. Prior systems leverage deep learning and pre-trained language models to improve the performance of the task. However, these systems often suffer from two shortcomings: 1) when performing maximum a posteriori (MAP) inference based on neural models, previous systems only used structured knowledge that is assumed to be absolutely correct, i.e., hard constraints; 2) biased predictions on dominant temporal relations when training with a limited amount of data. To address these issues, we propose a framework that enhances deep neural network with distributional constraints constructed by probabilistic domain knowledge. We solve the constrained inference problem via Lagrangian Relaxation and apply it to end-to-end event temporal relation extraction tasks. Experimental results show our framework is able to improve the baseline neural network models with strong statistical significance on two widely used datasets in news and clinical domains.",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.461,38939236 +main.2973,IIRC: A Dataset of Incomplete Information Reading Comprehension Questions,James Ferguson|Matt Gardner|Hannaneh Hajishirzi|Tushar Khot|Pradeep Dasigi,"Humans often have to read multiple documents to address their information needs. However, most existing reading comprehension (RC) tasks only focus on questions for which the contexts provide all the information required to answer them, thus not evaluating a system's performance at identifying a potential lack of sufficient information and locating sources for that information. To fill this gap, we present a dataset, IIRC, with more than 13K questions over paragraphs from English Wikipedia that provide only partial information to answer them, with the missing information occurring in one or more linked documents. The questions were written by crowd workers who did not have access to any of the linked documents, leading to questions that have little lexical overlap with the contexts where the answers appear. This process also gave many questions without answers, and those that require discrete reasoning, increasing the difficulty of the task. We follow recent modeling work on various reading comprehension datasets to construct a baseline model for this dataset, finding that it achieves 31.1% F1 on this task, while estimated human performance is 88.4%. The dataset, code for the baseline system, and a leaderboard can be found at https://allennlp.org/iirc.",,Question Answering,Long,https://www.aclweb.org/anthology/2020.emnlp-main.86,38939237 +main.2974,Scalable Zero-shot Entity Linking with Dense Entity Retrieval,Ledell Wu|Fabio Petroni|Martin Josifoski|Sebastian Riedel|Luke Zettlemoyer,"This paper introduces a conceptually simple, scalable, and highly effective BERT-based entity linking model, along with an extensive evaluation of its accuracy-speed trade-off. We present a two-stage zero-shot linking algorithm, where each entity is defined only by a short textual description. The first stage does retrieval in a dense space defined by a bi-encoder that independently embeds the mention context and the entity descriptions. Each candidate is then re-ranked with a cross-encoder, that concatenates the mention and entity text. Experiments demonstrate that this approach is state of the art on recent zero-shot benchmarks (6 point absolute gains) and also on more established non-zero-shot evaluations (e.g. TACKBP-2010), despite its relative simplicity (e.g. no explicit entity embeddings or manually engineered mention tables). We also show that bi-encoder linking is very fast with nearest neighbor search (e.g. linking with 5.9 million candidates in 2 milliseconds), and that much of the accuracy gain from the more expensive cross-encoder can be transferred to the bi-encoder via knowledge distillation. Our code and models are available at https://github.com/facebookresearch/BLINK.",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.519,38939238 +main.2975,Counterfactual Off-Policy Training for Neural Dialogue Generation,Qingfu Zhu|Wei-Nan Zhang|Ting Liu|William Yang Wang,"Open-domain dialogue generation suffers from the data insufficiency problem due to the vast size of potential responses. In this paper, we propose to explore potential responses by counterfactual reasoning. Given an observed response, the counterfactual reasoning model automatically infers the outcome of an alternative policy that could have been taken. The resulting counterfactual response synthesized in hindsight is of higher quality than the response synthesized from scratch. Training on the counterfactual responses under the adversarial learning framework helps to explore the high-reward area of the potential response space. An empirical study on the DailyDialog dataset shows that our approach significantly outperforms the HRED model as well as the conventional adversarial learning approaches.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.276,38939239 +main.298,A Synset Relation-enhanced Framework with a Try-again Mechanism for Word Sense Disambiguation,Ming Wang|Yinglin Wang,"Contextual embeddings are proved to be overwhelmingly effective to the task of Word Sense Disambiguation (WSD) compared with other sense representation techniques. However, these embeddings fail to embed sense knowledge in semantic networks. In this paper, we propose a Synset Relation-Enhanced Framework (SREF) that leverages sense relations for both sense embedding enhancement and a try-again mechanism that implements WSD again, after obtaining basic sense embeddings from augmented WordNet glosses. Experiments on all-words and lexical sample datasets show that the proposed system achieves new state-of-the-art results, defeating previous knowledge-based systems by at least 5.5 F1 measure. When the system utilizes sense embeddings learned from SemCor, it outperforms all previous supervised systems with only 20% SemCor data.",,Semantics: Lexical Semantics,Long,https://www.aclweb.org/anthology/2020.emnlp-main.504,38938678 +main.2982,Content Planning for Neural Story Generation with Aristotelian Rescoring,Seraphina Goldfarb-Tarrant|Tuhin Chakrabarty|Ralph Weischedel|Nanyun Peng,"Long-form narrative text generated from large language models manages a fluent impersonation of human writing, but only at the local sentence level, and lacks structure or global cohesion. We posit that many of the problems of story generation can be addressed via high-quality content planning, and present a system that focuses on how to learn good plot structures to guide story generation. We utilize a plot-generation language model along with an ensemble of rescoring models that each implement an aspect of good story-writing as detailed in Aristotle's Poetics. We find that stories written with our more principled plot-structure are both more relevant to a given prompt and higher quality than baselines that do not content plan, or that plan in an unprincipled way.",,Language Generation,Long,https://www.aclweb.org/anthology/2020.emnlp-main.351,38939240 +main.2989,"Infusing Disease Knowledge into BERT for Health Question Answering, Medical Inference and Disease Name Recognition",Yun He|Ziwei Zhu|Yin Zhang|Qin Chen|James Caverlee,"Knowledge of a disease includes information of various aspects of the disease, such as signs and symptoms, diagnosis and treatment. This disease knowledge is critical for many health-related and biomedical tasks, including consumer health question answering, medical language inference and disease name recognition. While pre-trained language models like BERT have shown success in capturing syntactic, semantic, and world knowledge from text, we find they can be further complemented by specific information like knowledge of symptoms, diagnoses, treatments, and other disease aspects. Hence, we integrate BERT with disease knowledge for improving these important tasks. Specifically, we propose a new disease knowledge infusion training procedure and evaluate it on a suite of BERT models including BERT, BioBERT, SciBERT, ClinicalBERT, BlueBERT, and ALBERT. Experiments over the three tasks show that these models can be enhanced in nearly all cases, demonstrating the viability of disease knowledge infusion. For example, accuracy of BioBERT on consumer health question answering is improved from 68.29% to 72.09%, while new SOTA results are observed in two datasets. We make our data and code freely available.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",Long,https://www.aclweb.org/anthology/2020.emnlp-main.372,38939241 +main.2990,PyMT5: Multi-mode Translation of Natural Language and Python Code with Transformers,Colin Clement|Dawn Drain|Jonathan Timcheck|Alexey Svyatkovskiy|Neel Sundaresan,"Simultaneously modeling source code and natural language has many exciting applications in automated software development and understanding. Pursuant to achieving such technology, we introduce PyMT5, the Python method text-to-text transfer transformer, which is trained to translate between all pairs of Python method feature combinations: a single model that can both predict whole methods from natural language documentation strings (docstrings) and summarize code into docstrings of any common style. We present an analysis and modeling effort of a large-scale parallel corpus of 26 million Python methods and 7.7 million method-docstring pairs, demonstrating that for docstring and method generation, PyMT5~ outperforms similarly-sized auto-regressive language models (GPT2) which were English pre-trained or randomly initialized. On the CodeSearchNet test set, our best model predicts 92.1% syntactically correct method bodies, achieved a BLEU score of 8.59 for method generation and 16.3 for docstring generation (summarization), and achieved a ROUGE-L F-score of 24.8 for method generation and 36.7 for docstring generation.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.728,38939242 +main.2991,Iterative Language-Based Image Editing via Self-Supervised Counterfactual Reasoning,Tsu-Jui Fu|Xin Wang|Scott Grafton|Miguel Eckstein|William Yang Wang,"Iterative Language-Based Image Editing (ILBIE) tasks follow iterative instructions to edit images step by step. Data scarcity is a significant issue for ILBIE as it is challenging to collect large-scale examples of images before and after instruction-based changes. Yet, humans still accomplish these editing tasks even when presented with an unfamiliar image-instruction pair. Such ability results from counterfactual thinking, the ability to think about possible alternatives to events that have happened already. In this paper, we introduce a Self-Supervised Counterfactual Reasoning (SSCR) framework that incorporates counterfactual thinking to overcome data scarcity. SSCR allows the model to consider out-of-distribution instructions paired with previous images. With the help of cross-task consistency (CTC), we train these counterfactual instructions in a self-supervised scenario. Extensive results show that SSCR improves the correctness of ILBIE in terms of both object identity and position, establishing a new state of the art (SOTA) on two IBLIE datasets (i-CLEVR and CoDraw). Even with only 50\% of the training data, SSCR achieves a comparable result to using complete data.",,"Language Grounding to Vision, Robotics and Beyond",Long,https://www.aclweb.org/anthology/2020.emnlp-main.357,38939243 +main.2994,"MOSEAS: A Multimodal Language Dataset for Spanish, Portuguese, German and French",AmirAli Bagher Zadeh|Yansheng Cao|Simon Hessner|Paul Pu Liang|Soujanya Poria|Louis-Philippe Morency,"Modeling multimodal language is a core research area in natural language processing. While languages such as English have relatively large multimodal language resources, other widely spoken languages across the globe have few or no large-scale datasets in this area. This disproportionately affects native speakers of languages other than English. As a step towards building more equitable and inclusive multimodal systems, we introduce the first large-scale multimodal language dataset for Spanish, Portuguese, German and French. The proposed dataset, called CMU-MOSEAS (CMU Multimodal Opinion Sentiment, Emotions and Attributes), is the largest of its kind with 40,000 total labelled sentences. It covers a diverse set topics and speakers, and carries supervision of 20 labels including sentiment (and subjectivity), emotions, and attributes. Our evaluations on a state-of-the-art multimodal model demonstrates that CMU-MOSEAS enables further research for multilingual studies in multimodal language.",,Speech and Multimodality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.141,38939244 +main.2995,Less Is More: Attention Supervision with Counterfactuals for Text Classification,Seungtaek Choi|Haeju Park|Jinyoung Yeo|Seung-won Hwang,"We aim to leverage human and machine intelligence together for attention supervision. Specifically, we show that human annotation cost can be kept reasonably low, while its quality can be enhanced by machine self-supervision. Specifically, for this goal, we explore the advantage of counterfactual reasoning, over associative reasoning typically used in attention supervision. Our empirical results show that this machine-augmented human attention supervision is more effective than existing methods requiring a higher annotation cost, in text classification tasks, including sentiment analysis and news categorization.",,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.543,38939245 +main.2996,Unsupervised Stance Detection for Arguments from Consequences,Jonathan Kobbe|Ioana Hulpuș|Heiner Stuckenschmidt,"Social media platforms have become an essential venue for online deliberation where users discuss arguments, debate, and form opinions. In this paper, we propose an unsupervised method to detect the stance of argumentative claims with respect to a topic. Most related work focuses on topic-specific supervised models that need to be trained for every emergent debate topic. To address this limitation, we propose a topic independent approach that focuses on a frequently encountered class of arguments, specifically, on arguments from consequences. We do this by extracting the effects that claims refer to, and proposing a means for inferring if the effect is a good or bad consequence. Our experiments provide promising results that are comparable to, and in particular regards even outperform BERT. Furthermore, we publish a novel dataset of arguments relating to consequences, annotated with Amazon Mechanical Turk.",,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Long,https://www.aclweb.org/anthology/2020.emnlp-main.4,38939246 +main.2999,Precise Task Formalization Matters in Winograd Schema Evaluations,Haokun Liu|William Huang|Dhara Mungra|Samuel R. Bowman,"Performance on the Winograd Schema Challenge (WSC), a respected English commonsense reasoning benchmark, recently rocketed from chance accuracy to 89% on the SuperGLUE leaderboard, with relatively little corroborating evidence of a correspondingly large improvement in reasoning ability. We hypothesize that much of this improvement comes from recent changes in task formalization—the combination of input specification, loss function, and reuse of pretrained parameters—by users of the dataset, rather than improvements in the pretrained model’s reasoning ability. We perform an ablation on two Winograd Schema datasets that interpolates between the formalizations used before and after this surge, and find (i) framing the task as multiple choice improves performance dramatically and (ii)several additional techniques, including the reuse of a pretrained language modeling head, can mitigate the model’s extreme sensitivity to hyperparameters. We urge future benchmark creators to impose additional structure to minimize the impact of formalization decisions on reported results.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",Long,https://www.aclweb.org/anthology/2020.emnlp-main.664,38939247 +main.30,Short Text Topic Modeling with Topic Distribution Quantization and Negative Sampling Decoder,Xiaobao Wu|Chunping Li|Yan Zhu|Yishu Miao,"Topic models have been prevailing for many years on discovering latent semantics while modeling long documents. However, for short texts they generally suffer from data sparsity because of extremely limited word co-occurrences; thus tend to yield repetitive or trivial topics with low quality. In this paper, to address this issue, we propose a novel neural topic model in the framework of autoencoding with a new topic distribution quantization approach generating peakier distributions that are more appropriate for modeling short texts. Besides the encoding, to tackle this issue in terms of decoding, we further propose a novel negative sampling decoder learning from negative samples to avoid yielding repetitive topics. We observe that our model can highly improve short text topic modeling performance. Through extensive experiments on real-world datasets, we demonstrate our model can outperform both strong traditional and neural baselines under extreme data sparsity scenes, producing high-quality topics.",,Information Retrieval and Text Mining,Long,https://www.aclweb.org/anthology/2020.emnlp-main.138,38938639 +main.300,Adaptive Attentional Network for Few-Shot Knowledge Graph Completion,Jiawei Sheng|Shu Guo|Zhenyu Chen|Juwei Yue|Lihong Wang|Tingwen Liu|Hongbo Xu,"Few-shot Knowledge Graph (KG) completion is a focus of current research, where each task aims at querying unseen facts of a relation given its few-shot reference entity pairs. Recent attempts solve this problem by learning static representations of entities and references, ignoring their dynamic properties, i.e., entities may exhibit diverse roles within task relations, and references may make different contributions to queries. This work proposes an adaptive attentional network for few-shot KG completion by learning adaptive entity and reference representations. Specifically, entities are modeled by an adaptive neighbor encoder to discern their task-oriented roles, while references are modeled by an adaptive query-aware aggregator to differentiate their contributions. Through the attention mechanism, both entities and references can capture their fine-grained semantic meanings, and thus render more expressive representations. This will be more predictive for knowledge acquisition in the few-shot scenario. Evaluation in link prediction on two public datasets shows that our approach achieves new state-of-the-art results with different few-shot sizes. The source code is available at https://github.com/JiaweiSheng/FAAN.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",Long,https://www.aclweb.org/anthology/2020.emnlp-main.131,38938679 +main.3010,Self-Supervised Text Planning for Paragraph Completion Task,Dongyeop Kang|Eduard Hovy,"Despite the recent success of contextualized language models on various NLP tasks, language model itself cannot capture textual coherence of a long, multi-sentence document (e.g., a paragraph). Humans often make structural decisions on what and how to say about before making utterances. Guiding surface realization with such high-level decisions and structuring text in a coherent way is essentially called a planning process. Where can the model learn such high-level coherence? A paragraph itself contains various forms of inductive coherence signals called self-supervision in this work, such as sentence orders, topical keywords, rhetorical structures, and so on. Motivated by that, this work proposes a new paragraph completion task PARCOM; predicting masked sentences in a paragraph. However, the task suffers from predicting and selecting appropriate topical content with respect to the given context. To address that, we propose a self-supervised text planner SSPlanner that predicts what to say first (content prediction), then guides the pretrained language model (surface realization) using the predicted content. SSPlanner outperforms the baseline generation models on the paragraph completion task in both automatic and human evaluation. We also find that a combination of noun and verb types of keywords is the most effective for content selection. As more number of content keywords are provided, overall generation quality also increases.",,Language Generation,Long,https://www.aclweb.org/anthology/2020.emnlp-main.529,38939248 +main.3012,Unsupervised Reference-Free Summary Quality Evaluation via Contrastive Learning,Hanlu Wu|Tengfei Ma|Lingfei Wu|Tariro Manyumwa|Shouling Ji,"Evaluation of a document summarization system has been a critical factor to impact the success of the summarization task. Previous approaches, such as ROUGE, mainly consider the informativeness of the assessed summary and require human-generated references for each test summary. In this work, we propose to evaluate the summary qualities without reference summaries by unsupervised contrastive learning. Specifically, we design a new metric which covers both linguistic qualities and semantic informativeness based on BERT. To learn the metric, for each summary, we construct different types of negative samples with respect to different aspects of the summary qualities, and train our model with a ranking loss. Experiments on Newsroom and CNN/Daily Mail demonstrate that our new evaluation method outperforms other metrics even without reference summaries. Furthermore, we show that our method is general and transferable across datasets.",,Summarization,Long,https://www.aclweb.org/anthology/2020.emnlp-main.294,38939249 +main.3013,MODE-LSTM: A Parameter-efficient Recurrent Network with Multi-Scale for Sentence Classification,Qianli Ma|Zhenxi Lin|Jiangyue Yan|Zipeng Chen|Liuhong Yu,"The central problem of sentence classification is to extract multi-scale n-gram features for understanding the semantic meaning of sentences. Most existing models tackle this problem by stacking CNN and RNN models, which easily leads to feature redundancy and overfitting because of relatively limited datasets. In this paper, we propose a simple yet effective model called Multi-scale Orthogonal inDependEnt LSTM (MODE-LSTM), which not only has effective parameters and good generalization ability, but also considers multiscale n-gram features. We disentangle the hidden state of the LSTM into several independently updated small hidden states and apply an orthogonal constraint on their recurrent matrices. We then equip this structure with sliding windows of different sizes for extracting multi-scale n-gram features. Extensive experiments demonstrate that our model achieves better or competitive performance against state-of-the-art baselines on eight benchmark datasets. We also combine our model with BERT to further boost the generalization performance.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.544,38939250 +main.3022,Pre-training Entity Relation Encoder with Intra-span and Inter-span Information,Yijun Wang|Changzhi Sun|Yuanbin Wu|Junchi Yan|Peng Gao|Guotong Xie,"In this paper, we integrate span-related information into pre-trained encoder for entity relation extraction task. Instead of using general-purpose sentence encoder (e.g., existing universal pre-trained models), we introduce a span encoder and a span pair encoder to the pre-training network, which makes it easier to import intra-span and inter-span information into the pre-trained model. To learn the encoders, we devise three customized pre-training objectives from different perspectives, which target on tokens, spans, and span pairs. In particular, a span encoder is trained to recover a random shuffling of tokens in a span, and a span pair encoder is trained to predict positive pairs that are from the same sentences and negative pairs that are from different sentences using contrastive loss. Experimental results show that the proposed pre-training method outperforms distantly supervised pre-training, and achieves promising performance on two entity relation extraction benchmark datasets (ACE05, SciERC).",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.132,38939251 +main.3023,Feature Adaptation of Pre-Trained Language Models across Languages and Domains with Robust Self-Training,Hai Ye|Qingyu Tan|Ruidan He|Juntao Li|Hwee Tou Ng|Lidong Bing,"Adapting pre-trained language models (PrLMs) (e.g., BERT) to new domains has gained much attention recently. Instead of fine-tuning PrLMs as done in most previous work, we investigate how to adapt the features of PrLMs to new domains without fine-tuning. We explore unsupervised domain adaptation (UDA) in this paper. With the features from PrLMs, we adapt the models trained with labeled data from the source domain to the unlabeled target domain. Self-training is widely used for UDA, and it predicts pseudo labels on the target domain data for training. However, the predicted pseudo labels inevitably include noise, which will negatively affect training a robust model. To improve the robustness of self-training, in this paper we present class-aware feature self-distillation (CFd) to learn discriminative features from PrLMs, in which PrLM features are self-distilled into a feature adaptation module and the features from the same class are more tightly clustered. We further extend CFd to a cross-language setting, in which language discrepancy is studied. Experiments on two monolingual and multilingual Amazon review datasets show that CFd can consistently improve the performance of self-training in cross-domain and cross-language settings.",,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Long,https://www.aclweb.org/anthology/2020.emnlp-main.599,38939252 +main.3028,Modeling Protagonist Emotions for Emotion-Aware Storytelling,Faeze Brahman|Snigdha Chaturvedi,"Emotions and their evolution play a central role in creating a captivating story. In this paper, we present the first study on modeling the emotional trajectory of the protagonist in neural storytelling. We design methods that generate stories that adhere to given story titles and desired emotion arcs for the protagonist. Our models include Emotion Supervision (EmoSup) and two Emotion-Reinforced (EmoRL) models. The EmoRL models use special rewards designed to regularize the story generation process through reinforcement learning. Our automatic and manual evaluations demonstrate that these models are significantly better at generating stories that follow the desired emotion arcs compared to baseline methods, without sacrificing story quality.",,Computational Social Science and Social Media,Long,https://www.aclweb.org/anthology/2020.emnlp-main.426,38939253 +main.3032,A Dataset for Tracking Entities in Open Domain Procedural Text,Niket Tandon|Keisuke Sakaguchi|Bhavana Dalvi|Dheeraj Rajagopal|Peter Clark|Michal Guerquin|Kyle Richardson|Eduard Hovy,"We present the first dataset for tracking state changes in procedural text from arbitrary domains by using an unrestricted (open) vocabulary. For example, in a text describing fog removal using potatoes, a car window may transition between being foggy, sticky, opaque, and clear. Previous formulations of this task provide the text and entities involved, and ask how those entities change for just a small, pre-defined set of attributes (e.g., location), limiting their fidelity. Our solution is a new task formulation where given just a procedural text as input, the task is to generate a set of state change tuples (entity, attribute, before-state, after-state) for each step, where the entity, attribute, and state values must be predicted from an open vocabulary. Using crowdsourcing, we create OPENPI, a high-quality (91.5% coverage as judged by humans and completely vetted), and large-scale dataset comprising 29,928 state changes over 4,050 sentences from 810 procedural real-world paragraphs from WikiHow.com. A current state-of-the-art generation model on this task achieves 16.1% F1 based on BLEU metric, leaving enough room for novel model architectures.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",Long,https://www.aclweb.org/anthology/2020.emnlp-main.520,38939254 +main.3035,Is Multihop QA in DiRe Condition? Measuring and Reducing Disconnected Reasoning,Harsh Trivedi|Niranjan Balasubramanian|Tushar Khot|Ashish Sabharwal,"Has there been real progress in multi-hop question-answering? Models often exploit dataset artifacts to produce correct answers, without connecting information across multiple supporting facts. This limits our ability to measure true progress and defeats the purpose of building multi-hop QA datasets. We make three contributions towards addressing this. First, we formalize such undesirable behavior as disconnected reasoning across subsets of supporting facts. This allows developing a model-agnostic probe for measuring how much any model can cheat via disconnected reasoning. Second, using a notion of \emph{contrastive support sufficiency}, we introduce an automatic transformation of existing datasets that reduces the amount of disconnected reasoning. Third, our experiments suggest that there hasn't been much progress in multi-hop QA in the reading comprehension setting. For a recent large-scale model (XLNet), we show that only 18 points out of its answer F1 score of 72 on HotpotQA are obtained through multifact reasoning, roughly the same as that of a simpler RNN baseline. Our transformation substantially reduces disconnected reasoning (19 points in answer F1). It is complementary to adversarial approaches, yielding further reductions in conjunction.",,Question Answering,Long,https://www.aclweb.org/anthology/2020.emnlp-main.712,38939255 +main.3046,Unsupervised Cross-Lingual Part-of-Speech Tagging for Truly Low-Resource Scenarios,Ramy Eskander|Smaranda Muresan|Michael Collins,"We describe a fully unsupervised cross-lingual transfer approach for part-of-speech (POS) tagging under a truly low resource scenario. We assume access to parallel translations between the target language and one or more source languages for which POS taggers are available. We use the Bible as parallel data in our experiments: small size, out-of-domain and covering many diverse languages. Our approach innovates in three ways: 1) a robust approach of selecting training instances via cross-lingual annotation projection that exploits best practices of unsupervised type and token constraints, word-alignment confidence and density of projected POS, 2) a Bi-LSTM architecture that uses contextualized word embeddings, affix embeddings and hierarchical Brown clusters, and 3) an evaluation on 12 diverse languages in terms of language family and morphological typology. In spite of the use of limited and out-of-domain parallel data, our experiments demonstrate significant improvements in accuracy over previous work. In addition, we show that using multi-source information, either via projection or output combination, improves the performance for most target languages.",,"Phonology, Morphology and Word Segmentation",Long,https://www.aclweb.org/anthology/2020.emnlp-main.391,38939256 +main.3049,Natural Language Processing for Achieving Sustainable Development: The Case of Neural Labelling to Enhance Community Profiling,Costanza Conforti|Stephanie Hirmer|Dai Morgan|Marco Basaldella|Yau Ben Or,"In recent years, there has been an increasing interest in the application of Artificial Intelligence – and especially Machine Learning – to the field of Sustainable Development (SD). However, until now, NLP has not been systematically applied in this context. In this paper, we show the high potential of NLP to enhance project sustainability. In particular, we focus on the case of community profiling in developing countries, where, in contrast to the developed world, a notable data gap exists. Here, NLP could help to address the cost and time barrier of structuring qualitative data that prohibits its widespread use and associated benefits. We propose the new extreme multi-class multi-label Automatic UserPerceived Value classification task. We release Stories2Insights, an expert-annotated dataset of interviews carried out in Uganda, we provide a detailed corpus analysis, and we implement a number of strong neural baselines to address the task. Experimental results show that the problem is challenging, and leaves considerable room for future research at the intersection of NLP and SD.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.677,38939257 +main.3051,Inference Strategies for Sequence Generation with Conditional Masking,Julia Kreutzer|George Foster|Colin Cherry,"Conditional masked language model (CMLM) training has proven successful for non-autoregressive and semi-autoregressive sequence generation tasks, such as machine translation. Given a trained CMLM, however, it is not clear what the best inference strategy is. We formulate masked inference as a factorization of conditional probabilities of partial sequences, show that this does not harm performance, and investigate a number of simple heuristics motivated by this perspective. We identify a thresholding strategy that has advantages over the standard ``mask-predict'' algorithm, and provide analyses of its behavior on machine translation tasks.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.465,38939258 +main.3054,PathQG: Neural Question Generation from Facts,Siyuan Wang|Zhongyu Wei|Zhihao Fan|Zengfeng Huang|Weijian Sun|Qi ZHANG|Xuanjing Huang,"Existing research for question generation encodes the input text as a sequence of tokens without explicitly modeling fact information. These models tend to generate irrelevant and uninformative questions. In this paper, we explore to incorporate facts in the text for question generation in a comprehensive way. We present a novel task of question generation given a query path in the knowledge graph constructed from the input text. We divide the task into two steps, namely, query representation learning and query-based question generation. We formulate query representation learning as a sequence labeling problem for identifying the involved facts to form a query and employ an RNN-based generator for question generation. We first train the two modules jointly in an end-to-end fashion, and further enforce the interaction between these two modules in a variational framework. We construct the experimental datasets on top of SQuAD and results show that our model outperforms other state-of-the-art approaches, and the performance margin is larger when target questions are complex. Human evaluation also proves that our model is able to generate relevant and informative questions.\footnote{Our code is available at \url{https://github.com/WangsyGit/PathQG}.}",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.729,38939259 +main.3057,A Method for Building a Commonsense Inference Dataset Based on Basic Events,Kazumasa Omura|Daisuke Kawahara|Sadao Kurohashi,"We present a scalable, low-bias, and low-cost method for building a commonsense inference dataset that combines automatic extraction from a corpus and crowdsourcing. Each problem is a multiple-choice question that asks contingency between basic events. We applied the proposed method to a Japanese corpus and acquired 104k problems. While humans can solve the resulting problems with high accuracy (88.9%), the accuracy of a high-performance transfer learning model is reasonably low (76.0%). We also confirmed through dataset analysis that the resulting dataset contains low bias. We released the datatset to facilitate language understanding research.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",Long,https://www.aclweb.org/anthology/2020.emnlp-main.192,38939260 +main.3064,DiversifiEd Multiple Instance Learning for Document-Level Multi-Aspect Sentiment ClassifiCation,Yunjie Ji|Hao Liu|Bolei He|Xinyan Xiao|Hua Wu|Yanhua Yu,"Neural Document-level Multi-aspect Sentiment Classification (DMSC) usually requires a lot of manual aspect-level sentiment annotations, which is time-consuming and laborious. As document-level sentiment labeled data are widely available from online service, it is valuable to perform DMSC with such free document-level annotations. To this end, we propose a novel Diversified Multiple Instance Learning Network (D-MILN), which is able to achieve aspect-level sentiment classification with only document-level weak supervision. Specifically, we connect aspect-level and document-level sentiment by formulating this problem as multiple instance learning, providing a way to learn aspect-level classifier from the back propagation of document-level supervision. Two diversified regularizations are further introduced in order to avoid the overfitting on document-level signals during training. Diversified textual regularization encourages the classifier to select aspect-relevant snippets, and diversified sentimental regularization prevents the aspect-level sentiments from being overly consistent with document-level sentiment. Experimental results on TripAdvisor and BeerAdvocate datasets show that D-MILN remarkably outperforms recent weakly-supervised baselines, and is also comparable to the supervised method.",,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Long,https://www.aclweb.org/anthology/2020.emnlp-main.570,38939261 +main.3065,Data Boost: Text Data Augmentation through Reinforcement Learning Guided Conditional Generation,Ruibo Liu|Guangxuan Xu|Chenyan Jia|Weicheng Ma|Lili Wang|Soroush Vosoughi,"Data augmentation is proven to be effective in many NLU tasks, especially for those suffering from data scarcity. In this paper, we present a powerful and easy to deploy text augmentation framework, Data Boost, which augments data through reinforcement learning guided conditional generation. We evaluate Data Boost on three diverse text classification tasks under five different classifier architectures. The result shows that Data Boost can boost the performance of classifiers especially in low-resource data scenarios. For instance, Data Boost improves F1 for the three tasks by 8.7% on average when given only 10% of the whole data for training. We also compare Data Boost with six prior text augmentation methods. Through human evaluations (N=178), we confirm that Data Boost augmentation has comparable quality as the original data with respect to readability and class consistency.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.726,38939262 +main.3068,Help! Need Advice on Identifying Advice,Venkata Subrahmanyan Govindarajan|Benjamin Chen|Rebecca Warholic|Katrin Erk|Junyi Jessy Li,"Humans use language to accomplish a wide variety of tasks - asking for and giving advice being one of them. In online advice forums, advice is mixed in with non-advice, like emotional support, and is sometimes stated explicitly, sometimes implicitly. Understanding the language of advice would equip systems with a better grasp of language pragmatics; practically, the ability to identify advice would drastically increase the efficiency of advice-seeking online, as well as advice-giving in natural language generation systems. We present a dataset in English from two Reddit advice forums - r/AskParents and r/needadvice - annotated for whether sentences in posts contain advice or not. Our analysis reveals rich linguistic phenomena in advice discourse. We present preliminary models showing that while pre-trained language models are able to capture advice better than rule-based systems, advice identification is challenging, and we identify directions for future research.",,Computational Social Science and Social Media,Long,https://www.aclweb.org/anthology/2020.emnlp-main.427,38939263 +main.3072,Keeping up Appearances: Computational Modeling of Face Acts in Persuasion Oriented Discussions,Ritam Dutt|Rishabh Joshi|Carolyn Rose,"The notion of face refers to the public self-image of an individual that emerges both from the individual's own actions as well as from the interaction with others. Modeling face and understanding its state changes throughout a conversation is critical to the study of maintenance of basic human needs in and through interaction. Grounded in the politeness theory of Brown and Levinson (1978), we propose a generalized framework for modeling face acts in persuasion conversations, resulting in a reliable coding manual, an annotated corpus, and computational models. The framework reveals insights about differences in face act utilization between asymmetric roles in persuasion conversations. Using computational models, we are able to successfully identify face acts as well as predict a key conversational outcome (e.g. donation success). Finally, we model a latent representation of the conversational state to analyze the impact of predicted face acts on the probability of a positive conversational outcome and observe several correlations that corroborate previous findings.",,Discourse and Pragmatics,Long,https://www.aclweb.org/anthology/2020.emnlp-main.605,38939264 +main.3074,Structured Pruning of Large Language Models,Ziheng Wang|Jeremy Wohlwend|Tao Lei,"Large language models have recently achieved state of the art performance across a wide variety of natural language tasks. Meanwhile, the size of these models and their latency have significantly increased, which makes their usage costly, and raises an interesting question: do language models need to be large? We study this question through the lens of model compression. We present a generic, structured pruning approach by parameterizing each weight matrix using its low-rank factorization, and adaptively removing rank-1 components during training. On language modeling tasks, our structured approach outperforms other unstructured and block-structured pruning baselines at various compression levels, while achieving significant speedups during both training and inference. We also demonstrate that our method can be applied to pruning adaptive word embeddings in large language models, and to pruning the BERT model on several downstream fine-tuning classification benchmarks.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.496,38939265 +main.3084,TeMP: Temporal Message Passing for Temporal Knowledge Graph Completion,Jiapeng Wu|Meng Cao|Jackie Chi Kit Cheung|William L. Hamilton,"Inferring missing facts in temporal knowledge graphs (TKGs) is a fundamental and challenging task. Previous works have approached this problem by augmenting methods for static knowledge graphs to leverage time-dependent representations. However, these methods do not explicitly leverage multi-hop structural information and temporal facts from recent time steps to enhance their predictions. Additionally, prior work does not explicitly address the temporal sparsity and variability of entity distributions in TKGs. We propose the Temporal Message Passing (TeMP) framework to address these challenges by combining graph neural networks, temporal dynamics models, data imputation and frequency-based gating techniques. Experiments on standard TKG tasks show that our approach provides substantial gains compared to the previous state of the art, achieving a 10.7% average relative improvement in Hits@10 across three standard benchmarks. Our analysis also reveals important sources of variability both within and across TKG datasets, and we introduce several simple but strong baselines that outperform the prior state of the art in certain settings.",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.462,38939266 +main.3088,Cross-Media Keyphrase Prediction: A Unified Framework with Multi-Modality Multi-Head Attention and Image Wordings,Yue Wang|Jing Li|Michael Lyu|Irwin King,"Social media produces large amounts of contents every day. To help users quickly capture what they need, keyphrase prediction is receiving a growing attention. Nevertheless, most prior efforts focus on text modeling, largely ignoring the rich features embedded in the matching images. In this work, we explore the joint effects of texts and images in predicting the keyphrases for a multimedia post. To better align social media style texts and images, we propose: (1) a novel Multi-Modality MultiHead Attention (M3H-Att) to capture the intricate cross-media interactions; (2) image wordings, in forms of optical characters and image attributes, to bridge the two modalities. Moreover, we design a unified framework to leverage the outputs of keyphrase classification and generation and couple their advantages. Extensive experiments on a large-scale dataset newly collected from Twitter show that our model significantly outperforms the previous state of the art based on traditional attention mechanisms. Further analyses show that our multi-head attention is able to attend information from various aspects and boost classification or generation in diverse scenarios.",,Computational Social Science and Social Media,Long,https://www.aclweb.org/anthology/2020.emnlp-main.268,38939267 +main.3093,Methods for Numeracy-Preserving Word Embeddings,Dhanasekar Sundararaman|Shijing Si|Vivek Subramanian|Guoyin Wang|Devamanyu Hazarika|Lawrence Carin,"Word embedding models are typically able to capture the semantics of words via the distributional hypothesis, but fail to capture the numerical properties of numbers that appear in the text. This leads to problems with numerical reasoning involving tasks such as question answering. We propose a new methodology to assign and learn embeddings for numbers. Our approach creates Deterministic, Independent-of-Corpus Embeddings (the model is referred to as DICE) for numbers, such that their cosine similarity reflects the actual distance on the number line. DICE outperforms a wide range of pre-trained word embedding models across multiple examples of two tasks: (i) evaluating the ability to capture numeration and magnitude; and (ii) to perform list maximum, decoding, and addition. We further explore the utility of these embeddings in downstream tasks, by initializing numbers with our approach for the task of magnitude prediction. We also introduce a regularization approach to learn model-based embeddings of numbers in a contextual setting.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.384,38939268 +main.3101,HENIN: Learning Heterogeneous Neural Interaction Networks for Explainable Cyberbullying Detection on Social Media,Hsin-Yu Chen|Cheng-Te Li,"In the computational detection of cyberbullying, existing work largely focused on building generic classifiers that rely exclusively on text analysis of social media sessions. Despite their empirical success, we argue that a critical missing piece is the model explainability, i.e., why a particular piece of media session is detected as cyberbullying. In this paper, therefore, we propose a novel deep model, HEterogeneous Neural Interaction Networks (HENIN), for explainable cyberbullying detection. HENIN contains the following components: a comment encoder, a post-comment co-attention sub-network, and session-session and post-post interaction extractors. Extensive experiments conducted on real datasets exhibit not only the promising performance of HENIN, but also highlight evidential comments so that one can understand why a media session is identified as cyberbullying.",,Computational Social Science and Social Media,Long,https://www.aclweb.org/anthology/2020.emnlp-main.200,38939269 +main.3111,Friendly Topic Assistant for Transformer Based Abstractive Summarization,Zhengjue Wang|Zhibin Duan|Hao Zhang|chaojie wang|long tian|Bo Chen|Mingyuan Zhou,"Abstractive document summarization is a comprehensive task including document understanding and summary generation, in which area Transformer-based models have achieved the state-of-the-art performance. Compared with Transformers, topic models are better at learning explicit document semantics, and hence could be integrated into Transformers to further boost their performance. To this end, we rearrange and explore the semantics learned by a topic model, and then propose a topic assistant (TA) including three modules. TA is compatible with various Transformer-based models and user-friendly since i) TA is a plug-and-play model that does not break any structure of the original Transformer network, making users easily fine-tune Transformer+TA based on a well pre-trained model; ii) TA only introduces a small number of extra parameters. Experimental results on three datasets demonstrate that TA is able to improve the performance of several Transformer-based models.",,Summarization,Long,https://www.aclweb.org/anthology/2020.emnlp-main.35,38939270 +main.3115,Investigating Lexical Variability in Language Models,Charles Yu|Ryan Sie|Nicolas Tedeschi|Leon Bergen,"Neural language models learn, to varying degrees of accuracy, the grammatical properties of natural languages. In this work, we investigate whether there are systematic sources of variation in the language models' accuracy. Focusing on subject-verb agreement and reflexive anaphora, we find that certain nouns are systematically understood better than others, an effect which is robust across grammatical tasks and different language models. Surprisingly, we find that across four orders of magnitude, corpus frequency is unrelated to a noun's performance on grammatical tasks. Finally, we find that a novel noun's grammatical properties can be few-shot learned from various types of training data. The results present a paradox: there should be less variation in grammatical performance than is actually observed.",,"Linguistic Theories, Cognitive Modeling and Psycholinguistics",Long,https://www.aclweb.org/anthology/2020.emnlp-main.331,38939271 +main.3116,Improving Multilingual Models with Language-Clustered Vocabularies,Hyung Won Chung|Dan Garrette|Kiat Chuan Tan|Jason Riesa,"State-of-the-art multilingual models depend on vocabularies that cover all of the languages the model will expect to see at inference time, but the standard methods for generating those vocabularies are not ideal for massively multilingual applications. In this work, we introduce a novel procedure for multilingual vocabulary generation that combines the separately trained vocabularies of several automatically derived language clusters, thus balancing the trade-off between cross-lingual subword sharing and language-specific vocabularies. Our experiments show improvements across languages on key multilingual benchmark tasks TyDi QA (+2.9 F1), XNLI (+2.1%), and WikiAnn NER (+2.8 F1) and factor of 8 reduction in out-of-vocabulary rate, all without increasing the size of the model or data.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.367,38939272 +main.3126,HABERTOR: An Efficient and Effective Deep Hatespeech Detector,Thanh Tran|Yifan Hu|Changwei Hu|Kevin Yen|Fei Tan|Kyumin Lee|Se Rim Park,"We present our HABERTOR model for detecting hatespeech in large scale user-generated content. Inspired by the recent success of the BERT model, we propose several modifications to BERT to enhance the performance on the downstream hatespeech classification task. HABERTOR inherits BERT's architecture, but is different in four aspects: (i) it generates its own vocabularies and is pre-trained from the scratch using the largest scale hatespeech dataset; (ii) it consists of Quaternion-based factorized components, resulting in a much smaller number of parameters, faster training and inferencing, as well as less memory usage; (iii) it uses our proposed multi-source ensemble heads with a pooling layer for separate input sources, to further enhance its effectiveness; and (iv) it uses a regularized adversarial training with our proposed fine-grained and adaptive noise magnitude to enhance its robustness. Through experiments on the large-scale real-world hatespeech dataset with 1.4M annotated comments, we show that HABERTOR works better than 15 state-of-the-art hatespeech detection methods, including fine-tuning Language Models. In particular, comparing with BERT, our HABERTOR is 4~5 times faster in the training/inferencing phase, uses less than 1/3 of the memory, and has better performance, even though we pre-train it by using less than 1% of the number of words. Our generalizability analysis shows that HABERTOR transfers well to other unseen hatespeech datasets and is a more efficient and effective alternative to BERT for the hatespeech classification.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.606,38939273 +main.3136,Multimodal Joint Attribute Prediction and Value Extraction for E-commerce Product,Tiangang Zhu|Yue Wang|Haoran Li|Youzheng Wu|Xiaodong He|Bowen Zhou,"Product attribute values are essential in many e-commerce scenarios, such as customer service robots, product recommendations, and product retrieval. While in the real world, the attribute values of a product are usually incomplete and vary over time, which greatly hinders the practical applications. In this paper, we propose a multimodal method to jointly predict product attributes and extract values from textual product descriptions with the help of the product images. We argue that product attributes and values are highly correlated, e.g., it will be easier to extract the values on condition that the product attributes are given. Thus, we jointly model the attribute prediction and value extraction tasks from multiple aspects towards the interactions between attributes and values. Moreover, product images have distinct effects on our tasks for different product attributes and values. Thus, we selectively draw useful visual information from product images to enhance our model. We annotate a multimodal product attribute value dataset that contains 87,194 instances, and the experimental results on this dataset demonstrate that explicitly modeling the relationship between attributes and values facilitates our method to establish the correspondence between them, and selectively utilizing visual product information is necessary for the task. Our code and dataset are available at https://github.com/jd-aig/JAVE.",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.166,38939274 +main.3140,Unsupervised Adaptation of Question Answering Systems via Generative Self-training,Steven Rennie|Etienne Marcheret|Neil Mallinar|David Nahamoo|Vaibhava Goel,"BERT-era question answering systems have recently achieved impressive performance on several question-answering (QA) tasks. These systems are based on representations that have been pre-trained on self-supervised tasks such as word masking and sentence entailment, using massive amounts of data. Nevertheless, additional pre-training closer to the end-task, such as training on synthetic QA pairs, has been shown to improve performance. While recent work has considered augmenting labelled data and leveraging large unlabelled datasets to generate synthetic QA data, directly adapting to target data has received little attention. In this paper we investigate the iterative generation of synthetic QA pairs as a way to realize unsupervised self adaptation. Motivated by the success of the roundtrip consistency method for filtering generated QA pairs, we present iterative generalizations of the approach, which maximize an approximation of a lower bound on the probability of the adaptation data. By adapting on synthetic QA pairs generated on the target data, our method is able to improve QA systems significantly, using an order of magnitude less synthetic data and training computation than existing augmentation approaches.",,Question Answering,Long,https://www.aclweb.org/anthology/2020.emnlp-main.87,38939275 +main.3143,What Do Position Embeddings Learn? An Empirical Study of Pre-Trained Language Model Positional Encoding,Yu-An Wang|Yun-Nung Chen,"In recent years, pre-trained Transformers have dominated the majority of NLP benchmark tasks. Many variants of pre-trained Transformers have kept breaking out, and most focus on designing different pre-training objectives or variants of self-attention. Embedding the position information in the self-attention mechanism is also an indispensable factor in Transformers however is often discussed at will. Hence, we carry out an empirical study on position embedding of mainstream pre-trained Transformers mainly focusing on two questions: 1) Do position embeddings really learn the meaning of positions? 2) How do these different learned position embeddings affect Transformers for NLP tasks? This paper focuses on providing a new insight of pre-trained position embeddings by feature-level analysis and empirical experiments on most of iconic NLP tasks. It is believed that our experimental results can guide the future works to choose the suitable positional encoding function for specific tasks given the application property.",,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.555,38939276 +main.315,Coreferential Reasoning Learning for Language Representation,Deming Ye|Yankai Lin|Jiaju Du|Zhenghao Liu|Peng Li|Maosong Sun|Zhiyuan Liu,"Language representation models such as BERT could effectively capture contextual semantic information from plain text, and have been proved to achieve promising results in lots of downstream NLP tasks with appropriate fine-tuning. However, most existing language representation models cannot explicitly handle coreference, which is essential to the coherent understanding of the whole discourse. To address this issue, we present CorefBERT, a novel language representation model that can capture the coreferential relations in context. The experimental results show that, compared with existing baseline models, CorefBERT can achieve significant improvements consistently on various downstream NLP tasks that require coreferential reasoning, while maintaining comparable performance to previous models on other common NLP tasks. The source code and experiment details of this paper can be obtained from https://github.com/thunlp/CorefBERT.",,Question Answering,Long,https://www.aclweb.org/anthology/2020.emnlp-main.582,38938680 +main.3151,Explainable Automated Fact-Checking for Public Health Claims,Neema Kotonya|Francesca Toni,"Fact-checking is the task of verifying the veracity of claims by assessing their assertions against credible evidence. The vast majority of fact-checking studies focus exclusively on political claims. Very little research explores fact-checking for other topics, specifically subject matters for which expertise is required. We present the first study of explainable fact-checking for claims which require specific expertise. For our case study we choose the setting of public health. To support this case study we construct a new dataset PUBHEALTH of 11.8K claims accompanied by journalist crafted, gold standard explanations (i.e., judgments) to support the fact-check labels for claims. We explore two tasks: veracity prediction and explanation generation. We also define and evaluate, with humans and computationally, three coherence properties of explanation quality. Our results indicate that, by training on in-domain data, gains can be made in explainable, automated fact-checking for claims which require specific expertise.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.623,38939277 +main.3157,Regularizing Dialogue Generation by Imitating Implicit Scenarios,Shaoxiong Feng|Xuancheng Ren|Hongshen Chen|Bin Sun|Kan Li|Xu SUN,"Human dialogues are scenario-based and appropriate responses generally relate to the latent context knowledge entailed by the specific scenario. To enable responses that are more meaningful and context-specific, we propose to improve generative dialogue systems from the scenario perspective, where both dialogue history and future conversation are taken into account to implicitly reconstruct the scenario knowledge. More importantly, the conversation scenarios are further internalized using imitation learning framework, where the conventional dialogue model that has no access to future conversations is effectively regularized by transferring the scenario knowledge contained in hierarchical supervising signals from the scenario-based dialogue model, so that the future conversation is not required in actual inference. Extensive evaluations show that our approach significantly outperforms state-of-the-art baselines on diversity and relevance, and expresses scenario-specific knowledge.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.534,38939278 +main.317,The World Is Not Binary: Learning to Rank with Grayscale Data for Dialogue Response Selection,Zibo Lin|Deng Cai|Yan Wang|Xiaojiang Liu|Haitao Zheng|Shuming Shi,"Response selection plays a vital role in building retrieval-based conversation systems. Despite that response selection is naturally a learning-to-rank problem, most prior works take a point-wise view and train binary classifiers for this task: each response candidate is labeled either relevant (one) or irrelevant (zero). On the one hand, this formalization can be sub-optimal due to its ignorance of the diversity of response quality. On the other hand, annotating grayscale data for learning-to-rank can be prohibitively expensive and challenging. In this work, we show that grayscale data can be automatically constructed without human effort. Our method employs off-the-shelf response retrieval models and response generation models as automatic grayscale data generators. With the constructed grayscale data, we propose multi-level ranking objectives for training, which can (1) teach a matching model to capture more fine-grained context-response relevance difference and (2) reduce the train-test discrepancy in terms of distractor strength. Our method is simple, effective, and universal. Experiments on three benchmark datasets and four state-of-the-art matching models show that the proposed approach brings significant and consistent performance improvements.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.741,38938681 +main.3174,MSCNN: A Monomeric-Siamese Convolutional Neural Network for Extremely Imbalanced Multi-label Text Classification,Wenshuo Yang|Jiyi Li|Fumiyo Fukumoto|Yanming Ye,"The data imbalance problem is a crucial issue for the multi-label text classification. Some existing works tackle it by proposing imbalanced loss objectives instead of the vanilla cross-entropy loss, but their performances remain limited in the cases of extremely imbalanced data. We propose a hybrid solution which adapts general networks for the head categories, and few-shot techniques for the tail categories. We propose a Hybrid-Siamese Convolutional Neural Network (HSCNN) with additional technical attributes, i.e., a multi-task architecture based on Single and Siamese networks; a category-specific similarity in the Siamese structure; a specific sampling method for training HSCNN. The results using two benchmark datasets and three loss objectives show that our method can improve the performance of Single networks with diverse loss objectives on the tail or entire categories.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.545,38939279 +main.3179,Generating Dialogue Responses from a Semantic Latent Space,Wei-Jen Ko|Avik Ray|Yilin Shen|Hongxia Jin,"Existing open-domain dialogue generation models are usually trained to mimic the gold response in the training set using cross-entropy loss on the vocabulary. However, a good response does not need to resemble the gold response, since there are multiple possible responses to a given prompt. In this work, we hypothesize that the current models are unable to integrate information from multiple semantically similar valid responses of a prompt, resulting in the generation of generic and uninformative responses. To address this issue, we propose an alternative to the end-to-end classification on vocabulary. We learn the pair relationship between the prompts and responses as a regression task on a latent space instead. In our novel dialog generation model, the representations of semantically related sentences are close to each other on the latent space. Human evaluation showed that learning the task on a continuous space can generate responses that are both relevant and informative.",,Language Generation,Long,https://www.aclweb.org/anthology/2020.emnlp-main.352,38939280 +main.318,Mitigating Gender Bias for Neural Dialogue Generation with Adversarial Learning,Haochen Liu|Wentao Wang|Yiqi Wang|Hui Liu|Zitao Liu|Jiliang Tang,"Dialogue systems play an increasingly important role in various aspects of our daily life. It is evident from recent research that dialogue systems trained on human conversation data are biased. In particular, they can produce responses that reflect people's gender prejudice. Many debiasing methods have been developed for various NLP tasks, such as word embedding. However, they are not directly applicable to dialogue systems because they are likely to force dialogue models to generate similar responses for different genders. This greatly degrades the diversity of the generated responses and immensely hurts the performance of the dialogue models. In this paper, we propose a novel adversarial learning framework Debiased-Chat to train dialogue models free from gender bias while keeping their performance. Extensive experiments on two real-world conversation datasets show that our framework significantly reduces gender bias in dialogue models while maintaining the response quality.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.64,38938682 +main.3181,Investigating Representations of Verb Bias in Neural Language Models,Robert Hawkins|Takateru Yamakoshi|Thomas Griffiths|Adele Goldberg,"Languages typically provide more than one grammatical construction to express certain types of messages. A speaker's choice of construction is known to depend on multiple factors, including the choice of main verb -- a phenomenon known as verb bias. Here we introduce DAIS, a large benchmark dataset containing 50K human judgments for 5K distinct sentence pairs in the English dative alternation. This dataset includes 200 unique verbs and systematically varies the definiteness and length of arguments. We use this dataset, as well as an existing corpus of naturally occurring data, to evaluate how well recent neural language models capture human preferences. Results show that larger models perform better than smaller models, and transformer architectures (e.g. GPT-2) tend to out-perform recurrent architectures (e.g. LSTMs) even under comparable parameter and training settings. Additional analyses of internal feature representations suggest that transformers may better integrate specific lexical information with grammatical constructions.",,"Linguistic Theories, Cognitive Modeling and Psycholinguistics",Long,https://www.aclweb.org/anthology/2020.emnlp-main.376,38939281 +main.3183,MUTANT: A Training Paradigm for Out-of-Distribution Generalization in Visual Question Answering,Tejas Gokhale|Pratyay Banerjee|Chitta Baral|Yezhou Yang,"While progress has been made on the visual question answering leaderboards, models often utilize spurious correlations and priors in datasets under the i.i.d. setting. As such, evaluation on out-of-distribution (OOD) test samples has emerged as a proxy for generalization. In this paper, we present \textit{MUTANT}, a training paradigm that exposes the model to perceptually similar, yet semantically distinct \textit{mutations} of the input, to improve OOD generalization, such as the VQA-CP challenge. Under this paradigm, models utilize a consistency-constrained training objective to understand the effect of semantic changes in input (question-image pair) on the output (answer). Unlike existing methods on VQA-CP, \textit{MUTANT} does not rely on the knowledge about the nature of train and test answer distributions. \textit{MUTANT} establishes a new state-of-the-art accuracy on VQA-CP with a $10.57\%$ improvement. Our work opens up avenues for the use of semantic input mutations for OOD generalization in question answering.",,"Language Grounding to Vision, Robotics and Beyond",Long,https://www.aclweb.org/anthology/2020.emnlp-main.63,38939282 +main.3184,Partially-Aligned Data-to-Text Generation with Distant Supervision,Zihao Fu|Bei Shi|Wai Lam|Lidong Bing|Zhiyuan Liu,"The Data-to-Text task aims to generate human-readable text for describing some given structured data enabling more interpretability. However, the typical generation task is confined to a few particular domains since it requires well-aligned data which is difficult and expensive to obtain. Using partially-aligned data is an alternative way of solving the dataset scarcity problem. This kind of data is much easier to obtain since it can be produced automatically. However, using this kind of data induces the over-generation problem posing difficulties for existing models, which tends to add unrelated excerpts during the generation procedure. In order to effectively utilize automatically annotated partially-aligned datasets, we extend the traditional generation task to a refined task called Partially-Aligned Data-to-Text Generation (PADTG) which is more practical since it utilizes automatically annotated data for training and thus considerably expands the application domains. To tackle this new task, we propose a novel distant supervision generation framework. It firstly estimates the input data's supportiveness for each target word with an estimator and then applies a supportiveness adaptor and a rebalanced beam search to harness the over-generation problem in the training and generation phases respectively. We also contribute a partially-aligned dataset (The data and source code of this paper can be obtained from https://github.com/fuzihaofzh/distant_supervision_nlg) by sampling sentences from Wikipedia and automatically extracting corresponding KB triples for each sentence from Wikidata. The experimental results show that our framework outperforms all baseline models as well as verify the feasibility of utilizing partially-aligned data.",,Language Generation,Long,https://www.aclweb.org/anthology/2020.emnlp-main.738,38939283 +main.3185,Multi-Instance Multi-Label Learning Networks for Aspect-Category Sentiment Analysis,Yuncong Li|Cunxiang Yin|Sheng-hua Zhong|Xu Pan,"Aspect-category sentiment analysis (ACSA) aims to predict sentiment polarities of sentences with respect to given aspect categories. To detect the sentiment toward a particular aspect category in a sentence, most previous methods first generate an aspect category-specific sentence representation for the aspect category, then predict the sentiment polarity based on the representation. These methods ignore the fact that the sentiment of an aspect category mentioned in a sentence is an aggregation of the sentiments of the words indicating the aspect category in the sentence, which leads to suboptimal performance. In this paper, we propose a Multi-Instance Multi-Label Learning Network for Aspect-Category sentiment analysis (AC-MIMLLN), which treats sentences as bags, words as instances, and the words indicating an aspect category as the key instances of the aspect category. Given a sentence and the aspect categories mentioned in the sentence, AC-MIMLLN first predicts the sentiments of the instances, then finds the key instances for the aspect categories, finally obtains the sentiments of the sentence toward the aspect categories by aggregating the key instance sentiments. Experimental results on three public datasets demonstrate the effectiveness of AC-MIMLLN.",,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Long,https://www.aclweb.org/anthology/2020.emnlp-main.287,38939284 +main.3186,Inquisitive Question Generation for High Level Text Comprehension,Wei-Jen Ko|TE-YUAN CHEN|Yiyan Huang|Greg Durrett|Junyi Jessy Li,"Inquisitive probing questions come naturally to humans in a variety of settings, but is a challenging task for automatic systems. One natural type of question to ask tries to fill a gap in knowledge during text comprehension, like reading a news article: we might ask about background information, deeper reasons behind things occurring, or more. Despite recent progress with data-driven approaches, generating such questions is beyond the range of models trained on existing datasets. We introduce INQUISITIVE, a dataset of ~19K questions that are elicited while a person is reading through a document. Compared to existing datasets, INQUISITIVE questions target more towards high-level (semantic and discourse) comprehension of text. We show that readers engage in a series of pragmatic strategies to seek information. Finally, we evaluate question generation models based on GPT-2 and show that our model is able to generate reasonable questions although the task is challenging, and highlight the importance of context to generate INQUISITIVE questions.",,Language Generation,Long,https://www.aclweb.org/anthology/2020.emnlp-main.530,38939285 +main.319,Unsupervised Question Decomposition for Question Answering,Ethan Perez|Patrick Lewis|Wen-tau Yih|Kyunghyun Cho|Douwe Kiela,"We aim to improve question answering (QA) by decomposing hard questions into simpler sub-questions that existing QA systems are capable of answering. Since labeling questions with decompositions is cumbersome, we take an unsupervised approach to produce sub-questions, also enabling us to leverage millions of questions from the internet. Specifically, we propose an algorithm for One-to-N Unsupervised Sequence transduction (ONUS) that learns to map one hard, multi-hop question to many simpler, single-hop sub-questions. We answer sub-questions with an off-the-shelf QA model and give the resulting answers to a recomposition model that combines them into a final answer. We show large QA improvements on HotpotQA over a strong baseline on the original, out-of-domain, and multi-hop dev sets. ONUS automatically learns to decompose different kinds of questions, while matching the utility of supervised and heuristic decomposition methods for QA and exceeding those methods in fluency. Qualitatively, we find that using sub-questions is promising for shedding light on why a QA system makes a prediction.",,Question Answering,Long,https://www.aclweb.org/anthology/2020.emnlp-main.713,38938683 +main.32,``You Are Grounded!'': Latent Name Artifacts in Pre-trained Language Models,Vered Shwartz|Rachel Rudinger|Oyvind Tafjord,"Pre-trained language models (LMs) may perpetuate biases originating in their training corpus to downstream models. We focus on artifacts associated with the representation of given names (e.g., Donald), which, depending on the corpus, may be associated with specific entities, as indicated by next token prediction (e.g., Trump). While helpful in some contexts, grounding happens also in under-specified or inappropriate contexts. For example, endings generated for `Donald is a' substantially differ from those of other names, and often have more-than-average negative sentiment. We demonstrate the potential effect on downstream tasks with reading comprehension probes where name perturbation changes the model answers. As a silver lining, our experiments suggest that additional pre-training on different corpora may mitigate this bias.",,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.556,38938640 +main.3205,Semi-Supervised Bilingual Lexicon Induction with Two-way Interaction,Xu Zhao|Zihao Wang|Hao Wu|Yong Zhang,"Semi-supervision is a promising paradigm for Bilingual Lexicon Induction (BLI) with limited annotations. However, previous semisupervised methods do not fully utilize the knowledge hidden in annotated and nonannotated data, which hinders further improvement of their performance. In this paper, we propose a new semi-supervised BLI framework to encourage the interaction between the supervised signal and unsupervised alignment. We design two message-passing mechanisms to transfer knowledge between annotated and non-annotated data, named prior optimal transport and bi-directional lexicon update respectively. Then, we perform semi-supervised learning based on a cyclic or a parallel parameter feeding routine to update our models. Our framework is a general framework that can incorporate any supervised and unsupervised BLI methods based on optimal transport. Experimental results on MUSE and VecMap datasets show significant improvement of our models. Ablation study also proves that the two-way interaction between the supervised signal and unsupervised alignment accounts for the gain of the overall performance. Results on distant language pairs further illustrate the advantage and robustness of our proposed method.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.238,38939286 +main.3216,Entity Linking in 100 Languages,Jan A. Botha|Zifei Shan|Daniel Gillick,"We propose a new formulation for multilingual entity linking, where language-specific mentions resolve to a language-agnostic Knowledge Base. We train a dual encoder in this new setting, building on prior work with improved feature representation, negative mining, and an auxiliary entity-pairing task, to obtain a single entity retrieval model that covers 100+ languages and 20~million entities. The model outperforms state-of-the-art results from a far more limited cross-lingual linking task. Rare entities and low-resource languages pose challenges at this large-scale, so we advocate for an increased focus on zero- and few-shot evaluation. To this end, we provide Mewsli-9, a large new multilingual dataset matched to our setting, and show how frequency-based analysis provided key insights for our model and training enhancements.",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.630,38939287 +main.3217,Discriminative Nearest Neighbor Few-Shot Intent Detection by Transferring Natural Language Inference,Jianguo Zhang|Kazuma Hashimoto|Wenhao Liu|Chien-Sheng Wu|Yao Wan|Philip Yu|Richard Socher|Caiming Xiong,"Intent detection is one of the core components of goal-oriented dialog systems, and detecting out-of-scope (OOS) intents is also a practically important skill. Few-shot learning is attracting much attention to mitigate data scarcity, but OOS detection becomes even more challenging. In this paper, we present a simple yet effective approach, discriminative nearest neighbor classification with deep self-attention. Unlike softmax classifiers, we leverage BERT-style pairwise encoding to train a binary classifier that estimates the best matched training example for a user input. We propose to boost the discriminative ability by transferring a natural language inference (NLI) model. Our extensive experiments on a large-scale multi-domain intent detection task show that our method achieves more stable and accurate in-domain and OOS detection accuracy than RoBERTa-based classifiers and embedding-based nearest neighbor approaches. More notably, the NLI transfer enables our 10-shot model to perform competitively with 50-shot or even full-shot classifiers, while we can keep the inference time constant by leveraging a faster embedding retrieval model.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.411,38939288 +main.3224,Sparsity Makes Sense: Word Sense Disambiguation Using Sparse Contextualized Word Representations,Gábor Berend,"In this paper, we demonstrate that by utilizing sparse word representations, it becomes possible to surpass the results of more complex task-specific models on the task of fine-grained all-words word sense disambiguation. Our proposed algorithm relies on an overcomplete set of semantic basis vectors that allows us to obtain sparse contextualized word representations. We introduce such an information theory-inspired synset representation based on the co-occurrence of word senses and non-zero coordinates for word forms which allows us to achieve an aggregated F-score of 78.8 over a combination of five standard word sense disambiguating benchmark datasets. We also demonstrate the general applicability of our proposed framework by evaluating it towards part-of-speech tagging on four different treebanks. Our results indicate a significant improvement over the application of the dense word representations.",,Semantics: Lexical Semantics,Long,https://www.aclweb.org/anthology/2020.emnlp-main.683,38939289 +main.3227,Targeted Finetuning for NMT with Conditional Generative-Discriminative Loss,Prathyusha Jwalapuram|Shafiq Joty|Youlin Shen,"Popular Neural Machine Translation model training uses strategies like backtranslation to improve BLEU scores, requiring large amounts of additional data and training. We introduce a class of conditional generative-discriminative hybrid losses that we use to fine-tune a trained machine translation model. Through a combination of targeted fine-tuning objectives and intuitive re-use of the training data the model has failed to adequately learn from, we improve the model performance of both a sentence-level and a contextual model without using any additional data. We target the improvement of pronoun translations through our fine-tuning and evaluate our models on a pronoun benchmark testset. Our sentence-level model shows a 0.5 BLEU improvement on both the WMT14 and the IWSLT13 De-En testsets, while our contextual model achieves the best results, improving from 31.81 to 32 BLEU on WMT14 De-En testset, and from 32.10 to 33.13 on the IWSLT13 De-En testset, with corresponding improvements in pronoun translation. We further show the generalizability of our method by reproducing the improvements on two additional language pairs, Fr-En and Cs-En.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.177,38939290 +main.3231,Introducing a New Dataset for Event Detection in Cybersecurity Texts,Hieu Man Duc Trong|Duc Trong Le|Amir Pouran Ben Veyseh|Thuat Nguyen|Thien Huu Nguyen,"Detecting cybersecurity events is necessary to keep us informed about the fast growing number of such events reported in text. In this work, we focus on the task of event detection (ED) to identify event trigger words for the cybersecurity domain. In particular, to facilitate the future research, we introduce a new dataset for this problem, characterizing the manual annotation for 30 important cybersecurity event types and a large dataset size to develop deep learning models. Comparing to the prior datasets for this task, our dataset involves more event types and supports the modeling of document-level information to improve the performance. We perform extensive evaluation with the current state-of-the-art methods for ED on the proposed dataset. Our experiments reveal the challenges of cybersecurity ED and present many research opportunities in this area for the future work.",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.433,38939291 +main.3236,Learning Adaptive Segmentation Policy for Simultaneous Translation,Ruiqing Zhang|Chuanqiang Zhang|Zhongjun He|Hua Wu|Haifeng Wang,"Balancing accuracy and latency is a great challenge for simultaneous translation. To achieve high accuracy, the model usually needs to wait for more streaming text before translation, which results in increased latency. However, keeping low latency would probably hurt accuracy. Therefore, it is essential to segment the ASR output into appropriate units for translation. Inspired by human interpreters, we propose a novel adaptive segmentation policy for simultaneous translation. The policy learns to segment the source text by considering possible translations produced by the translation model, maintaining consistency between the segmentation and translation. Experimental results on Chinese-English and German-English translation show that our method achieves a better accuracy-latency trade-off over recently proposed state-of-the-art methods.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.178,38939292 +main.3239,Unsupervised Natural Language Inference via Decoupled Multimodal Contrastive Learning,Wanyun Cui|Guangyu Zheng|Wei Wang,"We propose to solve the natural language inference problem without any supervision from the inference labels via task-agnostic multimodal pretraining. Although recent studies of multimodal self-supervised learning also represent the linguistic and visual context, their encoders for different modalities are coupled. Thus they cannot incorporate visual information when encoding plain text alone. In this paper, we propose Multimodal Aligned Contrastive Decoupled learning (MACD) network. MACD forces the decoupled text encoder to represent the visual information via contrastive learning. Therefore, it embeds visual knowledge even for plain text inference. We conducted comprehensive experiments over plain text inference datasets (i.e. SNLI and STS-B). The unsupervised MACD even outperforms the fully-supervised BiLSTM and BiLSTM+ELMO on STS-B.",,Speech and Multimodality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.444,38939293 +main.3240,Ad-hoc Document Retrieval Using Weak-Supervision with BERT and GPT2,Yosi Mass|Haggai Roitman,"We describe a weakly-supervised method for training deep learning models for the task of ad-hoc document retrieval. Our method is based on generative and discriminative models that are trained using weak-supervision just from the documents in the corpus. We present an end-to-end retrieval system that starts with traditional information retrieval methods, followed by two deep learning re-rankers. We evaluate our method on three different datasets: a COVID-19 related scientific literature dataset and two news datasets. We show that our method outperforms state-of-the-art methods; this without the need for the expensive process of manually labeling data.",,Information Retrieval and Text Mining,Long,https://www.aclweb.org/anthology/2020.emnlp-main.343,38939294 +main.3257,SLURP: A Spoken Language Understanding Resource Package,Emanuele Bastianelli|Andrea Vanzo|Pawel Swietojanski|Verena Rieser,"Spoken Language Understanding infers semantic meaning directly from audio data, and thus promises to reduce error propagation and misunderstandings in end-user applications. However, publicly available SLU resources are limited. In this paper, we release SLURP, a new SLU package containing the following: (1) A new challenging dataset in English spanning 18 domains, which is substantially bigger and linguistically more diverse than existing datasets; (2) Competitive baselines based on state-of-the-art NLU and ASR systems; (3) A new transparent metric for entity labelling which enables a detailed error analysis for identifying potential areas of improvement. SLURP is available at https://github.com/pswietojanski/slurp.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.588,38939295 +main.3259,Unsupervised Parsing with S-DIORA: Single Tree Encoding for Deep Inside-Outside Recursive Autoencoders,Andrew Drozdov|Subendhu Rongali|Yi-Pei Chen|Tim O'Gorman|Mohit Iyyer|Andrew McCallum,"The deep inside-outside recursive autoencoder (DIORA; Drozdov et al. 2019) is a self-supervised neural model that learns to induce syntactic tree structures for input sentences *without access to labeled training data*. In this paper, we discover that while DIORA exhaustively encodes all possible binary trees of a sentence with a soft dynamic program, its vector averaging approach is locally greedy and cannot recover from errors when computing the highest scoring parse tree in bottom-up chart parsing. To fix this issue, we introduce S-DIORA, an improved variant of DIORA that encodes a single tree rather than a softly-weighted mixture of trees by employing a hard argmax operation and a beam at each cell in the chart. Our experiments show that through *fine-tuning* a pre-trained DIORA with our new algorithm, we improve the state of the art in *unsupervised* constituency parsing on the English WSJ Penn Treebank by 2.2-6% F1, depending on the data used for fine-tuning.",,"Syntax: Tagging, Chunking, and Parsing",Long,https://www.aclweb.org/anthology/2020.emnlp-main.392,38939296 +main.327,Global-to-Local Neural Networks for Document-Level Relation Extraction,Difeng Wang|Wei Hu|Ermei Cao|Weijian Sun,"Relation extraction (RE) aims to identify the semantic relations between named entities in text. Recent years have witnessed it raised to the document level, which requires complex reasoning with entities and mentions throughout an entire document. In this paper, we propose a novel model to document-level RE, by encoding the document information in terms of entity global and local representations as well as context relation representations. Entity global representations model the semantic information of all entities in the document, entity local representations aggregate the contextual information of multiple mentions of specific entities, and context relation representations encode the topic information of other relations. Experimental results demonstrate that our model achieves superior performance on two public datasets for document-level RE. It is particularly effective in extracting relations between entities of long distance and having multiple mentions.",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.303,38938684 +main.3270,Backpropagation-based Decoding for Unsupervised Counterfactual and Abductive Reasoning,Lianhui Qin|Vered Shwartz|Peter West|Chandra Bhagavatula|Jena D. Hwang|Ronan Le Bras|Antoine Bosselut|Yejin Choi,"Abductive and counterfactual reasoning, core abilities of everyday human cognition, require reasoning about what might have happened at time t, while conditioning on multiple contexts from the relative past and future. However, simultaneous incorporation of past and future contexts using generative language models (LMs) can be challenging, as they are trained either to condition only on the past context or to perform narrowly scoped text-infilling. In this paper, we propose DeLorean, a new unsupervised decoding algorithm that can flexibly incorporate both the past and future contexts using only off-the-shelf, left-to-right language models and no supervision. The key intuition of our algorithm is incorporating the future through back-propagation, during which, we only update the internal representation of the output while fixing the model parameters. By alternating between forward and backward propagation, DeLorean can decode the output representation that reflects both the left and right contexts. We demonstrate that our approach is general and applicable to two nonmonotonic reasoning tasks: abductive text generation and counterfactual story revision, where DeLorean outperforms a range of unsupervised and some supervised methods, based on automatic and human evaluation.",,Language Generation,Long,https://www.aclweb.org/anthology/2020.emnlp-main.58,38939297 +main.3272,Scaling Hidden Markov Language Models,Justin Chiu|Alexander Rush,"The hidden Markov model (HMM) is a fundamental tool for sequence modeling that cleanly separates the hidden state from the emission structure. However, this separation makes it difficult to fit HMMs to large datasets in modern NLP, and they have fallen out of use due to very poor performance compared to fully observed models. This work revisits the challenge of scaling HMMs to language modeling datasets, taking ideas from recent approaches to neural modeling. We propose methods for scaling HMMs to massive state spaces while maintaining efficient exact inference, a compact parameterization, and effective regularization. Experiments show that this approach leads to models that are much more accurate than previous HMMs and n-gram-based methods, making progress towards the performance of state-of-the-art NN models.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.103,38939298 +main.3278,Assessing Phrasal Representation and Composition in Transformers,Lang Yu|Allyson Ettinger,"Deep transformer models have pushed performance on NLP tasks to new limits, suggesting sophisticated treatment of complex linguistic inputs, such as phrases. However, we have limited understanding of how these models handle representation of phrases, and whether this reflects sophisticated composition of phrase meaning like that done by humans. In this paper, we present systematic analysis of phrasal representations in state-of-the-art pre-trained transformers. We use tests leveraging human judgments of phrase similarity and meaning shift, and compare results before and after control of word overlap, to tease apart lexical effects versus composition effects. We find that phrase representation in these models relies heavily on word content, with little evidence of nuanced composition. We also identify variations in phrase representation quality across models, layers, and representation types, and make corresponding recommendations for usage of representations from these models.",,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.397,38939299 +main.328,Near-imperceptible Neural Linguistic Steganography via Self-Adjusting Arithmetic Coding,Jiaming Shen|Heng Ji|Jiawei Han,"Linguistic steganography studies how to hide secret messages in natural language cover texts. Traditional methods aim to transform a secret message into an innocent text via lexical substitution or syntactical modification. Recently, advances in neural language models (LMs) enable us to directly generate cover text conditioned on the secret message. In this study, we present a new linguistic steganography method which encodes secret messages using self-adjusting arithmetic coding based on a neural language model. We formally analyze the statistical imperceptibility of this method and empirically show it outperforms the previous state-of-the-art methods on four datasets by 15.3% and 38.9% in terms of bits/word and KL metrics, respectively. Finally, human evaluations show that 51% of generated cover texts can indeed fool eavesdroppers.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.22,38938685 +main.3282,Position-Aware Tagging for Aspect Sentiment Triplet Extraction,Lu Xu|Hao Li|Wei Lu|Lidong Bing,"Aspect Sentiment Triplet Extraction (ASTE) is the task of extracting the triplets of target entities, their associated sentiment, and opinion spans explaining the reason for the sentiment. Existing research efforts mostly solve this problem using pipeline approaches, which break the triplet extraction process into several stages. Our observation is that the three elements within a triplet are highly related to each other, and this motivates us to build a joint model to extract such triplets using a sequence tagging approach. However, how to effectively design a tagging approach to extract the triplets that can capture the rich interactions among the elements is a challenging research question. In this work, we propose the first end-to-end model with a novel position-aware tagging scheme that is capable of jointly extracting the triplets. Our experimental results on several existing datasets show that jointly capturing elements in the triplet using our approach leads to improved performance over the existing approaches. We also conducted extensive experiments to investigate the model effectiveness and robustness.",,"Syntax: Tagging, Chunking, and Parsing",Long,https://www.aclweb.org/anthology/2020.emnlp-main.183,38939300 +main.3286,Aspect Based Sentiment Analysis with Aspect-Specific Opinion Spans,Lu Xu|Lidong Bing|Wei Lu|Fei Huang,"Aspect based sentiment analysis, predicting sentiment polarity of given aspects, has drawn extensive attention. Previous attention-based models emphasize using aspect semantics to help extract opinion features for classification. However, these works are either not able to capture opinion spans as a whole, or not able to capture variable-length opinion spans. In this paper, we present a neat and effective structured attention model by aggregating multiple linear-chain CRFs. Such a design allows the model to extract aspect-specific opinion spans and then evaluate sentiment polarity by exploiting the extracted opinion features. The experimental results on four datasets demonstrate the effectiveness of the proposed model, and our analysis demonstrates that our model can capture aspect-specific opinion spans.",,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Long,https://www.aclweb.org/anthology/2020.emnlp-main.288,38939301 +main.3287,Two Are Better than One: Joint Entity and Relation Extraction with Table-Sequence Encoders,Jue WANG|Wei Lu,"Named entity recognition and relation extraction are two important fundamental problems. Joint learning algorithms have been proposed to solve both tasks simultaneously, and many of them cast the joint task as a table-filling problem. However, they typically focused on learning a single encoder (usually learning representation in the form of a table) to capture information required for both tasks within the same space. We argue that it can be beneficial to design two distinct encoders to capture such two different types of information in the learning process. In this work, we propose the novel table-sequence encoders where two different encoders -- a table encoder and a sequence encoder are designed to help each other in the representation learning process. Our experiments confirm the advantages of having two encoders over one encoder. On several standard datasets, our model shows significant improvements over existing approaches.",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.133,38939302 +main.3291,Causal Inference of Script Knowledge,Noah Weber|Rachel Rudinger|Benjamin Van Durme,"When does a sequence of events define an everyday scenario and how can this knowledge be induced from text? Prior works in inducing such scripts have relied on, in one form or another, measures of correlation between instances of events in a corpus. We argue from both a conceptual and practical sense that a purely correlation-based approach is insufficient, and instead propose an approach to script induction based on the causal effect between events, formally defined via interventions. Through both human and automatic evaluations, we show that the output of our method based on causal effects better matches the intuition of what a script represents.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",Long,https://www.aclweb.org/anthology/2020.emnlp-main.612,38939303 +main.3292,VCDM: Leveraging Variational Bi-encoding and Deep Contextualized Word Representations for Improved Definition Modeling,Machel Reid|Edison Marrese-Taylor|Yutaka Matsuo,"In this paper, we tackle the task of definition modeling, where the goal is to learn to generate definitions of words and phrases. Existing approaches for this task are discriminative, combining distributional and lexical semantics in an implicit rather than direct way. To tackle this issue we propose a generative model for the task, introducing a continuous latent variable to explicitly model the underlying relationship between a phrase used within a context and its definition. We rely on variational inference for estimation and leverage contextualized word embeddings for improved performance. Our approach is evaluated on four existing challenging benchmarks with the addition of two new datasets, ""Cambridge"" and the first non-English corpus ""Robert"", which we release to complement our empirical study. Our Variational Contextual Definition Modeler (VCDM) achieves state-of-the-art performance in terms of automatic and human evaluation metrics, demonstrating the effectiveness of our approach.",,Discourse and Pragmatics,Long,https://www.aclweb.org/anthology/2020.emnlp-main.513,38939304 +main.3298,Named Entity Recognition for Social Media Texts with Semantic Augmentation,Yuyang Nie|Yuanhe Tian|Xiang Wan|Yan Song|Bo Dai,"Existing approaches for named entity recognition suffer from data sparsity problems when conducted on short and informal texts, especially user-generated social media content. Semantic augmentation is a potential way to alleviate this problem. Given that rich semantic information is implicitly preserved in pre-trained word embeddings, they are potential ideal resources for semantic augmentation. In this paper, we propose a neural-based approach to NER for social media texts where both local (from running text) and augmented semantics are taken into account. In particular, we obtain the augmented semantic information from a large-scale corpus, and propose an attentive semantic augmentation module and a gate module to encode and aggregate such information, respectively. Extensive experiments are performed on three benchmark datasets collected from English and Chinese social media platforms, where the results demonstrate the superiority of our approach to previous studies across all three datasets.",,Computational Social Science and Social Media,Long,https://www.aclweb.org/anthology/2020.emnlp-main.107,38939305 +main.3299,Improving the Efficiency of Grammatical Error Correction with Erroneous Span Detection and Correction,Mengyun Chen|Tao Ge|Xingxing Zhang|Furu Wei|Ming Zhou,"We propose a novel language-independent approach to improve the efficiency for Grammatical Error Correction (GEC) by dividing the task into two subtasks: Erroneous Span Detection (ESD) and Erroneous Span Correction (ESC). ESD identifies grammatically incorrect text spans with an efficient sequence tagging model. Then, ESC leverages a seq2seq model to take the sentence with annotated erroneous spans as input and only outputs the corrected text for these spans. Experiments show our approach performs comparably to conventional seq2seq approaches in both English and Chinese GEC benchmarks with less than 50% time cost for inference.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.581,38939306 +main.3304,Birds Have Four Legs?! NumerSense: Probing Numerical Commonsense Knowledge of Pre-trained Language Models,Bill Yuchen Lin|Seyeon Lee|Rahul Khanna|Xiang Ren,"Recent works show that pre-trained language models (PTLMs), such as BERT, possess certain commonsense and factual knowledge. They suggest that it is promising to use PTLMs as ``neural knowledge bases'' via predicting masked words. Surprisingly, we find that this may not work for numerical commonsense knowledge (e.g., a bird usually has two legs). In this paper, we investigate whether and to what extent we can induce numerical commonsense knowledge from PTLMs as well as the robustness of this process. In this paper, we investigate whether and to what extent we can induce numerical commonsense knowledge from PTLMs as well as the robustness of this process. To study this, we introduce a novel probing task with a diagnostic dataset, NumerSense, containing 13.6k masked-word-prediction probes (10.5k for fine-tuning and 3.1k for testing). Our analysis reveals that: (1) BERT and its stronger variant RoBERTa perform poorly on the diagnostic dataset prior to any fine-tuning; (2) fine-tuning with distant supervision brings some improvement; (3) the best supervised model still performs poorly as compared to human performance (54.06% vs. 96.3% in accuracy).",,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.557,38939307 +main.3318,A Probabilistic End-To-End Task-Oriented Dialog Model with Latent Belief States towards Semi-Supervised Learning,Yichi Zhang|Zhijian Ou|Min Hu|Junlan Feng,"Structured belief states are crucial for user goal tracking and database query in task-oriented dialog systems. However, training belief trackers often requires expensive turn-level annotations of every user utterance. In this paper we aim at alleviating the reliance on belief state labels in building end-to-end dialog systems, by leveraging unlabeled dialog data towards semi-supervised learning. We propose a probabilistic dialog model, called the LAtent BElief State (LABES) model, where belief states are represented as discrete latent variables and jointly modeled with system responses given user inputs. Such latent variable modeling enables us to develop semi-supervised learning under the principled variational learning framework. Furthermore, we introduce LABES-S2S, which is a copy-augmented Seq2Seq model instantiation of LABES. In supervised experiments, LABES-S2S obtains strong results on three benchmark datasets of different scales. In utilizing unlabeled dialog data, semi-supervised LABES-S2S significantly outperforms both supervised-only and semi-supervised baselines. Remarkably, we can reduce the annotation demands to 50% without performance loss on MultiWOZ.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.740,38939308 +main.3321,Wasserstein Distance Regularized Sequence Representation for Text Matching in Asymmetrical Domains,Weijie Yu|Chen Xu|Jun Xu|Liang Pang|Xiaopeng Gao|Xiaozhao Wang|Ji-Rong Wen,"One approach to matching texts from asymmetrical domains is projecting the input sequences into a common semantic space as feature vectors upon which the matching function can be readily defined and learned. In real-world matching practices, it is often observed that with the training goes on, the feature vectors projected from different domains tend to be indistinguishable. The phenomenon, however, is often overlooked in existing matching models. As a result, the feature vectors are constructed without any regularization, which inevitably increases the difficulty of learning the downstream matching functions. In this paper, we propose a novel match method tailored for text matching in asymmetrical domains, called WD-Match. In WD-Match, a Wasserstein distance-based regularizer is defined to regularize the features vectors projected from different domains. As a result, the method enforces the feature projection function to generate vectors such that those correspond to different domains cannot be easily discriminated. The training process of WD-Match amounts to a game that minimizes the matching loss regularized by the Wasserstein distance. WD-Match can be used to improve different text matching methods, by using the method as its underlying matching model. Four popular text matching methods have been exploited in the paper. Experimental results based on four publicly available benchmarks showed that WD-Match consistently outperformed the underlying methods and the baselines.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.239,38939309 +main.3327,Incorporating Context Structures for Query Generation,Ruey-Cheng Chen|Chia-Jung Lee,"Generative neural networks have been shown effective on query suggestion. Commonly posed as a conditional generation problem, the task aims to leverage earlier inputs from users in a search session to predict queries that they will likely issue at a later time. User inputs come in various forms such as querying and clicking, each of which can imply different semantic signals channeled through the corresponding behavioral patterns. This paper induces these behavioral biases as hypotheses for query generation, where a generic encoder-decoder Transformer framework is presented to aggregate arbitrary hypotheses of choice. Our experimental results show that the proposed approach leads to significant improvements on top-k word error rate and Bert F1 Score compared to a recent BART model.",,Information Retrieval and Text Mining,Long,https://www.aclweb.org/anthology/2020.emnlp-main.251,38939310 +main.3329,Emotion-Cause Pair Extraction as Sequence Labeling Based on a Novel Tagging Scheme,Chaofa Yuan|Chuang Fan|Jianzhu Bao|Ruifeng Xu,"The task of emotion-cause pair extraction deals with finding all emotions and the corresponding causes in unannotated emotion texts. Most recent studies are based on the likelihood of Cartesian product among all clause candidates, resulting in a high computational cost. Targeting this issue, we regard the task as a sequence labeling problem and propose a novel tagging scheme with coding the distance between linked components into the tags, so that emotions and the corresponding causes can be extracted simultaneously. Accordingly, an end-to-end model is presented to process the input texts from left to right, always with linear time complexity, leading to a speed up. Experimental results show that our proposed model achieves the best performance, outperforming the state-of-the-art method by 2.26% (p<0.001) in F1 measure.",,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Long,https://www.aclweb.org/anthology/2020.emnlp-main.289,38939311 +main.3336,CHARM: Inferring Personal Attributes from Conversations,Anna Tigunova|Andrew Yates|Paramita Mirza|Gerhard Weikum,"Personal knowledge about users’ professions, hobbies, favorite food, and travel preferences, among others, is a valuable asset for individualized AI, such as recommenders or chatbots. Conversations in social media, such as Reddit, are a rich source of data for inferring personal facts. Prior work developed supervised methods to extract this knowledge, but these approaches can not generalize beyond attribute values with ample labeled training samples. This paper overcomes this limitation by devising CHARM: a zero-shot learning method that creatively leverages keyword extraction and document retrieval in order to predict attribute values that were never seen during training. Experiments with large datasets from Reddit show the viability of CHARM for open-ended attributes, such as professions and hobbies.",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.434,38939312 +main.3337,Long-Short Term Masking Transformer: A Simple but Effective Baseline for Document-level Neural Machine Translation,Pei Zhang|Boxing Chen|Niyu Ge|Kai Fan,"Many document-level neural machine translation (NMT) systems have explored the utility of context-aware architecture, usually requiring an increasing number of parameters and computational complexity. However, few attention is paid to the baseline model. In this paper, we research extensively the pros and cons of the standard transformer in document-level translation, and find that the auto-regressive property can simultaneously bring both the advantage of the consistency and the disadvantage of error accumulation. Therefore, we propose a surprisingly simple long-short term masking self-attention on top of the standard transformer to both effectively capture the long-range dependence and reduce the propagation of errors. We examine our approach on the two publicly available document-level datasets. We can achieve a strong result in BLEU and capture discourse phenomena.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.81,38939313 +main.334,F^2-Softmax: Diversifying Neural Text Generation via Frequency Factorized Softmax,Byung-Ju Choi|Jimin Hong|David Park|Sang Wan Lee,"Despite recent advances in neural text generation, encoding the rich diversity in human language remains elusive. We argue that the sub-optimal text generation is mainly attributable to the imbalanced token distribution, which particularly misdirects the learning model when trained with the maximum-likelihood objective. As a simple yet effective remedy, we propose two novel methods, F^2-Softmax and MefMax, for a balanced training even with the skewed frequency distribution. MefMax assigns tokens uniquely to frequency classes, trying to group tokens with similar frequencies and equalize frequency mass between the classes. F^2-Softmax then decomposes a probability distribution of the target token into a product of two conditional probabilities of (1) frequency class, and (2) token from the target frequency class. Models learn more uniform probability distributions because they are confined to subsets of vocabularies. Significant performance gains on seven relevant metrics suggest the supremacy of our approach in improving not only the diversity but also the quality of generated texts.",,Language Generation,Long,https://www.aclweb.org/anthology/2020.emnlp-main.737,38938686 +main.3344,Assessing the Helpfulness of Learning Materials with Inference-Based Learner-Like Agent,Yun-Hsuan Jen|Chieh-Yang Huang|MeiHua Chen|Ting-Hao Huang|Lun-Wei Ku,"Many English-as-a-second language learners have trouble using near-synonym words (e.g., small vs.little; briefly vs.shortly) correctly, and often look for example sentences to learn how two nearly synonymous terms differ. Prior work uses hand-crafted scores to recommend sentences but has difficulty in adopting such scores to all the near-synonyms as near-synonyms differ in various ways. We notice that the helpfulness of the learning material would reflect on the learners’ performance. Thus, we propose the inference-based learner-like agent to mimic learner behavior and identify good learning materials by examining the agent’s performance. To enable the agent to behave like a learner, we leverage entailment modeling’s capability of inferring answers from the provided materials. Experimental results show that the proposed agent is equipped with good learner-like behavior to achieve the best performance in both fill-in-the-blank (FITB) and good example sentence selection tasks. We further conduct a classroom user study with college ESL learners. The results of the user study show that the proposed agent can find out example sentences that help students learn more easily and efficiently. Compared to other models, the proposed agent improves the score of more than 17% of students after learning.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.312,38939314 +main.3348,Training for Gibbs Sampling on Conditional Random Fields with Neural Scoring Factors,Sida Gao|Matthew R. Gormley,"Most recent improvements in NLP come from changes to the neural network architectures modeling the text input. Yet, state-of-the-art models often rely on simple approaches to model the label space, e.g. bigram Conditional Random Fields (CRFs) in sequence tagging. More expressive graphical models are rarely used due to their prohibitive computational cost. In this work, we present an approach for efficiently training and decoding hybrids of graphical models and neural networks based on Gibbs sampling. Our approach is the natural adaptation of SampleRank (Wick et al., 2011) to neural models, and is widely applicable to tasks beyond sequence tagging. We apply our approach to named entity recognition and present a neural skip-chain CRF model, for which exact inference is impractical. The skip-chain model improves over a strong baseline on three languages from CoNLL-02/03. We obtain new state-of-the-art results on Dutch.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.406,38939315 +main.3352,Quantifying Intimacy in Language,Jiaxin Pei|David Jurgens,"Intimacy is a fundamental aspect of how we relate to others in social settings. Language encodes the social information of intimacy through both topics and other more subtle cues (such as linguistic hedging and swearing). Here, we introduce a new computational framework for studying expressions of the intimacy in language with an accompanying dataset and deep learning model for accurately predicting the intimacy level of questions (Pearson r = 0.87). Through analyzing a dataset of 80.5M questions across social media, books, and films, we show that individuals employ interpersonal pragmatic moves in their language to align their intimacy with social settings. Then, in three studies, we further demonstrate how individuals modulate their intimacy to match social norms around gender, social distance, and audience, each validating key findings from studies in social psychology. Our work demonstrates that intimacy is a pervasive and impactful social dimension of language.",,Computational Social Science and Social Media,Long,https://www.aclweb.org/anthology/2020.emnlp-main.428,38939316 +main.3353,How to Make Neural Natural Language Generation as Reliable as Templates in Task-Oriented Dialogue,Henry Elder|Alexander O'Connor|Jennifer Foster,"Neural Natural Language Generation (NLG) systems are well known for their unreliability. To overcome this issue, we propose a data augmentation approach which allows us to restrict the output of a network and guarantee reliability. While this restriction means generation will be less diverse than if randomly sampled, we include experiments that demonstrate the tendency of existing neural generation approaches to produce dull and repetitive text, and we argue that reliability is more important than diversity for this task. The system trained using this approach scored 100\% in semantic accuracy on the E2E NLG Challenge dataset, the same as a template system.",,Language Generation,Long,https://www.aclweb.org/anthology/2020.emnlp-main.230,38939317 +main.3357,Sentiment Analysis of Tweets Using Heterogeneous Multi-layer Network Representation and Embedding,Loitongbam Gyanendro Singh|Anasua Mitra|Sanasam Ranbir Singh,"Sentiment classification on tweets often needs to deal with the problems of under-specificity, noise, and multilingual content. This study proposes a heterogeneous multi-layer network-based representation of tweets to generate multiple representations of a tweet and address the above issues. The generated representations are further ensembled and classified using a neural-based early fusion approach. Further, we propose a centrality aware random-walk for node embedding and tweet representations suitable for the multi-layer network. From various experimental analysis, it is evident that the proposed method can address the problem of under-specificity, noisy text, and multilingual content present in a tweet and provides better classification performance than the text-based counterparts. Further, the proposed centrality aware based random walk provides better representations than unbiased and other biased counterparts.",,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Long,https://www.aclweb.org/anthology/2020.emnlp-main.718,38939318 +main.3358,A Bilingual Generative Transformer for Semantic Sentence Embedding,John Wieting|Graham Neubig|Taylor Berg-Kirkpatrick,"Semantic sentence embedding models encode natural language sentences into vectors, such that closeness in embedding space indicates closeness in the semantics between the sentences. Bilingual data offers a useful signal for learning such embeddings: properties shared by both sentences in a translation pair are likely semantic, while divergent properties are likely stylistic or language-specific. We propose a deep latent variable model that attempts to perform source separation on parallel sentences, isolating what they have in common in a latent semantic vector, and explaining what is left over with language-specific latent vectors. Our proposed approach differs from past work on semantic sentence encoding in two ways. First, by using a variational probabilistic framework, we introduce priors that encourage source separation, and can use our model's posterior to predict sentence embeddings for monolingual data at test time. Second, we use high-capacity transformers as both data generating distributions and inference networks -- contrasting with most past work on sentence embeddings. In experiments, our approach substantially outperforms the state-of-the-art on a standard suite of unsupervised semantic similarity evaluations. Further, we demonstrate that our approach yields the largest gains on more difficult subsets of these evaluations where simple word overlap is not a good indicator of similarity.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",Long,https://www.aclweb.org/anthology/2020.emnlp-main.122,38939319 +main.3360,"Vokenization: Improving Language Understanding via Contextualized, Visually-Grounded Supervision",Hao Tan|Mohit Bansal,"Humans learn language by listening, speaking, writing, reading, and also, via interaction with the multimodal real world. Existing language pre-training frameworks show the effectiveness of text-only self-supervision while we explore the idea of a visually-supervised language model in this paper. We find that the main reason hindering this exploration is the large divergence in magnitude and distributions between the visually-grounded language datasets and pure-language corpora. Therefore, we develop a technique named “vokenization” that extrapolates multimodal alignments to language-only data by contextually mapping language tokens to their related images (which we call “vokens”). The “vokenizer” is trained on relatively small image captioning datasets and we then apply it to generate vokens for large language corpora. Trained with these contextually generated vokens, our visually-supervised language models show consistent improvements over self-supervised alternatives on multiple pure-language tasks such as GLUE, SQuAD, and SWAG.",,"Language Grounding to Vision, Robotics and Beyond",Long,https://www.aclweb.org/anthology/2020.emnlp-main.162,38939320 +main.3370,ChrEn: Cherokee-English Machine Translation for Endangered Language Revitalization,Shiyue Zhang|Benjamin Frey|Mohit Bansal,"Cherokee is a highly endangered Native American language spoken by the Cherokee people. The Cherokee culture is deeply embedded in its language. However, there are approximately only 2,000 fluent first language Cherokee speakers remaining in the world and the number is declining every year. To help save this endangered language, we introduce ChrEn, a Cherokee-English parallel dataset, to facilitate machine translation research between Cherokee and English. Compared to some popular machine translation language pairs, ChrEn is extremely low-resource, only containing 14k sentence pairs in total. We split our parallel data in ways that facilitate both in-domain and out-of-domain evaluation. We also collect 5k Cherokee monolingual data to enable semi-supervised learning. Besides these datasets, we propose several Cherokee-English and English-Cherokee machine translation systems. We compare SMT (phrase-based) versus NMT (RNN-based and Transformer-based) systems; supervised versus semi-supervised (via language model, back-translation, and BERT/Multilingual-BERT) methods; as well as transfer learning versus multilingual joint training with 4 other languages. Our best results are 15.8/12.7 BLEU for in-domain and 6.5/5.0 BLEU for out-of-domain Chr-En/EnChr translations, respectively; and we hope that our dataset and systems will encourage future work by the community for Cherokee language revitalization.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.43,38939321 +main.3375,Introducing Syntactic Structures into Target Opinion Word Extraction with Deep Learning,Amir Pouran Ben Veyseh|Nasim Nouri|Franck Dernoncourt|Dejing Dou|Thien Huu Nguyen,"Targeted opinion word extraction (TOWE) is a sub-task of aspect based sentiment analysis (ABSA) which aims to find the opinion words for a given aspect-term in a sentence. Despite their success for TOWE, the current deep learning models fail to exploit the syntactic information of the sentences that have been proved to be useful for TOWE in the prior research. In this work, we propose to incorporate the syntactic structures of the sentences into the deep learning models for TOWE, leveraging the syntax-based opinion possibility scores and the syntactic connections between the words. We also introduce a novel regularization technique to improve the performance of the deep learning models based on the representation distinctions between the words in TOWE. The proposed model is extensively analyzed and achieves the state-of-the-art performance on four benchmark datasets.",,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Long,https://www.aclweb.org/anthology/2020.emnlp-main.719,38939322 +main.3389,Better Highlighting: Creating Sub-Sentence Summary Highlights,Sangwoo Cho|Kaiqiang Song|Chen Li|Dong Yu|Hassan Foroosh|Fei Liu,"Amongst the best means to summarize is highlighting. In this paper, we aim to generate summary highlights to be overlaid on the original documents to make it easier for readers to sift through a large amount of text. The method allows summaries to be understood in context to prevent a summarizer from distorting the original meaning, of which abstractive summarizers usually fall short. In particular, we present a new method to produce self-contained highlights that are understandable on their own to avoid confusion. Our method combines determinantal point processes and deep contextualized representations to identify an optimal set of sub-sentence segments that are both important and non-redundant to form summary highlights. To demonstrate the flexibility and modeling power of our method, we conduct extensive experiments on summarization datasets. Our analysis provides evidence that highlighting is a promising avenue of research towards future summarization.",,Summarization,Long,https://www.aclweb.org/anthology/2020.emnlp-main.509,38939323 +main.3390,Event Detection: Gate Diversity and Syntactic Importance Scores for Graph Convolution Neural Networks,Viet Dac Lai|Tuan Ngo Nguyen|Thien Huu Nguyen,"Recent studies on event detection (ED) have shown that the syntactic dependency graph can be employed in graph convolution neural networks (GCN) to achieve state-of-the-art performance. However, the computation of the hidden vectors in such graph-based models is agnostic to the trigger candidate words, potentially leaving irrelevant information for the trigger candidate for event prediction. In addition, the current models for ED fail to exploit the overall contextual importance scores of the words, which can be obtained via the dependency tree, to boost the performance. In this study, we propose a novel gating mechanism to filter noisy information in the hidden vectors of the GCN models for ED based on the information from the trigger candidate. We also introduce novel mechanisms to achieve the contextual diversity for the gates and the importance score consistency for the graphs and models in ED. The experiments show that the proposed model achieves state-of-the-art performance on two ED datasets.",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.435,38939324 +main.3391,Coding Textual Inputs Boosts the Accuracy of Neural Networks,Abdul Rafae Khan|Jia Xu|Weiwei Sun,"Natural Language Processing (NLP) tasks are usually performed word by word on textual inputs. We can use arbitrary symbols to represent the linguistic meaning of a word and use these symbols as inputs. As ``alternatives'' to a text representation, we introduce Soundex, MetaPhone, NYSIIS, logogram to NLP, and develop fixed-output-length coding and its extension using Huffman coding. Each of those codings combines different character/digital sequences and constructs a new vocabulary based on codewords. We find that the integration of those codewords with text provides more reliable inputs to Neural-Network-based NLP systems through redundancy than text-alone inputs. Experiments demonstrate that our approach outperforms the state-of-the-art models on the application of machine translation, language modeling, and part-of-speech tagging. The source code is available at https://github.com/abdulrafae/coding_nmt.",,"Phonology, Morphology and Word Segmentation",Long,https://www.aclweb.org/anthology/2020.emnlp-main.104,38939325 +main.3393,Simple Data Augmentation with the Mask Token Improves Domain Adaptation for Dialog Act Tagging,Semih Yavuz|Kazuma Hashimoto|Wenhao Liu|Nitish Shirish Keskar|Richard Socher|Caiming Xiong,"The concept of Dialogue Act (DA) is universal across different task-oriented dialogue domains - the act of ``request"" carries the same speaker intention whether it is for restaurant reservation or flight booking. However, DA taggers trained on one domain do not generalize well to other domains, which leaves us with the expensive need for a large amount of annotated data in the target domain. In this work, we investigate how to better adapt DA taggers to desired target domains with only unlabeled data. We propose MaskAugment, a controllable mechanism that augments text input by leveraging the pre-trained Mask token from BERT model. Inspired by consistency regularization, we use MaskAugment to introduce an unsupervised teacher-student learning scheme to examine the domain adaptation of DA taggers. Our extensive experiments on the Simulated Dialogue (GSim) and Schema-Guided Dialogue (SGD) datasets show that MaskAugment is useful in improving the cross-domain generalization for DA tagging.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.412,38939326 +main.3394,Contrastive Distillation on Intermediate Representations for Language Model Compression,Siqi Sun|Zhe Gan|Yuwei Fang|Yu Cheng|Shuohang Wang|Jingjing Liu,"Existing language model compression methods mostly use a simple L_2 loss to distill knowledge in the intermediate representations of a large BERT model to a smaller one. Although widely used, this objective by design assumes that all the dimensions of hidden representations are independent, failing to capture important structural knowledge in the intermediate layers of the teacher network. To achieve better distillation efficacy, we propose Contrastive Distillation on Intermediate Representations (CoDIR), a principled knowledge distillation framework where the student is trained to distill knowledge through intermediate layers of the teacher via a contrastive objective. By learning to distinguish positive sample from a large set of negative samples, CoDIR facilitates the student's exploitation of rich information in teacher's hidden layers. CoDIR can be readily applied to compress large-scale language models in both pre-training and finetuning stages, and achieves superb performance on the GLUE benchmark, outperforming state-of-the-art compression methods.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.36,38939327 +main.3398,Stepwise Extractive Summarization and Planning with Structured Transformers,Shashi Narayan|Joshua Maynez|Jakub Adamek|Daniele Pighin|Blaz Bratanic|Ryan McDonald,"We propose encoder-centric stepwise models for extractive summarization using structured transformers -- HiBERT and Extended Transformers. We enable stepwise summarization by injecting the previously generated summary into the structured transformer as an auxiliary sub-structure. Our models are not only efficient in modeling the structure of long inputs, but they also do not rely on task-specific redundancy-aware modeling, making them a general purpose extractive content planner for different tasks. When evaluated on CNN/DailyMail extractive summarization, stepwise models achieve state-of-the-art performance in terms of Rouge without any redundancy aware modeling or sentence filtering. This also holds true for Rotowire table-to-text generation, where our models surpass previously reported metrics for content selection, planning and ordering, highlighting the strength of stepwise modeling. Amongst the two structured transformers we test, stepwise Extended Transformers provides the best performance across both datasets and sets a new standard for these challenges.",,Summarization,Long,https://www.aclweb.org/anthology/2020.emnlp-main.339,38939328 +main.3403,Blank Language Models,Tianxiao Shen|Victor Quach|Regina Barzilay|Tommi Jaakkola,"We propose Blank Language Model (BLM), a model that generates sequences by dynamically creating and filling in blanks. The blanks control which part of the sequence to expand, making BLM ideal for a variety of text editing and rewriting tasks. The model can start from a single blank or partially completed text with blanks at specified locations. It iteratively determines which word to place in a blank and whether to insert new blanks, and stops generating when no blanks are left to fill. BLM can be efficiently trained using a lower bound of the marginal data likelihood. On the task of filling missing text snippets, BLM significantly outperforms all other baselines in terms of both accuracy and fluency. Experiments on style transfer and damaged ancient text restoration demonstrate the potential of this framework for a wide range of applications.",,Language Generation,Long,https://www.aclweb.org/anthology/2020.emnlp-main.420,38939329 +main.3408,Predicting Stance and Rumor Veracity via Dual Hierarchical Transformer with Pretrained Encoders,Jianfei Yu|Jing Jiang|Ling Min Serena Khoo|Hai Leong Chieu|Rui Xia,"The prevalent use of social media enables rapid spread of rumors on a massive scale, which leads to the emerging need of automatic rumor verification (RV). A number of previous studies focus on leveraging stance classification to enhance RV with multi-task learning (MTL) methods. However, most of these methods failed to employ pre-trained contextualized embeddings such as BERT, and did not exploit inter-task dependencies by using predicted stance labels to improve the RV task. Therefore, in this paper, to extend BERT to obtain thread representations, we first propose a Hierarchical Transformer, which divides each long thread into shorter subthreads, and employs BERT to separately represent each subthread, followed by a global Transformer layer to encode all the subthreads. We further propose a Coupled Transformer Module to capture the inter-task interactions and a Post-Level Attention layer to use the predicted stance labels for RV, respectively. Experiments on two benchmark datasets show the superiority of our Coupled Hierarchical Transformer model over existing MTL approaches.",,Computational Social Science and Social Media,Long,https://www.aclweb.org/anthology/2020.emnlp-main.108,38939330 +main.3419,Querying across Genres to Retrieve Research That Supports Medical Claims Made in News,Chaoyuan Zuo|Narayan Acharya|Ritwik Banerjee,"We present a query-based biomedical information retrieval task across two vastly different genres -- newswire and research literature -- where the goal is to find the research publication that supports the primary claim made in a health-related news article. For this task, we present a new dataset of 5,034 claims from news paired with research abstracts. Our approach consists of two steps: (i) selecting the most relevant candidates from a collection of 222k research abstracts, and (ii) re-ranking this list. We compare the classical IR approach using BM25 with more recent transformer-based models. Our results show that cross-genre medical IR is a viable task, but incorporating domain-specific knowledge is crucial.",,Information Retrieval and Text Mining,Long,https://www.aclweb.org/anthology/2020.emnlp-main.139,38939331 +main.3424,Social Media Attributions in the Context of Water Crisis,Rupak Sarkar|Sayantan Mahinder|Hirak Sarkar|Ashiqur KhudaBukhsh,"Attribution of natural disasters/collective misfortune is a widely-studied political science problem. However, such studies typically rely on surveys, or expert opinions, or external signals such as voting outcomes. In this paper, we explore the viability of using unstructured, noisy social media data to complement traditional surveys through automatically extracting attribution factors. We present a novel prediction task of \emph{attribution tie detection} of identifying the factors (e.g., poor city planning, exploding population etc.) held responsible for the crisis in a social media document. We focus on the 2019 Chennai water crisis that rapidly escalated into a discussion topic with global importance following alarming water-crisis statistics. On a challenging data set constructed from YouTube comments (72,098 comments posted by 43,859 users on 623 videos relevant to the crisis), we present a neural baseline to identify attribution ties that achieves a reasonable performance (accuracy: 87.34\% on attribution detection and 81.37\% on attribution resolution). We release the first annotated data set of 2,500 comments in this important domain.",,Computational Social Science and Social Media,Long,https://www.aclweb.org/anthology/2020.emnlp-main.109,38939332 +main.3431,Selection and Generation: Learning towards Multi-Product Advertisement Post Generation,Zhangming Chan|Yuchi Zhang|Xiuying Chen|Shen Gao|Zhiqiang Zhang|Dongyan Zhao|Rui Yan,"As the E-commerce thrives, high-quality online advertising copywriting has attracted more and more attention. Different from the advertising copywriting for a single product, an advertisement (AD) post includes an attractive topic that meets the customer needs and description copywriting about several products under its topic. A good AD post can highlight the characteristics of each product, thus helps customers make a good choice among candidate products. Hence, multi-product AD post generation is meaningful and important. We propose a novel end-to-end model named S-MG Net to generate the AD post. Targeted at such a challenging real-world problem, we split the AD post generation task into two subprocesses: (1) select a set of products via the SelectNet (Selection Network). (2) generate a post including selected products via the MGenNet (Multi-Generator Network). Concretely, SelectNet first captures the post topic and the relationship among the products to output the representative products. Then, MGenNet generates the description copywriting of each product. Experiments conducted on a large-scale real-world AD post dataset demonstrate that our proposed model achieves impressive performance in terms of both automatic metrics as well as human evaluations.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.313,38939333 +main.3434,Effective Unsupervised Domain Adaptation with Adversarially Trained Language Models,Thuy-Trang Vu|Dinh Phung|Gholamreza Haffari,"Recent work has shown the importance of adaptation of broad-coverage contextualised embedding models on the domain of the target task of interest. Current self-supervised adaptation methods are simplistic, as the training signal comes from a small percentage of \emph{randomly} masked-out tokens. In this paper, we show that careful masking strategies can bridge the knowledge gap of masked language models (MLMs) about the domains more effectively by allocating self-supervision where it is needed. Furthermore, we propose an effective training strategy by adversarially masking out those tokens which are harder to reconstruct by the underlying MLM. The adversarial objective leads to a challenging combinatorial optimisation problem over \emph{subsets} of tokens, which we tackle efficiently through relaxation to a variational lowerbound and dynamic programming. On six unsupervised domain adaptation tasks involving named entity recognition, our method strongly outperforms the random masking strategy and achieves up to +1.64 F1 score improvements.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.497,38939334 +main.3437,Neural Extractive Summarization with Hierarchical Attentive Heterogeneous Graph Network,Ruipeng Jia|Yanan Cao|Hengzhu Tang|Fang Fang|Cong Cao|Shi Wang,"Sentence-level extractive text summarization is substantially a node classification task of network mining, adhering to the informative components and concise representations. There are lots of redundant phrases between extracted sentences, but it is difficult to model them exactly by the general supervised methods. Previous sentence encoders, especially BERT, specialize in modeling the relationship between source sentences. While, they have no ability to consider the overlaps of the target selected summary, and there are inherent dependencies among target labels of sentences. In this paper, we propose HAHSum (as shorthand for Hierarchical Attentive Heterogeneous Graph for Text Summarization), which well models different levels of information, including words and sentences, and spotlights redundancy dependencies between sentences. Our approach iteratively refines the sentence representations with redundancy-aware graph and delivers the label dependencies by message passing. Experiments on large scale benchmark corpus (CNN/DM, NYT, and NEWSROOM) demonstrate that HAHSum yields ground-breaking performance and outperforms previous extractive summarizers.",,Summarization,Long,https://www.aclweb.org/anthology/2020.emnlp-main.295,38939335 +main.3438,A State-independent and Time-evolving Network with Applications to Early Rumor Detection,Rui Xia|Kaizhou Xuan|Jianfei Yu,"In this paper, we study automatic rumor detection for in social media at the event level where an event consists of a sequence of posts organized according to the posting time. It is common that the state of an event is dynamically evolving. However, most of the existing methods to this task ignored this problem, and established a global representation based on all the posts in the event's life cycle. Such coarse-grained methods failed to capture the event's unique features in different states. To address this limitation, we propose a state-independent and time-evolving Network (STN) for rumor detection based on fine-grained event state detection and segmentation. Given an event composed of a sequence of posts, STN first predicts the corresponding sequence of states and segments the event into several state-independent sub-events. For each sub-event, STN independently trains an encoder to learn the feature representation for that sub-event and incrementally fuses the representation of the current sub-event with previous ones for rumor prediction. This framework can more accurately learn the representation of an event in the initial stage and enable early rumor detection. Experiments on two benchmark datasets show that STN can significantly improve the rumor detection accuracy in comparison with some strong baseline systems. We also design a new evaluation metric to measure the performance of early rumor detection, under which STN shows a higher advantage in comparison.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.727,38939336 +main.3441,Generating Diverse Translation from Model Distribution with Dropout,Xuanfu Wu|Yang Feng|Chenze Shao,"Despite the improvement of translation quality, neural machine translation (NMT) often suffers from the lack of diversity in its generation. In this paper, we propose to generate diverse translations by deriving a large number of possible models with Bayesian modelling and sampling models from them for inference. The possible models are obtained by applying concrete dropout to the NMT model and each of them has specific confidence for its prediction, which corresponds to a posterior model distribution under specific training data in the principle of Bayesian modeling. With variational inference, the posterior model distribution can be approximated with a variational distribution, from which the final models for inference are sampled. We conducted experiments on Chinese-English and English-German translation tasks and the results shows that our method makes a better trade-off between diversity and accuracy.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.82,38939337 +main.345,Cold-start Active Learning through Self-Supervised Language Modeling,Michelle Yuan|Hsuan-Tien Lin|Jordan Boyd-Graber,"Active learning strives to reduce annotation costs by choosing the most critical examples to label. Typically, the active learning strategy is contingent on the classification model. For instance, uncertainty sampling depends on poorly calibrated model confidence scores. In the cold-start setting, active learning is impractical because of model instability and data scarcity. Fortunately, modern NLP provides an additional source of information: pre-trained language models. The pre-training loss can find examples that surprise the model and should be labeled for efficient fine-tuning. Therefore, we treat the language modeling loss as a proxy for classification uncertainty. With BERT, we develop a simple strategy based on the masked language modeling loss that minimizes labeling costs for text classification. Compared to other baselines, our approach reaches higher accuracy within less sampling iterations and computation time.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.637,38938687 +main.3450,Social Chemistry 101: Learning to Reason about Social and Moral Norms,Maxwell Forbes|Jena D. Hwang|Vered Shwartz|Maarten Sap|Yejin Choi,"Social norms—the unspoken commonsense rules about acceptable social behavior—are crucial in understanding the underlying causes and intents of people’s actions in narratives. For example, underlying an action such as ""wanting to call cops on my neighbor"" are social norms that inform our conduct, such as ""It is expected that you report crimes."" We present SOCIAL CHEMISTRY, a new conceptual formalism to study people’s everyday social norms and moral judgments over a rich spectrum of real life situations described in natural language. We introduce SOCIAL-CHEM-101, a large-scale corpus that catalogs 292k rules-of-thumb such as “It is rude to run a blender at 5am” as the basic conceptual units. Each rule-of-thumb is further broken down with 12 different dimensions of people’s judgments, including social judgments of good and bad, moral foundations, expected cultural pressure, and assumed legality, which together amount to over 4.5 million annotations of categorical labels and free-text descriptions. Comprehensive empirical results based on state-of-the-art neural models demonstrate that computational modeling of social norms is a promising research direction. Our model framework, Neural Norm Transformer, learns and generalizes SOCIAL-CHEM-101 to successfully reason about previously unseen situations, generating relevant (and potentially novel) attribute-aware social rules-of-thumb.",,Computational Social Science and Social Media,Long,https://www.aclweb.org/anthology/2020.emnlp-main.48,38939338 +main.3453,Design Challenges in Low-resource Cross-lingual Entity Linking,Xingyu Fu|Weijia Shi|Xiaodong Yu|Zian Zhao|Dan Roth,"Cross-lingual Entity Linking (XEL), the problem of grounding mentions of entities in a foreign language text into an English knowledge base such as Wikipedia, has seen a lot of research in recent years, with a range of promising techniques. However, current techniques do not rise to the challenges introduced by text in low-resource languages (LRL) and, surprisingly, fail to generalize to text not taken from Wikipedia, on which they are usually trained. This paper provides a thorough analysis of low-resource XEL techniques, focusing on the key step of identifying candidate English Wikipedia titles that correspond to a given foreign language mention. Our analysis indicates that current methods are limited by their reliance on Wikipedia’s interlanguage links and thus suffer when the foreign language’s Wikipedia is small. We conclude that the LRL setting requires the use of outside-Wikipedia cross-lingual resources and present a simple yet effective zero-shot XEL system, QuEL, that utilizes search engines query logs. With experiments on 25 languages, QuEL shows an average increase of 25% in gold candidate recall and of 13% in end-to-end linking accuracy over state-of-the-art baselines.",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.521,38939339 +main.3454,An Empirical Study of Hyperbole,Li Kong|Chuanyi Li|Jidong Ge|Bin Luo|Vincent Ng,"While hyperbole is one of the most prevalent rhetorical devices, it is arguably one of the least studied devices in the figurative language processing community. We contribute to the study of hyperbole by (1) creating a corpus focusing on sentence-level hyperbole detection, (2) performing a statistical and manual analysis of our corpus, and (3) addressing the automatic hyperbole detection task.",,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Long,https://www.aclweb.org/anthology/2020.emnlp-main.571,38939340 +main.3457,COD3S: Diverse Generation with Discrete Semantic Signatures,Nathaniel Weir|João Sedoc|Benjamin Van Durme,"We present COD3S, a novel method for generating semantically diverse sentences using neural sequence-to-sequence (seq2seq) models. Conditioned on an input, seq2seqs typically produce semantically and syntactically homogeneous sets of sentences and thus perform poorly on one-to-many sequence generation tasks. Our two-stage approach improves output diversity by conditioning generation on locality-sensitive hash (LSH)-based semantic sentence codes whose Hamming distances highly correlate with human judgments of semantic textual similarity. Though it is generally applicable, we apply \method{} to causal generation, the task of predicting a proposition's plausible causes or effects. We demonstrate through automatic and human evaluation that responses produced using our method exhibit improved diversity without degrading task performance.",,Language Generation,Long,https://www.aclweb.org/anthology/2020.emnlp-main.421,38939341 +main.3462,Deep Weighted MaxSAT for Aspect-based Opinion Extraction,Meixi Wu|Wenya Wang|Sinno Jialin Pan,"Though deep learning has achieved significant success in various NLP tasks, most deep learning models lack the capability of encoding explicit domain knowledge to model complex causal relationships among different types of variables. On the other hand, logic rules offer a compact expression to represent the causal relationships to guide the training process. Logic programs can be cast as a satisfiability problem which aims to find truth assignments to logic variables by maximizing the number of satisfiable clauses (MaxSAT). We adopt the MaxSAT semantics to model logic inference process and smoothly incorporate a weighted version of MaxSAT that connects deep neural networks and a graphical model in a joint framework. The joint model feeds deep learning outputs to a weighted MaxSAT layer to rectify the erroneous predictions and can be trained via end-to-end gradient descent. Our proposed model associates the benefits of high-level feature learning, knowledge reasoning, and structured learning with observable performance gain for the task of aspect-based opinion extraction.",,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Long,https://www.aclweb.org/anthology/2020.emnlp-main.453,38939342 +main.3464,Learning to Fuse Sentences with Transformers for Summarization,Logan Lebanoff|Franck Dernoncourt|Doo Soon Kim|Lidan Wang|Walter Chang|Fei Liu,"The ability to fuse sentences is highly attractive for summarization systems because it is an essential step to produce succinct abstracts. However, to date, summarizers can fail on fusing sentences. They tend to produce few summary sentences by fusion or generate incorrect fusions that lead the summary to fail to retain the original meaning. In this paper, we explore the ability of Transformers to fuse sentences and propose novel algorithms to enhance their ability to perform sentence fusion by leveraging the knowledge of points of correspondence between sentences. Through extensive experiments, we investigate the effects of different design choices on Transformer's performance. Our findings highlight the importance of modeling points of correspondence between sentences for effective sentence fusion.",,Summarization,Long,https://www.aclweb.org/anthology/2020.emnlp-main.338,38939343 +main.3470,Learning from Task Descriptions,Orion Weller|Nicholas Lourie|Matt Gardner|Matthew Peters,"Typically, machine learning systems solve new tasks by training on thousands of examples. In contrast, humans can solve new tasks by reading some instructions, with perhaps an example or two. To take a step toward closing this gap, we introduce a framework for developing NLP systems that solve new tasks after reading their descriptions, synthesizing prior work in this area. We instantiate this frame- work with a new English language dataset, ZEST, structured for task-oriented evaluation on unseen tasks. Formulating task descriptions as questions, we ensure each is general enough to apply to many possible inputs, thus comprehensively evaluating a model’s ability to solve each task. Moreover, the dataset’s structure tests specific types of systematic generalization. We find that the state-of-the-art T5 model achieves a score of 12% on ZEST, leaving a significant challenge for NLP researchers.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.105,38939344 +main.3483,PALM: Pre-training an Autoencoding&autoregressive Language Model for Context-conditioned Generation,Bin Bi|Chenliang Li|Chen Wu|Ming Yan|Wei Wang|Songfang Huang|Fei Huang|Luo Si,"Self-supervised pre-training, such as BERT, MASS and BART, has emerged as a powerful technique for natural language understanding and generation. Existing pre-training techniques employ autoencoding and/or autoregressive objectives to train Transformer-based models by recovering original word tokens from corrupted text with some masked tokens. The training goals of existing techniques are often inconsistent with the goals of many language generation tasks, such as generative question answering and conversational response generation, for producing new text given context. This work presents PALM with a novel scheme that jointly pre-trains an autoencoding and autoregressive language model on a large unlabeled corpus, specifically designed for generating new text conditioned on context. The new scheme alleviates the mismatch introduced by the existing denoising scheme between pre-training and fine-tuning where generation is more than reconstructing original text. An extensive set of experiments show that PALM achieves new state-of-the-art results on a variety of language generation benchmarks covering generative question answering (Rank 1 on the official MARCO leaderboard), abstractive summarization on CNN/DailyMail as well as Gigaword, question generation on SQuAD, and conversational response generation on Cornell Movie Dialogues.",,Language Generation,Long,https://www.aclweb.org/anthology/2020.emnlp-main.700,38939345 +main.3486,An Empirical Investigation of Contextualized Number Prediction,Taylor Berg-Kirkpatrick|Daniel Spokoyny,"We conduct a large scale empirical investigation of contextualized number prediction in running text. Specifically, we consider two tasks: (1)masked number prediction– predict-ing a missing numerical value within a sentence, and (2)numerical anomaly detection–detecting an errorful numeric value within a sentence. We experiment with novel combinations of contextual encoders and output distributions over the real number line. Specifically, we introduce a suite of output distribution parameterizations that incorporate latent variables to add expressivity and better fit the natural distribution of numeric values in running text, and combine them with both recur-rent and transformer-based encoder architectures. We evaluate these models on two numeric datasets in the financial and scientific domain. Our findings show that output distributions that incorporate discrete latent variables and allow for multiple modes outperform simple flow-based counterparts on all datasets, yielding more accurate numerical pre-diction and anomaly detection. We also show that our models effectively utilize textual con-text and benefit from general-purpose unsupervised pretraining.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.385,38939346 +main.349,ChiTeSQL: A Large-Scale and Pragmatic Chinese Text-to-SQL Dataset,Lijie Wang|Ao Zhang|Kun Wu|Ke Sun|Zhenghua Li|Hua Wu|Min Zhang|Haifeng Wang,"Due to the lack of labeled data, previous research on text-to-SQL parsing mainly focuses on English. Representative English datasets include ATIS, WikiSQL, Spider, etc. This paper presents DuSQL, a larges-scale and pragmatic Chinese dataset for the cross-domain text-to-SQL task, containing 200 databases, 813 tables, and 23,797 question/SQL pairs. Our new dataset has three major characteristics. First, by manually analyzing questions from several representative applications, we try to figure out the true distribution of SQL queries in real-life needs. Second, DuSQL contains a considerable proportion of SQL queries involving row or column calculations, motivated by our analysis on the SQL query distributions. Finally, we adopt an effective data construction framework via human-computer collaboration. The basic idea is automatically generating SQL queries based on the SQL grammar and constrained by the given database. This paper describes in detail the construction process and data statistics of DuSQL. Moreover, we present and compare performance of several open-source text-to-SQL parsers with minor modification to accommodate Chinese, including a simple yet effective extension to IRNet for handling calculation SQL queries.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",Long,https://www.aclweb.org/anthology/2020.emnlp-main.562,38938688 +main.3495,Neural Conversational QA: Learning to Reason vs Exploiting Patterns,Nikhil Verma|Abhishek Sharma|Dhiraj Madan|Danish Contractor|Harshit Kumar|Sachindra Joshi,"Neural Conversational QA tasks such as ShARC require systems to answer questions based on the contents of a given passage. On studying recent state-of-the-art models on the ShARC QA task, we found indications that the model(s) learn spurious clues/patterns in the data-set. Further, a heuristic-based program, built to exploit these patterns, had comparative performance to that of the neural models. In this paper we share our findings about the four types of patterns in the ShARC corpus and how the neural models exploit them. Motivated by the above findings, we create and share a modified data-set that has fewer spurious patterns than the original data-set, consequently allowing models to learn better.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.589,38939347 +main.3496,Form2Seq : A Framework for Higher-Order Form Structure Extraction,Milan Aggarwal|Hiresh Gupta|Mausoom Sarkar|Balaji Krishnamurthy,"Document structure extraction has been a widely researched area for decades with recent works performing it as a semantic segmentation task over document images using fully-convolution networks. Such methods are limited by image resolution due to which they fail to disambiguate structures in dense regions which appear commonly in forms. To mitigate this, we propose Form2Seq, a novel sequence-to-sequence (Seq2Seq) inspired framework for structure extraction using text, with a specific focus on forms, which leverages relative spatial arrangement of structures. We discuss two tasks; 1) Classification of low-level constituent elements (TextBlock and empty fillable Widget) into ten types such as field captions, list items, and others; 2) Grouping lower-level elements into higher-order constructs, such as Text Fields, ChoiceFields and ChoiceGroups, used as information collection mechanism in forms. To achieve this, we arrange the constituent elements linearly in natural reading order, feed their spatial and textual representations to Seq2Seq framework, which sequentially outputs prediction of each element depending on the final task. We modify Seq2Seq for grouping task and discuss improvements obtained through cascaded end-to-end training of two tasks versus training in isolation. Experimental results show the effectiveness of our text-based approach achieving an accuracy of 90% on classification task and an F1 of 75.82, 86.01, 61.63 on groups discussed above respectively, outperforming segmentation baselines. Further we show our framework achieves state of the results for table structure recognition on ICDAR 2013 dataset.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.314,38939348 +main.3497,A Predicate-Function-Argument Annotation of Natural Language for Open-Domain Information Expression,Mingming Sun|Wenyue Hua|Zoey Liu|Xin Wang|Kangjie Zheng|Ping Li,"Existing OIE (Open Information Extraction) algorithms are independent of each other such that there exist lots of redundant works; the featured strategies are not reusable and not adaptive to new tasks. This paper proposes a new pipeline to build OIE systems, where an Open-domain Information eXpression (OIX) task is proposed to provide a platform for all OIE strategies. The OIX is an OIE friendly expression of a sentence without information loss. The generation procedure of OIX contains shared works of OIE algorithms so that OIE strategies can be developed on the platform of OIX as inference operations focusing on more critical problems. Based on the same platform of OIX, the OIE strategies are reusable, and people can select a set of strategies to assemble their algorithm for a specific task so that the adaptability may be significantly increased. This paper focuses on the task of OIX and propose a solution -- Open Information Annotation (OIA). OIA is a predicate-function-argument annotation for sentences. We label a data set of sentence-OIA pairs and propose a dependency-based rule system to generate OIA annotations from sentences. The evaluation results reveal that learning the OIA from a sentence is a challenge owing to the complexity of natural language sentences, and it is worthy of attracting more attention from the research community.",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.167,38939349 +main.3504,Towards Understanding Sample Variance in Visually Grounded Language Generation: Evaluations and Observations,Wanrong Zhu|Xin Wang|Pradyumna Narayana|Kazoo Sone|Sugato Basu|William Yang Wang,"A major challenge in visually grounded language generation is to build robust benchmark datasets and models that can generalize well in real-world settings. To do this, it is critical to ensure that our evaluation protocols are correct, and benchmarks are reliable. In this work, we set forth to design a set of experiments to understand an important but often ignored problem in visually grounded language generation: given that humans have different utilities and visual attention, how will the sample variance in multi-reference datasets affect the models' performance? Empirically, we study several multi-reference datasets and corresponding vision-and-language tasks. We show that it is of paramount importance to report variance in experiments; that human-generated references could vary drastically in different datasets/tasks, revealing the nature of each task; that metric-wise, CIDEr has shown systematically larger variances than others. Our evaluations on reference-per-instance shed light on the design of reliable datasets in the future.",,"Language Grounding to Vision, Robotics and Beyond",Long,https://www.aclweb.org/anthology/2020.emnlp-main.708,38939350 +main.3506,AutoQA: From Databases to Q&A Semantic Parsers with Only Synthetic Training Data,Silei Xu|Sina Semnani|Giovanni Campagna|Monica Lam,"We propose AutoQA, a methodology and toolkit to generate semantic parsers that answer questions on databases, with no manual effort. Given a database schema and its data, AutoQA automatically generates a large set of high-quality questions for training that covers different database operations. It uses automatic paraphrasing combined with template-based parsing to find alternative expressions of an attribute in different parts of speech. It also uses a novel filtered auto-paraphraser to generate correct paraphrases of entire sentences. We apply AutoQA to the Schema2QA dataset and obtain an average logical form accuracy of 62.9% when tested on natural questions, which is only 6.4% lower than a model trained with expert natural language annotations and paraphrase data collected from crowdworkers. To demonstrate the generality of AutoQA, we also apply it to the Overnight dataset. AutoQA achieves 69.8% answer accuracy, 16.4% higher than the state-of-the-art zero-shot models and only 5.2% lower than the same model trained with human data.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",Long,https://www.aclweb.org/anthology/2020.emnlp-main.31,38939351 +main.3507,Mention Extraction and Linking for SQL Query Generation,Jianqiang Ma|ZEYU YAN|Shuai Pang|Yang Zhang|Jianping Shen,"On the WikiSQL benchmark, state-of-the-art text-to-SQL systems typically take a slot- filling approach by building several dedicated models for each type of slots. Such modularized systems are not only complex but also of limited capacity for capturing inter-dependencies among SQL clauses. To solve these problems, this paper proposes a novel extraction-linking approach, where a unified extractor recognizes all types of slot mentions appearing in the question sentence before a linker maps the recognized columns to the table schema to generate executable SQL queries. Trained with automatically generated annotations, the proposed method achieves the first place on the WikiSQL benchmark.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",Long,https://www.aclweb.org/anthology/2020.emnlp-main.563,38939352 +main.3513,On the Weak Link between Importance and Prunability of Attention Heads,Aakriti Budhraja|Madhura Pande|Preksha Nema|Pratyush Kumar|Mitesh M. Khapra,"Given the success of Transformer-based models, two directions of study have emerged: interpreting role of individual attention heads and down-sizing the models for efficiency. Our work straddles these two streams: We analyse the importance of basing pruning strategies on the interpreted role of the attention heads. We evaluate this on Transformer and BERT models on multiple NLP tasks. Firstly, we find that a large fraction of the attention heads can be randomly pruned with limited effect on accuracy. Secondly, for Transformers, we find no advantage in pruning attention heads identified to be important based on existing studies that relate importance to the location of a head. On the BERT model too we find no preference for top or bottom layers, though the latter are reported to have higher importance. However, strategies that avoid pruning middle layers and consecutive layers perform better. Finally, during fine-tuning the compensation for pruned attention heads is roughly equally distributed across the un-pruned heads. Our results thus suggest that interpretation of attention heads does not strongly inform pruning.",,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.260,38939353 +main.3517,Efficient One-Pass End-to-End Entity Linking for Questions,Belinda Z. Li|Sewon Min|Srinivasan Iyer|Yashar Mehdad|Wen-tau Yih,"We present ELQ, a fast end-to-end entity linking model for questions, which uses a biencoder to jointly perform mention detection and linking in one pass. Evaluated on WebQSP and GraphQuestions with extended annotations that cover multiple entities per question, ELQ outperforms the previous state of the art by a large margin of +12.7% and +19.6% F1, respectively. With a very fast inference time (1.57 examples/s on a single CPU), ELQ can be useful for downstream question answering systems. In a proof-of-concept experiment, we demonstrate that using ELQ significantly improves the downstream QA performance of GraphRetriever.",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.522,38939354 +main.3519,Recurrent Interaction Network for Jointly Extracting Entities and Classifying Relations,Kai Sun|Richong Zhang|Samuel Mensah|Yongyi Mao|xudong Liu,"The idea of using multi-task learning approaches to address the joint extraction of entity and relation is motivated by the relatedness between the entity recognition task and the relation classification task. Existing methods using multi-task learning techniques to address the problem learn interactions among the two tasks through a shared network, where the shared information is passed into the task-specific networks for prediction. However, such an approach hinders the model from learning explicit interactions between the two tasks to improve the performance on the individual tasks. As a solution, we design a multi-task learning model which we refer to as recurrent interaction network which allows the learning of interactions dynamically, to effectively model task-specific features for classification. Empirical studies on two real-world datasets confirm the superiority of the proposed model.",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.304,38939355 +main.3529,Towards Interpreting BERT for Reading Comprehension Based QA,Sahana Ramnath|Preksha Nema|Deep Sahni|Mitesh M. Khapra,"BERT and its variants have achieved state-of-the-art performance in various NLP tasks. Since then, various works have been proposed to analyze the linguistic information being captured in BERT. However, the current works do not provide an insight into how BERT is able to achieve near human-level performance on the task of Reading Comprehension based Question Answering. In this work, we attempt to interpret BERT for RCQA. Since BERT layers do not have predefined roles, we define a layer's role or functionality using Integrated Gradients. Based on the defined roles, we perform a preliminary analysis across all layers. We observed that the initial layers focus on query-passage interaction, whereas later layers focus more on contextual understanding and enhancing the answer prediction. Specifically for quantifier questions (how much/how many), we notice that BERT focuses on confusing words (i.e., on other numerical quantities in the passage) in the later layers, but still manages to predict the answer correctly. The fine-tuning and analysis scripts will be publicly available at https://github.com/iitmnlp/BERT-Analysis-RCQA.",,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.261,38939356 +main.353,Type B Reflexivization as an Unambiguous Testbed for Multilingual Multi-Task Gender Bias,Ana Valeria González|Maria Barrett|Rasmus Hvingelby|Kellie Webster|Anders Søgaard,"The one-sided focus on English in previous studies of gender bias in NLP misses out on opportunities in other languages: English challenge datasets such as GAP and WinoGender highlight model preferences that are ""hallucinatory"", e.g., disambiguating gender-ambiguous occurrences of 'doctor' as male doctors. We show that for languages with type B reflexivization, e.g., Swedish and Russian, we can construct multi-task challenge datasets for detecting gender bias that lead to unambiguously wrong model predictions: In these languages, the direct translation of 'the doctor removed his mask' is not ambiguous between a coreferential reading and a disjoint reading. Instead, the coreferential reading requires a non-gendered pronoun, and the gendered, possessive pronouns are anti-reflexive. We present a multilingual, multi-task challenge dataset, which spans four languages and four NLP tasks and focuses only on this phenomenon. We find evidence for gender bias across all task-language combinations and correlate model bias with national labor market statistics.",,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.209,38938689 +main.3532,Can Emojis Convey Human Emotions? A Study to Understand the Association between Emojis and Emotions,Abu Awal Md Shoeb|Gerard de Melo,"Given the growing ubiquity of emojis in language, there is a need for methods and resources that shed light on their meaning and communicative role. One conspicuous aspect of emojis is their use to convey affect in ways that may otherwise be non-trivial to achieve. In this paper, we seek to explore the connection between emojis and emotions by means of a new dataset consisting of human-solicited association ratings. We additionally conduct experiments to assess to what extent such associations can be inferred from existing data in an unsupervised manner. Our experiments show that this succeeds when high-quality word-level information is available.",,Computational Social Science and Social Media,Long,https://www.aclweb.org/anthology/2020.emnlp-main.720,38939357 +main.3540,Combining Self-Training and Self-Supervised Learning for Unsupervised Disfluency Detection,Shaolei Wang|Zhongyuan Wang|Wanxiang Che|Ting Liu,"Most existing approaches to disfluency detection heavily rely on human-annotated corpora, which is expensive to obtain in practice. There have been several proposals to alleviate this issue with, for instance, self-supervised learning techniques, but they still require human-annotated corpora. In this work, we explore the unsupervised learning paradigm which can potentially work with unlabeled text corpora that are cheaper and easier to obtain. Our model builds upon the recent work on Noisy Student Training, a semi-supervised learning approach that extends the idea of self-training. Experimental results on the commonly used English Switchboard test set show that our approach achieves competitive performance compared to the previous state-of-the-art supervised systems using contextualized word embeddings (e.g. BERT and ELECTRA).",,Speech and Multimodality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.142,38939358 +main.3541,Modularized Transfomer-based Ranking Framework,Luyu Gao|Zhuyun Dai|Jamie Callan,"Recent innovations in Transformer-based ranking models have advanced the state-of-the-art in information retrieval. However, these Transformers are computationally expensive, and their opaque hidden states make it hard to understand the ranking process. In this work, we modularize the Transformer ranker into separate modules for text representation and interaction. We show how this design enables substantially faster ranking using offline pre-computed representations and light-weight online interactions. The modular design is also easier to interpret and sheds light on the ranking process in Transformer rankers.",,Information Retrieval and Text Mining,Long,https://www.aclweb.org/anthology/2020.emnlp-main.342,38939359 +main.3543,Analyzing Redundancy in Pretrained Transformer Models,Fahim Dalvi|Hassan Sajjad|Nadir Durrani|Yonatan Belinkov,"Transformer-based deep NLP models are trained using hundreds of millions of parameters, limiting their applicability in computationally constrained environments. In this paper, we study the cause of these limitations by defining a notion of Redundancy, which we categorize into two classes: General Redundancy and Task-specific Redundancy. We dissect two popular pretrained models, BERT and XLNet, studying how much redundancy they exhibit at a representation-level and at a more fine-grained neuron-level. Our analysis reveals interesting insights, such as i) 85% of the neurons across the network are redundant and ii) at least 92% of them can be removed when optimizing towards a downstream task. Based on our analysis, we present an efficient feature-based transfer learning procedure, which maintains 97% performance while using at-most 10% of the original neurons.",,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.398,38939360 +main.3544,Semantic Evaluation for Text-to-SQL with Distilled Test Suite,Ruiqi Zhong|Tao Yu|Dan Klein,"We propose test suite accuracy to approximate semantic accuracy for Text-to-SQL models. Our method distills a small test suite of databases that achieves high code coverage for the gold query from a large number of randomly generated databases. At evaluation time, it computes the denotation accuracy of the predicted queries on the distilled test suite, hence calculating a tight upper-bound for semantic accuracy efficiently. We use our proposed method to evaluate 21 models submitted to the Spider leader board and manually verify that our method is always correct on 100 examples. In contrast, the current Spider metric leads to a 2.5% false negative rate on average and 8.1% in the worst case, indicating that test suite accuracy is needed. Our implementation, along with distilled test suites for eleven Text-to-SQL datasets, is publicly available.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",Long,https://www.aclweb.org/anthology/2020.emnlp-main.29,38939361 +main.355,VD-BERT: A Unified Vision and Dialog Transformer with BERT,Yue Wang|Shafiq Joty|Michael Lyu|Irwin King|Caiming Xiong|Steven C.H. Hoi,"Visual dialog is a challenging vision-language task, where a dialog agent needs to answer a series of questions through reasoning on the image content and dialog history. Prior work has mostly focused on various attention mechanisms to model such intricate interactions. By contrast, in this work, we propose VD-BERT, a simple yet effective framework of unified vision-dialog Transformer that leverages the pretrained BERT language models for Visual Dialog tasks. The model is unified in that (1) it captures all the interactions between the image and the multi-turn dialog using a single-stream Transformer encoder, and (2) it supports both answer ranking and answer generation seamlessly through the same architecture. More crucially, we adapt BERT for the effective fusion of vision and dialog contents via visually grounded training. Without the need of pretraining on external vision-language data, our model yields new state of the art, achieving the top position in both single-model and ensemble settings (74.54 and 75.35 NDCG scores) on the visual dialog leaderboard. Our code and pretrained models are released at https://github.com/salesforce/VD-BERT.",,"Language Grounding to Vision, Robotics and Beyond",Long,https://www.aclweb.org/anthology/2020.emnlp-main.269,38938690 +main.3550,RNNs Can Generate Bounded Hierarchical Languages with Optimal Memory,John Hewitt|Michael Hahn|Surya Ganguli|Percy Liang|Christopher D. Manning,"Recurrent neural networks empirically generate natural language with high syntactic fidelity. However, their success is not well-understood theoretically. We provide theoretical insight into this success, proving in a finite-precision setting that RNNs can efficiently generate bounded hierarchical languages that reflect the scaffolding of natural language syntax. We introduce Dyck-$(k,m)$, the language of well-nested brackets (of $k$ types) and $m$-bounded nesting depth, reflecting the bounded memory needs and long-distance dependencies of natural language syntax. The best known results use $O(k^{\frac{m}{2}})$ memory (hidden units) to generate these languages. We prove that an RNN with $O(m \log k)$ hidden units suffices, an exponential reduction in memory, by an explicit construction. Finally, we show that no algorithm, even with unbounded computation, can suffice with $o(m \log k)$ hidden units.",,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.156,38939362 +main.3551,Beyond Geolocation: Micro-Dialect Identification in Diaglossic and Code-Switched Environments,Muhammad Abdul-Mageed|Chiyu Zhang|AbdelRahim Elmadany|Lyle Ungar,"Although prediction of dialects is an important language processing task, with a wide range of applications, existing work is largely limited to coarse-grained varieties. Inspired by geolocation research, we propose the novel task of Micro-Dialect Identification (MDI) and introduce MARBERT, a new language model with striking abilities to predict a fine-grained variety (as small as that of a city) given a single, short message. For modeling, we offer a range of novel spatially and linguistically-motivated multi-task learning models. To showcase the utility of our models, we introduce a new, large-scale dataset of Arabic micro-varieties (low-resource) suited to our tasks. MARBERT predicts micro-dialects with 9.9% F1,  76 better than a majority class baseline. Our new language model also establishes new state-ofthe- art on several external tasks.",,Computational Social Science and Social Media,Long,https://www.aclweb.org/anthology/2020.emnlp-main.472,38939363 +main.3552,Re-evaluating Evaluation in Text Summarization,Manik Bhandari|Pranav Narayan Gour|Atabak Ashfaq|Pengfei Liu|Graham Neubig,"Automated evaluation metrics as a stand-in for manual evaluation are an essential part of the development of text-generation tasks such as text summarization. However, while the field has progressed, our standard metrics have not -- for nearly 20 years ROUGE has been the standard evaluation in most summarization papers. In this paper, we make an attempt to re-evaluate the evaluation method for text summarization: assessing the reliability of automatic metrics using top-scoring system outputs, both abstractive and extractive, on recently popular datasets for both system-level and summary-level evaluation settings. We find that conclusions about evaluation metrics on older datasets do not necessarily hold on modern datasets and systems. We release a dataset of human judgments that are collected from 25 top-scoring neural summarization systems (14 abstractive and 11 extractive).",,Summarization,Long,https://www.aclweb.org/anthology/2020.emnlp-main.751,38939364 +main.3563,Measuring the Similarity of Grammatical Gender Systems by Comparing Partitions,Arya D. McCarthy|Adina Williams|Shijia Liu|David Yarowsky|Ryan Cotterell,"A grammatical gender system divides a lexicon into a small number of relatively fixed grammatical categories. How similar are these gender systems across languages? To quantify the similarity, we define gender systems extensionally, thereby reducing the problem of comparisons between languages' gender systems to cluster evaluation. We borrow a rich inventory of statistical tools for cluster evaluation from the field of community detection (Driver and Kroeber, 1932; Cattell, 1945), that enable us to craft novel information theoretic metrics for measuring similarity between gender systems. We first validate our metrics, then use them to measure gender system similarity in 20 languages. We then ask whether our gender system similarities alone are sufficient to reconstruct historical relationships between languages. Towards this end, we make phylogenetic predictions on the popular, but thorny, problem from historical linguistics of inducing a phylogenetic tree over extant Indo-European languages. Of particular interest, languages on the same branch of our phylogenetic tree are notably similar, whereas languages from separate branches are no more similar than chance.",,"Phonology, Morphology and Word Segmentation",Long,https://www.aclweb.org/anthology/2020.emnlp-main.456,38939365 +main.3566,Dats Wassup!!: Investigating African-American Vernacular English in Transformer-Based Text Generation,Sophie Groenwold|Lily Ou|Aesha Parekh|Samhita Honnavalli|Sharon Levy|Diba Mirza|William Yang Wang,"The growth of social media has encouraged the written use of African American Vernacular English (AAVE), which has traditionally been used only in oral contexts. However, NLP models have historically been developed using dominant English varieties, such as Standard American English (SAE), due to text corpora availability. We investigate the performance of GPT-2 on AAVE text by creating a dataset of intent-equivalent parallel AAVE/SAE tweet pairs, thereby isolating syntactic structure and AAVE- or SAE-specific language for each pair. We evaluate each sample and its GPT-2 generated text with pretrained sentiment classifiers and find that while AAVE text results in more classifications of negative sentiment than SAE, the use of GPT-2 generally increases occurrences of positive sentiment for both. Additionally, we conduct human evaluation of AAVE and SAE text generated with GPT-2 to compare contextual rigor and overall quality.",,Computational Social Science and Social Media,Long,https://www.aclweb.org/anthology/2020.emnlp-main.473,38939366 +main.3567,Multi-Stage Pre-training for Automated Chinese Essay Scoring,Wei Song|Kai Zhang|Ruiji Fu|Lizhen Liu|Ting Liu|Miaomiao Cheng,"This paper proposes a pre-training based automated Chinese essay scoring method. The method involves three components: weakly supervised pre-training, supervised cross- prompt fine-tuning and supervised target- prompt fine-tuning. An essay scorer is first pre- trained on a large essay dataset covering diverse topics and with coarse ratings, i.e., good and poor, which are used as a kind of weak supervision. The pre-trained essay scorer would be further fine-tuned on previously rated es- says from existing prompts, which have the same score range with the target prompt and provide extra supervision. At last, the scorer is fine-tuned on the target-prompt training data. The evaluation on four prompts shows that this method can improve a state-of-the-art neural essay scorer in terms of effectiveness and domain adaptation ability, while in-depth analysis also reveals its limitations..",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.546,38939367 +main.357,Semantically Inspired AMR Alignment for the Portuguese Language,Rafael Anchiêta|Thiago Pardo,"Abstract Meaning Representation (AMR) is a graph-based semantic formalism where the nodes are concepts and edges are relations among them. Most of AMR parsing methods require alignment between the nodes of the graph and the words of the sentence. However, this alignment is not provided by manual annotations and available automatic aligners focus only on the English language, not performing well for other languages. Aiming to fulfill this gap, we developed an alignment method for the Portuguese language based on a more semantically matched word-concept pair. We performed both intrinsic and extrinsic evaluations and showed that our alignment approach outperforms the alignment strategies developed for English, improving AMR parsers, and achieving competitive results with a parser designed for the Portuguese language.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",Long,https://www.aclweb.org/anthology/2020.emnlp-main.123,38938691 +main.3573,To Schedule or Not to Schedule: Extracting Task Specific Temporal Entities and Associated Negation Constraints,Barun Patra|Chala Fufa|Pamela Bhattacharya|Charles Lee,"State of the art research for date-time\footnote{We use date-time entities, date entities, time entities and temporal entities interchangeably to denote entities associated with dates and/ or times.} entity extraction from text is task agnostic. Consequently, while the methods proposed in literature perform well for generic date-time extraction from texts, they don’t fare as well on task specific date-time entity extraction where only a subset of the date-time entities present in the text are pertinent to solving the task. Furthermore, some tasks require identifying negation constraints associated with the date-time entities to correctly reason over time. We showcase a novel model for extracting task-specific date-time entities along with their negation constraints. We show the efficacy of our method on the task of date-time understanding in the context of scheduling meetings for an email-based digital AI scheduling assistant. Our method achieves an absolute gain of 19% f-score points compared to baseline methods in detecting the date-time entities relevant to scheduling meetings and a 4% improvement over baseline methods for detecting negation constraints over date-time entities.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.678,38939368 +main.3579,Unified Feature and Instance Based Domain Adaptation for End-to-End Aspect-based Sentiment Analysis,Chenggong Gong|Jianfei Yu|Rui Xia,"The supervised models for aspect-based sentiment analysis (ABSA) rely heavily on labeled data. However, fine-grained labeled data are scarce for the ABSA task. To alleviate the dependence on labeled data, prior works mainly focused on feature-based adaptation, which used the domain-shared knowledge to construct auxiliary tasks or domain adversarial learning to bridge the gap between domains, while ignored the attribute of instance-based adaptation. To resolve this limitation, we propose an end-to-end framework to jointly perform feature and instance based adaptation for the ABSA task in this paper. Based on BERT, we learn domain-invariant feature representations by using part-of-speech features and syntactic dependency relations to construct auxiliary tasks, and jointly perform word-level instance weighting in the framework of sequence labeling. Experiment results on four benchmarks show that the proposed method can achieve significant improvements in comparison with the state-of-the-arts in both tasks of cross-domain End2End ABSA and cross-domain aspect extraction.",,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Long,https://www.aclweb.org/anthology/2020.emnlp-main.572,38939369 +main.3580,MovieChats: Chat like Humans in a Closed Domain,Hui Su|Xiaoyu Shen|Zhou Xiao|Zheng Zhang|Ernie Chang|Cheng Zhang|Cheng Niu|Jie Zhou,"Being able to perform in-depth chat with humans in a closed domain is a precondition before an open-domain chatbot can be ever claimed. In this work, we take a close look at the movie domain and present a large-scale high-quality corpus with fine-grained annotations in hope of pushing the limit of movie-domain chatbots. We propose a unified, readily scalable neural approach which reconciles all subtasks like intent prediction and knowledge retrieval. The model is first pretrained on the huge general-domain data, then finetuned on our corpus. We show this simple neural approach trained on high-quality data is able to outperform commercial systems replying on complex rules. On both the static and interactive tests, we find responses generated by our system exhibits remarkably good engagement and sensibleness close to human-written ones. We further analyze the limits of our work and point out potential directions for future work",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.535,38939370 +main.3581,Summarizing Text on Any Aspects: A Knowledge-Informed Weakly-Supervised Approach,Bowen Tan|Lianhui Qin|Eric Xing|Zhiting Hu,"Given a document and a target aspect (e.g., a topic of interest), aspect-based abstractive summarization attempts to generate a summary with respect to the aspect. Previous studies usually assume a small pre-defined set of aspects and fall short of summarizing on other diverse topics. In this work, we study summarizing on \emph{arbitrary} aspects relevant to the document, which significantly expands the application of the task in practice. Due to the lack of supervision data, we develop a new weak supervision construction method and an aspect modeling scheme, both of which integrate rich external knowledge sources such as ConceptNet and Wikipedia. Experiments show our approach achieves performance boosts on summarizing both real and synthetic documents given pre-defined or arbitrary aspects.",,Summarization,Long,https://www.aclweb.org/anthology/2020.emnlp-main.510,38939371 +main.359,Gone at Last: Removing the Hypothesis-Only Bias in Natural Language Inference via Ensemble Adversarial Training,Joe Stacey|Pasquale Minervini|Haim Dubossarsky|Sebastian Riedel|Tim Rocktäschel,"Natural Language Inference (NLI) datasets contain annotation artefacts resulting in spurious correlations between the natural language utterances and their respective entailment classes. These artefacts are exploited by neural networks even when only considering the hypothesis and ignoring the premise, leading to unwanted biases. Belinkov et al. (2019b) proposed tackling this problem via adversarial training, but this can lead to learned sentence representations that still suffer from the same biases. We show that the bias can be reduced in the sentence representations by using an ensemble of adversaries, encouraging the model to jointly decrease the accuracy of these different adversaries while fitting the data. This approach produces more robust NLI models, outperforming previous de-biasing efforts when generalised to 12 other NLI datasets (Belinkov et al., 2019a; Mahabadi et al., 2020). In addition, we find that the optimal number of adversarial classifiers depends on the dimensionality of the sentence representations, with larger sentence representations being more difficult to de-bias while benefiting from using a greater number of adversaries.",,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.665,38938692 +main.3593,Conundrums in Entity Reference Resolution,Jing Lu|Vincent Ng,"Despite the significant progress on entity coreference resolution observed in recent years, there is a general lack of understanding of what has been improved. We present an empirical analysis of state-of-the-art resolvers with the goal of providing the general NLP audience with a better understanding of the state of the art and coreference researchers with directions for future research.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.536,38939372 +main.3594,End-to-End Emotion-Cause Pair Extraction Based on Sliding Window Multi-Label Learning,Zixiang Ding|Rui Xia|Jianfei Yu,"Emotion-cause pair extraction (ECPE) is a new task that aims to extract the potential pairs of emotions and their corresponding causes in a document. The existing methods first perform emotion extraction and cause extraction independently, and then perform emotion-cause pairing and filtering. However, the above methods ignore the fact that the cause and the emotion it triggers are inseparable, and the extraction of the cause without specifying the emotion is pathological, which greatly limits the performance of the above methods in the first step. To tackle these shortcomings, we propose two joint frameworks for ECPE: 1) multi-label learning for the extraction of the cause clauses corresponding to the specified emotion clause (CMLL) and 2) multi-label learning for the extraction of the emotion clauses corresponding to the specified cause clause (EMLL). The window of multi-label learning is centered on the specified emotion clause or cause clause and slides as their positions move. Finally, CMLL and EMLL are integrated to obtain the final result. We evaluate our model on a benchmark emotion cause corpus, the results show that our approach achieves the best performance among all compared systems on the ECPE task.",,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Long,https://www.aclweb.org/anthology/2020.emnlp-main.290,38939373 +main.3597,Localizing Q&A Semantic Parsers for Any Language in a Day,Mehrad Moradshahi|Giovanni Campagna|Sina Semnani|Silei Xu|Monica Lam,"We propose Semantic Parser Localizer (SPL), a toolkit that leverages Neural Machine Translation (NMT) systems to localize a semantic parser for a new language. Our methodology is to (1) generate training data automatically in the target language by augmenting machine-translated datasets with local entities scraped from public websites, (2) add a few-shot boost of human-translated sentences and train a novel XLMR-LSTM semantic parser, and (3) test the model on natural utterances curated using human translators. We assess the effectiveness of our approach by extending the current capabilities of Schema2QA, a system for English Question Answering (QA) on the open web, to 10 new languages for the restaurants and hotels domains. Our model achieves an overall test accuracy ranging between 61% and 69% for the hotels domain and between 64% and 78% for restaurants domain, which compares favorably to 69% and 80% obtained for English parser trained on gold English data and a few examples from validation set. We show our approach outperforms the previous state-of-the-art methodology by more than 30% for hotels and 40% for restaurants with localized ontologies for the subset of languages tested. Our methodology enables any software developer to add a new language capability to a QA system for a new domain, leveraging machine translation, in less than 24 hours. Our code is released open-source.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.481,38939374 +main.360,I Was Just Being Sarcastic! Reactive Supervision: A New Method for Collecting Sarcasm Data,Boaz Shmueli|Lun-Wei Ku|Soumya Ray,"Sarcasm detection is an important task in affective computing, requiring large amounts of labeled data. We introduce reactive supervision, a novel data collection method that utilizes the dynamics of online conversations to overcome the limitations of existing data collection techniques. We use the new method to create and release a first-of-its-kind large dataset of tweets with sarcasm perspective labels and new contextual features. The dataset is expected to advance sarcasm detection research. Our method can be adapted to other affective computing domains, thus opening up new research opportunities.",,Computational Social Science and Social Media,Long,https://www.aclweb.org/anthology/2020.emnlp-main.201,38938693 +main.3609,An Unsupervised Sentence Embedding Method by Mutual Information Maximization,Yan Zhang|Ruidan He|ZUOZHU LIU|Kwan Hui Lim|Lidong Bing,"BERT is inefficient for sentence-pair tasks such as clustering or semantic search as it needs to evaluate combinatorially many sentence pairs which is very time-consuming. Sentence BERT (SBERT) attempted to solve this challenge by learning semantically meaningful representations of single sentences, such that similarity comparison can be easily accessed. However, SBERT is trained on corpus with high-quality labeled sentence pairs, which limits its application to tasks where labeled data is extremely scarce. In this paper, we propose a lightweight extension on top of BERT and a novel self-supervised learning objective based on mutual information maximization strategies to derive meaningful sentence embeddings in an unsupervised manner. Unlike SBERT, our method is not restricted by the availability of labeled data, such that it can be applied on different domain-specific corpus. Experimental results show that the proposed method significantly outperforms other unsupervised sentence embedding baselines on common semantic textual similarity (STS) tasks and downstream supervised tasks. It also outperforms SBERT in a setting where in-domain labeled data is not available, and achieves performance competitive with supervised methods on various tasks.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",Long,https://www.aclweb.org/anthology/2020.emnlp-main.124,38939375 +main.3617,Temporal Knowledge Base Completion: New Algorithms and Evaluation Protocols,Prachi Jain|Sushant Rathi|Mausam|Soumen Chakrabarti,"Research on temporal knowledge bases, which associate a relational fact (s,r,o) with a validity time period (or time instant), is in its early days. Our work considers predicting missing entities (link prediction) and missing time intervals (time prediction) as joint Temporal Knowledge Base Completion (TKBC) tasks, and presents TIMEPLEX, a novel TKBC method, in which entities, relations and, time are all embedded in a uniform, compatible space. TIMEPLEX exploits the recurrent nature of some facts/events and temporal interactions between pairs of relations, yielding state-of-the-art results on both prediction tasks. We also find that existing TKBC models heavily overestimate link prediction performance due to imperfect evaluation mechanisms. In response, we propose improved TKBC evaluation protocols for both link and time prediction tasks, dealing with subtle issues that arise from the partial overlap of time intervals in gold instances and system predictions.",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.305,38939376 +main.362,Multi-Task Learning for Logically Dependent Tasks from the Perspective of Causal Inference,Wenqing Chen|Jidong Tian|Liqiang Xiao|Hao He|Yaohui Jin,"Previous studies have shown that hierarchical multi-task learning (MTL) can utilize task dependencies by stacking encoders and outperform democratic MTL. However, stacking encoders only considers the dependencies of feature representations and ignores the label dependencies in logically dependent tasks. Furthermore, how to properly utilize the labels remains an issue due to the cascading errors between tasks. In this paper, we view logically dependent MTL from the perspective of causal inference and suggest a mediation assumption instead of the confounding assumption in conventional MTL models. We propose a model including two key mechanisms: label transfer (LT) for each task to utilize the labels of all its lower-level tasks, and Gumbel sampling (GS) to deal with cascading errors. In the field of causal inference, GS in our model is essentially a counterfactual reasoning process, trying to estimate the causal effect between tasks and utilize it to improve MTL. We conduct experiments on two English datasets and one Chinese dataset. Experiment results show that our model achieves state-of-the-art on six out of seven subtasks and improves predictions' consistency.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.173,38938694 +main.3621,SlotRefine: A Fast Non-Autoregressive Model for Joint Intent Detection and Slot Filling,Di Wu|Liang Ding|Fan Lu|Jian Xie,"Slot filling and intent detection are two main tasks in spoken language understanding (SLU) system. In this paper, we propose a novel non-autoregressive model named SlotRefine for joint intent detection and slot filling. Besides, we design a novel two-pass iteration mechanism to handle the uncoordinated slots problem caused by conditional independence of non-autoregressive model. Experiments demonstrate that our model significantly outperforms previous models in slot filling task, while considerably speeding up the decoding (up to x10.77). In-depth analysis show that 1) pretraining schemes could further enhance our model; 2) two-pass mechanism indeed remedy the uncoordinated slots.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.152,38939377 +main.3635,On the Sentence Embeddings from BERT for Semantic Textual Similarity,Bohan Li|Hao Zhou|Junxian He|Mingxuan Wang|Yiming Yang|Lei Li,"Pre-trained contextual representations like BERT have achieved great success in natural language processing. However, the sentence embeddings from the pre-trained language models without fine-tuning have been found to poorly capture semantic meaning of sentences. In this paper, we argue that the semantic information in the BERT embeddings is not fully exploited. We first reveal the theoretical connection between the masked language model pre-training objective and the semantic similarity task theoretically, and then analyze the BERT sentence embeddings empirically. We find that BERT always induces a non-smooth anisotropic semantic space of sentences, which harms its performance of semantic similarity. To address this issue, we propose to transform the anisotropic sentence embedding distribution to a smooth and isotropic Gaussian distribution through normalizing flows that are learned with an unsupervised objective. Experimental results show that our proposed BERT-flow method obtains significant performance gains over the state-of-the-art sentence embeddings on a variety of semantic textual similarity tasks. The code is available at \url{https://github.com/bohanli/BERT-flow}.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",Long,https://www.aclweb.org/anthology/2020.emnlp-main.733,38939378 +main.3644,Weakly Supervised Learning of Nuanced Frames for Analyzing Polarization in News Media,Shamik Roy|Dan Goldwasser,"In this paper, we suggest a minimally supervised approach for identifying nuanced frames in news article coverage of politically divisive topics. We suggest to break the broad policy frames suggested by Boydstun et al., 2014 into fine-grained subframes which can capture differences in political ideology in a better way. We evaluate the suggested subframes and their embedding, learned using minimal supervision, over three topics, namely, immigration, gun-control, and abortion. We demonstrate the ability of the subframes to capture ideological differences and analyze political discourse in news media.",,Computational Social Science and Social Media,Long,https://www.aclweb.org/anthology/2020.emnlp-main.620,38939379 +main.3646,Constrained Iterative Labeling for Open Information Extraction,Keshav Kolluru|Vaibhav Adlakha|Samarth Aggarwal|Mausam|Soumen Chakrabarti,"A recent state-of-the-art neural open information extraction (OpenIE) system generates extractions iteratively, requiring repeated encoding of partial outputs. This comes at a significant computational cost. On the other hand,sequence labeling approaches for OpenIE are much faster, but worse in extraction quality. In this paper, we bridge this trade-off by presenting an iterative labeling-based system that establishes a new state of the art for OpenIE, while extracting 10x faster. This is achieved through a novel Iterative Grid Labeling (IGL) architecture, which treats OpenIE as a 2-D grid labeling task. We improve its performance further by applying coverage (soft) constraints on the grid at training time. Moreover, on observing that the best OpenIE systems falter at handling coordination structures, our OpenIE system also incorporates a new coordination analyzer built with the same IGL architecture. This IGL based coordination analyzer helps our OpenIE system handle complicated coordination structures, while also establishing a new state of the art on the task of coordination analysis, with a 12.3 pts improvement in F1 over previous analyzers. Our OpenIE system - OpenIE6 - beats the previous systems by as much as 4 pts in F1, while being much faster.",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.306,38939380 +main.3647,Pre-training of Mention Representations in Coreference Models,Yuval Varkel|Amir Globerson,"Collecting labeled data for coreference resolution is a challenging task, requiring skilled annotators. It is thus desirable to develop coreference resolution models that can make use of unlabeled data. Here we provide such an approach for the powerful class of neural coreference models. These models rely on representations of mentions, and we show these representations can be learned in a self-supervised manner towards improving resolution accuracy. We propose two self-supervised tasks that are closely related to coreference resolution and thus improve mention representation. Applying this approach to the GAP dataset results in new state of the arts results.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",Long,https://www.aclweb.org/anthology/2020.emnlp-main.687,38939381 +main.3648,Interpretable Multi-dataset Evaluation for Named Entity Recognition,Jinlan Fu|Pengfei Liu|Graham Neubig,"With the proliferation of models for natural language processing tasks, it is even harder to understand the differences between models and their relative merits. Simply looking at differences between holistic metrics such as accuracy, BLEU, or F1 does not tell us why or how particular methods perform differently and how diverse datasets influence the model design choices. In this paper, we present a general methodology for interpretable evaluation for the named entity recognition (NER) task. The proposed evaluation method enables us to interpret the differences in models and datasets, as well as the interplay between them, identifying the strengths and weaknesses of current systems. By making our analysis tool available, we make it easy for future researchers to run similar analyses and drive progress in this area: https://github.com/neulab/InterpretEval",,"Syntax: Tagging, Chunking, and Parsing",Long,https://www.aclweb.org/anthology/2020.emnlp-main.489,38939382 +main.3651,A Knowledge-driven Generative Model for Multi-implication Chinese Medical Procedure Entity Normalization,Jinghui Yan|Yining Wang|Lu Xiang|Yu Zhou|Chengqing Zong,"Medical entity normalization, which links medical mentions in the text to entities in knowledge bases, is an important research topic in medical natural language processing. In this paper, we focus on Chinese medical procedure entity normalization. However, nonstandard Chinese expressions and combined procedures present challenges in our problem. The existing strategies relying on the discriminative model are poorly to cope with normalizing combined procedure mentions. We propose a sequence generative framework to directly generate all the corresponding medical procedure entities. we adopt two strategies: category-based constraint decoding and category-based model refining to avoid unrealistic results. The method is capable of linking entities when a mention contains multiple procedure concepts and our comprehensive experiments demonstrate that the proposed model can achieve remarkable improvements over existing baselines, particularly significant in the case of multi-implication Chinese medical procedures.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.116,38939383 +main.3656,Is Chinese Word Segmentation a Solved Task? Rethinking Neural Chinese Word Segmentation,Jinlan Fu|Pengfei Liu|Qi Zhang|Xuanjing Huang,"The performance of the Chinese Word Segmentation (CWS) systems has gradually reached a plateau with the rapid development of deep neural networks, especially the successful use of large pre-trained models. In this paper, we take stock of what we have achieved and rethink what's left in the CWS task. Methodologically, we propose a fine-grained evaluation for existing CWS systems, which not only allows us to diagnose the strengths and weaknesses of existing models (under the in-dataset setting), but enables us to quantify the discrepancy between different criterion and alleviate the negative transfer problem when doing multi-criteria learning. Strategically, despite not aiming to propose a novel model in this paper, our comprehensive experiments on eight models and seven datasets, as well as thorough analysis, could search for some promising direction for future research. We make all codes publicly available and release an interface that can quickly evaluate and diagnose user's models: https://github.com/neulab/InterpretEval",,"Phonology, Morphology and Word Segmentation",Long,https://www.aclweb.org/anthology/2020.emnlp-main.457,38939384 +main.3672,Few-shot Complex Knowledge Base Question Answering via Meta Reinforcement Learning,Yuncheng Hua|Yuan-Fang Li|Gholamreza Haffari|Guilin Qi|Tongtong Wu,"Complex question-answering (CQA) involves answering complex natural-language questions on a knowledge base (KB). However, the conventional neural program induction (NPI) approach exhibits uneven performance when the questions have different types, harboring inherently different characteristics, e.g., difficulty level. This paper proposes a meta-reinforcement learning approach to program induction in CQA to tackle the potential distributional bias in questions. Our method quickly and effectively adapts the meta-learned programmer to new questions based on the most similar questions retrieved from the training data. The meta-learned policy is then used to learn a good programming policy, utilizing the trial trajectories and their rewards for similar questions in the support set. Our method achieves state-of-the-art performance on the CQA dataset (Saha et al., 2018) while using only five trial trajectories for the top-5 retrieved questions in each support set, and meta-training on tasks constructed from only 1% of the training set. We have released our code at https://github.com/DevinJake/MRL-CQA.",,Question Answering,Long,https://www.aclweb.org/anthology/2020.emnlp-main.469,38939385 +main.3676,A Dual-generator Network for Text Style Transfer Applications,Xiao Li|Guanyi Chen|Chenghua Lin|Ruizhe Li,"We propose DGST, a novel and simple Dual-Generator network architecture for text Style Transfer. Our model employs two generators only, and does not rely on any discriminators or parallel corpus for training. Both quantitative and qualitative experiments on the Yelp and IMDb datasets show that our model gives competitive performance compared to several strong baselines with more complicated architecture designs.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.578,38939386 +main.3682,Re-examining the Role of Schema Linking in Text-to-SQL,Wenqiang Lei|Weixin Wang|Zhixin MA|Tian Gan|Wei Lu|Min-Yen Kan|Tat-Seng Chua,"In existing sophisticated text-to-SQL models, schema linking is often considered as a simple, minor component, belying its importance. By providing a schema linking corpus based on the Spider text-to-SQL dataset, we systematically study the role of schema linking. We also build a simple BERT-based baseline, called Schema-Linking SQL (SLSQL) to perform a data-driven study. We find when schema linking is done well, SLSQL demonstrates good performance on Spider despite its structural simplicity. Many remaining errors are attributable to corpus noise. This suggests schema linking is the crux for the current text-to-SQL task. Our analytic studies provide insights on the characteristics of schema linking for future developments of text-to-SQL tasks.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",Long,https://www.aclweb.org/anthology/2020.emnlp-main.564,38939387 +main.3688,Pre-training Multilingual Neural Machine Translation by Leveraging Alignment Information,Zehui Lin|Xiao Pan|Mingxuan Wang|Xipeng Qiu|Jiangtao Feng|Hao Zhou|Lei Li,"We investigate the following question for machine translation (MT): can we develop a single universal MT model to serve as the common seed and obtain derivative and improved models on arbitrary language pairs? We propose mRASP, an approach to pre-train a universal multilingual neural machine translation model. Our key idea in mRASP is its novel technique of random aligned substitution, which brings words and phrases with similar meanings across multiple languages closer in the representation space. We pre-train a mRASP model on 32 language pairs jointly with only public datasets. The model is then fine-tuned on downstream language pairs to obtain specialized MT models. We carry out extensive experiments on 42 translation directions across a diverse settings, including low, medium, rich resource, and as well as transferring to exotic language pairs. Experimental results demonstrate that mRASP achieves significant performance improvement compared to directly training on those target pairs. It is the first time to verify that multiple lowresource language pairs can be utilized to improve rich resource MT. Surprisingly, mRASP is even able to improve the translation quality on exotic languages that never occur in the pretraining corpus. Code, data, and pre-trained models are available at https://github. com/linzehui/mRASP.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.210,38939388 +main.371,BAE: BERT-based Adversarial Examples for Text Classification,Siddhant Garg|Goutham Ramakrishnan,"Modern text classification models are susceptible to adversarial examples, perturbed versions of the original text indiscernible by humans which get misclassified by the model. Recent works in NLP use rule-based synonym replacement strategies to generate adversarial examples. These strategies can lead to out-of-context and unnaturally complex token replacements, which are easily identifiable by humans. We present BAE, a black box attack for generating adversarial examples using contextual perturbations from a BERT masked language model. BAE replaces and inserts tokens in the original text by masking a portion of the text and leveraging the BERT-MLM to generate alternatives for the masked tokens. Through automatic and human evaluations, we show that BAE performs a stronger attack, in addition to generating adversarial examples with improved grammaticality and semantic coherence as compared to prior work.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.498,38938695 +main.373,Generating Image Descriptions via Sequential Cross-Modal Alignment Guided by Human Gaze,Ece Takmaz|Sandro Pezzelle|Lisa Beinborn|Raquel Fernández,"When speakers describe an image, they tend to look at objects before mentioning them. In this paper, we investigate such sequential cross-modal alignment by modelling the image description generation process computationally. We take as our starting point a state-of-the-art image captioning system and develop several model variants that exploit information from human gaze patterns recorded during language production. In particular, we propose the first approach to image description generation where visual processing is modelled sequentially. Our experiments and analyses confirm that better descriptions can be obtained by exploiting gaze-driven attention and shed light on human cognitive processes by comparing different ways of aligning the gaze modality with language production. We find that processing gaze data sequentially leads to descriptions that are better aligned to those produced by speakers, more diverse, and more natural---particularly when gaze is encoded with a dedicated recurrent component.",,"Linguistic Theories, Cognitive Modeling and Psycholinguistics",Long,https://www.aclweb.org/anthology/2020.emnlp-main.377,38938696 +main.376,Multimodal Routing: Improving Local and Global Interpretability of Multimodal Language Analysis,Yao-Hung Hubert Tsai|Martin Ma|Muqiao Yang|Ruslan Salakhutdinov|Louis-Philippe Morency,"The human language can be expressed through multiple sources of information known as modalities, including tones of voice, facial gestures, and spoken language. Recent multimodal learning with strong performances on human-centric tasks such as sentiment analysis and emotion recognition are often black-box, with very limited interpretability. In this paper we propose, which dynamically adjusts weights between input modalities and output representations differently for each input sample. Multimodal routing can identify relative importance of both individual modalities and cross-modality factors. Moreover, the weight assignment by routing allows us to interpret modality-prediction relationships not only globally (i.e. general trends over the whole dataset), but also locally for each single input sample, meanwhile keeping competitive performance compared to state-of-the-art methods.",,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.143,38938697 +main.384,Relation-aware Graph Attention Networks with Relational Position Encodings for Emotion Recognition in Conversations,Taichi Ishiwatari|Yuki Yasuda|Taro Miyazaki|Jun Goto,"Interest in emotion recognition in conversations (ERC) has been increasing in various fields, because it can be used to analyze user behaviors and detect fake news. Many recent ERC methods use graph-based neural networks to take the relationships between the utterances of the speakers into account. In particular, the state-of-the-art method considers self- and inter-speaker dependencies in conversations by using relational graph attention networks (RGAT). However, graph-based neural networks do not take sequential information into account. In this paper, we propose relational position encodings that provide RGAT with sequential information reflecting the relational graph structure. Accordingly, our RGAT model can capture both the speaker dependency and the sequential information. Experiments on four ERC datasets show that our model is beneficial to recognizing emotions expressed in conversations. In addition, our approach empirically outperforms the state-of-the-art on all of the benchmark datasets.",,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Long,https://www.aclweb.org/anthology/2020.emnlp-main.597,38938698 +main.387,Counterfactual Generator: A Weakly-Supervised Method for Named Entity Recognition,Xiangji Zeng|Yunliang Li|Yuchen Zhai|Yin Zhang,"Past progress on neural models has proven that named entity recognition is no longer a problem if we have enough labeled data. However, collecting enough data and annotating them are labor-intensive, time-consuming, and expensive. In this paper, we decompose the sentence into two parts: entity and context, and rethink the relationship between them and model performance from a causal perspective. Based on this, we propose the Counterfactual Generator, which generates counterfactual examples by the interventions on the existing observational examples to enhance the original dataset. Experiments across three datasets show that our method improves the generalization ability of models under limited observational examples. Besides, we provide a theoretical foundation by using a structural causal model to explore the spurious correlations between input features and output labels. We investigate the causal effects of entity or context on model performance under both conditions: the non-augmented and the augmented. Interestingly, we find that the non-spurious correlations are more located in entity representation rather than context representation. As a result, our method eliminates part of the spurious correlations between context representation and output labels. The code is available at https://github.com/xijiz/cfgen.",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.590,38938699 +main.390,Scene Restoring for Narrative Machine Reading Comprehension,Zhixing Tian|Yuanzhe Zhang|Kang Liu|Jun Zhao|Yantao Jia|Zhicheng Sheng,"This paper focuses on machine reading comprehension for narrative passages. Narrative passages usually describe a chain of events. When reading this kind of passage, humans tend to restore a scene according to the text with their prior knowledge, which helps them understand the passage comprehensively. Inspired by this behavior of humans, we propose a method to let the machine imagine a scene during reading narrative for better comprehension. Specifically, we build a scene graph by utilizing Atomic as the external knowledge and propose a novel Graph Dimensional-Iteration Network (GDIN) to encode the graph. We conduct experiments on the ROCStories, a dataset of Story Cloze Test (SCT), and CosmosQA, a dataset of multiple choice. Our method achieves state-of-the-art.",,Question Answering,Long,https://www.aclweb.org/anthology/2020.emnlp-main.247,38938700 +main.392,Multi-modal Multi-label Emotion Detection with Modality and Label Dependence,Dong Zhang|Xincheng Ju|Junhui Li|Shoushan Li|Qiaoming Zhu|Guodong Zhou,"As an important research issue in the natural language processing community, multi-label emotion detection has been drawing more and more attention in the last few years. However, almost all existing studies focus on one modality (e.g., textual modality). In this paper, we focus on multi-label emotion detection in a multi-modal scenario. In this scenario, we need to consider both the dependence among different labels (label dependence) and the dependence between each predicting label and different modalities (modality dependence). Particularly, we propose a multi-modal sequence-to-set approach to effectively model both kinds of dependence in multi-modal multi-label emotion detection. The detailed evaluation demonstrates the effectiveness of our approach.",,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Long,https://www.aclweb.org/anthology/2020.emnlp-main.291,38938701 +main.400,Learn to Cross-lingual Transfer with Meta Graph Learning across Heterogeneous Languages,Zheng Li|Mukul Kumar|William Headden|Bing Yin|Ying Wei|Yu Zhang|Qiang Yang,"Recent emergence of multilingual pre-training language model (mPLM) has enabled breakthroughs on various downstream cross-lingual transfer (CLT) tasks. However, mPLM-based methods usually involve two problems: (1) simply fine-tuning may not adapt general-purpose multilingual representations to be task-aware on low-resource languages; (2) ignore how cross-lingual adaptation happens for downstream tasks. To address the issues, we propose a meta graph learning (MGL) method. Unlike prior works that transfer from scratch, MGL can learn to cross-lingual transfer by extracting meta-knowledge from historical CLT experiences (tasks), making mPLM insensitive to low-resource languages. Besides, for each CLT task, MGL formulates its transfer process as information propagation over a dynamic graph, where the geometric structure can automatically capture intrinsic language relationships to explicitly guide cross-lingual transfer. Empirically, extensive experiments on both public and real-world datasets demonstrate the effectiveness of the MGL method.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.179,38938702 +main.407,Cross-lingual Spoken Language Understanding with Regularized Representation Alignment,Zihan Liu|Genta Indra Winata|Peng Xu|Zhaojiang Lin|Pascale Fung,"Despite the promising results of current cross-lingual models for spoken language understanding systems, they still suffer from imperfect cross-lingual representation alignments between the source and target languages, which makes the performance sub-optimal. To cope with this issue, we propose a regularization approach to further align word-level and sentence-level representations across languages without any external resource. First, we regularize the representation of user utterances based on their corresponding labels. Second, we regularize the latent variable model (Liu et al., 2019) by leveraging adversarial training to disentangle the latent variables. Experiments on the cross-lingual spoken language understanding task show that our model outperforms current state-of-the-art methods in both few-shot and zero-shot scenarios, and our model, trained on a few-shot setting with only 3\% of the target language training data, achieves comparable performance to the supervised training with all the training data.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.587,38938703 +main.41,Unsupervised Commonsense Question Answering with Self-Talk,Vered Shwartz|Peter West|Ronan Le Bras|Chandra Bhagavatula|Yejin Choi,"Natural language understanding involves reading between the lines with implicit background knowledge. Current systems either rely on pre-trained language models as the sole implicit source of world knowledge, or resort to external knowledge bases (KBs) to incorporate additional relevant knowledge. We propose an unsupervised framework based on self-talk as a novel alternative to multiple-choice commonsense tasks. Inspired by inquiry-based discovery learning (Bruner, 1961), our approach inquires language models with a number of information seeking questions such as ""what is the definition of..."" to discover additional background knowledge. Empirical results demonstrate that the self-talk procedure substantially improves the performance of zero-shot language model baselines on four out of six commonsense benchmarks, and competes with models that obtain knowledge from external KBs. While our approach improves performance on several benchmarks, the self-talk induced knowledge even when leading to correct answers is not always seen as helpful by human judges, raising interesting questions about the inner-workings of pre-trained language models for commonsense reasoning.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",Long,https://www.aclweb.org/anthology/2020.emnlp-main.373,38938641 +main.410,A Simple Approach to Learning Unsupervised Multilingual Embeddings,Pratik Jawanpuria|Mayank Meghwanshi|Bamdev Mishra,"Recent progress on unsupervised cross-lingual embeddings in the bilingual setting has given the impetus to learning a shared embedding space for several languages. A popular framework to solve the latter problem is to solve the following two sub-problems jointly: 1) learning unsupervised word alignment between several language pairs, and 2) learning how to map the monolingual embeddings of every language to shared multilingual space. In contrast, we propose a simple approach by decoupling the above two sub-problems and solving them separately, one after another, using existing techniques. We show that this proposed approach obtains surprisingly good performance in tasks such as bilingual lexicon induction, cross-lingual word similarity, multilingual document classification, and multilingual dependency parsing. When distant languages are involved, the proposed approach shows robust behavior and outperforms existing unsupervised multilingual word embedding approaches.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.240,38938704 +main.419,Will I Sound like Me? Improving Persona Consistency in Dialogues through Pragmatic Self-Consciousness,Hyunwoo Kim|Byeongchang Kim|Gunhee Kim,"We explore the task of improving persona consistency of dialogue agents. Recent models tackling consistency often train with additional Natural Language Inference (NLI) labels or attach trained extra modules to the generative agent for maintaining consistency. However, such additional labels and training can be demanding. Also, we find even the best-performing persona-based agents are insensitive to contradictory words. Inspired by social cognition and pragmatics, we endow existing dialogue agents with public self-consciousness on the fly through an imaginary listener. Our approach, based on the Rational Speech Acts framework (Frank and Goodman, 2012), can enforce dialogue agents to refrain from uttering contradiction. We further extend the framework by learning the distractor selection, which has been usually done manually or randomly. Results on Dialogue NLI (Welleck et al., 2019) and PersonaChat (Zhang et al., 2018) dataset show that our approach reduces contradiction and improves consistency of existing dialogue models. Moreover, we show that it can be generalized to improve context-consistency beyond persona in dialogues.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.65,38938705 +main.426,Adversarial Self-Supervised Data Free Distillation for Text Classification,Xinyin Ma|Yongliang Shen|Gongfan Fang|Chen Chen|Chenghao Jia|Weiming Lu,"Large pre-trained transformer-based language models have achieved impressive results on a wide range of NLP tasks. In the past few years, Knowledge Distillation(KD) has become a popular paradigm to compress a computationally expensive model to a resource-efficient lightweight model. However, most KD algorithms, especially in NLP, rely on the accessibility of the original training dataset, which may be unavailable due to privacy issues. To tackle this problem, we propose a novel two-stage data-free distillation method, named Adversarial self-Supervised Data-Free Distillation (AS-DFD), which is designed for compressing large-scale transformer-based models (e.g., BERT). To avoid text generation in discrete space, we introduce a Plug & Play Embedding Guessing method to craft pseudo embeddings from the teacher's hidden knowledge. Meanwhile, with a self-supervised module to quantify the student's ability, we adapt the difficulty of pseudo embeddings in an adversarial training manner. To the best of our knowledge, our framework is the first data-free distillation framework designed for NLP tasks. We verify the effectiveness of our method on several text classification datasets.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.499,38938706 +main.438,Detecting Attackable Sentences in Arguments,Yohan Jo|Seojin Bang|Emaad Manzoor|Eduard Hovy|Chris Reed,"Finding attackable sentences in an argument is the first step toward successful refutation in argumentation. We present a first large-scale analysis of sentence attackability in online arguments. We analyze driving reasons for attacks in argumentation and identify relevant characteristics of sentences. We demonstrate that a sentence's attackability is associated with many of these characteristics regarding the sentence's content, proposition types, and tone, and that an external knowledge source can provide useful information about attackability. Building on these findings, we demonstrate that machine learning models can automatically detect attackable sentences in arguments, significantly better than several baselines and comparably well to laypeople.",,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Long,https://www.aclweb.org/anthology/2020.emnlp-main.1,38938707 +main.440,Extracting Implicitly Asserted Propositions in Argumentation,Yohan Jo|Jacky Visser|Chris Reed|Eduard Hovy,"Argumentation accommodates various rhetorical devices, such as questions, reported speech, and imperatives. These rhetorical tools usually assert argumentatively relevant propositions rather implicitly, so understanding their true meaning is key to understanding certain arguments properly. However, most argument mining systems and computational linguistics research have paid little attention to implicitly asserted propositions in argumentation. In this paper, we examine a wide range of computational methods for extracting propositions that are implicitly asserted in questions, reported speech, and imperatives in argumentation. By evaluating the models on a corpus of 2016 U.S. presidential debates and online commentary, we demonstrate the effectiveness and limitations of the computational models. Our study may inform future research on argument mining and the semantics of these rhetorical devices in argumentation.",,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Long,https://www.aclweb.org/anthology/2020.emnlp-main.2,38938708 +main.445,Beyond Instructional Videos: Probing for More Diverse Visual-Textual Grounding on YouTube,Jack Hessel|Zhenhai Zhu|Bo Pang|Radu Soricut,"Pretraining from unlabelled web videos has quickly become the de-facto means of achieving high performance on many video understanding tasks. Features are learned via prediction of grounded relationships between visual content and automatic speech recognition (ASR) tokens. However, prior pretraining work has been limited to only instructional videos; a priori, we expect this domain to be relatively ""easy:"" speakers in instructional videos will often reference the literal objects/actions being depicted. We ask: can similar models be trained on more diverse video corpora? And, if so, what types of videos are ""grounded"" and what types are not? We fit a representative pretraining model to the diverse YouTube8M dataset, and study its success and failure cases. We find that visual-textual grounding is indeed possible across previously unexplored video categories, and that pretraining on a more diverse set results in representations that generalize to both non-instructional and instructional domains.",,"Language Grounding to Vision, Robotics and Beyond",Long,https://www.aclweb.org/anthology/2020.emnlp-main.709,38938709 +main.447,Some Languages Seem Easier to Parse Because Their Treebanks Leak,Anders Søgaard,"Cross-language differences in (universal) dependency parsing performance are mostly attributed to treebank size, average sentence length, average dependency length, morphological complexity, and domain differences. We point at a factor not previously discussed: If we abstract away from words and dependency labels, how many graphs in the test data were seen in the training data? We compute graph isomorphisms, and show that, treebank size aside, overlap between training and test graphs explain more of the observed variation than standard explanations such as the above.",,"Syntax: Tagging, Chunking, and Parsing",Long,https://www.aclweb.org/anthology/2020.emnlp-main.220,38938710 +main.449,A Simple and Effective Model for Answering Multi-span Questions,Elad Segal|Avia Efrat|Mor Shoham|Amir Globerson|Jonathan Berant,"Models for reading comprehension (RC) commonly restrict their output space to the set of all single contiguous spans from the input, in order to alleviate the learning problem and avoid the need for a model that generates text explicitly. However, forcing an answer to be a single span can be restrictive, and some recent datasets also include multi-span questions, i.e., questions whose answer is a set of non-contiguous spans in the text. Naturally, models that return single spans cannot answer these questions. In this work, we propose a simple architecture for answering multi-span questions by casting the task as a sequence tagging problem, namely, predicting for each input token whether it should be part of the output or not. Our model substantially improves performance on span extraction questions from DROP and Quoref by 9.9 and 5.5 EM points respectively.",,Question Answering,Long,https://www.aclweb.org/anthology/2020.emnlp-main.248,38938711 +main.450,Towards More Accurate Uncertainty Estimation in Text Classification,Jianfeng He|Xuchao Zhang|Shuo Lei|Zhiqian Chen|Fanglan Chen|Abdulaziz Alhamadani|Bei Xiao|ChangTien Lu,"The uncertainty measurement of classified results is especially important in areas requiring limited human resources for higher accuracy. For instance, data-driven algorithms diagnosing diseases need accurate uncertainty score to decide whether additional but limited quantity of experts are needed for rectification. However, few uncertainty models focus on improving the performance of text classification where human resources are involved. To achieve this, we aim at generating accurate uncertainty score by improving the confidence of winning scores. Thus, a model called MSD, which includes three independent components as ``mix-up"", ``self-ensembling"", ``distinctiveness score"", is proposed to improve the accuracy of uncertainty score by reducing the effect of overconfidence of winning score and considering the impact of different categories of uncertainty simultaneously. MSD can be applied with different Deep Neural Networks. Extensive experiments with ablation setting are conducted on four real-world datasets, on which, competitive results are obtained.",,Information Retrieval and Text Mining,Long,https://www.aclweb.org/anthology/2020.emnlp-main.671,38938712 +main.453,Non-Autoregressive Machine Translation with Latent Alignments,Chitwan Saharia|William Chan|Saurabh Saxena|Mohammad Norouzi,"This paper presents two strong methods, CTC and Imputer, for non-autoregressive machine translation that model latent alignments with dynamic programming. We revisit CTC for machine translation and demonstrate that a simple CTC model can achieve state-of-the-art for single-step non-autoregressive machine translation, contrary to what prior work indicates. In addition, we adapt the Imputer model for non-autoregressive machine translation and demonstrate that Imputer with just 4 generation steps can match the performance of an autoregressive Transformer baseline. Our latent alignment models are simpler than many existing non-autoregressive translation baselines; for example, we do not require target length prediction or re-scoring with an autoregressive model. On the competitive WMT'14 En$\rightarrow$De task, our CTC model achieves 25.7 BLEU with a single generation step, while Imputer achieves 27.5 BLEU with 2 generation steps, and 28.0 BLEU with 4 generation steps. This compares favourably to the autoregressive Transformer baseline at 27.8 BLEU.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.83,38938713 +main.457,Detecting Independent Pronoun Bias with Partially-Synthetic Data Generation,Robert Munro|Alex (Carmen) Morrison,"We report that state-of-the-art parsers consistently failed to identify “hers” and “theirs” as pronouns but identified the masculine equivalent “his”. We find that the same biases exist in recent language models like BERT. While some of the bias comes from known sources, like training data with gender imbalances, we find that the bias is _amplified_ in the language models and that linguistic differences between English pronouns that are not inherently biased can become biases in some machine learning models. We introduce a new technique for measuring bias in models, using Bayesian approximations to generate partially-synthetic data from the model itself.",,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.157,38938714 +main.47,BERT-ATTACK: Adversarial Attack against BERT Using BERT,Linyang Li|Ruotian Ma|Qipeng Guo|Xiangyang Xue|Xipeng Qiu,"Adversarial attacks for discrete data (such as texts) have been proved significantly more challenging than continuous data (such as images) since it is difficult to generate adversarial samples with gradient-based methods. Current successful attack methods for texts usually adopt heuristic replacement strategies on the character or word level, which remains challenging to find the optimal solution in the massive space of possible combinations of replacements while preserving semantic consistency and language fluency. In this paper, we propose \textbf{BERT-Attack}, a high-quality and effective method to generate adversarial samples using pre-trained masked language models exemplified by BERT. We turn BERT against its fine-tuned models and other deep neural models in downstream tasks so that we can successfully mislead the target models to predict incorrectly. Our method outperforms state-of-the-art attack strategies in both success rate and perturb percentage, while the generated adversarial samples are fluent and semantically preserved. Also, the cost of calculation is low, thus possible for large-scale generations. The code is available at \url{https://github.com/LinyangLee/BERT-Attack}.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.500,38938642 +main.470,Competence-Level Prediction and Resume-Job_Description Matching Using Context-Aware Transformer Models,Changmao Li|Elaine Fisher|Rebecca Thomas|Steve Pittard|Vicki Hertzberg|Jinho D. Choi,"This paper presents a comprehensive study on resume classification to reduce the time and labor needed to screen an overwhelming number of applications significantly, while improving the selection of suitable candidates. A total of 6,492 resumes are extracted from 24,933 job applications for 252 positions designated into four levels of experience for Clinical Research Coordinators (CRC). Each resume is manually annotated to its most appropriate CRC position by experts through several rounds of triple annotation to establish guidelines. As a result, a high Kappa score of 61% is achieved for inter-annotator agreement. Given this dataset, novel transformer-based classification models are developed for two tasks: the first task takes a resume and classifies it to a CRC level (T1), and the second task takes both a resume and a job description to apply and predicts if the application is suited to the job (T2). Our best models using section encoding and a multi-head attention decoding give results of 73.3% to T1 and 79.2% to T2. Our analysis shows that the prediction errors are mostly made among adjacent CRC levels, which are hard for even experts to distinguish, implying the practical value of our models in real HR platforms.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.679,38938715 +main.471,Q-learning with Language Model for Edit-based Unsupervised Summarization,Ryosuke Kohita|Akifumi Wachi|Yang Zhao|Ryuki Tachibana,"Unsupervised methods are promising for abstractive textsummarization in that the parallel corpora is not required. However, their performance is still far from being satisfied, therefore research on promising solutions is on-going. In this paper, we propose a new approach based on Q-learning with an edit-based summarization. The method combines two key modules to form an Editorial Agent and Language Model converter (EALM). The agent predicts edit actions (e.t., delete, keep, and replace), and then the LM converter deterministically generates a summary on the basis of the action signals. Q-learning is leveraged to train the agent to produce proper edit actions. Experimental results show that EALM delivered competitive performance compared with the previous encoder-decoder-based methods, even with truly zero paired data (i.e., no validation set). Defining the task as Q-learning enables us not only to develop a competitive method but also to make the latest techniques in reinforcement learning available for unsupervised summarization. We also conduct qualitative analysis, providing insights into future study on unsupervised summarizers.",,Summarization,Long,https://www.aclweb.org/anthology/2020.emnlp-main.34,38938716 +main.476,MIME: MIMicking Emotions for Empathetic Response Generation,Navonil Majumder|Pengfei Hong|Shanshan Peng|Jiankun Lu|Deepanway Ghosal|Alexander Gelbukh|Rada Mihalcea|Soujanya Poria,"Current approaches to empathetic response generation view the set of emotions expressed in the input text as a flat structure, where all the emotions are treated uniformly. We argue that empathetic responses often mimic the emotion of the user to a varying degree, depending on its positivity or negativity and content. We show that the consideration of these polarity-based emotion clusters and emotional mimicry results in improved empathy and contextual relevance of the response as compared to the state-of-the-art. Also, we introduce stochasticity into the emotion mixture that yields emotionally more varied empathetic responses than the previous work. We demonstrate the importance of these factors to empathetic response generation using both automatic- and human-based evaluations. The implementation of MIME is publicly available at https://github.com/declare-lab/MIME.",,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Long,https://www.aclweb.org/anthology/2020.emnlp-main.721,38938717 +main.478,Dialogue Distillation: Open-domain Dialogue Augmentation Using Unpaired Data,Rongsheng Zhang|Yinhe Zheng|Jianzhi Shao|Xiaoxi Mao|Yadong Xi|Minlie Huang,"Recent advances in open-domain dialogue systems rely on the success of neural models that are trained on large-scale data. However, collecting large-scale dialogue data is usually time-consuming and labor-intensive. To address this data dilemma, we propose a novel data augmentation method for training open-domain dialogue models by utilizing unpaired data. Specifically, a data-level distillation process is first proposed to construct augmented dialogues where both post and response are retrieved from the unpaired data. A ranking module is employed to filter out low-quality dialogues. Further, a model-level distillation process is employed to distill a teacher model trained on high-quality paired data to augmented dialogue pairs, thereby preventing dialogue models from being affected by the noise in the augmented data. Automatic and manual evaluation indicates that our method can produce high-quality dialogue pairs with diverse contents, and the proposed data-level and model-level dialogue distillation can improve the performance of competitive baselines.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.277,38938718 +main.485,Improving Detection and Categorization of Task-relevant Utterances through Integration of Discourse Structure and Ontological Knowledge,Sopan Khosla|Shikhar Vashishth|Jill Fain Lehman|Carolyn Rose,"Information extraction from conversational data is particularly challenging because the task-centric nature of conversation allows for effective communication of implicit information by humans, but is challenging for machines. The challenges may differ between utterances depending on the role of the speaker within the conversation, especially when relevant expertise is distributed asymmetrically across roles. Further, the challenges may also increase over the conversation as more shared context is built up through information communicated implicitly earlier in the dialogue. In this paper, we propose the novel modeling approach MedFilter, which addresses these insights in order to increase performance at identifying and categorizing task-relevant utterances, and in so doing, positively impacts performance at a downstream information extraction task. We evaluate this approach on a corpus of nearly 7,000 doctor-patient conversations where MedFilter is used to identify medically relevant contributions to the discussion (achieving a 10% improvement over SOTA baselines in terms of area under the PR curve). Identifying task-relevant utterances benefits downstream medical processing, achieving improvements of 15%, 105%, and 23% respectively for the extraction of symptoms, medications, and complaints.",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.626,38938719 +main.486,Discontinuous Constituent Parsing as Sequence Labeling,David Vilares|Carlos Gómez-Rodríguez,"This paper reduces discontinuous parsing to sequence labeling. It first shows that existing reductions for constituent parsing as labeling do not support discontinuities. Second, it fills this gap and proposes to encode tree discontinuities as nearly ordered permutations of the input sequence. Third, it studies whether such discontinuous representations are learnable. The experiments show that despite the architectural simplicity, under the right representation, the models are fast and accurate.",,"Syntax: Tagging, Chunking, and Parsing",Long,https://www.aclweb.org/anthology/2020.emnlp-main.221,38938720 +main.493,Active Learning for BERT: An Empirical Study,Liat Ein-Dor|Alon Halfon|Ariel Gera|Eyal Shnarch|Lena Dankin|Leshem Choshen|Marina Danilevsky|Ranit Aharonov|Yoav Katz|Noam Slonim,"Real world scenarios present a challenge for text classification, since labels are usually expensive and the data is often characterized by class imbalance. Active Learning (AL) is a ubiquitous paradigm to cope with data scarcity. Recently, pre-trained NLP models, and BERT in particular, are receiving massive attention due to their outstanding performance in various NLP tasks. However, the use of AL with deep pre-trained models has so far received little consideration. Here, we present a large-scale empirical study on active learning techniques for BERT-based classification, addressing a diverse set of AL strategies and datasets. We focus on practical scenarios of binary text classification, where the annotation budget is very small, and the data is often skewed. Our results demonstrate that AL can boost BERT performance, especially in the most realistic scenario in which the initial set of labeled examples is created using keyword-based queries, resulting in a biased sample of the minority class. We release our research framework, aiming to facilitate future research along the lines explored here.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.638,38938721 +main.498,A Matter of Framing: The Impact of Linguistic Formalism on Probing Results,Ilia Kuznetsov|Iryna Gurevych,"Deep pre-trained contextualized encoders like BERT demonstrate remarkable performance on a range of downstream tasks. A recent line of research in probing investigates the linguistic knowledge implicitly learned by these models during pre-training. While most work in probing operates on the task level, linguistic tasks are rarely uniform and can be represented in a variety of formalisms. Any linguistics-based probing study thereby inevitably commits to the formalism used to annotate the underlying data. Can the choice of formalism affect probing results? To investigate, we conduct an in-depth cross-formalism layer probing study in role semantics. We find linguistically meaningful differences in the encoding of semantic role- and proto-role information by BERT depending on the formalism and demonstrate that layer probing can detect subtle differences between the implementations of the same linguistic formalism. Our results suggest that linguistic formalism is an important dimension in probing studies, along with the commonly used cross-task and cross-lingual experimental settings.",,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.13,38938722 +main.504,MLSUM: The Multilingual Summarization Corpus,Thomas Scialom|Paul-Alexis Dray|Sylvain Lamprier|Benjamin Piwowarski|Jacopo Staiano,"We present MLSUM, the first large-scale MultiLingual SUMmarization dataset. Obtained from online newspapers, it contains 1.5M+ article/summary pairs in five different languages -- namely, French, German, Spanish, Russian, Turkish. Together with English news articles from the popular CNN/Daily mail dataset, the collected data form a large scale multilingual dataset which can enable new research directions for the text summarization community. We report cross-lingual comparative analyses based on state-of-the-art systems. These highlight existing biases which motivate the use of a multi-lingual dataset.",,Summarization,Long,https://www.aclweb.org/anthology/2020.emnlp-main.647,38938723 +main.517,The Thieves on Sesame Street Are Polyglots --- Extracting Multilingual Models from Monolingual APIs,Nitish Shirish Keskar|Bryan McCann|Caiming Xiong|Richard Socher,"Pre-training in natural language processing makes it easier for an adversary with only query access to a victim model to reconstruct a local copy of the victim by training with gibberish input data paired with the victim's labels for that data. We discover that this extraction process extends to local copies initialized from a pre-trained, multilingual model while the victim remains monolingual. The extracted model learns the task from the monolingual victim, but it generalizes far better than the victim to several other languages. This is done without ever showing the multilingual, extracted model a well-formed input in any of the languages for the target task. We also demonstrate that a few real examples can greatly improve performance, and we analyze how these results shed light on how such extraction methods succeed.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.501,38938724 +main.522,Language Model Prior for Low-Resource Neural Machine Translation,Christos Baziotis|Barry Haddow|Alexandra Birch,"The scarcity of large parallel corpora is an important obstacle for neural machine translation. A common solution is to exploit the knowledge of language models (LM) trained on abundant monolingual data. In this work, we propose a novel approach to incorporate a LM as prior in a neural translation model (TM). Specifically, we add a regularization term, which pushes the output distributions of the TM to be probable under the LM prior, while avoiding wrong predictions when the TM ""disagrees"" with the LM. This objective relates to knowledge distillation, where the LM can be viewed as teaching the TM about the target language. The proposed approach does not compromise decoding speed, because the LM is used only at training time, unlike previous work that requires it during inference. We present an analysis of the effects that different methods have on the distributions of the TM. Results on two low-resource machine translation datasets show clear improvements even with limited monolingual data.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.615,38938725 +main.527,Information Seeking in the Spirit of Learning: A Dataset for Conversational Curiosity,Pedro Rodriguez|Paul Crook|Seungwhan Moon|Zhiguang Wang,"Open-ended human learning and information-seeking are increasingly mediated by digital assistants. However, such systems often ignore the user's pre-existing knowledge. Assuming a correlation between engagement and user responses such as ``liking'' messages or asking followup questions, we design a Wizard-of-Oz dialog task that tests the hypothesis that engagement increases when users are presented with facts related to what they know. Through crowd-sourcing of this experiment, we collect and release 14K dialogs (181K utterances) where users and assistants converse about geographic topics like geopolitical entities and locations. This dataset is annotated with pre-existing user knowledge, message-level dialog acts, grounding to Wikipedia, and user reactions to messages. Responses using a user's prior knowledge increase engagement. We incorporate this knowledge into a multi-task model that reproduces human assistant policies and improves over a \textsc{bert} content model by 13 mean reciprocal rank points.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.655,38938726 +main.531,Distilling Structured Knowledge for Text-Based Relational Reasoning,Jin Dong|Marc-Antoine Rondeau|William L. Hamilton,"There is an increasing interest in developing text-based relational reasoning systems, which are capable of systematically reasoning about the relationships between entities mentioned in a text. However, there remains a substantial performance gap between NLP models for relational reasoning and models based on graph neural networks (GNNs), which have access to an underlying symbolic representation of the text. In this work, we investigate how the structured knowledge of a GNN can be distilled into various NLP models in order to improve their performance. We first pre-train a GNN on a reasoning task using structured inputs and then incorporate its knowledge into an NLP model (e.g., an LSTM) via knowledge distillation. To overcome the difficulty of cross-modal knowledge transfer, we also employ a contrastive learning based module to align the latent representations of NLP models and the GNN. We test our approach with two state-of-the-art NLP models on 13 different inductive reasoning datasets from the CLUTRR benchmark and obtain significant improvements.",,Question Answering,Long,https://www.aclweb.org/anthology/2020.emnlp-main.551,38938727 +main.540,Multi-XScience: A Large-scale Dataset for Extreme Multi-document Summarization of Scientific Articles,Yao Lu|Yue Dong|Laurent Charlin,"Multi-document summarization is a challenging task for which there exists little large-scale datasets. We propose Multi-XScience, a large-scale multi-document summarization dataset created from scientific articles. Multi-XScience introduces a challenging multi-document summarization task: writing the related-work section of a paper based on its abstract and the articles it references. Our work is inspired by extreme summarization, a dataset construction protocol that favours abstractive modeling approaches. Descriptive statistics and empirical results---using several state-of-the-art models trained on the Multi-XScience dataset---reveal that Multi-XScience is well suited for abstractive models.",,Summarization,Long,https://www.aclweb.org/anthology/2020.emnlp-main.648,38938728 +main.548,Conditional Causal Relationships between Emotions and Causes in Texts,Xinhong Chen|Qing Li|Jianping Wang,"The causal relationships between emotions and causes in text have recently received a lot of attention. Most of the existing works focus on the extraction of the causally related clauses from documents. However, none of these works has considered the possibility that the causal relationships among the extracted emotion and cause clauses may only be valid under a specific context, without which the extracted clauses may not be causally related. To address such an issue, we propose a new task of determining whether or not an input pair of emotion and cause has a valid causal relationship under different contexts, and construct a corresponding dataset via manual annotation and negative sampling based on an existing benchmark dataset. Furthermore, we propose a prediction aggregation module with low computational overhead to fine-tune the prediction results based on the characteristics of the input clauses. Experiments demonstrate the effectiveness and generality of our aggregation module.",,Information Retrieval and Text Mining,Long,https://www.aclweb.org/anthology/2020.emnlp-main.252,38938729 +main.55,CheXbert: Combining Automatic Labelers and Expert Annotations for Accurate Radiology Report Labeling Using BERT,Akshay Smit|Saahil Jain|Pranav Rajpurkar|Anuj Pareek|Andrew Ng|Matthew Lungren,"The extraction of labels from radiology text reports enables large-scale training of medical imaging models. Existing approaches to report labeling typically rely either on sophisticated feature engineering based on medical domain knowledge or manual annotations by experts. In this work, we introduce a BERT-based approach to medical image report labeling that exploits both the scale of available rule-based systems and the quality of expert annotations. We demonstrate superior performance of a biomedically pretrained BERT model first trained on annotations of a rule-based labeler and then finetuned on a small set of expert annotations augmented with automated backtranslation. We find that our final model, CheXbert, is able to outperform the previous best rules-based labeler with statistical significance, setting a new SOTA for report labeling on one of the largest datasets of chest x-rays.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.117,38938643 +main.557,Attention Is All You Need for Chinese Word Segmentation,Sufeng Duan|Hai Zhao,"Taking greedy decoding algorithm as it should be, this work focuses on further strengthening the model itself for Chinese word segmentation (CWS), which results in an even more fast and more accurate CWS model. Our model consists of an attention only stacked encoder and a light enough decoder for the greedy segmentation plus two highway connections for smoother training, in which the encoder is composed of a newly proposed Transformer variant, Gaussian-masked Directional (GD) Transformer, and a biaffine attention scorer. With the effective encoder design, our model only needs to take unigram features for scoring. Our model is evaluated on SIGHAN Bakeoff benchmark datasets. The experimental results show that with the highest segmentation speed, the proposed model achieves new state-of-the-art or comparable performance against strong baselines in terms of strict closed test setting.",,"Phonology, Morphology and Word Segmentation",Long,https://www.aclweb.org/anthology/2020.emnlp-main.317,38938730 +main.574,SRLGRN: Semantic Role Labeling Graph Reasoning Network,Chen Zheng|Parisa Kordjamshidi,"This work deals with the challenge of learning and reasoning over multi-hop question answering (QA). We propose a graph reasoning network based on the semantic structure of the sentences to learn cross paragraph reasoning paths and find the supporting facts and the answer jointly. The proposed graph is a heterogeneous document-level graph that contains nodes of type sentence (question, title, and other sentences), and semantic role labeling sub-graphs per sentence that contain arguments as nodes and predicates as edges. Incorporating the argument types, the argument phrases, and the semantics of the edges originated from SRL predicates into the graph encoder helps in finding and also the explainability of the reasoning paths. Our proposed approach shows competitive performance on the HotpotQA distractor setting benchmark compared to the recent state-of-the-art models.",,Question Answering,Long,https://www.aclweb.org/anthology/2020.emnlp-main.714,38938731 +main.585,Interactive Refinement of Cross-Lingual Word Embeddings,Michelle Yuan|Mozhi Zhang|Benjamin Van Durme|Leah Findlater|Jordan Boyd-Graber,"Cross-lingual word embeddings transfer knowledge between languages: models trained on high-resource languages can predict in low-resource languages. We introduce CLIME, an interactive system to quickly refine cross-lingual word embeddings for a given classification problem. First, CLIME ranks words by their salience to the downstream task. Then, users mark similarity between keywords and their nearest neighbors in the embedding space. Finally, CLIME updates the embeddings using the annotations. We evaluate CLIME on identifying health-related text in four low-resource languages: Ilocano, Sinhalese, Tigrinya, and Uyghur. Embeddings refined by CLIME capture more nuanced word semantics and have higher test accuracy than the original embeddings. CLIME often improves accuracy faster than an active learning baseline and can be easily combined with active learning to improve results.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.482,38938732 +main.593,The Grammar of Emergent Languages,Oskar van der Wal|Silvan de Boer|Elia Bruni|Dieuwke Hupkes,"In this paper, we consider the syntactic properties of languages emerged in referential games, using unsupervised grammar induction (UGI) techniques originally designed to analyse natural language. We show that the considered UGI techniques are appropriate to analyse emergent languages and we then study if the languages that emerge in a typical referential game setup exhibit syntactic structure, and to what extent this depends on the maximum message length and number of symbols that the agents are allowed to use. Our experiments demonstrate that a certain message length and vocabulary size are required for structure to emerge, but they also illustrate that more sophisticated game scenarios are required to obtain syntactic properties more akin to those observed in human language. We argue that UGI techniques should be part of the standard toolkit for analysing emergent languages and release a comprehensive library to facilitate such analysis for future researchers.",,"Language Grounding to Vision, Robotics and Beyond",Long,https://www.aclweb.org/anthology/2020.emnlp-main.270,38938733 +main.595,SubjQA: A Dataset for Subjectivity and Review Comprehension,Johannes Bjerva|Nikita Bhutani|Behzad Golshan|Wang-Chiew Tan|Isabelle Augenstein,"Subjectivity is the expression of internal opinions or beliefs which cannot be objectively observed or verified, and has been shown to be important for sentiment analysis and word-sense disambiguation. Furthermore, subjectivity is an important aspect of user-generated data. In spite of this, subjectivity has not been investigated in contexts where such data is widespread, such as in question answering (QA). We develop a new dataset which allows us to investigate this relationship. We find that subjectivity is an important feature in the case of QA, albeit with more intricate interactions between subjectivity and QA performance than found in previous work on sentiment analysis. For instance, a subjective question may or may not be associated with a subjective answer. We release an English QA dataset (SubjQA) based on customer reviews, containing subjectivity annotations for questions and answer spans across 6 domains.",,Question Answering,Long,https://www.aclweb.org/anthology/2020.emnlp-main.442,38938734 +main.598,Exploring Semantic Capacity of Terms,Jie Huang|Zilong Wang|Kevin Chang|Wen-mei Hwu|JinJun Xiong,"We introduce and study semantic capacity of terms. For example, the semantic capacity of artificial intelligence is higher than that of linear regression since artificial intelligence possesses a broader meaning scope. Understanding semantic capacity of terms will help many downstream tasks in natural language processing. For this purpose, we propose a two-step model to investigate semantic capacity of terms, which takes a large text corpus as input and can evaluate semantic capacity of terms if the text corpus can provide enough co-occurrence information of terms. Extensive experiments in three fields demonstrate the effectiveness and rationality of our model compared with well-designed baselines and human-level evaluations.",,Semantics: Lexical Semantics,Long,https://www.aclweb.org/anthology/2020.emnlp-main.684,38938735 +main.60,Imitation Attacks and Defenses for Black-box Machine Translation Systems,Eric Wallace|Mitchell Stern|Dawn Song,"Adversaries may look to steal or attack black-box NLP systems, either for financial gain or to exploit model errors. One setting of particular interest is machine translation (MT), where models have high commercial value and errors can be costly. We investigate possible exploitations of black-box MT systems and explore a preliminary defense against such threats. We first show that MT systems can be stolen by querying them with monolingual sentences and training models to imitate their outputs. Using simulated experiments, we demonstrate that MT model stealing is possible even when imitation models have different input data or architectures than their target models. Applying these ideas, we train imitation models that reach within 0.6 BLEU of three production MT systems on both high-resource and low-resource language pairs. We then leverage the similarity of our imitation models to transfer adversarial examples to the production systems. We use gradient-based attacks that expose inputs which lead to semantically-incorrect translations, dropped content, and vulgar model outputs. To mitigate these vulnerabilities, we propose a defense that modifies translation outputs in order to misdirect the optimization of imitation models. This defense degrades the adversary's BLEU score and attack success rate at some cost in the defender's BLEU and inference speed.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.446,38938644 +main.605,Understanding Procedural Text Using Interactive Entity Networks,Jizhi Tang|Yansong Feng|Dongyan Zhao,"The task of procedural text comprehension aims to understand the dynamic nature of entities/objects in a process. Here, the key is to track how the entities interact with each other and how their states are changing along the procedure. Recent efforts have made great progress to track multiple entities in a procedural text, but usually treat each entity separately and ignore the fact that there are often multiple entities interacting with each other during one process, some of which are even explicitly mentioned. In this paper, we propose a novel Interactive Entity Network (IEN), which is a recurrent network with memory equipped cells for state tracking. In each IEN cell, we maintain different attention matrices through specific memories to model different types of entity interactions. Importantly, we can update these memories in a sequential manner so as to explore the causal relationship between entity actions and subsequent state changes. We evaluate our model on a benchmark dataset, and the results show that IEN outperforms state-of-the-art models by precisely capturing the interactions of multiple entities and explicitly leverage the relationship between entity interactions and subsequent state changes.",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.591,38938736 +main.607,"Reasoning about Goals, Steps, and Temporal Ordering with WikiHow",Li Zhang|Qing Lyu|Chris Callison-Burch,"We propose a suite of reasoning tasks on two types of relations between procedural events: goal-step relations (“learn poses” is a step in the larger goal of “doing yoga”) and step-step temporal relations (“buy a yoga mat” typically precedes “learn poses”). We introduce a dataset targeting these two relations based on wikiHow, a website of instructional how-to articles. Our human-validated test set serves as a reliable benchmark for common-sense inference, with a gap of about 10% to 20% between the performance of state-of-the-art transformer models and human performance. Our automatically-generated training set allows models to effectively transfer to out-of-domain tasks requiring knowledge of procedural events, with greatly improved performances on SWAG, Snips, and Story Cloze Test in zero- and few-shot settings.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",Long,https://www.aclweb.org/anthology/2020.emnlp-main.374,38938737 +main.616,Attention Is Not Only a Weight: Analyzing Transformers with Vector Norms,Goro Kobayashi|Tatsuki Kuribayashi|Sho Yokoi|Kentaro Inui,"Attention is a key component of Transformers, which have recently achieved considerable success in natural language processing. Hence, attention is being extensively studied to investigate various linguistic capabilities of Transformers, focusing on analyzing the parallels between attention weights and specific linguistic phenomena. This paper shows that attention weights alone are only one of the two factors that determine the output of attention and proposes a norm-based analysis that incorporates the second factor, the norm of the transformed input vectors. The findings of our norm-based analyses of BERT and a Transformer-based neural machine translation system include the following: (i) contrary to previous studies, BERT pays poor attention to special tokens, and (ii) reasonable word alignment can be extracted from attention mechanisms of Transformer. These findings provide insights into the inner workings of Transformers.",,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.574,38938738 +main.618,Losing Heads in the Lottery: Pruning Transformer Attention in Neural Machine Translation,Maximiliana Behnke|Kenneth Heafield,"The attention mechanism is the crucial component of the transformer architecture. Recent research shows that most attention heads are not confident in their decisions and can be pruned. However, removing them before training a model results in lower quality. In this paper, we apply the lottery ticket hypothesis to prune heads in the early stages of training. Our experiments on machine translation show that it is possible to remove up to three-quarters of attention heads from transformer-big during early training with an average -0.1 change in BLEU for Turkish→English. The pruned model is 1.5 times as fast at inference, albeit at the cost of longer training. Our method is complementary to other approaches, such as teacher-student, with English→German student model gaining an additional 10% speed-up with 75% encoder attention removed and 0.2 BLEU loss.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.211,38938739 +main.619,Translationese in Machine Translation Evaluation,Yvette Graham|Barry Haddow|Philipp Koehn,"The term translationese has been used to describe features of translated text, and in this paper, we provide detailed analysis of potential adverse effects of translationese on machine translation evaluation. Our analysis shows differences in conclusions drawn from evaluations that include translationese in test data compared to experiments that tested only with text originally composed in that language. For this reason we recommend that reverse-created test data be omitted from future machine translation test sets. In addition, we provide a re-evaluation of a past machine translation evaluation claiming human-parity of MT. One important issue not previously considered is statistical power of significance tests applied to comparison of human and machine translation. Since the very aim of past evaluations was investigation of ties between human and MT systems, power analysis is of particular importance, to avoid, for example, claims of human parity simply corresponding to Type II error resulting from the application of a low powered test. We provide detailed analysis of tests used in such evaluations to provide an indication of a suitable minimum sample size for future studies.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.6,38938740 +main.623,XCOPA: A Multilingual Dataset for Causal Commonsense Reasoning,Edoardo Maria Ponti|Goran Glavaš|Olga Majewska|Qianchu Liu|Ivan Vulić|Anna Korhonen,"In order to simulate human language capacity, natural language processing systems must be able to reason about the dynamics of everyday situations, including their possible causes and effects. Moreover, they should be able to generalise the acquired world knowledge to new languages, modulo cultural differences. Advances in machine reasoning and cross-lingual transfer depend on the availability of challenging evaluation benchmarks. Motivated by both demands, we introduce Cross-lingual Choice of Plausible Alternatives (XCOPA), a typologically diverse multilingual dataset for causal commonsense reasoning in 11 languages, which includes resource-poor languages like Eastern Apurímac Quechua and Haitian Creole. We evaluate a range of state-of-the-art models on this novel dataset, revealing that the performance of current methods based on multilingual pretraining and zero-shot fine-tuning falls short compared to translation-based transfer. Finally, we propose strategies to adapt multilingual models to out-of-sample resource-lean languages where only a small corpus or a bilingual dictionary is available, and report substantial improvements over the random baseline. The XCOPA dataset is freely available at github.com/cambridgeltl/xcopa.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.185,38938741 +main.628,Multistage Fusion with Forget Gate for Multimodal Summarization in Open-Domain Videos,Nayu Liu|Xian Sun|Hongfeng Yu|Wenkai Zhang|Guangluan Xu,"Multimodal summarization for open-domain videos is an emerging task, aiming to generate a summary from multisource information (video, audio, transcript). Despite the success of recent multiencoder-decoder frameworks on this task, existing methods lack fine-grained multimodality interactions of multisource inputs. Besides, unlike other multimodal tasks, this task has longer multimodal sequences with more redundancy and noise. To address these two issues, we propose a multistage fusion network with the fusion forget gate module, which builds upon this approach by modeling fine-grained interactions between the modalities through a multistep fusion schema and controlling the flow of redundant information between multimodal long sequences via a forgetting module. Experimental results on the How2 dataset show that our proposed model achieves a new state-of-the-art performance. Comprehensive analysis empirically verifies the effectiveness of our fusion schema and forgetting module on multiple encoder-decoder architectures. Specially, when using high noise ASR transcripts (WER>30%), our model still achieves performance close to the ground-truth transcript model, which reduces manual annotation cost.",,Summarization,Long,https://www.aclweb.org/anthology/2020.emnlp-main.144,38938742 +main.635,Detecting Cross-Modal Inconsistency to Defend against Neural Fake News,Reuben Tan|Bryan Plummer|Kate Saenko,"Large-scale dissemination of disinformation online intended to mislead or deceive the general population is a major societal problem. Rapid progression in image, video, and natural language generative models has only exacerbated this situation and intensified our need for an effective defense mechanism. While existing approaches have been proposed to defend against neural fake news, they are generally constrained to the very limited setting where articles only have text and metadata such as the title and authors. In this paper, we introduce the more realistic and challenging task of defending against machine-generated news that also includes images and captions. To identify the possible weaknesses that adversaries can exploit, we create a NeuralNews dataset which is comprised of 4 different types of generated articles as well as conduct a series of human user study experiments based on this dataset. Coupled with providing a relatively effective approach based on detecting visual-semantic inconsistencies, the valuable insights gleaned from our user study experiments and, consequently, this paper will serve as an effective first line of defense and a valuable reference for future work in defending against machine-generated disinformation.",,"Language Grounding to Vision, Robotics and Beyond",Long,https://www.aclweb.org/anthology/2020.emnlp-main.163,38938743 +main.638,An Information Theoretic View on Selecting Linguistic Probes,Zining Zhu|Frank Rudzicz,"There is increasing interest in assessing the linguistic knowledge encoded in neural representations. A popular approach is to attach a diagnostic classifier -- or ''probe'' -- to perform supervised classification from internal representations. However, how to select a good probe is in debate. Hewitt and Liang (2019) showed that a high performance on diagnostic classification itself is insufficient, because it can be attributed to either ''the representation being rich in knowledge'', or ''the probe learning the task'', which Pimentel et al. (2020) challenged. We show this dichotomy is valid information-theoretically. In addition, we find that the ''good probe'' criteria proposed by the two papers, *selectivity* (Hewitt and Liang, 2019) and *information gain* (Pimentel et al., 2020), are equivalent -- the errors of their approaches are identical (modulo irrelevant terms). Empirically, these two selection criteria lead to results that highly agree with each other.",,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.744,38938744 +main.639,Exploiting Sentence Order in Document Alignment,Brian Thompson|Philipp Koehn,"We present a simple document alignment method that incorporates sentence order information in both candidate generation and candidate re-scoring. Our method results in 61% relative reduction in error compared to the best previously published result on the WMT16 document alignment shared task. Our method improves downstream MT performance on web-scraped Sinhala–English documents from ParaCrawl, outperforming the document alignment method used in the most recent ParaCrawl release. It also outperforms a comparable corpora method which uses the same multilingual embeddings, demonstrating that exploiting sentence order is beneficial even if the end goal is sentence-level bitext.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.483,38938745 +main.644,Speakers Fill Semantic Gaps with Context,Tiago Pimentel|Rowan Hall Maudslay|Damian Blasi|Ryan Cotterell,"Lexical ambiguity is widespread in language, allowing for the reuse of economical word forms and therefore making language more efficient. If ambiguous words cannot be disambiguated from context, however, this gain in efficiency might make language less clear---resulting in frequent miscommunication. For a language to be clear and efficiently encoded, we posit that the lexical ambiguity of a word type should correlate with how much information context provides about it, on average. To investigate whether this is the case, we operationalise the lexical ambiguity of a word as the entropy of meanings it can take, and provide two ways to estimate this---one which requires human annotation (using WordNet), and one which does not (using BERT), making it readily applicable to a large number of languages. We validate these measures by showing that, on six high-resource languages, there are significant Pearson correlations between our BERT-based estimate of ambiguity and the number of synonyms a word has in WordNet (e.g. $\rho = 0.40$ in English). We then test our main hypothesis---that a word's lexical ambiguity should negatively correlate with its contextual uncertainty---and find significant correlations on all 18 typologically diverse languages we analyse. This suggests that, in the presence of ambiguity, speakers compensate by making contexts more informative.",,"Linguistic Theories, Cognitive Modeling and Psycholinguistics",Long,https://www.aclweb.org/anthology/2020.emnlp-main.328,38938746 +main.645,Multi-View Sequence-to-Sequence Models with Conversational Structure for Abstractive Dialogue Summarization,Jiaao Chen|Diyi Yang,"Text summarization is one of the most challenging and interesting problems in NLP. Although much attention has been paid to summarizing structured text like news reports or encyclopedia articles, summarizing conversations---an essential part of human-human/machine interaction where most important pieces of information are scattered across various utterances of different speakers---remains relatively under-investigated. This work proposes a multi-view sequence-to-sequence model by first extracting conversational structures of unstructured daily chats from different views to represent conversations and then utilizing a multi-view decoder to incorporate different views to generate dialogue summaries. Experiments on a large-scale dialogue summarization corpus demonstrated that our methods significantly outperformed previous state-of-the-art models via both automatic evaluations and human judgment. We also discussed specific challenges that current approaches faced with this task. We have publicly released our code at https://github.com/GT-SALT/Multi-View-Seq2Seq.",,Summarization,Long,https://www.aclweb.org/anthology/2020.emnlp-main.336,38938747 +main.647,Where Are You? Localization from Embodied Dialog,Meera Hahn|Jacob Krantz|Dhruv Batra|Devi Parikh|James Rehg|Stefan Lee|Peter Anderson,"We present WHERE ARE YOU? (WAY), a dataset of ~6k dialogs in which two humans -- an Observer and a Locator -- complete a cooperative localization task. The Observer is spawned at random in a 3D environment and can navigate from first-person views while answering questions from the Locator. The Locator must localize the Observer in a detailed top-down map by asking questions and giving instructions. Based on this dataset, we define three challenging tasks: Localization from Embodied Dialog or LED (localizing the Observer from dialog history), Embodied Visual Dialog (modeling the Observer), and Cooperative Localization (modeling both agents). In this paper, we focus on the LED task -- providing a strong baseline model with detailed ablations characterizing both dataset biases and the importance of various modeling choices. Our best model achieves 32.7% success at identifying the Observer's location within 3m in unseen buildings, vs. 70.4% for human Locators.",,"Language Grounding to Vision, Robotics and Beyond",Long,https://www.aclweb.org/anthology/2020.emnlp-main.59,38938748 +main.648,Sparse Text Generation,Pedro Henrique Martins|Zita Marinho|André F. T. Martins,"Current state-of-the-art text generators build on powerful language models such as GPT-2, achieving impressive performance. However, to avoid degenerate text, they require sampling from a modified softmax, via temperature parameters or ad-hoc truncation techniques, as in top-$k$ or nucleus sampling. This creates a mismatch between training and testing conditions. In this paper, we use the recently introduced entmax transformation to train and sample from a natively sparse language model, avoiding this mismatch. The result is a text generator with favorable performance in terms of fluency and consistency, fewer repetitions, and n-gram diversity closer to human text. In order to evaluate our model, we propose three new metrics for comparing sparse or truncated distributions: $\epsilon$-perplexity, sparsemax score, and Jensen-Shannon divergence. Human-evaluated experiments in story completion and dialogue generation show that entmax sampling leads to more engaging and coherent stories and conversations.",,Language Generation,Long,https://www.aclweb.org/anthology/2020.emnlp-main.348,38938749 +main.652,Incorporating Multimodal Information in Open-Domain Web Keyphrase Extraction,Yansen Wang|Zhen Fan|Carolyn Rose,"Open-domain Keyphrase extraction (KPE) on the Web is a fundamental yet complex NLP task with a wide range of practical applications within the field of Information Retrieval. In contrast to other document types, web page designs are intended for easy navigation and information finding. Effective designs encode within the layout and formatting signals that point to where the important information can be found. In this work, we propose a modeling approach that leverages these multi-modal signals to aid in the KPE task. In particular, we leverage both lexical and visual features (e.g., size, font, position) at the micro-level to enable effective strategy induction and meta-level features that describe pages at a macro-level to aid in strategy selection. Our evaluation demonstrates that a combination of effective strategy induction and strategy selection within this approach for the KPE task outperforms state-of-the-art models. A qualitative post-hoc analysis illustrates how these features function within the model.",,Information Retrieval and Text Mining,Long,https://www.aclweb.org/anthology/2020.emnlp-main.140,38938750 +main.658,DyERNIE: Dynamic Evolution of Riemannian Manifold Embeddings for Temporal Knowledge Graph Completion,Zhen Han|Peng Chen|Yunpu Ma|Volker Tresp,"There has recently been increasing interest in learning representations of temporal knowledge graphs (KGs), which record the dynamic relationships between entities over time. Temporal KGs often exhibit multiple simultaneous non-Euclidean structures, such as hierarchical and cyclic structures. However, existing embedding approaches for temporal KGs typically learn entity representations and their dynamic evolution in the Euclidean space, which might not capture such intrinsic structures very well. To this end, we propose DyERNIE, a non-Euclidean embedding approach that learns evolving entity representations in a product of Riemannian manifolds, where the composed spaces are estimated from the sectional curvatures of underlying data. Product manifolds enable our approach to better reflect a wide variety of geometric structures on temporal KGs. Besides, to capture the evolutionary dynamics of temporal KGs, we let the entity representations evolve according to a velocity vector defined in the tangent space at each timestamp. We analyze in detail the contribution of geometric spaces to representation learning of temporal KGs and evaluate our model on temporal knowledge graph completion tasks. Extensive experiments on three real-world datasets demonstrate significantly improved performance, indicating that the dynamics of multi-relational graph data can be more properly modeled by the evolution of embeddings on Riemannian manifolds.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.593,38938751 +main.664,Task-Completion Dialogue Policy Learning via Monte Carlo Tree Search with Dueling Network,Sihan Wang|kaijie zhou|Kunfeng Lai|Jianping Shen,"We introduce a framework of Monte Carlo Tree Search with Double-q Dueling network (MCTS-DDU) for task-completion dialogue policy learning. Different from the previous deep model-based reinforcement learning methods, which uses background planning and may suffer from low-quality simulated experiences, MCTS-DDU performs decision-time planning based on dialogue state search trees built by Monte Carlo simulations and is robust to the simulation errors. Such idea arises naturally in human behaviors, e.g. predicting others' responses and then deciding our own actions. In the simulated movie-ticket booking task, our method outperforms the background planning approaches significantly. We demonstrate the effectiveness of MCTS and the dueling network in detailed ablation studies, and also compare the performance upper bounds of these two planning methods.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.278,38938752 +main.666,Exploiting Structured Knowledge in Text via Graph-Guided Representation Learning,Tao Shen|Yi Mao|Pengcheng He|Guodong Long|Adam Trischler|Weizhu Chen,"In this work, we aim at equipping pre-trained language models with structured knowledge. We present two self-supervised tasks learning over raw text with the guidance from knowledge graphs. Building upon entity-level masked language models, our first contribution is an entity masking scheme that exploits relational knowledge underlying the text. This is fulfilled by using a linked knowledge graph to select informative entities and then masking their mentions. In addition, we use knowledge graphs to obtain distractors for the masked entities, and propose a novel distractor-suppressed ranking objective that is optimized jointly with masked language model. In contrast to existing paradigms, our approach uses knowledge graphs implicitly, only during pre-training, to inject language models with structured knowledge via learning from raw text. It is more efficient than retrieval-based methods that perform entity linking and integration during finetuning and inference, and generalizes more effectively than the methods that directly learn from concatenated graph triples. Experiments show that our proposed model achieves improved performance on five benchmarks, including question answering and knowledge base completion.",,Information Retrieval and Text Mining,Long,https://www.aclweb.org/anthology/2020.emnlp-main.722,38938753 +main.668,Multi-view Story Characterization from Movie Plot Synopses and Reviews,Sudipta Kar|Gustavo Aguilar|Mirella Lapata|Thamar Solorio,"This paper considers the problem of characterizing stories by inferring properties such as theme and style using written synopses and reviews of movies. We experiment with a multi-label dataset of movie synopses and a tagset representing various attributes of stories (e.g., genre, type of events). Our proposed multi-view model encodes the synopses and reviews using hierarchical attention and shows improvement over methods that only use synopses. Finally, we demonstrate how we can take advantage of such a model to extract a complementary set of story-attributes from reviews without direct supervision. We have made our dataset and source code publicly available at https://ritual.uh.edu/multiview-tag-2020.",,Information Retrieval and Text Mining,Long,https://www.aclweb.org/anthology/2020.emnlp-main.454,38938754 +main.675,Intrinsic Evaluation of Summarization Datasets,Rishi Bommasani|Claire Cardie,"High quality data forms the bedrock for building meaningful statistical models in NLP. Consequently, data quality must be evaluated either during dataset construction or *post hoc*. Almost all popular summarization datasets are drawn from natural sources and do not come with inherent quality assurance guarantees. In spite of this, data quality has gone largely unquestioned for many of these recent datasets. We perform the first large-scale evaluation of summarization datasets by introducing 5 intrinsic metrics and applying them to 10 popular datasets. We find that data usage in recent summarization research is sometimes inconsistent with the underlying properties of the data. Further, we discover that our metrics can serve the additional purpose of being inexpensive heuristics for detecting generically low quality examples.",,Summarization,Long,https://www.aclweb.org/anthology/2020.emnlp-main.649,38938755 +main.684,Dynamic Anticipation and Completion for Multi-Hop Reasoning over Sparse Knowledge Graph,Xin Lv|Xu Han|Lei Hou|Juanzi Li|Zhiyuan Liu|Wei Zhang|YICHI ZHANG|Hao Kong|Suhui Wu,"Multi-hop reasoning has been widely studied in recent years to seek an effective and interpretable method for knowledge graph (KG) completion. Most previous reasoning methods are designed for dense KGs with enough paths between entities, but cannot work well on those sparse KGs that only contain sparse paths for reasoning. On the one hand, sparse KGs contain less information, which makes it difficult for the model to choose correct paths. On the other hand, the lack of evidential paths to target entities also makes the reasoning process difficult. To solve these problems, we propose a multi-hop reasoning model over sparse KGs, by applying novel dynamic anticipation and completion strategies: (1) The anticipation strategy utilizes the latent prediction of embedding-based models to make our model perform more potential path search over sparse KGs. (2) Based on the anticipation information, the completion strategy dynamically adds edges as additional actions during the path search, which further alleviates the sparseness problem of KGs. The experimental results on five datasets sampled from Freebase, NELL and Wikidata show that our method outperforms state-of-the-art baselines. Our codes and datasets can be obtained from https://github.com/THU-KEG/DacKGR.",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.459,38938756 +main.689,Semantic Role Labeling Guided Multi-turn Dialogue ReWriter,Kun Xu|Haochen Tan|Linfeng Song|Han Wu|Haisong Zhang|Linqi Song|Dong Yu,"For multi-turn dialogue rewriting, the capacity of effectively modeling the linguistic knowledge in dialog context and getting ride of the noises is essential to improve its performance. Existing attentive models attend to all words without prior focus, which results in inaccurate concentration on some dispensable words. In this paper, we propose to use semantic role labeling (SRL), which highlights the core semantic information of who did what to whom, to provide additional guidance for the rewriter model. Experiments show that this information significantly improves a RoBERTa-based model that already outperforms previous state-of-the-art systems.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.537,38938757 +main.693,Coarse-to-Fine Query Focused Multi-Document Summarization,Yumo Xu|Mirella Lapata,"We consider the problem of better modeling query-cluster interactions to facilitate query focused multi-document summarization. Due to the lack of training data, existing work relies heavily on retrieval-style methods for assembling query relevant summaries. We propose a coarse-to-fine modeling framework which employs progressively more accurate modules for estimating whether text segments are relevant, likely to contain an answer, and central. The modules can be independently developed and leverage training data if available. We present an instantiation of this framework with a trained evidence estimator which relies on distant supervision from question answering (where various resources exist) to identify segments which are likely to answer the query and should be included in the summary. Our framework is robust across domains and query types (i.e., long vs short) and outperforms strong comparison systems on benchmark datasets.",,Summarization,Long,https://www.aclweb.org/anthology/2020.emnlp-main.296,38938758 +main.699,Learning a Simple and Effective Model for Multi-turn Response Generation with Auxiliary Tasks,YUFAN ZHAO|Can Xu|wei wu,"We study multi-turn response generation for open-domain dialogues. The existing state-of-the-art addresses the problem with deep neural architectures. While these models improved response quality, their complexity also hinders the application of the models in real systems. In this work, we pursue a model that has a simple structure yet can effectively leverage conversation contexts for response generation. To this end, we propose four auxiliary tasks including word order recovery, utterance order recovery, masked word recovery, and masked utterance recovery, and optimize the objectives of these tasks together with maximizing the likelihood of generation. By this means, the auxiliary tasks that relate to context understanding can guide the learning of the generation model to achieve a better local optimum. Empirical studies with three benchmarks indicate that our model can significantly outperform state-of-the-art generation models in terms of response quality on both automatic evaluation and human judgment, and at the same time enjoys a much faster decoding process.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.279,38938759 +main.701,Towards Enhancing Faithfulness for Neural Machine Translation,Rongxiang Weng|Heng Yu|Xiangpeng Wei|Weihua Luo,"Neural machine translation (NMT) has achieved great success due to the ability to generate high-quality sentences. Compared with human translations, one of the drawbacks of current NMT is that translations are not usually faithful to the input, e.g., omitting information or generating unrelated fragments, which inevitably decreases the overall quality, especially for human readers. In this paper, we propose a novel training strategy with a multi-task learning paradigm to build a faithfulness enhanced NMT model (named \textsc{FEnmt}). During the NMT training process, we sample a subset from the training set and translate them to get fragments that have been mistranslated. Afterward, the proposed multi-task learning paradigm is employed on both encoder and decoder to guide NMT to correctly translate these fragments. Both automatic and human evaluations verify that our \textsc{FEnmt} could improve translation quality by effectively reducing unfaithful translations.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.212,38938760 +main.702,VMSMO: Learning to Generate Multimodal Summary for Video-based News Articles,Mingzhe Li|Xiuying Chen|Shen Gao|Zhangming Chan|Dongyan Zhao|Rui Yan,"A popular multimedia news format nowadays is providing users with a lively video and a corresponding news article, which is employed by influential news media including CNN, BBC, and social media including Twitter and Weibo. In such a case, automatically choosing a proper cover frame of the video and generating an appropriate textual summary of the article can help editors save time, and readers make the decision more effectively. Hence, in this paper, we propose the task of Video-based Multimodal Summarization with Multimodal Output (VMSMO) to tackle such a problem. The main challenge in this task is to jointly model the temporal dependency of video with semantic meaning of article. To this end, we propose a Dual-Interaction-based Multimodal Summarizer (DIMS), consisting of a dual interaction module and multimodal generator. In the dual interaction module, we propose a conditional self-attention mechanism that captures local semantic information within video and a global-attention mechanism that handles the semantic relationship between news text and video from a high level. Extensive experiments conducted on a large-scale real-world VMSMO dataset show that DIMS achieves the state-of-the-art performance in terms of both automatic metrics and human evaluations.",,Summarization,Long,https://www.aclweb.org/anthology/2020.emnlp-main.752,38938761 +main.714,Pre-training for Abstractive Document Summarization by Reinstating Source Text,Yanyan Zou|Xingxing Zhang|Wei Lu|Furu Wei|Ming Zhou,"Abstractive document summarization is usually modeled as a sequence-to-sequence (SEQ2SEQ) learning problem. Unfortunately, training large SEQ2SEQ based summarization models on limited supervised summarization data is challenging. This paper presents three sequence-to-sequence pre-training (in shorthand, STEP) objectives which allow us to pre-train a SEQ2SEQ based abstractive summarization model on unlabeled text. The main idea is that, given an input text artificially constructed from a document, a model is pre-trained to reinstate the original document. These objectives include sentence reordering, next sentence generation and masked document generation, which have close relations with the abstractive document summarization task. Experiments on two benchmark summarization datasets (i.e., CNN/DailyMail and New York Times) show that all three objectives can improve performance upon baselines. Compared to models pre-trained on large-scale data (larger than 160GB), our method, with only 19GB text for pre-training, achieves comparable results, which demonstrates its effectiveness.",,Summarization,Long,https://www.aclweb.org/anthology/2020.emnlp-main.297,38938762 +main.730,Gradient-guided Unsupervised Lexically Constrained Text Generation,Lei Sha,"Lexically constrained generation requires the target sentence to satisfy some lexical constraints, such as containing some specific words or being the paraphrase to a given sentence, which is very important in many real-world natural language generation applications. Previous works usually apply beam-search-based methods or stochastic searching methods to lexically-constrained generation. However, when the search space is too large, beam-search-based methods always fail to find the constrained optimal solution. At the same time, stochastic search methods always cost too many steps to find the correct optimization direction. In this paper, we propose a novel method G2LC to solve the lexically-constrained generation as an unsupervised gradient-guided optimization problem. We propose a differentiable objective function and use the gradient to help determine which position in the sequence should be changed (deleted or inserted/replaced by another word). The word updating process of the inserted/replaced word also benefits from the guidance of gradient. Besides, our method is free of parallel data training, which is flexible to be used in the inference stage of any pre-trained generation model. We apply G2LC to two generation tasks: keyword-to-sentence generation and unsupervised paraphrase generation. The experiment results show that our method achieves state-of-the-art compared to previous lexically-constrained methods.",,Language Generation,Long,https://www.aclweb.org/anthology/2020.emnlp-main.701,38938763 +main.733,"Tasty Burgers, Soggy Fries: Probing Aspect Robustness in Aspect-Based Sentiment Analysis",Xiaoyu Xing|Zhijing Jin|Di Jin|Bingning Wang|Qi Zhang|Xuanjing Huang,"Aspect-based sentiment analysis (ABSA) aims to predict the sentiment towards a specific aspect in the text. However, existing ABSA test sets cannot be used to probe whether a model can distinguish the sentiment of the target aspect from the non-target aspects. To solve this problem, we develop a simple but effective approach to enrich ABSA test sets. Specifically, we generate new examples to disentangle the confounding sentiments of the non-target aspects from the target aspect's sentiment. Based on the SemEval 2014 dataset, we construct the Aspect Robustness Test Set (ARTS) as a comprehensive probe of the aspect robustness of ABSA models. Over 92% data of ARTS show high fluency and desired sentiment on all aspects by human evaluation. Using ARTS, we analyze the robustness of nine ABSA models, and observe, surprisingly, that their accuracy drops by up to 69.73%. We explore several ways to improve aspect robustness, and find that adversarial training can improve models' performance on ARTS by up to 32.85%. Our code and new test set are available at https://github.com/zhijing-jin/ARTS_TestSet",,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",Long,https://www.aclweb.org/anthology/2020.emnlp-main.292,38938764 +main.74,Zero-Shot Cross-Lingual Transfer with Meta Learning,Farhad Nooralahzadeh|Giannis Bekoulis|Johannes Bjerva|Isabelle Augenstein,"Learning what to share between tasks has become a topic of great importance, as strategic sharing of knowledge has been shown to improve downstream task performance. This is particularly important for multilingual applications, as most languages in the world are under-resourced. Here, we consider the setting of training models on multiple different languages at the same time, when little or no data is available for languages other than English. We show that this challenging setup can be approached using meta-learning: in addition to training a source language model, another model learns to select which training instances are the most beneficial to the first. We experiment using standard supervised, zero-shot cross-lingual, as well as few-shot cross-lingual settings for different natural language understanding tasks (natural language inference, question answering). Our extensive experimental setup demonstrates the consistent effectiveness of meta-learning for a total of 15 languages. We improve upon the state-of-the-art for zero-shot and few-shot NLI (on MultiNLI and XNLI) and QA (on the MLQA dataset). A comprehensive error analysis indicates that the correlation of typological features between languages can partly explain when parameter sharing learned via meta-learning is beneficial.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.368,38938645 +main.744,Adversarial Semantic Decoupling for Recognizing Open-Vocabulary Slots,Yuanmeng Yan|Keqing He|Hong Xu|Sihong Liu|Fanyu Meng|Min Hu|Weiran XU,"Open-vocabulary slots, such as file name, album name, or schedule title, significantly degrade the performance of neural-based slot filling models since these slots can take on values from a virtually unlimited set and have no semantic restriction nor a length limit. In this paper, we propose a robust adversarial model-agnostic slot filling method that explicitly decouples local semantics inherent in open-vocabulary slot words from the global context. We aim to depart entangled contextual semantics and focus more on the holistic context at the level of the whole sentence. Experiments on two public datasets show that our method consistently outperforms other methods with a statistically significant margin on all the open-vocabulary slots without deteriorating the performance of normal slots.",,"Syntax: Tagging, Chunking, and Parsing",Long,https://www.aclweb.org/anthology/2020.emnlp-main.490,38938765 +main.745,Transformer Based Multi-Source Domain Adaptation,Dustin Wright|Isabelle Augenstein,"In practical machine learning settings, the data on which a model must make predictions often come from a different distribution than the data it was trained on. Here, we investigate the problem of unsupervised multi-source domain adaptation, where a model is trained on labelled data from multiple source domains and must make predictions on a domain for which no labelled data has been seen. Prior work with CNNs and RNNs has demonstrated the benefit of mixture of experts, where the predictions of multiple domain expert classifiers are combined; as well as domain adversarial training, to induce a domain agnostic representation space. Inspired by this, we investigate how such methods can be effectively applied to large pretrained transformer models. We find that domain adversarial training has an effect on the learned representations of these models while having little effect on their performance, suggesting that large transformer-based models are already relatively robust across domains. Additionally, we show that mixture of experts leads to significant performance improvements by comparing several variants of mixing functions, including one novel metric based on attention. Finally, we demonstrate that the predictions of large pretrained transformer based domain experts are highly homogenous, making it challenging to learn effective metrics for mixing their predictions.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.639,38938766 +main.748,COMETA: A Corpus for Medical Entity Linking in the Social Media,Marco Basaldella|Fangyu Liu|Ehsan Shareghi|Nigel Collier,"Whilst there has been growing progress in Entity Linking (EL) for general language, existing datasets fail to address the complex nature of health terminology in layman's language. Meanwhile, there is a growing need for applications that can understand the public's voice in the health domain. To address this we introduce a new corpus called COMETA, consisting of 20k English biomedical entity mentions from Reddit expert-annotated with links to SNOMED CT, a widely-used medical knowledge graph. Our corpus satisfies a combination of desirable properties, from scale and coverage to diversity and quality, that to the best of our knowledge has not been met by any of the existing resources in the field. Through benchmark experiments on 20 EL baselines from string- to neural-based models we shed light on the ability of these systems to perform complex inference on entities and concepts under 2 challenging evaluation scenarios. Our experimental results on COMETA illustrate that no golden bullet exists and even the best mainstream techniques still have a significant performance gap to fill, while the best solution relies on combining different views of data.",,Information Retrieval and Text Mining,Long,https://www.aclweb.org/anthology/2020.emnlp-main.253,38938767 +main.750,UDapter: Language Adaptation for Truly Universal Dependency Parsing,Ahmet Üstün|Arianna Bisazza|Gosse Bouma|Gertjan van Noord,"Recent advances in multilingual dependency parsing have brought the idea of a truly universal parser closer to reality. However, cross-language interference and restrained model capacity remain major obstacles. To address this, we propose a novel multilingual task adaptation approach based on contextual parameter generation and adapter modules. This approach enables to learn adapters via language embeddings while sharing model parameters across languages. It also allows for an easy but effective integration of existing linguistic typology features into the parsing network. The resulting parser, UDapter, outperforms strong monolingual and multilingual baselines on the majority of both high-resource and low-resource (zero-shot) languages, showing the success of the proposed adaptation approach. Our in-depth analyses show that soft parameter sharing via typological features is key to this success.",,"Syntax: Tagging, Chunking, and Parsing",Long,https://www.aclweb.org/anthology/2020.emnlp-main.180,38938768 +main.754,Compositional Phrase Alignment and beyond,Yuki Arase|Jun'ichi Tsujii,"Phrase alignment is the basis for modelling sentence pair interactions, such as paraphrase and textual entailment recognition. Most phrase alignments are compositional processes such that an alignment of a phrase pair is constructed based on the alignments of their child phrases. Nonetheless, studies have revealed that non-compositional alignments involving long-distance phrase reordering are prevalent in practice. We address the phrase alignment problem by combining an unordered tree mapping algorithm and phrase representation modelling that explicitly embeds the similarity distribution in the sentences onto powerful contextualized representations. Experimental results demonstrate that our method effectively handles compositional and non-compositional global phrase alignments. Our method significantly outperforms that used in a previous study and achieves a performance competitive with that of experienced human annotators.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",Long,https://www.aclweb.org/anthology/2020.emnlp-main.125,38938769 +main.76,ALICE: Active Learning with Contrastive Natural Language Explanations,Weixin Liang|James Zou|Zhou Yu,"Training a supervised neural network classifier typically requires many annotated training samples. Collecting and annotating a large number of data points are costly and sometimes even infeasible. Traditional annotation process uses a low-bandwidth human-machine communication interface: classification labels, each of which only provides a few bits of information. We propose Active Learning with Contrastive Explanations (ALICE), an expert-in-the-loop training framework that utilizes contrastive natural language explanations to improve data efficiency in learning. AL-ICE learns to first use active learning to select the most informative pairs of label classes to elicit contrastive natural language explanations from experts. Then it extracts knowledge from these explanations using a semantic parser. Finally, it incorporates the extracted knowledge through dynamically changing the learning model’s structure. We applied ALICEin two visual recognition tasks, bird species classification and social relationship classification. We found by incorporating contrastive explanations, our models outperform baseline models that are trained with 40-100% more training data. We found that adding1expla-nation leads to similar performance gain as adding 13-30 labeled training data points.",,"Language Grounding to Vision, Robotics and Beyond",Long,https://www.aclweb.org/anthology/2020.emnlp-main.355,38938646 +main.763,Bootstrapped Q-learning with Context Relevant Observation Pruning to Generalize in Text-based Games,Subhajit Chaudhury|Daiki Kimura|Kartik Talamadupula|Michiaki Tatsubori|Asim Munawar|Ryuki Tachibana,"We show that Reinforcement Learning (RL) methods for solving Text-Based Games (TBGs) often fail to generalize on unseen games, especially in small data regimes. To address this issue, we propose Context Relevant Episodic State Truncation (CREST) for irrelevant token removal in observation text for improved generalization. Our method first trains a base model using Q-learning, which typically overfits the training games. The base model's action token distribution is used to perform observation pruning that removes irrelevant tokens. A second bootstrapped model is then retrained on the pruned observation text. Our bootstrapped agent shows improved generalization in solving unseen TextWorld games, using 10x-20x fewer training games compared to previous state-of-the-art (SOTA) methods despite requiring fewer number of training episodes.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.241,38938770 +main.767,Grammatical Error Correction in Low Error Density Domains: A New Benchmark and Analyses,Simon Flachs|Ophélie Lacroix|Helen Yannakoudakis|Marek Rei|Anders Søgaard,"Evaluation of grammatical error correction (GEC) systems has primarily focused on essays written by non-native learners of English, which however is only part of the full spectrum of GEC applications. We aim to broaden the target domain of GEC and release CWEB, a new benchmark for GEC consisting of website text generated by English speakers of varying levels of proficiency. Website data is a common and important domain that contains far fewer grammatical errors than learner essays, which we show presents a challenge to state-of-the-art GEC systems. We demonstrate that a factor behind this is the inability of systems to rely on a strong internal language model in low error density domains. We hope this work shall facilitate the development of open-domain GEC models that generalize to different topics and genres.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.680,38938771 +main.782,Is Graph Structure Necessary for Multi-hop Question Answering?,Nan Shao|Yiming Cui|Ting Liu|Shijin Wang|Guoping Hu,"Recently, attempting to model texts as graph structure and introducing graph neural networks to deal with it has become a trend in many NLP research areas. In this paper, we investigate whether the graph structure is necessary for textual multi-hop reasoning. Our analysis is centered on HotpotQA. We construct a strong baseline model to establish that, with the proper use of pre-trained models, graph structure may not be necessary for textual multi-hop reasoning. We point out that both graph structure and adjacency matrix are task-related prior knowledge, and graph-attention can be considered as a special case of self-attention. Experiments demonstrate that graph-attention or the entire graph structure can be replaced by self-attention or Transformers.",,Question Answering,Long,https://www.aclweb.org/anthology/2020.emnlp-main.583,38938772 +main.787,AttnIO: Knowledge Graph Exploration with In-and-Out Attention Flow for Knowledge-Grounded Dialogue,Jaehun Jung|Bokyung Son|Sungwon Lyu,"Retrieving the proper knowledge relevant to conversational context is an important challenge in dialogue systems, to engage users with more informative response. Several recent works propose to formulate this knowledge selection problem as a path traversal over an external knowledge graph (KG), but show only a limited utilization of KG structure, leaving rooms of improvement in performance. To this effect, we present AttnIO, a new dialog-conditioned path traversal model that makes a full use of rich structural information in KG based on two directions of attention flows. Through the attention flows, AttnIO is not only capable of exploring a broad range of multi-hop knowledge paths, but also learns to flexibly adjust the varying range of plausible nodes and edges to attend depending on the dialog context. Empirical evaluations present a marked performance improvement of AttnIO compared to all baselines in OpenDialKG dataset. Also, we find that our model can be trained to generate an adequate knowledge path even when the paths are not available and only the destination nodes are given as label, making it more applicable to real-world dialogue systems.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.280,38938773 +main.789,On the Reliability and Validity of Detecting Approval of Political Actors in Tweets,Indira Sen|Fabian Flöck|Claudia Wagner,"Social media sites like Twitter possess the potential to complement surveys that measure political opinions and, more specifically, political actors' approval. However, new challenges related to the reliability and validity of social-media-based estimates arise. Various sentiment analysis and stance detection methods have been developed and used in previous research to measure users' political opinions based on their content on social media. In this work, we attempt to gauge the efficacy of untargeted sentiment, targeted sentiment, and stance detection methods in labeling various political actors' approval by benchmarking them across several datasets. We also contrast the performance of these pretrained methods that can be used in an off-the-shelf (OTS) manner against a set of models trained on minimal custom data. We find that OTS methods have low generalizability on unseen and familiar targets, while low-resource custom models are more robust. Our work sheds light on the strengths and limitations of existing methods proposed for understanding politicians' approval from tweets.",,Computational Social Science and Social Media,Long,https://www.aclweb.org/anthology/2020.emnlp-main.110,38938774 +main.802,Multi-Dimensional Gender Bias Classification,Emily Dinan|Angela Fan|Ledell Wu|Jason Weston|Douwe Kiela|Adina Williams,"Machine learning models are trained to find patterns in data. NLP models can inadvertently learn socially undesirable patterns when training on gender biased text. In this work, we propose a novel, general framework that decomposes gender bias in text along several pragmatic and semantic dimensions: bias from the gender of the person being spoken about, bias from the gender of the person being spoken to, and bias from the gender of the speaker. Using this fine-grained framework, we automatically annotate eight large scale datasets with gender information. In addition, we collect a new, crowdsourced evaluation benchmark. Distinguishing between gender bias along multiple dimensions enables us to train better and more fine-grained gender bias classifiers. We show our classifiers are valuable for a variety of applications, like controlling for gender bias in generative models, detecting gender bias in arbitrary text, and classifying text as offensive based on its genderedness.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.23,38938775 +main.809,Modeling the Music Genre Perception across Language-Bound Cultures,Elena V. Epure|Guillaume Salha|Manuel Moussallam|Romain Hennequin,"The music genre perception expressed through human annotations of artists or albums varies significantly across language-bound cultures. These variations cannot be modeled as mere translations since we also need to account for cultural differences in the music genre perception. In this work, we study the feasibility of obtaining relevant cross-lingual, culture-specific music genre annotations based only on language-specific semantic representations, namely distributed concept embeddings and ontologies. Our study, focused on six languages, shows that unsupervised cross-lingual music genre annotation is feasible with high accuracy, especially when combining both types of representations. This approach of studying music genres is the most extensive to date and has many implications in musicology and music information retrieval. Besides, we introduce a new, domain-dependent cross-lingual corpus to benchmark state of the art multilingual pre-trained embedding models.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.386,38938776 +main.820,Word Class Flexibility: A Deep Contextualized Approach,Bai Li|Guillaume Thomas|Yang Xu|Frank Rudzicz,"Word class flexibility refers to the phenomenon whereby a single word form is used across different grammatical categories. Extensive work in linguistic typology has sought to characterize word class flexibility across languages, but quantifying this phenomenon accurately and at scale has been fraught with difficulties. We propose a principled methodology to explore regularity in word class flexibility. Our method builds on recent work in contextualized word embeddings to quantify semantic shift between word classes (e.g., noun-to-verb, verb-to-noun), and we apply this method to 37 languages. We find that contextualized embeddings not only capture human judgment of class variation within words in English, but also uncover shared tendencies in class flexibility across languages. Specifically, we find greater semantic variation when flexible lemmas are used in their dominant word class, supporting the view that word class flexibility is a directional process. Our work highlights the utility of deep contextualized models in linguistic typology.",,"Linguistic Theories, Cognitive Modeling and Psycholinguistics",Long,https://www.aclweb.org/anthology/2020.emnlp-main.71,38938777 +main.821,Vector-Vector-Matrix Architecture: A Novel Hardware-Aware Framework for Low-Latency Inference in NLP Applications,Matthew Khoury|Rumen Dangovski|Longwu Ou|Preslav Nakov|Yichen Shen|Li Jing,"Deep neural networks have become the standard approach to building reliable Natural Language Processing (NLP) applications, ranging from Neural Machine Translation (NMT) to dialogue systems. However, improving accuracy by increasing the model size requires a large number of hardware computations, which can slow down NLP applications significantly at inference time. To address this issue, we propose a novel vector-vector-matrix architecture (VVMA), which greatly reduces the latency at inference time for NMT. This architecture takes advantage of specialized hardware that has low-latency vector-vector operations and higher-latency vector-matrix operations. It also reduces the number of parameters and FLOPs for virtually all models that rely on efficient matrix multipliers without significantly impacting accuracy. We present empirical results suggesting that our framework can reduce the latency of sequence-to-sequence and Transformer models used for NMT by a factor of four. Finally, we show evidence suggesting that our VVMA extends to other domains, and we discuss novel hardware for its efficient use.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.640,38938778 +main.825,Multilevel Text Alignment with Cross-Document Attention,Xuhui Zhou|Nikolaos Pappas|Noah A. Smith,"Text alignment finds application in tasks such as citation recommendation and plagiarism detection. Existing alignment methods operate at a single, predefined level and cannot learn to align texts at, for example, sentence \emph{and} document levels. We propose a new learning approach that equips previously established hierarchical attention encoders for representing documents with a cross-document attention component, enabling structural comparisons across different levels (document-to-document and sentence-to-document). Our component is weakly supervised from document pairs and can align at multiple levels. Our evaluation on predicting document-to-document relationships and sentence-to-document relationships on the tasks of citation recommendation and plagiarism detection shows that our approach outperforms previously established hierarchical, attention encoders based on recurrent and transformer contextualization that are unaware of structural correspondence between documents.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.407,38938779 +main.834,Queens Are Powerful Too: Mitigating Gender Bias in Dialogue Generation,Emily Dinan|Angela Fan|Adina Williams|Jack Urbanek|Douwe Kiela|Jason Weston,"Social biases present in data are often directly reflected in the predictions of models trained on that data. We analyze gender bias in dialogue data, and examine how this bias is not only replicated, but is also amplified in subsequent generative chit-chat dialogue models. We measure gender bias in six existing dialogue datasets before selecting the most biased one, the multi-player text-based fantasy adventure dataset LIGHT, as a testbed for bias mitigation techniques. We consider three techniques to mitigate gender bias: counterfactual data augmentation, targeted data collection, and bias controlled training. We show that our proposed techniques mitigate gender bias by balancing the genderedness of generated dialogue utterances, and find that they are particularly effective in combination. We evaluate model performance with a variety of quantitative methods---including the quantity of gendered words, a dialogue safety classifier, and human assessments---all of which show that our models generate less gendered, but equally engaging chit-chat responses.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.656,38938780 +main.835,XXXXX: A Neural Framework for MT Evaluation,Ricardo Rei|Craig Stewart|Ana C Farinha|Alon Lavie,"We present COMET, a neural framework for training multilingual machine translation evaluation models which obtains new state-of-the-art levels of correlation with human judgements. Our framework leverages recent breakthroughs in cross-lingual pretrained language modeling resulting in highly multilingual and adaptable MT evaluation models that exploit information from both the source input and a target-language reference translation in order to more accurately predict MT quality. To showcase our framework, we train three models with different types of human judgements: Direct Assessments, Human-mediated Translation Edit Rate and Multidimensional Quality Metric. Our models achieve new state-of-the-art performance on the WMT 2019 Metrics shared task and demonstrate robustness to high-performing systems.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.213,38938781 +main.838,Unsupervised Discovery of Implicit Gender Bias,Anjalie Field|Yulia Tsvetkov,"Despite their prevalence in society, social biases are difficult to identify, primarily because human judgements in this domain can be unreliable. We take an unsupervised approach to identifying gender bias against women at a comment level and present a model that can surface text likely to contain bias. Our main challenge is forcing the model to focus on signs of implicit bias, rather than other artifacts in the data. Thus, our methodology involves reducing the influence of confounds through propensity matching and adversarial learning. Our analysis shows how biased comments directed towards female politicians contain mixed criticisms, while comments directed towards other female public figures focus on appearance and sexualization. Ultimately, our work offers a way to capture subtle biases in various domains without relying on subjective human judgements.",,Computational Social Science and Social Media,Long,https://www.aclweb.org/anthology/2020.emnlp-main.44,38938782 +main.84,BLEU Might Be Guilty but References Are Not Innocent,Markus Freitag|David Grangier|Isaac Caswell,"The quality of automatic metrics for machine translation has been increasingly called into question, especially for high-quality systems. This paper demonstrates that, while choice of metric is important, the nature of the references is also critical. We study different methods to collect references and compare their value in automated evaluation by reporting correlation with human evaluation for a variety of systems and metrics. Motivated by the finding that typical references exhibit poor diversity, concentrating around translationese language, we develop a paraphrasing task for linguists to perform on existing reference translations, which counteracts this bias. Our method yields higher correlation with human judgment not only for the submissions of WMT 2019 English to German, but also for Back-translation and APE augmented MT output, which have been shown to have low correlation with automatic metrics using standard references. We demonstrate that our methodology improves correlation with all modern evaluation metrics we look at, including embedding-based methods.To complete this picture, we reveal that multi-reference BLEU does not improve the correlation for high quality output, and present an alternative multi-reference formulation that is more effective.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.5,38938647 +main.850,Low-Resource Domain Adaptation for Compositional Task-Oriented Semantic Parsing,Xilun Chen|Asish Ghoshal|Yashar Mehdad|Luke Zettlemoyer|Sonal Gupta,"Task-oriented semantic parsing is a critical component of virtual assistants, which is responsible for understanding the user’s intents (set reminder, play music, etc.). Recent advances in deep learning have enabled several approaches to successfully parse more complex queries (Gupta et al., 2018; Rongali et al.,2020), but these models require a large amount of annotated training data to parse queries on new domains (e.g. reminder, music). In this paper, we focus on adapting task-oriented semantic parsers to low-resource domains, and propose a novel method that outperforms a supervised neural model at a 10-fold data reduction. In particular, we identify two fundamental factors for low-resource domain adaptation: better representation learning and better training techniques. Our representation learning uses BART (Lewis et al., 2019) to initialize our model which outperforms encoder-only pre-trained representations used in previous work. Furthermore, we train with optimization-based meta-learning (Finn et al., 2017) to improve generalization to low-resource domains. This approach significantly outperforms all baseline methods in the experiments on a newly collected multi-domain task-oriented semantic parsing dataset (TOPv2), which we release to the public.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.413,38938783 +main.851,Joint Estimation and Analysis of Risk Behavior Ratings in Movie Scripts,Victor Martinez|Krishna Somandepalli|Yalda Tehranian-Uhls|Shrikanth Narayanan,"Exposure to violent, sexual, or substance-abuse content in media increases the willingness of children and adolescents to imitate similar behaviors. Computational methods that identify portrayals of risk behaviors from audio-visual cues are limited in their applicability to films in post-production, where modifications might be prohibitively expensive. To address this limitation, we propose a model that estimates content ratings based on the language use in movie scripts, making our solution available at the earlier stages of creative production. Our model significantly improves the state-of-the-art by adapting novel techniques to learn better movie representations from the semantic and sentiment aspects of a character’s language use, and by leveraging the co-occurrence of risk behaviors, following a multi-task approach. Additionally, we show how this approach can be useful to learn novel insights on the joint portrayal of these behaviors, and on the subtleties that filmmakers may otherwise not pick up on.",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.387,38938784 +main.852,Reusing a Pretrained Language Model on Languages with Limited Corpora for Unsupervised NMT,Alexandra Chronopoulou|Dario Stojanovski|Alexander Fraser,"Using a language model (LM) pretrained on two languages with large monolingual data in order to initialize an unsupervised neural machine translation (UNMT) system yields state-of-the-art results. When limited data is available for one language, however, this method leads to poor translations. We present an effective approach that reuses an LM that is pretrained only on the high-resource language. The monolingual LM is fine-tuned on both languages and is then used to initialize a UNMT model. To reuse the pretrained LM, we have to modify its predefined vocabulary, to account for the new language. We therefore propose a novel vocabulary extension method. Our approach, RE-LM, outperforms a competitive cross-lingual pretraining model (XLM) in English-Macedonian (En-Mk) and English-Albanian (En-Sq), yielding more than +8.3 BLEU points for all four translation directions.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.214,38938785 +main.856,Simulated Multiple Reference Training Improves Low-Resource Machine Translation,Huda Khayrallah|Brian Thompson|Matt Post|Philipp Koehn,"Many valid translations exist for a given sentence, yet machine translation (MT) is trained with a single reference translation, exacerbating data sparsity in low-resource settings. We introduce Simulated Multiple Reference Training (SMRT), a novel MT training method that approximates the full space of possible translations by sampling a paraphrase of the reference sentence from a paraphraser and training the MT model to predict the paraphraser’s distribution over possible tokens. We demonstrate the effectiveness of SMRT in low-resource settings when translating to English, with improvements of 1.2 to 7.0 BLEU. We also find SMRT is complementary to back-translation.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.7,38938786 +main.858,On the Evaluation of Contextual Embeddings for Zero-Shot Cross-Lingual Transfer Learning,Phillip Keung|Yichao Lu|Julian Salazar|Vikas Bhardwaj,"Multilingual contextual embeddings have demonstrated state-of-the-art performance in zero-shot cross-lingual transfer learning, where multilingual BERT is fine-tuned on one source language and evaluated on a different target language. However, published results for mBERT zero-shot accuracy vary as much as 17 points on the MLDoc classification task across four papers. We show that the standard practice of using English dev accuracy for model selection in the zero-shot setting makes it difficult to obtain reproducible results on the MLDoc and XNLI tasks. English dev accuracy is often uncorrelated (or even anti-correlated) with target language accuracy, and zero-shot performance varies greatly at different points in the same fine-tuning run and between different fine-tuning runs. These reproducibility issues are also present for other tasks with different pre-trained embeddings (e.g., MLQA with XLM-R). We recommend providing oracle scores alongside zero-shot results: still fine-tune using English data, but choose a checkpoint with the target dev set. Reporting this upper bound makes results more consistent by avoiding arbitrarily bad checkpoints.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.40,38938787 +main.861,Severing the Edge between before and after: Neural Architectures for Temporal Ordering of Events,Miguel Ballesteros|Rishita Anubhai|Shuai Wang|Nima Pourdamghani|Yogarshi Vyas|Jie Ma|Parminder Bhatia|Kathleen McKeown|Yaser Al-Onaizan,"In this paper, we propose a neural architecture and a set of training methods for ordering events by predicting temporal relations. Our proposed models receive a pair of events within a span of text as input and they identify temporal relations (Before, After, Equal, Vague) between them. Given that a key challenge with this task is the scarcity of annotated data, our models rely on either pretrained representations (i.e. RoBERTa, BERT or ELMo), transfer and multi-task learning (by leveraging complementary datasets), and self-training techniques. Experiments on the MATRES dataset of English documents establish a new state-of-the-art on this task.",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.436,38938788 +main.865,LNMap: Departures from Isomorphic Assumption in Bilingual Lexicon Induction through Non-Linear Mapping in Latent Space,Tasnim Mohiuddin|M Saiful Bari|Shafiq Joty,"Most of the successful and predominant methods for Bilingual Lexicon Induction (BLI) are mapping-based, where a linear mapping function is learned with the assumption that the word embedding spaces of different languages exhibit similar geometric structures (i.e. approximately isomorphic). However, several recent studies have criticized this simplified assumption showing that it does not hold in general even for closely related languages. In this work, we propose a novel semi-supervised method to learn cross-lingual word embeddings for BLI. Our model is independent of the isomorphic assumption and uses non-linear mapping in the latent space of two independently pre-trained autoencoders. Through extensive experiments on fifteen (15) different language pairs (in both directions) comprising resource-rich and low-resource languages from two different datasets, we demonstrate that our method outperforms existing models by a good margin. Ablation studies show the importance of different model components and the necessity of non-linear mapping.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.215,38938789 +main.868,Writing Strategies for Science Communication: Data and Computational Analysis,Tal August|Lauren Kim|Katharina Reinecke|Noah A. Smith,"Communicating complex scientific ideas without misleading or overwhelming the public is challenging. While science communication guides exist, they rarely offer empirical evidence for how their strategies are used in practice. Writing strategies that can be automatically recognized could greatly support science communication efforts by enabling tools to detect and suggest strategies for writers. We compile a set of writing strategies drawn from a wide range of prescriptive sources and develop an annotation scheme allowing humans to recognize them. We collect a corpus of 128k science writing documents in English and annotate a subset of this corpus. We use the annotations to train transformer-based classifiers and measure the strategies' use in the larger corpus. We find that the use of strategies, such as storytelling and emphasizing the most important findings, varies significantly across publications with different reader audiences.",,Computational Social Science and Social Media,Long,https://www.aclweb.org/anthology/2020.emnlp-main.429,38938790 +main.87,How Do Decisions Emerge across Layers in Neural Models? Interpretation with Differentiable Masking,Nicola De Cao|Michael Sejr Schlichtkrull|Wilker Aziz|Ivan Titov,"Attribution methods assess the contribution of inputs to the model prediction. One way to do so is erasure: a subset of inputs is considered irrelevant if it can be removed without affecting the prediction. Though conceptually simple, erasure's objective is intractable and approximate search remains expensive with modern deep NLP models. Erasure is also susceptible to the hindsight bias: the fact that an input can be dropped does not mean that the model `knows' it can be dropped. The resulting pruning is over-aggressive and does not reflect how the model arrives at the prediction. To deal with these challenges, we introduce Differentiable Masking. DiffMask learns to mask-out subsets of the input while maintaining differentiability. The decision to include or disregard an input token is made with a simple model based on intermediate hidden layers of the analyzed model. First, this makes the approach efficient because we predict rather than search. Second, as with probing classifiers, this reveals what the network `knows' at the corresponding layers. This lets us not only plot attribution heatmaps but also analyze how decisions are formed across network layers. We use DiffMask to study BERT models on sentiment classification and question answering.",,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.262,38938648 +main.870,Multilingual AMR-to-Text Generation,Angela Fan|Claire Gardent,"Generating text from structured data is challenging because it requires bridging the gap between (i) structure and natural language (NL) and (ii) semantically underspecified input and fully specified NL output. Multilingual generation brings in an additional challenge: that of generating into languages with varied word order and morphological properties. In this work, we focus on Abstract Meaning Representations (AMRs) as structured input, where previous research has overwhelmingly focused on generating only into English. We leverage advances in cross-lingual embeddings, pretraining, and multilingual models to create multilingual AMR-to-text models that generate in twenty one different languages. Our multilingual models surpass baselines that generate into one language in eighteen languages, based on automatic metrics. We analyze the ability of our multilingual models to accurately capture morphology and word order using human evaluation, and find that native speakers judge our generations to be fluent.",,Language Generation,Long,https://www.aclweb.org/anthology/2020.emnlp-main.231,38938791 +main.871,"XGLUE: A New Benchmark Datasetfor Cross-lingual Pre-training, Understanding and Generation",Yaobo Liang|Nan Duan|Yeyun Gong|Ning Wu|Fenfei Guo|Weizhen Qi|Ming Gong|Linjun Shou|Daxin Jiang|Guihong Cao|Xiaodong Fan|Ruofei Zhang|Rahul Agrawal|Edward Cui|Sining Wei|Taroon Bharti|Ying Qiao|Jiun-Hung Chen|Winnie Wu|Shuguang Liu|Fan Yang|Daniel Campos|Rangan Majumder|Ming Zhou,"In this paper, we introduce XGLUE, a new benchmark dataset to train large-scale cross-lingual pre-trained models using multilingual and bilingual corpora, and evaluate their performance across a diverse set of cross-lingual tasks. Comparing to GLUE (Wang et al.,2019), which is labeled in English and includes natural language understanding tasks only, XGLUE has three main advantages: (1) it provides two corpora with different sizes for cross-lingual pre-training; (2) it provides 11 diversified tasks that cover both natural language understanding and generation scenarios; (3) for each task, it provides labeled data in multiple languages. We extend a recent cross-lingual pre-trained model Unicoder (Huang et al., 2019) to cover both understanding and generation tasks, which is evaluated on XGLUE as a strong baseline. We also evaluate the base versions (12-layer) of Multilingual BERT, XLM and XLM-R for comparison.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.484,38938792 +main.872,Where Are the Facts? Searching for Fact-checked Information to Alleviate the Spread of Fake News,Nguyen Vo|Kyumin Lee,"Although many fact-checking systems have been developed in academia and industry, fake news is still proliferating on social media. These systems mostly focus on fact-checking but usually neglect online users who are the main drivers of the spread of misinformation. How can we use fact-checked information to improve users’ consciousness of fake news to which they are exposed? How can we stop users from spreading fake news? To tackle these questions, we propose a novel framework to search for fact-checking articles, which address the content of an original tweet (that may contain misinformation) posted by online users. The search can directly warn fake news posters and online users (e.g. the posters' followers) about misinformation, discourage them from spreading fake news, and scale up verified content on social media. Our framework uses both text and images to search for fact-checking articles, and achieves promising results on real-world datasets. Our code and datasets are released at https://github.com/nguyenvo09/EMNLP2020.",,Computational Social Science and Social Media,Long,https://www.aclweb.org/anthology/2020.emnlp-main.621,38938793 +main.875,The Multilingual Amazon Reviews Corpus,Phillip Keung|Yichao Lu|György Szarvas|Noah A. Smith,"We present the Multilingual Amazon Reviews Corpus (MARC), a large-scale collection of Amazon reviews for multilingual text classification. The corpus contains reviews in English, Japanese, German, French, Spanish, and Chinese, which were collected between 2015 and 2019. Each record in the dataset contains the review text, the review title, the star rating, an anonymized reviewer ID, an anonymized product ID, and the coarse-grained product category (e.g., 'books', 'appliances', etc.) The corpus is balanced across the 5 possible star ratings, so each rating constitutes 20% of the reviews in each language. For each language, there are 200,000, 5,000, and 5,000 reviews in the training, development, and test sets, respectively. We report baseline results for supervised text classification and zero-shot cross-lingual transfer learning by fine-tuning a multilingual BERT model on reviews data. We propose the use of mean absolute error (MAE) instead of classification accuracy for this task, since MAE accounts for the ordinal nature of the ratings.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.369,38938794 +main.876,Sound Natural: Content Rephrasing in Dialog Systems,Arash Einolghozati|Anchit Gupta|Keith Diedrick|Sonal Gupta,"We introduce a new task of rephrasing for a more natural virtual assistant. Currently, virtual assistants work in the paradigm of intent-slot tagging and the slot values are directly passed as-is to the execution engine. However, this setup fails in some scenarios such as messaging when the query given by the user needs to be changed before repeating it or sending it to another user. For example, for queries like 'ask my wife if she can pick up the kids' or 'remind me to take my pills', we need to rephrase the content to 'can you pick up the kids' and 'take your pills'. In this paper, we study the problem of rephrasing with messaging as a use case and release a dataset of 3000 pairs of original query and rephrased query. We show that BART, a pre-trained transformers-based masked language model, is a strong baseline for the task, and show improvements by adding a copy-pointer and copy loss to it. We analyze different trade-offs of BART-based and LSTM-based seq2seq models, and propose a distilled LSTM-based seq2seq as the best practical model",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.414,38938795 +main.877,Modularized Syntactic Neural Networks for Sentence Classification,Haiyan Wu|Ying Liu|Shaoyun Shi,"This paper focuses on tree-based modeling for the sentence classification task. In existing works, aggregating on a syntax tree usually considers local information of sub-trees. In contrast, in addition to the local information, our proposed Modularized Syntactic Neural Network (MSNN) utilizes the syntax category labels and takes advantage of the global context while modeling sub-trees. In MSNN, each node of a syntax tree is modeled by a label-related syntax module. Each syntax module aggregates the outputs of lower-level modules, and finally, the root module provides the sentence representation. We design a tree-parallel mini-batch strategy for efficient training and predicting. Experimental results on four benchmark datasets show that our MSNN significantly outperforms previous state-of-the-art tree-based methods on the sentence classification task.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",Long,https://www.aclweb.org/anthology/2020.emnlp-main.222,38938796 +main.883,Incremental Neural Coreference Resolution in Constant Memory,Patrick Xia|João Sedoc|Benjamin Van Durme,"We investigate modeling coreference resolution under a fixed memory constraint by extending an incremental clustering algorithm to utilize contextualized encoders and neural components. Given a new sentence, our end-to-end algorithm proposes and scores each mention span against explicit entity representations created from the earlier document context (if any). These spans are then used to update the entity's representations before being forgotten; we only retain a fixed set of salient entities throughout the document. In this work, we successfully convert a high-performing model (Joshi et al., 2020), asymptotically reducing its memory usage to constant space with only a 0.3% relative loss in F1 on OntoNotes 5.0.",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.695,38938797 +main.888,Automatic Machine Translation Evaluation in Many Languages via Zero-Shot Paraphrasing,Brian Thompson|Matt Post,"We frame the task of machine translation evaluation as one of scoring machine translation output with a sequence-to-sequence paraphraser, conditioned on a human reference. We propose training the paraphraser as a multilingual NMT system, treating paraphrasing as a zero-shot translation task (e.g., Czech to Czech). This results in the paraphraser’s output mode being centered around a copy of the input sequence, which represents the best case scenario where the MT system output matches a human reference. Our method is simple and intuitive, and does not require human judgements for training. Our single model (trained in 39 languages) outperforms or statistically ties with all prior metrics on the WMT 2019 segment-level shared metrics task in all languages (excluding Gujarati where the model had no training data). We also explore using our model for the task of quality estimation as a metric—conditioning on the source instead of the reference—and find that it significantly outperforms every submission to the WMT 2019 shared task on quality estimation in every language pair.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.8,38938798 +main.891,Uncertainty-Aware Semantic Augmentation for Neural Machine Translation,Xiangpeng Wei|Heng Yu|Yue Hu|Rongxiang Weng|Luxi Xing|Weihua Luo,"As a sequence-to-sequence generation task, neural machine translation (NMT) naturally contains intrinsic uncertainty, where a single sentence in one language has multiple valid counterparts in the other. However, the dominant methods for NMT only observe one of them from the parallel corpora for the model training but have to deal with adequate variations under the same meaning at inference. This leads to a discrepancy of the data distribution between the training and the inference phases. To address this problem, we propose uncertainty-aware semantic augmentation, which explicitly captures the universal semantic information among multiple semantically-equivalent source sentences and enhances the hidden representations with this information for better translations. Extensive experiments on various translation tasks reveal that our approach significantly outperforms the strong baselines and the existing methods.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.216,38938799 +main.894,Can Automatic Post-Editing Improve NMT?,Shamil Chollampatt|Raymond Hendy Susanto|Liling Tan|Ewa Szymanska,"Automatic post-editing (APE) aims to improve machine translations, thereby reducing human post-editing effort. APE has had notable success when used with statistical machine translation (SMT) systems but has not been as successful over neural machine translation (NMT) systems. This has raised questions on the relevance of APE task in the current scenario. However, the training of APE models has been heavily reliant on large-scale artificial corpora combined with only limited human post-edited data. We hypothesize that APE models have been underperforming in improving NMT translations due to the lack of adequate supervision. To ascertain our hypothesis, we compile a larger corpus of human post-edits of English to German NMT. We empirically show that a state-of-art neural APE model trained on this corpus can significantly improve a strong in-domain NMT system, challenging the current understanding in the field. We further investigate the effects of varying training data sizes, using artificial training data, and domain specificity for the APE task. We release this new corpus under CC BY-NC-SA 4.0 license at https://github.com/shamilcm/pedra.",,Machine Translation and Multilinguality,Long,https://www.aclweb.org/anthology/2020.emnlp-main.217,38938800 +main.903,Repulsive Attention: Rethinking Multi-head Attention as Bayesian Inference,Bang An|Jie Lyu|Zhenyi Wang|Chunyuan Li|Changwei Hu|Fei Tan|Ruiyi Zhang|Yifan Hu|Changyou Chen,"The neural attention mechanism plays an important role in many natural language processing applications. In particular, multi-head attention extends single-head attention by allowing a model to jointly attend information from different perspectives. However, without explicit constraining, multi-head attention may suffer from attention collapse, an issue that makes different heads extract similar attentive features, thus limiting the model's representation power. In this paper, for the first time, we provide a novel understanding of multi-head attention from a Bayesian perspective. Based on the recently developed particle-optimization sampling techniques, we propose a non-parametric approach that explicitly improves the repulsiveness in multi-head attention and consequently strengthens model's expressiveness. Remarkably, our Bayesian interpretation provides theoretical inspirations on the not-well-understood questions: why and how one uses multi-head attention. Extensive experiments on various attention models and applications demonstrate that the proposed repulsive attention can improve the learned feature diversity, leading to more informative representations with consistent performance improvement on multiple tasks.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.17,38938801 +main.910,TeaForN: Teacher-Forcing with N-grams,Sebastian Goodman|Nan Ding|Radu Soricut,"Sequence generation models trained with teacher-forcing suffer from issues related to exposure bias and lack of differentiability across timesteps. Our proposed method, Teacher-Forcing with N-grams (TeaForN), addresses both these problems directly, through the use of a stack of N decoders trained to decode along a secondary time axis that allows model-parameter updates based on N prediction steps. TeaForN can be used with a wide class of decoder architectures and requires minimal modifications from a standard teacher-forcing setup. Empirically, we show that TeaForN boosts generation quality on one Machine Translation benchmark, WMT 2014 English-French, and two News Summarization benchmarks, CNN/Dailymail and Gigaword.",,Language Generation,Long,https://www.aclweb.org/anthology/2020.emnlp-main.702,38938802 +main.911,LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention,Ikuya Yamada|Akari Asai|Hiroyuki Shindo|Hideaki Takeda|Yuji Matsumoto,"Entity representations are useful in natural language tasks involving entities. In this paper, we propose new pretrained contextualized representations of words and entities based on the bidirectional transformer. The proposed model treats words and entities in a given text as independent tokens, and outputs contextualized representations of them. Our model is trained using a new pretraining task based on the masked language model of BERT. The task involves predicting randomly masked words and entities in a large entity-annotated corpus retrieved from Wikipedia. We also propose an entity-aware self-attention mechanism that is an extension of the self-attention mechanism of the transformer, and considers the types of tokens (words or entities) when computing attention scores. The proposed model achieves impressive empirical performance on a wide range of entity-related tasks. In particular, it obtains state-of-the-art results on five well-known datasets: Open Entity (entity typing), TACRED (relation classification), CoNLL-2003 (named entity recognition), ReCoRD (cloze-style question answering), and SQuAD 1.1 (extractive question answering). Our source code and pretrained representations are available at https://github.com/studio-ousia/luke.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",Long,https://www.aclweb.org/anthology/2020.emnlp-main.523,38938803 +main.916,"Continuity of Topic, Interaction, and Query: Learning to Quote in Online Conversations",Lingzhi Wang|Jing Li|Xingshan Zeng|Haisong Zhang|Kam-Fai Wong,"Quotations are crucial for successful explanations and persuasions in interpersonal communications. However, finding what to quote in a conversation is challenging for both humans and machines. This work studies automatic quotation generation in an online conversation and explores how language consistency affects whether a quotation fits the given context. Here, we capture the contextual consistency of a quotation in terms of latent topics, interactions with the dialogue history, and coherence to the query turn's existing contents. Further, an encoder-decoder neural framework is employed to continue the context with a quotation via language generation. Experiment results on two large-scale datasets in English and Chinese demonstrate that our quotation generation model outperforms the state-of-the-art models. Further analysis shows that topic, interaction, and query consistency are all helpful to learn how to quote in online conversations.",,Computational Social Science and Social Media,Long,https://www.aclweb.org/anthology/2020.emnlp-main.538,38938804 +main.920,Acrostic Poem Generation,Rajat Agarwal|Katharina Kann,"We propose a new task in the area of computational creativity: acrostic poem generation in English. Acrostic poems are poems that contain a hidden message; typically, the first letter of each line spells out a word or short phrase. We define the task as a generation task with multiple constraints: given an input word, 1) the initial letters of each line should spell out the provided word, 2) the poem's semantics should also relate to it, and 3) the poem should conform to a rhyming scheme. We further provide a baseline model for the task, which consists of a conditional neural language model in combination with a neural rhyming model. Since no dedicated datasets for acrostic poem generation exist, we create training data for our task by first training a separate topic prediction model on a small set of topic-annotated poems and then predicting topics for additional poems. Our experiments show that the acrostic poems generated by our baseline are received well by humans and do not lose much quality due to the additional constraints. Last, we confirm that poems generated by our model are indeed closely related to the provided prompts, and that pretraining on Wikipedia can boost performance.",,Language Generation,Long,https://www.aclweb.org/anthology/2020.emnlp-main.94,38938805 +main.923,Language Generation with Multi-hop Reasoning on Commonsense Knowledge Graph,Haozhe Ji|Pei Ke|Shaohan Huang|Furu Wei|Xiaoyan Zhu|Minlie Huang,"Despite the success of generative pre-trained language models on a series of text generation tasks, they still suffer in cases where reasoning over underlying commonsense knowledge is required during generation. Existing approaches that integrate commonsense knowledge into generative pre-trained language models simply transfer relational knowledge by post-training on individual knowledge triples while ignoring rich connections within the knowledge graph. We argue that exploiting both the structural and semantic information of the knowledge graph facilitates commonsense-aware text generation. In this paper, we propose Generation with Multi-Hop Reasoning Flow (GRF) that enables pre-trained models with dynamic multi-hop reasoning on multi-relational paths extracted from the external commonsense knowledge graph. We empirically show that our model outperforms existing baselines on three text generation tasks that require reasoning over commonsense knowledge. We also demonstrate the effectiveness of the dynamic multi-hop reasoning module with reasoning paths inferred by the model that provide rationale to the generation.",,Language Generation,Long,https://www.aclweb.org/anthology/2020.emnlp-main.54,38938806 +main.928,TORQUE: A Reading Comprehension Dataset of Temporal Ordering Questions,Qiang Ning|Hao Wu|Rujun Han|Nanyun Peng|Matt Gardner|Dan Roth,"A critical part of reading is being able to understand the temporal relationships between events described in a passage of text, even when those relationships are not explicitly stated. However, current machine reading comprehension benchmarks have practically no questions that test temporal phenomena, so systems trained on these benchmarks have no capacity to answer questions such as ``what happened before/after [some event]?'' We introduce TORQUE, a new English reading comprehension benchmark built on 3.2k news snippets with 21k human-generated questions querying temporal relationships. Results show that RoBERTa-large achieves an exact-match score of 51% on the test set of TORQUE, about 30% behind human performance.",,Question Answering,Long,https://www.aclweb.org/anthology/2020.emnlp-main.88,38938807 +main.930,A Joint Multiple Criteria Model in Transfer Learning for Cross-domain Chinese Word Segmentation,Kaiyu Huang|Degen Huang|Zhuang Liu|Fengran Mo,"Word-level information is important in natural language processing (NLP), especially for the Chinese language due to its high linguistic complexity. Chinese word segmentation (CWS) is an essential task for Chinese downstream NLP tasks. Existing methods have already achieved a competitive performance for CWS on large-scale annotated corpora. However, the accuracy of the method will drop dramatically when it handles an unsegmented text with lots of out-of-vocabulary (OOV) words. In addition, there are many different segmentation criteria for addressing different requirements of downstream NLP tasks. Excessive amounts of models with saving different criteria will generate the explosive growth of the total parameters. To this end, we propose a joint multiple criteria model that shares all parameters to integrate different segmentation criteria into one model. Besides, we utilize a transfer learning method to improve the performance of OOV words. Our proposed method is evaluated by designing comprehensive experiments on multiple benchmark datasets (e.g., Bakeoff 2005, Bakeoff 2008 and SIGHAN 2010). Our method achieves the state-of-the-art performances on all datasets. Importantly, our method also shows a competitive practicability and generalization ability for the CWS task.",,"Phonology, Morphology and Word Segmentation",Long,https://www.aclweb.org/anthology/2020.emnlp-main.318,38938808 +main.947,Information-Theoretic Probing with Minimum Description Length,Elena Voita|Ivan Titov,"To measure how well pretrained representations encode some linguistic property, it is common to use accuracy of a probe, i.e. a classifier trained to predict the property from the representations. Despite widespread adoption of probes, differences in their accuracy fail to adequately reflect differences in representations. For example, they do not substantially favour pretrained representations over randomly initialized ones. Analogously, their accuracy can be similar when probing for genuine linguistic labels and probing for random synthetic tasks. To see reasonable differences in accuracy with respect to these random baselines, previous work had to constrain either the amount of probe training data or its model size. Instead, we propose an alternative to the standard probes, information-theoretic probing with minimum description length (MDL). With MDL probing, training a probe to predict labels is recast as teaching it to effectively transmit the data. Therefore, the measure of interest changes from probe accuracy to the description length of labels given representations. In addition to probe quality, the description length evaluates ""the amount of effort"" needed to achieve the quality. This amount of effort characterizes either (i) size of a probing model, or (ii) the amount of data needed to achieve the high quality. We consider two methods for estimating MDL which can be easily implemented on top of the standard probing pipelines: variational coding and online coding. We show that these methods agree in results and are more informative and stable than the standard probes.",,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.14,38938809 +main.954,Amalgamating Knowledge from Two Teachers for Task-oriented Dialogue System with Adversarial Training,Wanwei He|Min Yang|Rui Yan|Chengming Li|Ying Shen|Ruifeng Xu,"The challenge of both achieving task completion by querying the knowledge base and generating human-like responses for task-oriented dialogue systems is attracting increasing research attention. In this paper, we propose a “Two-Teacher One-Student” learning framework (TTOS) for task-oriented dialogue, with the goal of retrieving accurate KB entities and generating human-like responses simultaneously. TTOS amalgamates knowledge from two teacher networks that together provide comprehensive guidance to build a high-quality task-oriented dialogue system (student network). Each teacher network is trained via reinforcement learning with a goal-specific reward, which can be viewed as an expert towards the goal and transfers the professional characteristic to the student network. Instead of adopting the classic student-teacher learning of forcing the output of a student network to exactly mimic the soft targets produced by the teacher networks, we introduce two discriminators as in generative adversarial network (GAN) to transfer knowledge from two teachers to the student. The usage of discriminators relaxes the rigid coupling between the student and teachers. Extensive experiments on two benchmark datasets (i.e., CamRest and In-Car Assistant) demonstrate that TTOS significantly outperforms baseline methods.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.281,38938810 +main.955,Discourse Self-Attention for Discourse Element Identification in Argumentative Student Essays,Wei Song|Ziyao Song|Ruiji Fu|Lizhen Liu|Miaomiao Cheng|Ting Liu,"This paper proposes to adapt self-attention to discourse level for modeling discourse elements in argumentative student essays. Specifically, we focus on two issues. First, we propose structural sentence positional encodings to explicitly represent sentence positions. Second, we propose to use inter-sentence attentions to capture sentence interactions and enhance sentence representation. We conduct experiments on two datasets: a Chinese dataset and an English dataset. We find that (i) sentence positional encoding can lead to a large improvement for identifying discourse elements; (ii) a structural relative positional encoding of sentences shows to be most effective; (iii) inter-sentence attention vectors are useful as a kind of sentence representations for identifying discourse elements.",,Discourse and Pragmatics,Long,https://www.aclweb.org/anthology/2020.emnlp-main.225,38938811 +main.956,BERT-EMD: Many-to-Many Layer Mapping for BERT Compression with Earth Mover's Distance,jianquan li|Xiaokang Liu|Honghong Zhao|Ruifeng Xu|Min Yang|yaohong jin,"Pre-trained language models (e.g., BERT) have achieved significant success in various natural language processing (NLP) tasks. However, high storage and computational costs obstruct pre-trained language models to be effectively deployed on resource-constrained devices. In this paper, we propose a novel BERT distillation method based on many-to-many layer mapping, which allows each intermediate student layer to learn from any intermediate teacher layers. In this way, our model can learn from different teacher layers adaptively for different NLP tasks. In addition, we leverage Earth Mover's Distance (EMD) to compute the minimum cumulative cost that must be paid to transform knowledge from teacher network to student network. EMD enables effective matching for the many-to-many layer mapping. Furthermore, we propose a cost attention mechanism to learn the layer weights used in EMD automatically, which is supposed to further improve the model's performance and accelerate convergence time. Extensive experiments on GLUE benchmark demonstrate that our model achieves competitive performance compared to strong competitors in terms of both accuracy and model compression",,NLP Applications,Long,https://www.aclweb.org/anthology/2020.emnlp-main.242,38938812 +main.958,A Diagnostic Study of Explainability Techniques for Text Classification,Pepa Atanasova|Jakob Grue Simonsen|Christina Lioma|Isabelle Augenstein,"Recent developments in machine learning have introduced models that approach human performance at the cost of increased architectural complexity. Efforts to make the rationales behind the models' predictions transparent have inspired an abundance of new explainability techniques. Provided with an already trained model, they compute saliency scores for the words of an input instance. However, there exists no definitive guide on (i) how to choose such a technique given a particular application task and model architecture, and (ii) the benefits and drawbacks of using each such technique. In this paper, we develop a comprehensive list of diagnostic properties for evaluating existing explainability techniques. We then employ the proposed list to compare a set of diverse explainability techniques on downstream text classification tasks and neural network architectures. We also compare the saliency scores assigned by the explainability techniques with human annotations of salient input regions to find relations between a model's performance and the agreement of its rationales with human ones. Overall, we find that the gradient-based explanations perform best across tasks and model architectures, and we present further insights into the properties of the reviewed explainability techniques.",,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.263,38938813 +main.959,F1 Is Not Enough! Models and Evaluation towards User-Centered Explainable Question Answering,Hendrik Schuff|Heike Adel|Ngoc Thang Vu,"Explainable question answering systems predict an answer together with an explanation showing why the answer has been selected. The goal is to enable users to assess the correctness of the system and understand its reasoning process. However, we show that current models and evaluation settings have shortcomings regarding the coupling of answer and explanation which might cause serious issues in user experience. As a remedy, we propose a hierarchical model and a new regularization term to strengthen the answer-explanation coupling as well as two evaluation scores to quantify the coupling. We conduct experiments on the HOTPOTQA benchmark data set and perform a user study. The user study shows that our models increase the ability of the users to judge the correctness of the system and that scores like F1 are not enough to estimate the usefulness of a model in a practical setting with human users. Our scores are better aligned with user experience, making them promising candidates for model selection.",,Interpretability and Analysis of Models for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.575,38938814 +main.96,Event Extraction by Answering (Almost) Natural Questions,Xinya Du|Claire Cardie,"The problem of event extraction requires detecting the event trigger and extracting its corresponding arguments. Existing work in event argument extraction typically relies heavily on entity recognition as a preprocessing/concurrent step, causing the well-known problem of error propagation. To avoid this issue, we introduce a new paradigm for event extraction by formulating it as a question answering (QA) task that extracts the event arguments in an end-to-end manner. Empirical results demonstrate that our framework outperforms prior methods substantially; in addition, it is capable of extracting event arguments for roles not seen at training time (i.e., in a zero-shot learning setting).",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.49,38938649 +main.965,What Have We Achieved on Text Summarization?,Dandan Huang|Leyang Cui|Sen Yang|Guangsheng Bao|Kun Wang|Jun Xie|Yue Zhang,"Deep learning has led to significant improvement in text summarization with various methods investigated and improved ROUGE scores reported over the years. However, gaps still exist between summaries produced by automatic summarizers and human professionals. Aiming to gain more understanding of summarization systems with respect to their strengths and limits on a fine-grained syntactic and semantic level, we consult the Multidimensional Quality Metric (MQM) and quantify 8 major sources of errors on 10 representative summarization models manually. Primarily, we find that 1) under similar settings, extractive summarizers are in general better than their abstractive counterparts thanks to strength in faithfulness and factual-consistency; 2) milestone techniques such as copy, coverage and hybrid extractive/abstractive methods do bring specific improvements but also demonstrate limitations; 3) pre-training techniques, and in particular sequence-to-sequence pre-training, are highly effective for improving text summarization, with BART giving the best results.",,Summarization,Long,https://www.aclweb.org/anthology/2020.emnlp-main.33,38938815 +main.973,Embedding Words in Non-Vector Space with Unsupervised Graph Learning,Max Ryabinin|Sergei Popov|Liudmila Prokhorenkova|Elena Voita,"It has become a de-facto standard to represent words as elements of a vector space (word2vec, GloVe). While this approach is convenient, it is unnatural for language: words form a graph with a latent hierarchical structure, and this structure has to be revealed and encoded by word embeddings. We introduce GraphGlove: unsupervised graph word representations which are learned end-to-end. In our setting, each word is a node in a weighted graph and the distance between words is the shortest path distance between the corresponding nodes. We adopt a recent method learning a representation of data in the form of a differentiable weighted graph and use it to modify the GloVe training algorithm. We show that our graph-based representations substantially outperform vector-based methods on word similarity and analogy tasks. Our analysis reveals that the structure of the learned graphs is hierarchical and similar to that of WordNet, the geometry is highly non-trivial and contains subgraphs with different local topology.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.594,38938816 +main.977,Slot Attention with Value Normalization for Multi-domain Dialogue State Tracking,Yexiang Wang|Yi Guo|Siqi Zhu,"Incompleteness of domain ontology and unavailability of some values are two inevitable problems of dialogue state tracking (DST). Existing approaches generally fall into two extremes: choosing models without ontology or embedding ontology in models leading to over-dependence. In this paper, we propose a new architecture to cleverly exploit ontology, which consists of Slot Attention (SA) and Value Normalization (VN), referred to as SAVN. Moreover, we supplement the annotation of supporting span for MultiWOZ 2.1, which is the shortest span in utterances to support the labeled value. SA shares knowledge between slots and utterances and only needs a simple structure to predict the supporting span. VN is designed specifically for the use of ontology, which can convert supporting spans to the values. Empirical results demonstrate that SAVN achieves the state-of-the-art joint accuracy of 54.52% on MultiWOZ 2.0 and 54.86% on MultiWOZ 2.1. Besides, we evaluate VN with incomplete ontology. The results show that even if only 30% ontology is used, VN can also contribute to our model.",,Machine Learning for NLP,Long,https://www.aclweb.org/anthology/2020.emnlp-main.243,38938817 +main.983,Enhancing Aspect Term Extraction with Soft Prototypes,Zhuang Chen|Tieyun Qian,"Aspect term extraction (ATE) aims to extract aspect terms from a review sentence that users have expressed opinions on. Existing studies mostly focus on designing neural sequence taggers to extract linguistic features from the token level. However, since the aspect terms and context words usually exhibit long-tail distributions, these taggers often converge to an inferior state without enough sample exposure. In this paper, we propose to tackle this problem by correlating words with each other through soft prototypes. These prototypes, generated by a soft retrieval process, can introduce global knowledge from internal or external data and serve as the supporting evidence for discovering the aspect terms. Our proposed model is a general framework and can be combined with almost all sequence taggers. Experiments on four SemEval datasets show that our model boosts the performance of three typical ATE methods by a large margin.",,Information Extraction,Long,https://www.aclweb.org/anthology/2020.emnlp-main.164,38938818 +main.989,Named Entity Recognition Only from Word Embeddings,Ying Luo|Hai Zhao|Junlang Zhan,"Deep neural network models have helped named entity recognition achieve amazing performance without handcrafting features. However, existing systems require large amounts of human annotated training data. Efforts have been made to replace human annotations with external knowledge (e.g., NE dictionary, part-of-speech tags), while it is another challenge to obtain such effective resources. In this work, we propose a fully unsupervised NE recognition model which only needs to take informative clues from pre-trained word embeddings.We first apply Gaussian Hidden Markov Model and Deep Autoencoding Gaussian Mixture Model on word embeddings for entity span detection and type prediction, and then further design an instance selector based on reinforcement learning to distinguish positive sentences from noisy sentences and then refine these coarse-grained annotations through neural networks. Extensive experiments on two CoNLL benchmark NER datasets (CoNLL-2003 English dataset and CoNLL-2002 Spanish dataset) demonstrate that our proposed light NE recognition model achieves remarkable performance without using any annotated lexicon or corpus.",,Information Retrieval and Text Mining,Long,https://www.aclweb.org/anthology/2020.emnlp-main.723,38938819 +main.995,Sub-Instruction Aware Vision-and-Language Navigation,Yicong Hong|Cristian Rodriguez|Qi Wu|Stephen Gould,"Vision-and-language navigation requires an agent to navigate through a real 3D environment following natural language instructions. Despite significant advances, few previous works are able to fully utilize the strong correspondence between the visual and textual sequences. Meanwhile, due to the lack of intermediate supervision, the agent's performance at following each part of the instruction cannot be assessed during navigation. In this work, we focus on the granularity of the visual and language sequences as well as the traceability of agents through the completion of an instruction. We provide agents with fine-grained annotations during training and find that they are able to follow the instruction better and have a higher chance of reaching the target at test time. We enrich the benchmark dataset Room-to-Room (R2R) with sub-instructions and their corresponding paths. To make use of this data, we propose effective sub-instruction attention and shifting modules that select and attend to a single sub-instruction at each time-step. We implement our sub-instruction modules in four state-of-the-art agents, compare with their baseline models, and show that our proposed method improves the performance of all four agents. We release the Fine-Grained R2R dataset (FGR2R) and the code at https://github.com/YicongHong/Fine-Grained-R2R.",,"Language Grounding to Vision, Robotics and Beyond",Long,https://www.aclweb.org/anthology/2020.emnlp-main.271,38938820 +main.999,Profile Consistency Identification for Open-domain Dialogue Agents,Haoyu Song|Yan Wang|Wei-Nan Zhang|Zhengyu Zhao|Ting Liu|Xiaojiang Liu,"Maintaining a consistent attribute profile is crucial for dialogue agents to naturally converse with humans. Existing studies on improving attribute consistency mainly explored how to incorporate attribute information in the responses, but few efforts have been made to identify the consistency relations between response and attribute profile. To facilitate the study of profile consistency identification, we create a large-scale human-annotated dataset with over 110K single-turn conversations and their key-value attribute profiles. Explicit relation between response and profile is manually labeled. We also propose a key-value structure information enriched BERT model to identify the profile consistency, and it gained improvements over strong baselines. Further evaluations on downstream tasks demonstrate that the profile consistency identification model is conducive for improving dialogue consistency.",,Dialog and Interactive Systems,Long,https://www.aclweb.org/anthology/2020.emnlp-main.539,38938821 +CL.1,Tractable Lexical-Functional Grammar,Jürgen Wedekind|Ronald M. Kaplan,"The formalism for Lexical-Functional Grammar (LFG) was introduced in the 1980’s as one of the first constraint-based grammatical formalisms for natural language. It has led to substantial contributions to the linguistic literature and to the construction of large-scale descriptions of particular languages. Investigations of its mathematical properties have shown that, without further restrictions, the recognition, emptiness, and generation problems are undecidable, and that they are intractable in the worst case even with commonly applied restrictions. However, grammars of real languages appear not to invoke the full expressive power of the formalism, as indicated by the fact that algorithms and implementations for recognition and generation have been developed that run—even for broad-coverage grammars—in typically polynomial time. This paper formalizes some restrictions on the notation and its interpretation that are compatible with conventions and principles that have been implicit or informally stated in linguistic theory. We show that LFG grammars that respect these restrictions, although still suitable for the description of natural languages, are equivalent to linear context-free rewriting systems and allow for tractable computation.",,"Syntax: Tagging, Chunking, and Parsing",CL,,38939389 +CL.2,Semantic Drift in Multilingual Representations,Lisa Beinborn|Rochelle Choenni,"Multilingual representations have mostly been evaluated based on their performance on specific tasks. In this article, we look beyond engineering goals and analyze the relations between languages in computational representations. We introduce a methodology for comparing languages based on their organization of semantic concepts. We propose to conduct an adapted version of representational similarity analysis of a selected set of concepts in computational multilingual representations. Using this analysis method, we can reconstruct a phylogenetic tree that closely resembles those assumed by linguistic experts. These results indicate that multilingual distributional representations that are only trained on monolingual text and bilingual dictionaries preserve relations between languages without the need for any etymological information. In addition, we propose a measure to identify semantic drift between language families.We perform experiments on word-based and sentence-based multilingual models and provide both quantitative results and qualitative examples. Analyses of semantic drift in multilingual representations can serve two purposes: They can indicate unwanted characteristics of the computational models and they provide a quantitative means to study linguistic phenomena across languages.",,Machine Translation and Multilinguality,CL,,38939390 +CL.3,Predicting In-game Actions from Interviews of NBA Players,Nadav Oved|Amir Feder|Roi Reichart,"Sports competitions are widely researched in computer and social science, with the goal of understanding how players act under uncertainty. Although there is an abundance of computational work on player metrics prediction based on past performance, very few attempts to incorporate out-of-game signals have been made. Specifically, it was previously unclear whether linguistic signals gathered from players’ interviews can add information that does not appear in performance metrics. To bridge that gap, we define text classification tasks of predicting deviations from mean in NBA players’ in-game actions, which are associated with strategic choices, player behavior, and risk, using their choice of language prior to the game. We collected a data set of transcripts from key NBA players’ pre-game interviews and their in-game performance metrics, totalling 5,226 interview-metric pairs. We design neural models for players’ action prediction based on increasingly more complex aspects of the language signals in their openended interviews. Our models can make their predictions based on the textual signal alone, or on a combination of that signal with signals from past-performance metrics. Our text-based models outperform strong baselines trained on performance metrics only, demonstrating the importance of language usage for action prediction. Moreover, the models that utilize both textual input and past-performance metrics produced the best results. Finally, as neural networks are notoriously difficult to interpret, we propose a method for gaining further insight into what our models have learned. Particularly, we present a latent Dirichlet allocation–based analysis, where we interpretmodel predictions in terms of correlated topics. We find that our best performing textual modelis most associated with topics that are intuitively related to each prediction task and that bettermodels yield higher correlation with more informative topics.",,NLP Applications,CL,,38939391 +CL.4,Sparse Transcription,Steven Bird,"The transcription bottleneck is often cited as a major obstacle for efforts to document the world’s endangered languages and supply them with language technologies. One solution is to extend methods from automatic speech recognition and machine translation, and recruit linguists to provide narrow phonetic transcriptions and sentence-aligned translations. However, I believe that these approaches are not a good fit with the available data and skills, or with long-established practices that are essentially word based. In seeking a more effective approach, I consider a century of transcription practice and a wide range of computational approaches, before proposing a computational model based on spoken term detection which I call “sparse transcription.” This represents a shift away from current assumptions that we transcribe phones, transcribe fully, and transcribe first. Instead, sparse transcription combines the older practice of word-level transcription with interpretive, iterative, and interactive processes which are amenable to wider participation and which open the way to new methods for processing oral languages.",,Speech and Multimodality,CL,,38939392 +CL.5,Efficient Outside Computation,Daniel Gildea,"Weighted deduction systems provide a framework for describing parsing algorithms that can be used with a variety of operations for combining the values of partial derivations. For some operations, inside values can be computed efficiently, but outside values cannot. We view outside values as functions from inside values to the total value of all derivations, and we analyze outside computation in terms of function composition. This viewpoint helps explain why efficient outside computation is possible in many settings, despite the lack of a general outside algorithm for semiring operations.",,"Syntax: Tagging, Chunking, and Parsing",CL,,38939393 +TACL.1936,Consistent Unsupervised Estimators for Anchored PCFGs,Alexander Clark|Nathanaël Fijalkow,"Learning probabilistic context-free grammars from strings is a classic problem in computational linguistics since Horning (1969). Here we present an algorithm based on distributional learning that is a consistent estimator for a large class of PCFGs that satisfy certain natural conditions including being anchored (Stratos et al., 2016). ** We proceed via a reparameterisation of (top-down) PCFGs which we call a bottom-up weighted context-free grammar. We show that if the grammar is anchored and satisfies additional restrictions on its ambiguity, then the parameters can be directly related to distributional properties of the anchoring strings; we show the asymptotic correctness of a naive estimator and present some simulations using synthetic data that show that algorithms based on this approach have good finite sample behaviour.",,"Syntax: Tagging, Chunking, and Parsing",TACL,,38939394 +TACL.1943,Reproducible and Efficient Benchmarks for Hyperparameter Optimization of Neural Machine Translation Systems,Xuan Zhang|Kevin Duh,"Hyperparameter selection is a crucial part of building neural machine translation (NMT) systems across both academia and industry. Fine-grained adjustments to a model's architecture or training recipe can mean the difference between a positive and negative research result or between a state-of-the-art and under-performing system. While recent literature has proposed methods for automatic hyperparameter optimization (HPO), there has been limited work on applying these methods to NMT, due in part to the high costs associated with experiments that train large numbers of model variants. To facilitate research in this space, we introduce a lookup-based approach that uses a library of pre-trained models for fast, low cost HPO experimentation. Our contributions include (1) the release of a large collection of trained NMT models covering a wide range of hyperparameters, (2) the proposal of targeted metrics for evaluating HPO methods on NMT, and (3) a reproducible benchmark of several HPO methods against our model library, including novel graph-based and multi-objective methods.",,Machine Translation and Multilinguality,TACL,,38939395 +TACL.1983,How Can We Know What Language Models Know,Zhengbao Jiang|Frank F. Xu|Jun Araki|Graham Neubig,"Recent work has presented intriguing results examining the knowledge contained in language models (LM) by having the LM fill in the blanks of prompts such as ""Obama is a _ by profession"". These prompts are usually manually created, and quite possibly sub-optimal; another prompt such as ""Obama worked as a _"" may result in more accurately predicting the correct profession. Because of this, given an inappropriate prompt, we might fail to retrieve facts that the LM does know, and thus any given prompt only provides a lower bound estimate of the knowledge contained in an LM. In this paper, we attempt to more accurately estimate the knowledge contained in LMs by automatically discovering better prompts to use in this querying process. Specifically, we propose mining-based and paraphrasing-based methods to automatically generate high-quality and diverse prompts, as well as ensemble methods to combine answers from different prompts. Extensive experiments on the LAMA benchmark for extracting relational knowledge from LMs demonstrate that our methods can improve accuracy from 31.1% to 39.6%, providing a tighter lower bound on what LMs know. We have released the code and the resulting LM Prompt And Query Archive (LPAQA) at https://github.com/jzbjyb/LPAQA.",,Language Generation,TACL,,38939396 +TACL.1997,Unsupervised Quality Estimation for Neural Machine Translation,Marina Fomicheva|Shuo Sun|Lisa Yankovskaya|Frédéric Blain|Francisco Guzmán|Mark Fishel|Nikolaos Aletras|Vishrav Chaudhary|Lucia Specia,"Quality Estimation (QE) is an important component in making Machine Translation (MT) useful in real-world applications, as it is aimed to inform the user on the quality of the MT output at test time. Existing approaches require large amounts of expert annotated data, computation and time for training. As an alternative, we devise an unsupervised approach to QE where no training or access to additional resources besides the MT system itself is required. Different from most of the current work that treats the MT system as a black box, we explore useful information that can be extracted from the MT system as a by-product of translation. By employing methods for uncertainty quantification, we achieve very good correlation with human judgments of quality, rivalling state-of-the-art supervised QE models. To evaluate our approach we collect the first dataset that enables work on both black-box and glass-box approaches to QE.",,Machine Translation and Multilinguality,TACL,,38939397 +TACL.2011,Nurse is Closer to Woman than Surgeon? Mitigating Gender-Biased Proximities in Word Embeddings,Vaibhav Kumar|Tenzin Bhotia|Vaibhav Kumar|Tanmoy Chakraborty,"Word embeddings are the standard model for semantic and syntactic representations of words. Unfortunately, these models have been shown to exhibit undesirable word associations resulting from gender, racial, and religious biases. Existing post-processing methods for debiasing word embeddings are unable to mitigate gender bias hidden in the spatial arrangement of word vectors. In this paper, we propose RAN-Debias, a novel gender debiasing methodology which not only eliminates the bias present in a word vector but also alters the spatial distribution of its neighbouring vectors, achieving a bias-free setting while maintaining minimal semantic offset. We also propose a new bias evaluation metric - Gender-based Illicit Proximity Estimate (GIPE), which measures the extent of undue proximity in word vectors resulting from the presence of gender-based predilections. Experiments based on a suite of evaluation metrics show that RAN-Debias significantly outperforms the state-of-the-art in reducing proximity bias (GIPE) by at least 42.02%. It also reduces direct bias, adding minimal semantic disturbance, and achieves the best performance in a downstream application task (coreference resolution).",,Semantics: Lexical Semantics,TACL,,38939398 +TACL.2013,BLiMP: The Benchmark of Linguistic Minimal Pairs for English,Alex Warstadt|Alicia Parrish|Haokun Liu|Anhad Monananey|Wei Peng|Sheng-Fu Wang|Samuel Bowman,"We introduce The Benchmark of Linguistic Minimal Pairs (BLiMP), a challenge set for evaluating the linguistic knowledge of language models (LMs) on major grammatical phenomena in English. BLiMP consists of 67 individual datasets, each containing 1,000 minimal pairs, i.e. pairs of minimally different sentences that contrast in grammatical acceptability and isolate specific phenomenon in syntax, morphology, or semantics. We generate the data according to linguist-crafted grammar templates, and human aggregate agreement with the labels is 96.4%. We evaluate n-gram, LSTM, and Transformer (GPT-2 and TransformerXL) LMs by observing whether they assign a higher probability to the acceptable sentence in each minimal pair. We find that state-of-the-art models identify morphological contrasts related to agreement reliably, but they struggle with some subtle semantic and syntactic phenomena, such as negative polarity items and extraction islands.",,"Linguistic Theories, Cognitive Modeling and Psycholinguistics",TACL,,38939399 +TACL.2041,oLMpics - On what Language Model Pre-training Captures,Alon Talmor|Yanai Elazar|Yoav Goldberg|Jonathan Berant,"Recent success of pre-trained language models (LMs) has spurred widespread interest in the language capabilities that they possess. However, efforts to understand whether LM representations are useful for symbolic reasoning tasks have been limited and scattered. In this work, we propose eight reasoning tasks, which conceptually require operations such as comparison, conjunction, and composition. A fundamental challenge is to understand whether the performance of a LM on a task should be attributed to the pre-trained representations or to the process of fine-tuning on the task data. To address this, we propose an evaluation protocol that includes both zero-shot evaluation (no fine-tuning), as well as comparing the learning curve of a fine-tuned LM to the learning curve of multiple controls, which paints a rich picture of the LM capabilities. Our main findings are that: (a) different LMs exhibit qualitatively different reasoning abilities, e.g., RoBERTa succeeds in reasoning tasks where BERT fails completely; (b) LMs do not reason in an abstract manner and are context-dependent, e.g., while RoBERTa can compare ages, it can do so only when the ages are in the typical range of human ages; (c) On half of our reasoning tasks all models fail completely. Our findings and infrastructure can help future work on designing new datasets, models and objective functions for pre-training.",,Question Answering,TACL,,38939400 +TACL.2047,Data Weighted Training Strategies for Grammatical Error Correction,Jared Lichtarge|Chris Alberti|Shankar Kumar,"Recent progress in the task of Grammatical Error Correction (GEC) has been driven by addressing data sparsity, both through new methods for generating large and noisy pretraining data and through the publication of small and higher-quality finetuning data in the BEA-2019 shared task. Building upon recent work in Neural Machine Translation (NMT), we make use of both kinds of data by deriving example-level scores on our large pretraining data based on a smaller, higher-quality dataset. In this work, we perform an empirical study to discover how to best incorporate delta-log-perplexity, a type of example scoring, into a training schedule for GEC. In doing so, we perform experiments that shed light on the function and applicability of delta-log-perplexity. Models trained on scored data achieve state-of-the-art results on common GEC test sets.",,NLP Applications,TACL,,38939401 +TACL.2049,What Does My QA Model Know? Devising Controlled Probes using Expert,Kyle Richardson|Ashish Sabharwal,"Open-domain question answering (QA) involves many knowledge and reasoning challenges, but are successful QA models actually learning such knowledge when trained on benchmark QA tasks? We investigate this via several new diagnostic tasks probing whether multiple-choice QA models know definitions and taxonomic reasoning—two skills widespread in existing benchmarks and fundamental to more complex reasoning. We introduce a methodology for automatically building probe datasets from expert knowledge sources, allowing for systematic control and a comprehensive evaluation. We include ways to carefully control for artifacts that may arise during this process. Our evaluation confirms that transformer-based multiple-choice QA models are already predisposed to recognize certain types of structural linguistic knowledge. However, it also reveals a more nuanced picture: their performance notably degrades even with a slight increase in the number of “hops” in the underlying taxonomic hierarchy, and with more challenging distractor candidates. Further, existing models are far from perfect when assessed at the level of clusters of semantically connected probes, such as all hypernym questions about a single concept.",,Question Answering,TACL,,38939402 +TACL.2055,An Empirical Study on Robustness to Spurious Correlations using Pre-trained Language Models,Lifu Tu|Garima Lalwani|Spandana Gella|He He,"Recent work has shown that pre-trained language models such as BERT improve robustness to spurious correlations in the dataset. Intrigued by these results, we find that the key to their success is generalization from a small amount of counterexamples where the spurious correlations do not hold. When such minority examples are scarce, pre-trained models perform as poorly as models trained from scratch. In the case of extreme minority, we propose to use multi-task learning (MTL) to improve generalization. Our experiments on natural language inference and paraphrase identification show that MTL with the right auxiliary tasks significantly improves performance on challenging examples without hurting the in-distribution performance. Further, we show that the gain from MTL mainly comes from improved generalization from the minority examples. Our results highlight the importance of data diversity for overcoming spurious correlations.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",TACL,,38939403 +TACL.2083,A Neural Generative Model for Joint Learning Topics and Topic-Specific Word Embeddings,Lixing Zhu|Deyu Zhou|Yulan He,"We propose a novel generative model to explore both local and global context for joint learning topics and topic-specific word embeddings. In particular, we assume that global latent topics are shared across documents; a word is generated by a hidden semantic vector encoding its contextual semantic meaning; and its context words are generated conditional on both the hidden semantic vector and global latent topics. Topics are trained jointly with the word embeddings. The trained model maps words to topic-dependent embeddings, which naturally addresses the issue of word polysemy. Experimental results show that the proposed model outperforms the word-level embedding methods in both word similarity evaluation and word sense disambiguation. Furthermore, the model also extracts more coherent topics compared to existing neural topic models or other models for joint learning of topics and word embeddings. Finally, the model can be easily integrated with existing deep contextualized word embedding learning methods to further improve the performance of downstream tasks such as sentiment classification.",,Machine Learning for NLP,TACL,,38939404 +TACL.2093,Topic Modeling in Embedding Spaces,Adji Bousso Dieng|Francisco Ruiz|David Blei,"Topic modeling analyzes documents to learn meaningful patterns of words. However, existing topic models fail to learn interpretable topics when working with large and heavy-tailed vocabularies. To this end, we develop the embedded topic model (ETM), a generative model of documents that marries traditional topic models with word embeddings. More specifically, the ETM models each word with a categorical distribution whose natural parameter is the inner product between the word's embedding and an embedding of its assigned topic. To fit the ETM, we develop an efficient amortized variational inference algorithm. The ETM discovers interpretable topics even with large vocabularies that include rare words and stop words. It outperforms existing document models, such as latent Dirichlet allocation (LDA), in terms of both topic quality and predictive performance.",,Machine Learning for NLP,TACL,,38939405 +TACL.2095,Interactive Text Ranking with Bayesian Optimisation: A Case Study on Community QA and Summarisation,Edwin Simpson|Yang Gao|Iryna Gurevych,"For many NLP applications, such as question answering and summarisation, the goal is to select the best solution from a large space of candidates to meet a particular user’s needs. To address the lack of user or task-specific training data, we propose an interactive text ranking approach that actively selects pairs of candidates, from which the user selects the best. Unlike previous strategies, which attempt to learn a ranking across the whole candidate space, our method employs Bayesian optimisation to focus the user’s labelling effort on high quality candidates and integrate prior knowledge to cope better with small data scenarios. We apply our method to community question answering (cQA) and extractive multi-document summarisation, finding that it significantly outperforms existing interactive approaches. We also show that the ranking function learned by our method is an effective reward function for reinforcement learning, which improves the state of the art for interactive summarisation.",,Machine Learning for NLP,TACL,,38939406 +TACL.2103,Nested Named Entity Recognition via Second-best Sequence Learning and Decoding,Takashi Shibuya|Eduard Hovy,"When an entity name contains other names within it, the identification of all combinations of names can become difficult and expensive. We propose a new method to recognize not only outermost named entities but also inner nested ones. We design an objective function for training a neural model that treats the tag sequence for nested entities as the second best path within the span of their parent entity. In addition, we provide the decoding method for inference that extracts entities iteratively from outermost ones to inner ones in an outside-to-inside way. Our method has no additional hyperparameters to the conditional random field based model widely used for flat named entity recognition tasks. Experiments demonstrate that our method performs better than or at least as well as existing methods capable of handling nested entities, achieving the F1-scores of 85.82%, 84.34%, and 77.36% on ACE-2004, ACE-2005, and GENIA datasets, respectively",,Information Extraction,TACL,,38939407 +TACL.2107,Multilingual Denoising Pre-training for Neural Machine Translation,Jiatao Gu|Yinhan Liu|Naman Goyal|Xian Li|Sergey Edunov|Marjan Ghazvininejad|Mike Lewis|Luke Zettlemoyer,"This paper demonstrates that multilingual denoising pre-training produces significant performance gains across a wide variety of machine translation (MT) tasks. We present mBART -- a sequence-to-sequence denoising auto-encoder pre-trained on large-scale monolingual corpora in many languages using the BART objective. mBART is the first method for pre-training a complete sequence-to-sequence model by denoising full texts in multiple languages, while previous approaches have focused only on the encoder, decoder, or reconstructing parts of the text. Pre-training a complete model allows it to be directly fine-tuned for supervised (both sentence-level and document-level) and unsupervised machine translation, with no task-specific modifications. We demonstrate that adding mBART initialization produces performance gains in all but the highest-resource settings, including up to 12 BLEU points for low resource MT and over 5 BLEU points for many document-level and unsupervised models. We also show it enables transfer to language pairs with no bi-text or that were not in the pre-training corpus, and present extensive analysis of which factors contribute the most to effective pre-training.",,Machine Translation and Multilinguality,TACL,,38939408 +TACL.2121,Modeling Global and Local Node Contexts for Text Generation from Knowledge Graphs,Leonardo F. R. Ribeiro|Yue Zhang|Claire Gardent|Iryna Gurevych,"Recent graph-to-text models generate text from graph-based data using either global or local aggregation to learn node representations. Global node encoding allows explicit communication between two distant nodes, thereby neglecting graph topology as all nodes are directly connected. In contrast, local node encoding considers the relations between neighbor nodes capturing the graph structure, but it can fail to capture long-range relations. In this work, we gather both encoding strategies, proposing novel neural models which encode an input graph combining both global and local node contexts, in order to learn better contextualized node embeddings. In our experiments, we demonstrate that our approaches lead to significant improvements on two graph-to-text datasets achieving BLEU scores of 18.01 on AGENDA dataset, and 63.69 on the WebNLG dataset for seen categories, outperforming state-of-the-art models by 3.7 and 3.1 points, respectively.",,Language Generation,TACL,,38939409 +TACL.2129,Beat the AI: Investigating Adversarial Human Annotation for Reading Comprehension,Max Bartolo|Alastair Roberts|Johannes Welbl|Sebastian Riedel|Pontus Stenetorp,"Innovations in annotation methodology have been a catalyst for Reading Comprehension (RC) datasets and models. One recent trend to challenge current RC models is to involve a model in the annotation process: humans create questions adversarially, such that the model fails to answer them correctly. In this work we investigate this annotation methodology and apply it in three different settings, collecting a total of 36,000 samples with progressively stronger models in the annotation loop. This allows us to explore questions such as the reproducibility of the adversarial effect, transfer from data collected with varying model-in-the-loop strengths, and generalisation to data collected without a model. We find that training on adversarially collected samples leads to strong generalisation to non-adversarially collected datasets, yet with progressive performance deterioration with increasingly stronger models-in-the-loop. Furthermore, we find that stronger models can still learn from datasets collected with substantially weaker models-in-the-loop. When trained on data collected with a BiDAF model in the loop, RoBERTa achieves 39.9F1 on questions that it cannot answer when trained on SQuAD - only marginally lower than when trained on data collected using RoBERTa itself (41.0F1).",,Question Answering,TACL,,38939410 +TACL.2135,Sketch-Driven Regular Expression Generation from Natural Language and Examples,Xi Ye|Qiaochu Chen|Xinyu Wang|Isil Dillig|Greg Durrett,"Recent systems for converting natural language descriptions into regular expressions (regexes) have achieved some success, but typically deal with short, formulaic text and can only produce simple regexes. Realworld regexes are complex, hard to describe with brief sentences, and sometimes require examples to fully convey the user’s intent. We present a framework for regex synthesis in this setting where both natural language (NL) and examples are available. First, a semantic parser (either grammar-based or neural) maps the natural language description into an intermediate sketch, which is an incomplete regex containing holes to denote missing components. Then a program synthesizer searches over the regex space defined by the sketch and finds a regex that is consistent with the given string examples. Our semantic parser can be trained purely from weak supervision based on correctness of the synthesized regex, or it can leverage heuristically-derived sketches. We evaluate on two prior datasets (Kushman and Barzilay, 2013; Locascio et al., 2016) and a real-world dataset from Stack Overflow. Our system achieves state-of-the-art performance on the prior datasets and solves 57% of the real-world dataset, which existing neural systems completely fail on.",,"Semantics: Sentence-level Semantics, Textual Inference and Other areas",TACL,,38939411 +TACL.2141,The Return of Lexical Dependencies: Neural Lexicalized PCFGs,Hao Zhu|Yonatan Bisk|Graham Neubig,"In this paper we demonstrate that context free grammar (CFG) based methods for grammar induction benefit from modeling lexical dependencies. This contrasts to the most popular current methods for grammar induction, which focus on discovering either constituents or dependencies. Previous approaches to marry these two disparate syntactic formalisms (e.g. lexicalized PCFGs) have been plagued by sparsity, making them unsuitable for unsupervised grammar induction. However, in this work, we present novel neural models of lexicalized PCFGs which allow us to overcome sparsity problems and effectively induce both constituents and dependencies within a single model. Experiments demonstrate that this unified framework results in stronger results on both representations than achieved when modeling either formalism alone. Code is available at https://github.com/neulab/neural-lpcfg.",,"Syntax: Tagging, Chunking, and Parsing",TACL,,38939412 +TACL.2143,Task-Oriented Dialogue as Dataflow Synthesis,Jacob Andreas|John Bufe|David Burkett|Charles Chen|Josh Clausman|Jean Crawford|Kate Crim|Jordan DeLoach|Leah Dorner|Jason Eisner|Hao Fang|Alan Guo|David Hall|Kristin Hayes|Kellie Hill|Diana Ho|Wendy Iwaszuk|Smriti Jha|Dan Klein|Jayant Krishnamurthy|Theo Lanman|Percy Liang|Christopher Lin|Ilya Lintsbakh|Andy McGovern|Alexander Nisnevich|Adam Pauls|Brent Read|Dan Roth|Subhro Roy|Beth Short|Div Slomin|Ben Snyder|Stephon Striplin|Yu Su|Zachary Tellman|Sam Thomson|Andrei Vorobev|Izabela Witoszko|Jason Wolfe|Abby Wray|Yuchen Zhang|Alexander Zotov|Jesse Rusak|Dmitrij Petters,"We describe an approach to task-oriented dialogue in which dialogue state is represented as a dataflow graph. A dialogue agent maps each user utterance to a program that extends this graph. Programs include metacomputation operators for reference and revision that reuse dataflow fragments from previous turns. Our graph-based state enables the expression and manipulation of complex user intents, and explicit metacomputation makes these intents easier for learned models to predict. We introduce a new dataset, SMCalFlow, featuring complex dialogues about events, weather, places, and people. Experiments show that dataflow graphs and metacomputation substantially improve representability and predictability in these natural dialogues. Additional experiments on the MultiWOZ dataset show that our dataflow representation enables an otherwise off-the-shelf sequence-to-sequence model to match the best existing task-specific state tracking model. The SMCalFlow dataset and code for replicating experiments are available at https://www.microsoft.com/en-us/research/project/dataflow-based-dialogue-semantic-machines. ",,Dialog and Interactive Systems,TACL,,38939413 +TACL.2169,A* Beam Search,Clara Meister|Ryan Cotterell|Tim Vieira,"Decoding for many NLP tasks requires an effective heuristic algorithm for approximating exact search since the problem of searching the full output space is often intractable, or impractical in many settings. The default algorithm for this job is beam search--a pruned version of breadth-first search. Quite surprisingly, beam search often returns better results than exact inference due to beneficial search bias for NLP tasks. In this work, we show that the standard implementation of beam search can be made up to 10x faster in practice. Our method assumes that the scoring function is monotonic in the sequence length, which allows us to safely prune hypotheses that cannot be in the final set of hypotheses early on. We devise effective monotonic approximations to popular nonmonontic scoring functions, including length normalization and mutual information decoding. Lastly, we propose a memory-reduced variant of Best-First Beam Search, which has a similar beneficial search bias in terms of downstream performance, but runs in a fraction of the time.",,Language Generation,TACL,,38939414 +TACL.2221,Consistent Transcription and Translation of Speech,Matthias Sperber|Hendra Setiawan|Christian Gollan|Udhay Nallasamy|Matthias Paulik,"The conventional paradigm in speech translation starts with a speech recognition step to generate transcripts, followed by a translation step with the automatic transcripts as input. To address various shortcomings of this paradigm, recent work explores end-to-end trainable direct models that translate without transcribing. However, transcripts can be an indispensable output in practical applications, which often display transcripts alongside the translations to users. ** We make this common requirement explicit and explore the task of jointly transcribing and translating speech. While high accuracy of transcript and translation are crucial, even highly accurate systems can suffer from inconsistencies between both outputs that degrade the user experience. We introduce a methodology to evaluate consistency and compare several modeling approaches, including the traditional cascaded approach and end-to-end models. We find that direct models are poorly suited to the joint transcription/translation task, but that end-to-end models that feature a coupled inference procedure are able to achieve strong consistency. We further introduce simple techniques for directly optimizing for consistency, and analyze the resulting trade-offs between consistency, transcription accuracy, and translation accuracy.",,Speech and Multimodality,TACL,,38939415 +TACL.2255,PERL: Pivot-based Domain Adaptation for Pre-trained Deep Contextualized Embedding Models,Roi Reichart|Eyal Ben David|Carmel Rabinovitz,"Pivot-based neural representation models have led to significant progress in domain adaptation for NLP. However, previous works that follow this approach utilize only labeled data from the source domain and unlabeled data from the source and target domains, but neglect to incorporate massive unlabeled corpora that are not necessarily drawn from these domains. To alleviate this, we propose PERL: A representation learning model that extends contextualized word embedding models such as BERT (Devlin et al., 2019) with pivot-based fine-tuning. PERL outperforms strong baselines across 22 sentiment classification domain adaptation setups, improves in-domain model performance, yields effective reduced-size models and increases model stability.",,Machine Learning for NLP,TACL,,38939416 +TACL.2389,Improving Dialog Evaluation with a Multi-reference Adversarial Dataset and Large Scale Pretraining,Ananya Sai|Akash Mohan Kumar|Siddhartha Arora|Mitesh Khapra,"There is an increasing focus on model-based dialog evaluation metrics such as ADEM, RUBER, and the more recent BERT-based metrics. These models aim to assign a high score to all relevant responses and a low score to all irrelevant responses. Ideally, such models should be trained using multiple relevant and irrelevant responses for any given context. However, no such data is publicly available, and hence existing models are usually trained using a single relevant response and multiple randomly selected responses from other contexts (random negatives). To allow for better training and robust evaluation of model-based metrics, we introduce the DailyDialog++ dataset, consisting of (i) five relevant responses for each context and (ii) five \textit{adversarially crafted} irrelevant responses for each context. Using this dataset, we first show that even in the presence of multiple correct references, n-gram based metrics and embedding based metrics do not perform well at separating relevant responses from even random negatives. While model-based metrics perform better than n-gram and embedding based metrics on random negatives, their performance drops substantially when evaluated on adversarial examples. To check if large scale pretraining could help, we propose a new BERT-based evaluation metric called DEB, which is pretrained on 727M Reddit conversations and then finetuned on our dataset. DEB significantly outperforms existing models, showing better correlation with human judgements and better performance on random negatives (88.27% accuracy). However, its performance again drops substantially, when evaluated on adversarial responses, thereby highlighting that even large-scale pretrained evaluation models are not robust to the adversarial examples in our dataset. The dataset and code are publicly available. (Dataset: https://iitmnlp.github.io/DailyDialog-plusplus/ and Code: https://github.com/iitmnlp/Dialogue-Evaluation-with-BERT).",,Dialog and Interactive Systems,TACL,,38939417 +TACL.2411,Syntactic Structure Distillation Pretraining for Bidirectional Encoders,Adhiguna Kuncoro|Lingpeng Kong|Daniel Fried|Dani Yogatama|Laura Rimell|Chris Dyer|Phil Blunsom,"Textual representation learners trained on large amounts of data have achieved notable success on downstream tasks; intriguingly, they have also performed well on challenging tests of syntactic competence. Hence, it remains an open question whether scalable learners like BERT can become fully proficient in the syntax of natural language by virtue of data scale alone, or whether they still benefit from more explicit syntactic biases. To answer this question, we introduce a knowledge distillation strategy for injecting syntactic biases into BERT pretraining, by distilling the syntactically informative predictions of a hierarchical---albeit harder to scale---syntactic language model. Since BERT models masked words in bidirectional context, we propose to distill the approximate marginal distribution over words in context from the syntactic LM. Our approach reduces relative error by 2-21% on a diverse set of structured prediction tasks, although we obtain mixed results on the GLUE benchmark. Our findings demonstrate the benefits of syntactic biases, even for representation learners that exploit large amounts of data, and contribute to a better understanding of where syntactic biases are helpful in benchmarks of natural language understanding.",,"Syntax: Tagging, Chunking, and Parsing",TACL,,38939418 diff --git a/sitedata/paper_recs.json b/sitedata/paper_recs.json index dc25c4b..57675e7 100644 --- a/sitedata/paper_recs.json +++ b/sitedata/paper_recs.json @@ -1 +1 @@ -{"cl.1482": ["cl.1482", "main.682", "main.493", "cl.1552", "main.422"], "cl.1508": ["cl.1508", "main.8", "main.305", "main.635", "demo.100"], "cl.1543": ["cl.1543", "main.329", "demo.116", "main.260", "main.95"], "cl.1547": ["cl.1547", "main.420", "demo.115", "main.140", "main.383"], "cl.1550": ["cl.1550", "srw.105", "main.590", "main.540", "main.688"], "cl.1552": ["cl.1552", "main.311", "main.422", "main.687", "main.385"], "cl.1554": ["cl.1554", "main.46", "main.354", "srw.144", "main.260"], "demo.24": ["demo.24", "main.105", "demo.44", "demo.115", "demo.69"], "demo.28": ["demo.28", "main.538", "demo.69", "demo.101", "demo.93"], "demo.31": ["demo.31", "main.719", "main.282", "demo.104", "main.432"], "demo.32": ["demo.32", "main.120", "main.456", "cl.1508", "main.650"], "demo.33": ["demo.33", "main.155", "main.215", "main.254", "main.400"], "demo.35": ["demo.35", "demo.100", "main.331", "main.77", "main.478"], "demo.37": ["demo.37", "main.638", "main.166", "main.568", "main.60"], "demo.39": ["demo.39", "main.464", "demo.93", "main.699", "main.702"], "demo.41": ["demo.41", "main.729", "demo.104", "main.229", "main.566"], "demo.44": ["demo.44", "demo.69", "demo.93", "demo.115", "demo.24"], "demo.45": ["demo.45", "demo.69", "demo.115", "main.100", "demo.93"], "demo.46": ["demo.46", "main.195", "main.250", "main.776", "main.705"], "demo.47": ["demo.47", "main.705", "main.195", "main.383", "main.222"], "demo.48": ["demo.48", "main.413", "main.652", "main.626", "main.662"], "demo.49": ["demo.49", "main.54", "tacl.1901", "main.218", "demo.102"], "demo.54": ["demo.54", "main.756", "main.417", "srw.106", "main.200"], "demo.58": ["demo.58", "demo.100", "main.331", "main.478", "main.77"], "demo.59": ["demo.59", "main.375", "main.746", "main.303", "main.96"], "demo.61": ["demo.61", "demo.91", "main.273", "main.402", "main.401"], "demo.66": ["demo.66", "main.124", "main.149", "demo.44", "main.4"], "demo.67": ["demo.67", "main.347", "demo.115", "demo.69", "main.391"], "demo.69": ["demo.69", "main.369", "demo.44", "main.425", "demo.115"], "demo.79": ["demo.79", "main.98", "main.60", "main.126", "main.64"], "demo.84": ["demo.84", "main.419", "srw.115", "main.687", "main.432"], "demo.86": ["demo.86", "main.655", "main.246", "main.443", "main.586"], "demo.87": ["demo.87", "srw.123", "main.128", "main.426", "main.771"], "demo.89": ["demo.89", "srw.116", "main.138", "main.322", "demo.87"], "demo.90": ["demo.90", "main.149", "demo.104", "main.158", "main.358"], "demo.91": ["demo.91", "main.204", "main.100", "demo.67", "main.769"], "demo.93": ["demo.93", "demo.44", "demo.69", "main.100", "main.290"], "demo.94": ["demo.94", "main.230", "main.489", "main.449", "demo.44"], "demo.96": ["demo.96", "srw.104", "main.456", "main.123"], "demo.100": ["demo.100", "demo.58", "main.331", "main.478", "main.77"], "demo.101": ["demo.101", "demo.67", "main.135", "main.449", "demo.91"], "demo.102": ["demo.102", "demo.49", "demo.39", "main.72", "demo.79"], "demo.104": ["demo.104", "demo.31", "demo.91", "main.96", "main.410"], "demo.107": ["demo.107", "main.750", "demo.86", "main.719", "main.246"], "demo.115": ["demo.115", "demo.69", "demo.67", "main.443", "demo.44"], "demo.116": ["demo.116", "main.156", "main.421", "main.260", "main.329"], "demo.120": ["demo.120", "demo.69", "demo.115", "demo.93", "main.76"], "demo.124": ["demo.124", "main.67", "main.605", "tacl.1805", "main.167"], "demo.130": ["demo.130", "main.677", "demo.69", "main.769", "main.74"], "demo.139": ["demo.139", "main.759", "main.310", "srw.15", "main.206"], "main.1": ["main.1", "main.176", "main.633", "main.351", "srw.116"], "main.2": ["main.2", "main.694", "main.437", "main.270", "cl.1552"], "main.3": ["main.3", "main.5", "main.563", "main.10", "main.567"], "main.4": ["main.4", "main.124", "main.19", "main.220", "demo.115"], "main.5": ["main.5", "main.563", "main.567", "main.3", "main.53"], "main.6": ["main.6", "main.515", "main.58", "main.635", "main.744"], "main.7": ["main.7", "main.516", "main.8", "main.131", "main.55"], "main.8": ["main.8", "main.516", "main.635", "demo.37", "cl.1508"], "main.9": ["main.9", "main.52", "main.226", "main.638", "main.694"], "main.10": ["main.10", "main.5", "main.563", "main.3", "main.567"], "main.11": ["main.11", "tacl.1853", "main.505", "main.370", "main.585"], "main.12": ["main.12", "main.524", "main.148", "main.636", "main.625"], "main.13": ["main.13", "tacl.1811", "main.376", "main.301", "main.451"], "main.14": ["main.14", "main.480", "main.230", "main.526", "main.241"], "main.15": ["main.15", "main.171", "main.277", "main.350", "main.6"], "main.16": ["main.16", "main.583", "main.664", "main.93", "main.469"], "main.17": ["main.17", "main.168", "main.546", "main.332", "main.101"], "main.18": ["main.18", "main.538", "main.436", "main.135", "main.522"], "main.19": ["main.19", "main.413", "main.652", "main.410", "main.450"], "main.20": ["main.20", "main.413", "main.500", "main.652", "main.772"], "main.21": ["main.21", "main.69", "main.355", "main.498", "srw.122"], "main.22": ["main.22", "main.535", "main.545", "tacl.1967", "main.28"], "main.23": ["main.23", "main.235", "main.708", "main.705", "main.354"], "main.24": ["main.24", "main.705", "main.200", "main.36", "main.542"], "main.25": ["main.25", "main.173", "main.223", "main.646", "main.561"], "main.26": ["main.26", "main.631", "main.175", "main.461", "main.21"], "main.27": ["main.27", "main.282", "main.749", "main.572", "main.449"], "main.28": ["main.28", "main.535", "main.707", "main.654", "srw.19"], "main.29": ["main.29", "main.603", "main.277", "tacl.1876", "main.170"], "main.30": ["main.30", "main.346", "main.618", "main.431", "main.272"], "main.31": ["main.31", "main.199", "main.553", "main.207", "main.366"], "main.32": ["main.32", "main.548", "main.73", "srw.5", "main.724"], "main.33": ["main.33", "main.197", "main.451", "tacl.1849", "main.374"], "main.34": ["main.34", "main.41", "main.40", "srw.54", "main.532"], "main.35": ["main.35", "srw.55", "main.491", "main.387", "main.448"], "main.36": ["main.36", "main.687", "main.38", "tacl.1849", "main.609"], "main.37": ["main.37", "main.672", "main.147", "main.38", "cl.1552"], "main.38": ["main.38", "main.250", "main.687", "main.686", "main.504"], "main.39": ["main.39", "main.432", "main.419", "main.387", "main.312"], "main.40": ["main.40", "main.41", "main.34", "main.38", "srw.54"], "main.41": ["main.41", "main.34", "main.40", "main.620", "main.693"], "main.42": ["main.42", "main.254", "main.36", "srw.2", "main.325"], "main.43": ["main.43", "tacl.1709", "tacl.1815", "main.428", "srw.127"], "main.44": ["main.44", "main.459", "main.176", "main.747", "main.313"], "main.45": ["main.45", "main.200", "main.247", "main.590", "main.76"], "main.46": ["main.46", "main.234", "main.684", "cl.1554", "main.160"], "main.47": ["main.47", "main.389", "main.176", "main.415", "demo.84"], "main.48": ["main.48", "main.771", "main.494", "main.352", "main.97"], "main.49": ["main.49", "main.32", "srw.5", "main.338", "main.358"], "main.50": ["main.50", "main.352", "main.723", "main.509", "main.305"], "main.51": ["main.51", "srw.39", "main.337", "main.276", "main.260"], "main.52": ["main.52", "main.185", "main.55", "main.288", "main.638"], "main.53": ["main.53", "main.5", "main.636", "main.563", "main.637"], "main.54": ["main.54", "demo.49", "main.221", "main.638", "main.62"], "main.55": ["main.55", "main.568", "main.638", "main.185", "main.221"], "main.56": ["main.56", "main.185", "main.52", "main.568", "main.55"], "main.57": ["main.57", "main.60", "demo.79", "main.98", "main.64"], "main.58": ["main.58", "main.5", "main.734", "main.735", "main.574"], "main.59": ["main.59", "main.566", "main.166", "main.129", "main.98"], "main.60": ["main.60", "demo.79", "main.98", "main.126", "main.166"], "main.61": ["main.61", "main.481", "main.141", "main.568", "main.670"], "main.62": ["main.62", "main.566", "main.637", "main.166", "tacl.1901"], "main.63": ["main.63", "main.163", "main.708", "main.542", "main.290"], "main.64": ["main.64", "demo.79", "main.126", "main.60", "main.98"], "main.65": ["main.65", "main.544", "main.235", "main.710", "main.694"], "main.66": ["main.66", "main.428", "main.564", "main.491", "main.432"], "main.67": ["main.67", "tacl.1805", "main.640", "main.167", "main.241"], "main.68": ["main.68", "main.9", "main.185", "main.708", "main.358"], "main.69": ["main.69", "main.545", "main.21", "tacl.1845", "main.498"], "main.70": ["main.70", "main.366", "main.548", "main.291", "main.227"], "main.71": ["main.71", "main.235", "main.364", "main.694", "main.753"], "main.72": ["main.72", "main.143", "main.248", "main.201", "srw.5"], "main.73": ["main.73", "main.32", "main.630", "main.548", "srw.5"], "main.74": ["main.74", "main.498", "main.414", "main.652", "main.600"], "main.75": ["main.75", "demo.96", "srw.104", "main.118"], "main.76": ["main.76", "main.204", "main.195", "main.247", "main.250"], "main.77": ["main.77", "main.331", "main.392", "main.478", "demo.100"], "main.78": ["main.78", "main.194", "main.279", "main.191", "tacl.1801"], "main.79": ["main.79", "main.494", "main.492", "main.771", "main.286"], "main.80": ["main.80", "main.510", "main.554", "main.329", "main.645"], "main.81": ["main.81", "main.67", "main.547", "main.291", "main.339"], "main.82": ["main.82", "main.315", "main.705", "main.214", "main.370"], "main.83": ["main.83", "main.435", "main.247", "main.666", "main.87"], "main.84": ["main.84", "main.496", "main.91", "main.742", "main.398"], "main.85": ["main.85", "main.537", "main.325", "main.617", "main.37"], "main.86": ["main.86", "main.222", "main.388", "main.689", "main.183"], "main.87": ["main.87", "main.421", "main.747", "main.554", "main.252"], "main.88": ["main.88", "main.135", "main.586", "main.100", "main.105"], "main.89": ["main.89", "main.247", "main.197", "main.357", "main.195"], "main.90": ["main.90", "main.498", "srw.122", "main.772", "main.21"], "main.91": ["main.91", "main.74", "main.730", "main.69", "main.67"], "main.92": ["main.92", "main.362", "main.610", "main.27", "tacl.1727"], "main.93": ["main.93", "main.664", "main.674", "main.4", "main.704"], "main.94": ["main.94", "main.318", "demo.116", "main.766", "main.156"], "main.95": ["main.95", "main.369", "main.255", "main.423", "main.595"], "main.96": ["main.96", "main.663", "main.764", "demo.104", "cl.1552"], "main.97": ["main.97", "main.549", "main.655", "main.761", "main.361"], "main.98": ["main.98", "demo.79", "main.60", "main.166", "main.64"], "main.99": ["main.99", "main.102", "main.279", "main.336", "main.12"], "main.100": ["main.100", "main.115", "main.169", "demo.69", "main.105"], "main.101": ["main.101", "main.454", "main.712", "main.355", "main.456"], "main.102": ["main.102", "main.128", "main.481", "main.58", "main.99"], "main.103": ["main.103", "main.710", "main.105", "main.724", "main.617"], "main.104": ["main.104", "main.58", "main.27", "main.283", "main.128"], "main.105": ["main.105", "demo.24", "main.100", "main.538", "demo.69"], "main.106": ["main.106", "main.67", "main.322", "main.571", "demo.116"], "main.107": ["main.107", "main.329", "main.443", "main.716", "main.108"], "main.108": ["main.108", "main.767", "main.449", "main.290", "main.447"], "main.109": ["main.109", "main.554", "main.658", "main.421", "srw.137"], "main.110": ["main.110", "main.185", "main.19", "main.173", "main.52"], "main.111": ["main.111", "demo.91", "main.542", "main.192", "main.421"], "main.112": ["main.112", "main.706", "demo.69", "demo.115", "main.290"], "main.113": ["main.113", "main.151", "srw.58", "main.93", "main.253"], "main.114": ["main.114", "main.558", "main.400", "main.306", "main.214"], "main.115": ["main.115", "main.100", "main.586", "main.408", "main.420"], "main.116": ["main.116", "main.173", "main.453", "main.467", "main.705"], "main.117": ["main.117", "main.19", "main.652", "main.410", "main.662"], "main.118": ["main.118", "main.711", "main.349", "main.401", "main.75"], "main.119": ["main.119", "main.397", "tacl.1805", "main.167", "main.67"], "main.120": ["main.120", "demo.58", "main.478", "main.331", "main.555"], "main.121": ["main.121", "main.125", "main.554", "main.153", "main.581"], "main.122": ["main.122", "main.268", "tacl.1853", "main.480", "main.710"], "main.123": ["main.123", "main.456", "main.454", "main.458", "main.173"], "main.124": ["main.124", "main.445", "main.4", "main.173", "demo.69"], "main.125": ["main.125", "main.121", "main.411", "main.521", "main.687"], "main.126": ["main.126", "demo.79", "main.60", "main.64", "main.98"], "main.127": ["main.127", "main.568", "main.635", "main.55", "main.517"], "main.128": ["main.128", "main.436", "main.426", "demo.87", "main.148"], "main.129": ["main.129", "demo.79", "main.59", "main.166", "main.60"], "main.130": ["main.130", "demo.37", "main.135", "main.210", "main.586"], "main.131": ["main.131", "main.516", "main.7", "main.219", "main.218"], "main.132": ["main.132", "main.650", "main.357", "main.622", "srw.79"], "main.133": ["main.133", "main.218", "main.637", "main.638", "srw.22"], "main.134": ["main.134", "main.302", "main.362", "main.301", "main.298"], "main.135": ["main.135", "main.141", "main.670", "main.586", "main.88"], "main.136": ["main.136", "main.247", "main.574", "main.197", "main.230"], "main.137": ["main.137", "main.670", "main.714", "main.713", "main.521"], "main.138": ["main.138", "demo.89", "srw.116", "main.322", "main.777"], "main.139": ["main.139", "main.581", "main.520", "main.519", "main.752"], "main.140": ["main.140", "main.383", "cl.1547", "cl.1482", "main.158"], "main.141": ["main.141", "main.670", "main.135", "main.207", "main.321"], "main.142": ["main.142", "main.527", "main.444", "main.140", "main.159"], "main.143": ["main.143", "main.201", "srw.137", "main.324", "main.148"], "main.144": ["main.144", "tacl.1756", "main.463", "main.692", "main.253"], "main.145": ["main.145", "srw.9", "main.270", "main.687", "main.38"], "main.146": ["main.146", "main.578", "main.747", "srw.2", "main.40"], "main.147": ["main.147", "main.252", "main.687", "main.37", "main.336"], "main.148": ["main.148", "main.252", "main.150", "main.324", "main.12"], "main.149": ["main.149", "main.320", "demo.66", "main.105", "main.18"], "main.150": ["main.150", "main.252", "main.148", "main.304", "main.324"], "main.151": ["main.151", "main.554", "main.421", "main.113", "main.658"], "main.152": ["main.152", "main.756", "srw.54", "main.147", "srw.137"], "main.153": ["main.153", "main.554", "main.269", "demo.116", "main.121"], "main.154": ["main.154", "main.468", "main.690", "main.702", "main.332"], "main.155": ["main.155", "demo.33", "main.254", "main.215", "main.400"], "main.156": ["main.156", "main.421", "demo.116", "main.260", "main.329"], "main.157": ["main.157", "main.291", "main.509", "main.565", "main.441"], "main.158": ["main.158", "tacl.1892", "main.303", "main.179", "srw.28"], "main.159": ["main.159", "main.177", "main.698", "main.597", "main.142"], "main.160": ["main.160", "main.463", "main.375", "main.387", "main.641"], "main.161": ["main.161", "main.178", "main.623", "tacl.1886", "main.162"], "main.162": ["main.162", "main.473", "main.507", "main.668", "main.86"], "main.163": ["main.163", "main.63", "main.559", "main.708", "main.542"], "main.164": ["main.164", "main.173", "main.516", "main.219", "main.419"], "main.165": ["main.165", "main.565", "main.370", "main.595", "main.150"], "main.166": ["main.166", "main.59", "main.60", "main.98", "demo.79"], "main.167": ["main.167", "tacl.1805", "main.67", "main.397", "main.119"], "main.168": ["main.168", "main.17", "main.27", "main.568", "main.4"], "main.169": ["main.169", "main.639", "main.100", "srw.17", "main.673"], "main.170": ["main.170", "main.275", "main.29", "main.755", "srw.2"], "main.171": ["main.171", "main.15", "main.251", "main.753", "main.235"], "main.172": ["main.172", "main.228", "main.705", "main.556", "main.124"], "main.173": ["main.173", "main.454", "main.457", "main.458", "main.124"], "main.174": ["main.174", "main.453", "main.451", "main.173", "main.556"], "main.175": ["main.175", "main.461", "main.513", "main.457", "main.555"], "main.176": ["main.176", "main.1", "main.8", "main.587", "srw.116"], "main.177": ["main.177", "main.543", "main.159", "cl.1552", "main.561"], "main.178": ["main.178", "main.161", "tacl.1886", "main.233", "main.481"], "main.179": ["main.179", "main.158", "main.173", "main.309", "main.263"], "main.180": ["main.180", "srw.16", "main.244", "main.97", "main.493"], "main.181": ["main.181", "main.406", "main.179", "main.384", "main.584"], "main.182": ["main.182", "demo.79", "main.98", "main.57", "main.60"], "main.183": ["main.183", "main.769", "main.222", "main.86", "main.689"], "main.184": ["main.184", "main.717", "main.748", "main.635", "main.470"], "main.185": ["main.185", "main.568", "main.55", "main.52", "main.19"], "main.186": ["main.186", "main.3", "main.126", "main.5", "demo.79"], "main.187": ["main.187", "main.100", "main.18", "main.708", "main.677"], "main.188": ["main.188", "main.242", "main.278", "main.438", "main.575"], "main.189": ["main.189", "main.390", "main.62", "main.215", "demo.87"], "main.190": ["main.190", "main.212", "main.195", "main.768", "main.771"], "main.191": ["main.191", "srw.105", "main.263", "main.590", "main.200"], "main.192": ["main.192", "main.111", "tacl.1853", "main.341", "srw.123"], "main.193": ["main.193", "main.581", "main.627", "main.3", "main.136"], "main.194": ["main.194", "main.522", "main.538", "main.449", "main.78"], "main.195": ["main.195", "main.204", "main.250", "main.247", "main.76"], "main.196": ["main.196", "srw.55", "main.437", "tacl.1727", "main.753"], "main.197": ["main.197", "main.249", "main.314", "main.357", "main.214"], "main.198": ["main.198", "srw.39", "main.264", "main.773", "main.257"], "main.199": ["main.199", "main.640", "main.362", "main.31", "main.224"], "main.200": ["main.200", "main.45", "main.705", "main.24", "main.76"], "main.201": ["main.201", "main.143", "main.618", "tacl.1766", "main.766"], "main.202": ["main.202", "main.528", "main.250", "main.89", "main.195"], "main.203": ["main.203", "srw.105", "main.540", "main.380", "main.245"], "main.204": ["main.204", "main.195", "main.411", "main.76", "main.537"], "main.205": ["main.205", "main.104", "main.208", "main.426", "main.103"], "main.206": ["main.206", "main.351", "main.216", "main.505", "main.1"], "main.207": ["main.207", "main.670", "main.141", "main.693", "main.321"], "main.208": ["main.208", "main.614", "main.707", "main.205", "main.677"], "main.209": ["main.209", "main.649", "main.515", "main.546", "main.717"], "main.210": ["main.210", "main.115", "main.130", "main.441", "main.586"], "main.211": ["main.211", "main.361", "main.413", "main.321", "main.662"], "main.212": ["main.212", "main.429", "main.190", "main.303", "main.309"], "main.213": ["main.213", "main.351", "main.473", "main.438", "main.1"], "main.214": ["main.214", "srw.79", "main.197", "main.314", "main.357"], "main.215": ["main.215", "main.400", "main.155", "demo.33", "main.189"], "main.216": ["main.216", "main.273", "main.38", "main.683", "main.505"], "main.217": ["main.217", "main.661", "main.614", "main.336", "main.345"], "main.218": ["main.218", "main.133", "main.131", "main.220", "demo.37"], "main.219": ["main.219", "main.131", "main.93", "main.164", "main.664"], "main.220": ["main.220", "main.4", "srw.58", "demo.37", "main.638"], "main.221": ["main.221", "main.638", "main.568", "main.55", "main.54"], "main.222": ["main.222", "main.86", "main.402", "main.388", "main.183"], "main.223": ["main.223", "main.535", "main.25", "srw.17", "main.354"], "main.224": ["main.224", "main.67", "main.640", "main.712", "main.539"], "main.225": ["main.225", "main.11", "tacl.1886", "tacl.1853", "main.25"], "main.226": ["main.226", "main.705", "main.9", "main.666", "main.515"], "main.227": ["main.227", "main.52", "main.705", "main.516", "main.354"], "main.228": ["main.228", "main.172", "tacl.1906", "main.17", "main.707"], "main.229": ["main.229", "main.769", "main.194", "main.18", "main.88"], "main.230": ["main.230", "demo.94", "main.14", "main.136", "main.683"], "main.231": ["main.231", "main.482", "srw.28", "main.707", "main.29"], "main.232": ["main.232", "main.729", "main.463", "main.427", "main.340"], "main.233": ["main.233", "main.481", "main.643", "main.683", "main.61"], "main.234": ["main.234", "main.409", "main.495", "main.463", "main.46"], "main.235": ["main.235", "main.316", "main.753", "main.694", "main.71"], "main.236": ["main.236", "main.431", "main.434", "main.156", "main.405"], "main.237": ["main.237", "main.187", "srw.122", "demo.61", "demo.31"], "main.238": ["main.238", "main.412", "main.617", "main.572", "main.526"], "main.239": ["main.239", "main.688", "main.528", "main.336", "main.750"], "main.240": ["main.240", "main.391", "main.18", "demo.69", "demo.67"], "main.241": ["main.241", "main.640", "main.67", "tacl.1805", "main.553"], "main.242": ["main.242", "main.188", "main.278", "main.503", "main.235"], "main.243": ["main.243", "main.314", "main.544", "main.18", "main.354"], "main.244": ["main.244", "main.769", "main.770", "main.245", "main.247"], "main.245": ["main.245", "main.540", "srw.105", "main.244", "main.317"], "main.246": ["main.246", "main.391", "main.764", "demo.86", "main.4"], "main.247": ["main.247", "tacl.1853", "main.705", "tacl.1849", "main.195"], "main.248": ["main.248", "main.453", "main.710", "main.709", "main.767"], "main.249": ["main.249", "main.197", "main.357", "main.540", "main.314"], "main.250": ["main.250", "main.195", "main.247", "main.38", "main.204"], "main.251": ["main.251", "main.171", "main.277", "main.36", "main.593"], "main.252": ["main.252", "main.148", "srw.54", "srw.137", "main.150"], "main.253": ["main.253", "main.691", "main.532", "main.320", "main.113"], "main.254": ["main.254", "main.42", "main.155", "demo.33", "main.400"], "main.255": ["main.255", "main.156", "main.95", "main.369", "demo.116"], "main.256": ["main.256", "srw.48", "main.96", "main.85", "main.188"], "main.257": ["main.257", "main.250", "main.198", "main.683", "main.200"], "main.258": ["main.258", "main.625", "main.766", "main.675", "main.653"], "main.259": ["main.259", "tacl.1912", "main.775", "tacl.1720", "main.58"], "main.260": ["main.260", "main.468", "main.264", "main.421", "main.493"], "main.261": ["main.261", "main.506", "main.332", "demo.104", "main.699"], "main.262": ["main.262", "main.773", "main.468", "main.264", "main.690"], "main.263": ["main.263", "main.383", "main.357", "main.197", "main.191"], "main.264": ["main.264", "main.468", "main.326", "main.773", "main.262"], "main.265": ["main.265", "main.690", "main.326", "main.468", "main.484"], "main.266": ["main.266", "main.694", "main.316", "main.753", "main.437"], "main.267": ["main.267", "main.283", "main.517", "main.431", "main.90"], "main.268": ["main.268", "main.581", "main.297", "main.122", "main.283"], "main.269": ["main.269", "main.153", "cl.1552", "main.311", "tacl.1815"], "main.270": ["main.270", "main.145", "srw.9", "tacl.1815", "main.411"], "main.271": ["main.271", "main.625", "main.304", "main.593", "main.517"], "main.272": ["main.272", "tacl.1801", "main.581", "main.148", "main.128"], "main.273": ["main.273", "main.570", "main.401", "main.402", "main.556"], "main.274": ["main.274", "main.547", "main.758", "main.578", "main.600"], "main.275": ["main.275", "main.170", "main.29", "main.245", "main.615"], "main.276": ["main.276", "main.324", "main.766", "main.318", "srw.137"], "main.277": ["main.277", "main.29", "main.251", "main.641", "main.15"], "main.278": ["main.278", "main.188", "main.242", "main.532", "main.575"], "main.279": ["main.279", "main.78", "main.62", "main.390", "main.99"], "main.280": ["main.280", "main.199", "main.393", "main.48", "main.466"], "main.281": ["main.281", "tacl.1886", "main.352", "main.601"], "main.282": ["main.282", "main.364", "main.27", "main.449", "main.538"], "main.283": ["main.283", "srw.123", "main.31", "main.426", "main.104"], "main.284": ["main.284", "main.434", "tacl.1849", "main.518", "main.451"], "main.285": ["main.285", "demo.93", "demo.69", "main.184", "demo.115"], "main.286": ["main.286", "main.494", "main.27", "main.719", "main.48"], "main.287": ["main.287", "main.639", "srw.17", "main.331", "main.673"], "main.288": ["main.288", "main.289", "main.342", "main.372", "main.52"], "main.289": ["main.289", "main.288", "main.342", "main.580", "main.61"], "main.290": ["main.290", "main.343", "main.391", "main.100", "main.374"], "main.291": ["main.291", "main.588", "main.157", "main.565", "main.58"], "main.292": ["main.292", "main.717", "main.370", "main.748", "main.565"], "main.293": ["main.293", "main.296", "main.631", "main.338", "main.340"], "main.294": ["main.294", "main.639", "main.169", "srw.17", "main.354"], "main.295": ["main.295", "main.293", "main.588", "main.631", "main.296"], "main.296": ["main.296", "main.293", "main.631", "main.340", "main.582"], "main.297": ["main.297", "main.268", "main.591", "main.247", "main.629"], "main.298": ["main.298", "main.352", "main.14", "main.749", "main.134"], "main.299": ["main.299", "tacl.1853", "main.301", "main.622", "main.571"], "main.300": ["main.300", "tacl.1811", "main.600", "main.301", "main.764"], "main.301": ["main.301", "main.299", "main.197", "tacl.1853", "main.119"], "main.302": ["main.302", "main.358", "main.290", "tacl.1876", "main.135"], "main.303": ["main.303", "tacl.1892", "main.158", "main.212", "main.591"], "main.304": ["main.304", "main.150", "main.252", "main.148", "main.324"], "main.305": ["main.305", "cl.1508", "main.390", "main.8", "main.392"], "main.306": ["main.306", "main.664", "main.731", "main.643", "main.683"], "main.307": ["main.307", "main.331", "demo.100", "demo.58", "main.77"], "main.308": ["main.308", "main.332", "main.404", "srw.18", "demo.35"], "main.309": ["main.309", "main.212", "main.179", "main.769", "srw.105"], "main.310": ["main.310", "main.429", "main.467", "main.195", "main.263"], "main.311": ["main.311", "main.429", "srw.115", "main.197", "main.687"], "main.312": ["main.312", "main.432", "main.385", "main.419", "main.687"], "main.313": ["main.313", "main.526", "main.643", "main.481", "main.385"], "main.314": ["main.314", "main.197", "main.357", "main.214", "main.249"], "main.315": ["main.315", "tacl.1876", "main.528", "main.311", "main.82"], "main.316": ["main.316", "main.235", "main.753", "main.694", "main.36"], "main.317": ["main.317", "main.540", "main.244", "main.245", "main.197"], "main.318": ["main.318", "main.94", "main.766", "main.156", "main.618"], "main.319": ["main.319", "srw.105", "main.540", "main.326", "main.590"], "main.320": ["main.320", "main.149", "main.124", "main.253", "main.522"], "main.321": ["main.321", "main.207", "main.141", "main.322", "main.670"], "main.322": ["main.322", "main.321", "main.693", "srw.116", "main.273"], "main.323": ["main.323", "main.504", "main.593", "main.686", "main.325"], "main.324": ["main.324", "srw.137", "main.148", "main.150", "main.421"], "main.325": ["main.325", "main.593", "main.504", "main.744", "main.537"], "main.326": ["main.326", "main.264", "main.468", "main.262", "main.690"], "main.327": ["main.327", "main.153", "main.4", "srw.58", "main.554"], "main.328": ["main.328", "demo.79", "main.98", "main.126", "main.57"], "main.329": ["main.329", "main.716", "main.421", "main.348", "main.747"], "main.330": ["main.330", "main.651", "main.222", "main.388", "main.600"], "main.331": ["main.331", "main.77", "main.392", "demo.58", "demo.100"], "main.332": ["main.332", "main.308", "main.761", "main.17", "main.655"], "main.333": ["main.333", "main.4", "main.173", "main.708", "main.568"], "main.334": ["main.334", "tacl.1903", "main.340", "main.199", "main.336"], "main.335": ["main.335", "main.574", "main.577", "main.571", "main.520"], "main.336": ["main.336", "tacl.1906", "main.252", "main.523", "main.722"], "main.337": ["main.337", "main.51", "main.725", "main.71", "main.575"], "main.338": ["main.338", "main.293", "main.374", "main.295", "main.341"], "main.339": ["main.339", "main.296", "main.295", "main.527", "main.81"], "main.340": ["main.340", "main.296", "main.293", "main.631", "main.295"], "main.341": ["main.341", "main.374", "main.195", "main.204", "main.338"], "main.342": ["main.342", "main.288", "main.289", "main.52", "main.67"], "main.343": ["main.343", "main.570", "main.374", "main.290", "main.273"], "main.344": ["main.344", "main.347", "main.370", "main.247", "main.89"], "main.345": ["main.345", "main.661", "main.217", "main.395", "main.1"], "main.346": ["main.346", "tacl.1801", "main.197", "main.347", "main.89"], "main.347": ["main.347", "demo.67", "main.374", "main.522", "main.391"], "main.348": ["main.348", "main.329", "main.716", "main.336", "main.747"], "main.349": ["main.349", "main.118", "main.711", "main.306", "main.114"], "main.350": ["main.350", "main.344", "main.36", "main.217", "main.147"], "main.351": ["main.351", "main.206", "main.344", "main.215", "main.1"], "main.352": ["main.352", "main.509", "main.50", "main.48", "main.298"], "main.353": ["main.353", "tacl.1834", "demo.96", "main.390", "main.427"], "main.354": ["main.354", "main.639", "main.169", "main.673", "main.705"], "main.355": ["main.355", "main.21", "main.18", "main.538", "main.708"], "main.356": ["main.356", "main.368", "main.544", "main.37", "main.65"], "main.357": ["main.357", "main.197", "main.249", "main.314", "main.214"], "main.358": ["main.358", "main.347", "main.204", "main.302", "main.290"], "main.359": ["main.359", "main.692", "main.252", "main.217", "main.151"], "main.360": ["main.360", "main.593", "main.325", "main.250", "main.76"], "main.361": ["main.361", "main.211", "main.87", "main.97", "main.414"], "main.362": ["main.362", "main.67", "main.640", "main.588", "main.224"], "main.363": ["main.363", "main.704", "main.483", "main.248", "main.510"], "main.364": ["main.364", "main.282", "main.238", "main.449", "main.431"], "main.365": ["main.365", "main.493", "main.300", "main.630", "main.168"], "main.366": ["main.366", "main.31", "main.67", "main.588", "main.362"], "main.367": ["main.367", "main.694", "main.593", "main.341", "main.753"], "main.368": ["main.368", "main.431", "main.705", "main.76", "main.195"], "main.369": ["main.369", "demo.69", "main.95", "main.204", "main.423"], "main.370": ["main.370", "main.565", "main.595", "main.681", "main.247"], "main.371": ["main.371", "main.667", "main.453", "main.298", "srw.5"], "main.372": ["main.372", "main.471", "main.288", "main.681", "main.52"], "main.373": ["main.373", "srw.22", "main.213", "main.172", "main.673"], "main.374": ["main.374", "main.343", "main.347", "main.341", "main.338"], "main.375": ["main.375", "tacl.1892", "main.109", "cl.1482", "main.303"], "main.376": ["main.376", "main.629", "tacl.1805", "main.13", "main.777"], "main.377": ["main.377", "main.67", "main.224", "main.605", "main.640"], "main.378": ["main.378", "main.326", "main.773", "main.629", "main.488"], "main.379": ["main.379", "main.607", "main.482", "main.142", "main.571"], "main.380": ["main.380", "main.483", "main.264", "main.485", "main.769"], "main.381": ["main.381", "main.422", "main.269", "demo.84", "main.365"], "main.382": ["main.382", "main.771", "srw.105", "main.463", "main.494"], "main.383": ["main.383", "main.420", "main.263", "main.431", "main.140"], "main.384": ["main.384", "main.430", "srw.127", "main.158", "main.375"], "main.385": ["main.385", "srw.115", "main.687", "main.432", "main.312"], "main.386": ["main.386", "main.454", "main.771", "main.101", "main.409"], "main.387": ["main.387", "main.432", "main.687", "main.419", "main.385"], "main.388": ["main.388", "main.402", "main.273", "main.222", "main.86"], "main.389": ["main.389", "main.594", "srw.54", "main.596", "main.650"], "main.390": ["main.390", "main.305", "main.189", "main.59", "main.86"], "main.391": ["main.391", "main.240", "main.290", "tacl.1876", "main.610"], "main.392": ["main.392", "main.331", "main.77", "demo.58", "demo.100"], "main.393": ["main.393", "main.765", "main.27", "main.280", "main.281"], "main.394": ["main.394", "main.401", "main.372", "main.288", "main.336"], "main.395": ["main.395", "main.729", "main.345", "srw.98", "main.625"], "main.396": ["main.396", "main.530", "srw.122", "main.762", "main.515"], "main.397": ["main.397", "main.119", "main.167", "main.67", "tacl.1805"], "main.398": ["main.398", "main.745", "main.608", "main.247", "main.708"], "main.399": ["main.399", "tacl.1843", "main.139", "main.370", "main.757"], "main.400": ["main.400", "main.731", "main.114", "main.343", "main.215"], "main.401": ["main.401", "main.402", "main.273", "main.570", "main.343"], "main.402": ["main.402", "main.401", "main.570", "main.273", "main.388"], "main.403": ["main.403", "main.509", "main.475", "main.702", "main.157"], "main.404": ["main.404", "main.326", "main.769", "main.468", "main.773"], "main.405": ["main.405", "main.488", "main.431", "main.487", "main.484"], "main.406": ["main.406", "main.656", "main.181", "main.546", "main.362"], "main.407": ["main.407", "srw.28", "main.229", "main.177", "main.644"], "main.408": ["main.408", "main.409", "main.764", "main.115", "main.387"], "main.409": ["main.409", "main.408", "main.771", "main.449", "main.387"], "main.410": ["main.410", "main.19", "main.507", "main.600", "main.652"], "main.411": ["main.411", "main.204", "main.687", "tacl.1815", "main.385"], "main.412": ["main.412", "main.238", "main.617", "main.572", "main.653"], "main.413": ["main.413", "main.20", "main.600", "main.19", "main.652"], "main.414": ["main.414", "main.413", "main.600", "main.20", "main.772"], "main.415": ["main.415", "main.336", "main.421", "main.627", "srw.137"], "main.416": ["main.416", "main.569", "main.162", "main.397", "main.268"], "main.417": ["main.417", "main.756", "main.709", "srw.137", "srw.106"], "main.418": ["main.418", "main.488", "main.484", "main.690", "main.773"], "main.419": ["main.419", "main.432", "main.312", "main.385", "main.387"], "main.420": ["main.420", "cl.1547", "main.383", "main.408", "main.115"], "main.421": ["main.421", "main.747", "main.329", "main.493", "main.554"], "main.422": ["main.422", "cl.1552", "cl.1482", "demo.84", "main.431"], "main.423": ["main.423", "main.369", "main.197", "main.346", "main.95"], "main.424": ["main.424", "main.709", "main.707", "main.173", "main.448"], "main.425": ["main.425", "demo.69", "main.444", "main.100", "main.469"], "main.426": ["main.426", "main.128", "srw.123", "demo.87", "main.52"], "main.427": ["main.427", "main.232", "main.187", "main.219", "main.644"], "main.428": ["main.428", "main.173", "main.66", "main.499", "main.768"], "main.429": ["main.429", "main.311", "main.383", "main.743", "main.212"], "main.430": ["main.430", "srw.127", "main.384", "main.490", "main.257"], "main.431": ["main.431", "main.405", "main.236", "main.488", "main.260"], "main.432": ["main.432", "main.312", "main.419", "main.387", "main.385"], "main.433": ["main.433", "main.497", "srw.114", "main.432", "main.495"], "main.434": ["main.434", "main.236", "main.612", "main.431", "main.156"], "main.435": ["main.435", "main.730", "main.664", "main.83", "main.570"], "main.436": ["main.436", "main.18", "main.128", "main.731", "main.12"], "main.437": ["main.437", "main.694", "main.753", "main.336", "main.2"], "main.438": ["main.438", "main.744", "main.325", "main.188", "main.200"], "main.439": ["main.439", "main.247", "main.250", "main.451", "tacl.1853"], "main.440": ["main.440", "main.729", "main.644", "main.229", "demo.41"], "main.441": ["main.441", "main.197", "main.115", "main.477", "main.770"], "main.442": ["main.442", "srw.55", "main.543", "tacl.1743", "main.506"], "main.443": ["main.443", "main.538", "main.449", "demo.115", "main.290"], "main.444": ["main.444", "main.98", "demo.79", "main.60", "main.126"], "main.445": ["main.445", "main.552", "main.124", "main.453", "main.100"], "main.446": ["main.446", "main.52", "main.638", "main.634", "main.568"], "main.447": ["main.447", "demo.48", "demo.93", "main.464", "demo.46"], "main.448": ["main.448", "main.173", "main.333", "main.113", "main.455"], "main.449": ["main.449", "main.538", "main.443", "main.135", "main.552"], "main.450": ["main.450", "main.454", "main.413", "main.19", "main.20"], "main.451": ["main.451", "main.555", "main.552", "main.141", "main.553"], "main.452": ["main.452", "main.460", "main.458", "main.453", "main.173"], "main.453": ["main.453", "main.445", "main.174", "main.452", "main.173"], "main.454": ["main.454", "main.173", "main.450", "main.458", "main.386"], "main.455": ["main.455", "main.454", "main.124", "main.450", "main.333"], "main.456": ["main.456", "main.123", "demo.96", "main.639"], "main.457": ["main.457", "main.173", "main.555", "main.175", "main.454"], "main.458": ["main.458", "main.454", "main.173", "main.450", "main.452"], "main.459": ["main.459", "main.44", "main.552", "main.556", "main.454"], "main.460": ["main.460", "main.452", "main.175", "main.454", "main.458"], "main.461": ["main.461", "main.175", "main.513", "main.555", "main.173"], "main.462": ["main.462", "main.463", "main.700", "main.405", "main.485"], "main.463": ["main.463", "main.698", "main.382", "main.340", "tacl.1756"], "main.464": ["main.464", "demo.39", "main.550", "main.699", "main.702"], "main.465": ["main.465", "main.197", "main.89", "main.212", "main.62"], "main.466": ["main.466", "main.358", "main.764", "main.387", "main.420"], "main.467": ["main.467", "main.740", "main.207", "main.383", "main.497"], "main.468": ["main.468", "main.264", "main.260", "main.773", "main.262"], "main.469": ["main.469", "main.311", "main.664", "main.425", "main.583"], "main.470": ["main.470", "main.184", "main.635", "main.568", "main.183"], "main.471": ["main.471", "main.372", "main.288", "main.342", "demo.91"], "main.472": ["main.472", "main.419", "main.432", "main.385", "main.387"], "main.473": ["main.473", "main.162", "main.387", "main.213", "main.679"], "main.474": ["main.474", "main.486", "main.305", "main.485", "main.468"], "main.475": ["main.475", "main.403", "main.352", "main.404", "demo.120"], "main.476": ["main.476", "main.463", "main.633", "main.679", "main.724"], "main.477": ["main.477", "main.486", "main.441", "main.100", "main.157"], "main.478": ["main.478", "demo.100", "main.77", "demo.58", "main.331"], "main.479": ["main.479", "main.768", "main.543", "tacl.1780", "srw.135"], "main.480": ["main.480", "main.14", "main.695", "srw.22", "main.667"], "main.481": ["main.481", "main.233", "main.61", "main.643", "main.53"], "main.482": ["main.482", "main.300", "main.231", "main.671", "main.272"], "main.483": ["main.483", "main.769", "main.485", "main.380", "main.770"], "main.484": ["main.484", "main.690", "main.773", "main.619", "main.488"], "main.485": ["main.485", "main.468", "srw.18", "main.486", "main.260"], "main.486": ["main.486", "srw.18", "main.488", "main.405", "main.485"], "main.487": ["main.487", "main.405", "main.488", "main.486", "main.418"], "main.488": ["main.488", "main.405", "main.484", "main.431", "main.486"], "main.489": ["main.489", "main.664", "main.572", "main.412", "main.449"], "main.490": ["main.490", "main.493", "main.329", "main.421", "main.304"], "main.491": ["main.491", "main.771", "main.494", "main.382", "main.35"], "main.492": ["main.492", "main.534", "main.79", "main.467", "main.96"], "main.493": ["main.493", "main.421", "main.536", "main.260", "main.747"], "main.494": ["main.494", "main.771", "main.491", "main.382", "main.79"], "main.495": ["main.495", "main.497", "main.409", "main.387", "main.539"], "main.496": ["main.496", "main.84", "main.593", "main.747", "main.299"], "main.497": ["main.497", "main.495", "main.467", "main.507", "srw.28"], "main.498": ["main.498", "main.74", "main.21", "main.19", "main.501"], "main.499": ["main.499", "main.601", "main.600", "main.247", "main.19"], "main.500": ["main.500", "main.20", "main.413", "main.450", "main.772"], "main.501": ["main.501", "main.498", "main.604", "main.19", "main.247"], "main.502": ["main.502", "tacl.1882", "main.226", "tacl.1929", "main.499"], "main.503": ["main.503", "main.413", "main.652", "main.19", "main.20"], "main.504": ["main.504", "main.593", "main.38", "main.325", "main.537"], "main.505": ["main.505", "tacl.1853", "main.247", "main.216", "main.637"], "main.506": ["main.506", "main.485", "main.700", "main.261", "main.764"], "main.507": ["main.507", "tacl.1882", "main.410", "main.247", "main.701"], "main.508": ["main.508", "tacl.1882", "main.115", "main.387", "main.769"], "main.509": ["main.509", "main.157", "main.352", "main.403", "main.287"], "main.510": ["main.510", "main.421", "main.658", "main.554", "srw.79"], "main.511": ["main.511", "main.371", "main.667", "main.738", "main.116"], "main.512": ["main.512", "main.571", "main.588", "main.67", "main.362"], "main.513": ["main.513", "main.461", "main.582", "main.175", "main.296"], "main.514": ["main.514", "main.663", "demo.101", "main.364", "cl.1547"], "main.515": ["main.515", "main.6", "main.637", "main.546"], "main.516": ["main.516", "main.7", "main.131", "main.8", "main.568"], "main.517": ["main.517", "main.638", "main.197", "main.542", "main.127"], "main.518": ["main.518", "main.197", "tacl.1849", "main.214", "demo.37"], "main.519": ["main.519", "main.577", "main.525", "main.571", "main.528"], "main.520": ["main.520", "main.519", "main.139", "main.528", "main.752"], "main.521": ["main.521", "main.125", "tacl.1815", "main.710", "main.137"], "main.522": ["main.522", "main.194", "main.769", "main.347", "main.586"], "main.523": ["main.523", "main.519", "main.336", "srw.123", "demo.87"], "main.524": ["main.524", "main.12", "main.572", "main.578", "main.421"], "main.525": ["main.525", "main.519", "main.528", "main.571", "main.612"], "main.526": ["main.526", "main.572", "main.238", "main.617", "main.241"], "main.527": ["main.527", "main.579", "main.572", "main.142", "main.526"], "main.528": ["main.528", "main.519", "main.611", "main.525", "main.315"], "main.529": ["main.529", "main.41", "main.263", "srw.2", "main.40"], "main.530": ["main.530", "main.396", "main.322", "main.693", "tacl.1915"], "main.531": ["main.531", "main.757", "srw.54", "main.641"], "main.532": ["main.532", "main.253", "main.252", "main.691", "srw.54"], "main.533": ["main.533", "main.36", "main.316", "main.531", "srw.42"], "main.534": ["main.534", "main.492", "main.31", "tacl.2001", "main.207"], "main.535": ["main.535", "main.22", "main.28", "main.545", "srw.84"], "main.536": ["main.536", "main.421", "main.493", "main.260", "main.329"], "main.537": ["main.537", "main.593", "main.204", "main.195", "main.197"], "main.538": ["main.538", "main.449", "main.443", "main.18", "main.135"], "main.539": ["main.539", "main.549", "main.605", "main.224", "main.67"], "main.540": ["main.540", "srw.105", "main.245", "main.319", "main.249"], "main.541": ["main.541", "main.587", "main.168", "main.18", "main.644"], "main.542": ["main.542", "main.63", "main.517", "main.111", "main.197"], "main.543": ["main.543", "main.177", "main.158", "main.768", "main.479"], "main.544": ["main.544", "main.549", "main.65", "srw.42", "main.97"], "main.545": ["main.545", "main.69", "main.535", "main.22", "main.500"], "main.546": ["main.546", "main.515", "main.572", "main.17", "main.209"], "main.547": ["main.547", "main.274", "main.224", "main.315", "main.712"], "main.548": ["main.548", "main.32", "srw.5", "main.694", "main.73"], "main.549": ["main.549", "main.655", "main.761", "main.97", "main.539"], "main.550": ["main.550", "main.464", "main.702", "main.354", "main.26"], "main.551": ["main.551", "main.457", "main.445", "main.451", "main.513"], "main.552": ["main.552", "main.445", "main.451", "main.449", "main.124"], "main.553": ["main.553", "main.640", "main.241", "main.552", "main.555"], "main.554": ["main.554", "main.421", "main.658", "main.109", "main.153"], "main.555": ["main.555", "main.556", "main.457", "main.451", "main.175"], "main.556": ["main.556", "main.555", "main.273", "main.445", "main.458"], "main.557": ["main.557", "main.301", "main.376", "main.364", "main.441"], "main.558": ["main.558", "main.114", "main.321", "main.113", "main.253"], "main.559": ["main.559", "main.163", "main.542", "main.63", "main.683"], "main.560": ["main.560", "main.464", "main.485", "main.150", "demo.39"], "main.561": ["main.561", "main.177", "main.660", "main.384", "main.463"], "main.562": ["main.562", "demo.39", "demo.44", "main.622", "main.116"], "main.563": ["main.563", "main.5", "main.567", "main.53", "main.3"], "main.564": ["main.564", "demo.87", "srw.58", "main.516", "main.173"], "main.565": ["main.565", "main.370", "main.165", "main.681", "main.18"], "main.566": ["main.566", "main.62", "main.59", "main.166", "tacl.1901"], "main.567": ["main.567", "main.563", "main.5", "main.637", "main.10"], "main.568": ["main.568", "main.55", "main.638", "main.185", "main.221"], "main.569": ["main.569", "main.416", "srw.9", "main.301", "main.609"], "main.570": ["main.570", "main.273", "main.343", "main.402", "main.401"], "main.571": ["main.571", "main.577", "main.519", "main.512", "main.67"], "main.572": ["main.572", "main.412", "main.238", "main.655", "main.526"], "main.573": ["main.573", "main.481", "main.61", "main.583", "main.719"], "main.574": ["main.574", "main.58", "main.722", "main.612", "main.725"], "main.575": ["main.575", "main.278", "main.543", "srw.16", "tacl.1853"], "main.576": ["main.576", "main.719", "main.62", "main.282", "main.567"], "main.577": ["main.577", "main.519", "main.571", "main.525", "main.611"], "main.578": ["main.578", "main.524", "main.612", "main.572", "main.512"], "main.579": ["main.579", "main.527", "main.382", "main.771", "main.142"], "main.580": ["main.580", "main.289", "tacl.2001", "main.31", "main.670"], "main.581": ["main.581", "main.554", "main.519", "main.139", "main.528"], "main.582": ["main.582", "main.296", "main.513", "main.295", "main.340"], "main.583": ["main.583", "main.642", "main.664", "srw.22", "srw.53"], "main.584": ["main.584", "main.727", "main.643", "main.306", "main.436"], "main.585": ["main.585", "main.413", "main.500", "main.622", "tacl.1853"], "main.586": ["main.586", "main.135", "main.115", "main.522", "main.769"], "main.587": ["main.587", "main.40", "srw.16", "main.687", "main.311"], "main.588": ["main.588", "main.295", "main.67", "main.605", "main.224"], "main.589": ["main.589", "main.311", "main.147", "main.312", "main.517"], "main.590": ["main.590", "srw.105", "main.540", "cl.1550", "main.191"], "main.591": ["main.591", "main.303", "main.158", "main.297", "main.778"], "main.592": ["main.592", "main.197", "main.158", "main.140", "main.581"], "main.593": ["main.593", "main.537", "main.204", "main.504", "main.325"], "main.594": ["main.594", "main.389", "main.596", "main.22", "main.695"], "main.595": ["main.595", "main.370", "main.681", "main.165", "main.692"], "main.596": ["main.596", "main.598", "main.594", "main.389", "main.27"], "main.597": ["main.597", "main.619", "main.702", "main.418", "main.159"], "main.598": ["main.598", "main.596", "main.115", "main.733", "main.695"], "main.599": ["main.599", "main.604", "main.507", "main.247", "main.37"], "main.600": ["main.600", "main.413", "main.20", "main.652", "main.19"], "main.601": ["main.601", "main.499", "main.694", "main.412", "main.26"], "main.602": ["main.602", "main.507", "main.408", "main.410", "main.455"], "main.603": ["main.603", "main.247", "main.29", "main.361", "main.503"], "main.604": ["main.604", "main.599", "main.498", "main.247", "main.501"], "main.605": ["main.605", "main.67", "main.539", "main.607", "main.588"], "main.606": ["main.606", "main.605", "main.116", "main.669", "main.40"], "main.607": ["main.607", "main.605", "main.588", "tacl.1876", "main.397"], "main.608": ["main.608", "tacl.1801", "main.398", "main.545", "main.197"], "main.609": ["main.609", "tacl.1805", "main.36", "main.224", "main.67"], "main.610": ["main.610", "main.391", "main.194", "demo.69", "main.522"], "main.611": ["main.611", "main.528", "main.519", "main.571", "main.577"], "main.612": ["main.612", "main.722", "main.760", "main.238", "main.525"], "main.613": ["main.613", "demo.116", "main.153", "main.554", "demo.66"], "main.614": ["main.614", "main.217", "main.208", "main.762", "main.565"], "main.615": ["main.615", "main.646", "main.316", "main.128", "main.764"], "main.616": ["main.616", "main.385", "tacl.1815", "main.687", "main.411"], "main.617": ["main.617", "main.238", "main.412", "main.526", "main.241"], "main.618": ["main.618", "main.675", "main.766", "main.201", "main.318"], "main.619": ["main.619", "main.690", "main.484", "main.597", "main.262"], "main.620": ["main.620", "main.41", "main.542", "main.34", "main.36"], "main.621": ["main.621", "main.748", "main.717", "main.184", "main.760"], "main.622": ["main.622", "tacl.1853", "main.738", "main.585", "main.299"], "main.623": ["main.623", "main.161", "main.262", "srw.99", "main.392"], "main.624": ["main.624", "demo.69", "tacl.1906", "demo.24", "main.18"], "main.625": ["main.625", "main.12", "main.197", "main.249", "main.314"], "main.626": ["main.626", "main.413", "demo.48", "main.500", "main.450"], "main.627": ["main.627", "main.581", "main.336", "tacl.1906", "main.193"], "main.628": ["main.628", "main.654", "tacl.1849", "main.28", "main.247"], "main.629": ["main.629", "main.605", "main.67", "tacl.1876", "main.588"], "main.630": ["main.630", "main.705", "main.76", "main.247", "main.73"], "main.631": ["main.631", "main.293", "main.296", "main.340", "main.295"], "main.632": ["main.632", "main.463", "main.287", "main.700", "main.472"], "main.633": ["main.633", "main.1", "main.478", "main.506", "demo.58"], "main.634": ["main.634", "demo.37", "main.515", "main.635", "main.19"], "main.635": ["main.635", "main.8", "demo.37", "main.6", "main.127"], "main.636": ["main.636", "main.53", "main.5", "main.12", "main.129"], "main.637": ["main.637", "main.567", "main.62", "main.53", "main.638"], "main.638": ["main.638", "main.568", "main.55", "main.221", "demo.37"], "main.639": ["main.639", "srw.17", "main.169", "main.673", "main.354"], "main.640": ["main.640", "main.67", "tacl.1805", "main.241", "main.224"], "main.641": ["main.641", "main.432", "main.419", "main.385", "main.687"], "main.642": ["main.642", "main.583", "main.643", "main.640", "main.664"], "main.643": ["main.643", "main.306", "main.642", "main.683", "main.664"], "main.644": ["main.644", "main.729", "main.440", "main.229", "tacl.1843"], "main.645": ["main.645", "main.156", "main.80", "main.722", "srw.79"], "main.646": ["main.646", "main.506", "main.177", "main.615", "main.468"], "main.647": ["main.647", "main.326", "main.262", "main.404", "main.260"], "main.648": ["main.648", "main.315", "main.145", "main.737", "main.81"], "main.649": ["main.649", "main.209", "main.596", "main.587", "main.374"], "main.650": ["main.650", "main.132", "main.191", "srw.69", "main.389"], "main.651": ["main.651", "main.330", "main.21", "tacl.1845", "main.600"], "main.652": ["main.652", "main.413", "main.20", "main.19", "main.600"], "main.653": ["main.653", "main.413", "main.412", "main.652", "main.600"], "main.654": ["main.654", "main.336", "main.28", "main.507", "main.636"], "main.655": ["main.655", "main.549", "main.761", "main.97", "main.572"], "main.656": ["main.656", "main.761", "main.97", "main.771", "main.494"], "main.657": ["main.657", "main.224", "main.67", "main.549", "main.210"], "main.658": ["main.658", "main.554", "srw.137", "main.421", "main.510"], "main.659": ["main.659", "main.420", "main.383", "srw.90", "main.506"], "main.660": ["main.660", "main.561", "main.592", "main.766", "main.485"], "main.661": ["main.661", "main.217", "main.345", "main.614", "main.54"], "main.662": ["main.662", "main.413", "main.20", "main.600", "main.652"], "main.663": ["main.663", "main.96", "main.408", "main.386", "cl.1547"], "main.664": ["main.664", "main.731", "main.306", "main.583", "main.683"], "main.665": ["main.665", "main.163", "cl.1552", "cl.1547", "main.177"], "main.666": ["main.666", "main.226", "main.83", "main.488", "tacl.1849"], "main.667": ["main.667", "main.718", "main.371", "main.714", "main.82"], "main.668": ["main.668", "main.162", "main.507", "main.544", "main.678"], "main.669": ["main.669", "main.335", "main.722", "main.600", "demo.59"], "main.670": ["main.670", "main.141", "main.207", "main.135", "main.137"], "main.671": ["main.671", "main.357", "main.247", "main.508", "main.687"], "main.672": ["main.672", "main.37", "main.687", "main.250", "main.38"], "main.673": ["main.673", "main.639", "main.169", "srw.17", "main.354"], "main.674": ["main.674", "main.93", "main.664", "main.583", "main.642"], "main.675": ["main.675", "main.618", "main.421", "main.766", "main.143"], "main.676": ["main.676", "main.770", "main.138", "main.733", "main.754"], "main.677": ["main.677", "demo.130", "main.105", "main.449", "main.538"], "main.678": ["main.678", "main.680", "main.95", "main.423", "main.668"], "main.679": ["main.679", "main.769", "main.244", "main.690", "main.457"], "main.680": ["main.680", "main.722", "main.764", "main.645", "main.678"], "main.681": ["main.681", "main.370", "main.595", "main.565", "main.139"], "main.682": ["main.682", "cl.1482", "main.643", "main.62", "main.219"], "main.683": ["main.683", "main.306", "main.643", "main.664", "main.642"], "main.684": ["main.684", "main.382", "main.771", "main.566", "main.46"], "main.685": ["main.685", "main.328", "main.625", "main.729", "main.190"], "main.686": ["main.686", "main.204", "main.411", "main.38", "main.504"], "main.687": ["main.687", "main.385", "main.387", "srw.115", "tacl.1815"], "main.688": ["main.688", "main.260", "main.239", "main.536", "main.169"], "main.689": ["main.689", "main.692", "main.595", "main.165", "main.86"], "main.690": ["main.690", "main.484", "main.619", "main.262", "main.326"], "main.691": ["main.691", "main.253", "main.532", "main.773", "main.690"], "main.692": ["main.692", "main.689", "main.595", "main.165", "main.740"], "main.693": ["main.693", "main.207", "main.322", "main.321", "main.40"], "main.694": ["main.694", "main.753", "main.437", "main.235", "main.316"], "main.695": ["main.695", "main.480", "main.598", "main.116", "main.733"], "main.696": ["main.696", "main.81", "srw.19", "main.299", "srw.129"], "main.697": ["main.697", "main.759", "main.462", "main.447", "main.606"], "main.698": ["main.698", "main.463", "main.382", "main.159", "main.340"], "main.699": ["main.699", "main.464", "demo.39", "main.485", "main.702"], "main.700": ["main.700", "main.506", "main.462", "srw.36", "main.463"], "main.701": ["main.701", "main.507", "main.211", "main.247", "main.654"], "main.702": ["main.702", "main.464", "main.619", "main.597", "main.690"], "main.703": ["main.703", "main.740", "tacl.1853", "main.705", "main.247"], "main.704": ["main.704", "main.770", "main.93", "main.693", "srw.58"], "main.705": ["main.705", "main.247", "demo.47", "main.24", "main.195"], "main.706": ["main.706", "demo.69", "main.522", "main.708", "main.100"], "main.707": ["main.707", "main.28", "main.709", "main.460", "main.424"], "main.708": ["main.708", "main.63", "main.4", "main.769", "main.18"], "main.709": ["main.709", "main.424", "main.707", "srw.137", "main.417"], "main.710": ["main.710", "main.103", "main.105", "main.101", "main.300"], "main.711": ["main.711", "main.118", "main.349", "main.515", "tacl.1886"], "main.712": ["main.712", "tacl.1805", "main.67", "main.224", "main.640"], "main.713": ["main.713", "main.758", "main.421", "main.554", "main.137"], "main.714": ["main.714", "main.718", "main.141", "main.321", "main.230"], "main.715": ["main.715", "main.642", "main.527", "main.375", "main.591"], "main.716": ["main.716", "main.329", "main.348", "main.421", "main.747"], "main.717": ["main.717", "main.748", "main.184", "main.292", "main.760"], "main.718": ["main.718", "main.714", "main.667", "main.141", "main.207"], "main.719": ["main.719", "demo.31", "main.408", "main.576", "main.286"], "main.720": ["main.720", "main.490", "main.329", "main.421", "main.536"], "main.721": ["main.721", "main.148", "main.531", "main.128", "main.272"], "main.722": ["main.722", "tacl.1906", "main.612", "main.574", "main.680"], "main.723": ["main.723", "main.486", "main.49", "main.50", "main.488"], "main.724": ["main.724", "main.32", "main.103", "main.65", "main.104"], "main.725": ["main.725", "main.574", "main.527", "main.578", "main.612"], "main.726": ["main.726", "main.374", "main.341", "main.283", "main.338"], "main.727": ["main.727", "main.731", "main.306", "main.770", "main.586"], "main.728": ["main.728", "main.19", "main.463", "srw.53", "main.54"], "main.729": ["main.729", "main.644", "demo.41", "main.440", "main.232"], "main.730": ["main.730", "main.435", "main.135", "main.600", "main.643"], "main.731": ["main.731", "main.664", "main.306", "main.400", "main.683"], "main.732": ["main.732", "main.736", "main.603", "main.735", "main.645"], "main.733": ["main.733", "main.398", "main.598", "main.676", "main.436"], "main.734": ["main.734", "main.58", "main.315", "main.595", "main.528"], "main.735": ["main.735", "main.58", "tacl.1876", "main.732", "main.609"], "main.736": ["main.736", "main.732", "srw.98", "main.737", "main.240"], "main.737": ["main.737", "main.648", "main.736", "main.138", "main.81"], "main.738": ["main.738", "main.622", "main.418", "main.520", "main.193"], "main.739": ["main.739", "main.583", "main.93", "main.497", "main.626"], "main.740": ["main.740", "main.467", "main.703", "main.244", "main.370"], "main.741": ["main.741", "main.630", "main.612", "main.628", "main.422"], "main.742": ["main.742", "main.677", "tacl.1901", "main.718", "main.595"], "main.743": ["main.743", "main.429", "tacl.1852", "main.680", "main.142"], "main.744": ["main.744", "main.325", "main.593", "main.438", "main.6"], "main.745": ["main.745", "main.398", "main.677", "main.742", "main.195"], "main.746": ["main.746", "main.640", "main.607", "main.224", "main.67"], "main.747": ["main.747", "main.421", "main.329", "main.493", "srw.137"], "main.748": ["main.748", "main.717", "main.760", "main.184", "main.292"], "main.749": ["main.749", "main.27", "main.524", "main.241", "main.572"], "main.750": ["main.750", "main.165", "main.524", "main.681", "main.139"], "main.751": ["main.751", "main.199", "main.515", "main.26", "main.33"], "main.752": ["main.752", "main.519", "main.528", "main.139", "main.522"], "main.753": ["main.753", "main.694", "main.235", "main.437", "main.316"], "main.754": ["main.754", "main.720", "main.769", "main.750", "main.18"], "main.755": ["main.755", "main.540", "main.245", "main.317", "main.138"], "main.756": ["main.756", "main.417", "demo.54", "srw.137", "main.294"], "main.757": ["main.757", "tacl.1843", "main.504", "main.125", "main.753"], "main.758": ["main.758", "main.554", "main.329", "main.581", "main.274"], "main.759": ["main.759", "demo.139", "main.312", "main.697", "demo.84"], "main.760": ["main.760", "main.748", "main.612", "tacl.1906", "main.717"], "main.761": ["main.761", "main.549", "main.655", "main.97", "main.656"], "main.762": ["main.762", "main.306", "main.643", "main.731", "main.683"], "main.763": ["main.763", "main.478", "main.331", "main.260", "srw.18"], "main.764": ["main.764", "main.408", "main.204", "main.769", "main.420"], "main.765": ["main.765", "main.637", "main.568", "main.62", "main.54"], "main.766": ["main.766", "main.421", "main.260", "main.329", "main.618"], "main.767": ["main.767", "main.108", "main.248", "demo.96", "main.217"], "main.768": ["main.768", "main.479", "tacl.1780", "srw.135", "main.543"], "main.769": ["main.769", "main.770", "main.773", "main.522", "main.586"], "main.770": ["main.770", "main.769", "main.773", "main.690", "main.704"], "main.771": ["main.771", "main.494", "main.382", "main.491", "main.387"], "main.772": ["main.772", "main.413", "main.20", "main.19", "main.600"], "main.773": ["main.773", "main.468", "main.262", "main.264", "main.769"], "main.774": ["main.774", "main.768", "srw.135", "main.771", "tacl.1780"], "main.775": ["main.775", "main.587", "main.735", "main.591", "main.259"], "main.776": ["main.776", "demo.46", "main.195", "main.250", "main.89"], "main.777": ["main.777", "main.138", "main.519", "srw.123", "main.627"], "main.778": ["main.778", "main.591", "tacl.1766", "main.198", "main.690"], "srw.2": ["srw.2", "main.753", "main.40", "main.41", "main.693"], "srw.5": ["srw.5", "main.32", "main.548", "main.49", "main.73"], "srw.9": ["srw.9", "main.145", "main.207", "main.270", "main.693"], "srw.14": ["srw.14", "srw.2", "srw.54", "main.148", "main.149"], "srw.15": ["srw.15", "main.310", "main.391", "demo.139", "main.433"], "srw.16": ["srw.16", "main.432", "demo.84", "main.419", "main.385"], "srw.17": ["srw.17", "main.639", "main.169", "main.673", "main.294"], "srw.18": ["srw.18", "main.486", "main.485", "main.262", "main.468"], "srw.19": ["srw.19", "main.28", "main.274", "main.696", "main.654"], "srw.22": ["srw.22", "main.583", "main.133", "tacl.1915", "main.486"], "srw.28": ["srw.28", "tacl.1892", "main.158", "main.495", "main.231"], "srw.35": ["srw.35", "main.526", "main.588", "main.282", "main.241"], "srw.36": ["srw.36", "main.688", "main.467", "main.222", "demo.47"], "srw.39": ["srw.39", "main.51", "main.198", "main.434", "cl.1550"], "srw.42": ["srw.42", "main.544", "main.533", "main.230", "main.426"], "srw.46": ["srw.46", "main.218", "main.771", "main.472", "main.287"], "srw.48": ["srw.48", "main.252", "srw.54", "main.304", "main.336"], "srw.49": ["srw.49", "main.460", "main.707", "main.452", "main.36"], "srw.52": ["srw.52", "main.746", "main.682", "main.688", "main.260"], "srw.53": ["srw.53", "main.664", "main.583", "main.642", "main.435"], "srw.54": ["srw.54", "main.252", "main.532", "main.34", "main.40"], "srw.55": ["srw.55", "main.35", "main.387", "main.491", "main.771"], "srw.58": ["srw.58", "main.220", "main.55", "main.568", "main.4"], "srw.69": ["srw.69", "main.434", "main.383", "main.650", "main.180"], "srw.79": ["srw.79", "main.214", "main.247", "main.197", "main.747"], "srw.82": ["srw.82", "srw.144", "main.505", "main.140", "main.216"], "srw.84": ["srw.84", "main.535", "main.545", "main.28", "main.445"], "srw.85": ["srw.85", "main.157", "srw.79", "main.57", "main.773"], "srw.90": ["srw.90", "main.659", "main.596", "main.437", "tacl.1759"], "srw.95": ["srw.95", "main.421", "main.329", "main.12", "main.148"], "srw.98": ["srw.98", "main.736", "main.583", "main.93", "main.395"], "srw.99": ["srw.99", "main.100", "main.157", "main.57", "main.188"], "srw.104": ["srw.104", "demo.96", "main.75", "main.287"], "srw.105": ["srw.105", "main.540", "main.319", "main.590", "main.245"], "srw.106": ["srw.106", "main.756", "main.417", "main.578", "srw.137"], "srw.109": ["srw.109", "main.174", "main.173", "main.555", "main.453"], "srw.114": ["srw.114", "main.672", "cl.1482", "srw.127", "main.481"], "srw.115": ["srw.115", "main.385", "main.687", "tacl.1815", "main.432"], "srw.116": ["srw.116", "demo.89", "main.322", "main.138", "main.1"], "srw.122": ["srw.122", "main.21", "main.69", "main.396", "main.90"], "srw.123": ["srw.123", "demo.87", "main.426", "main.283", "main.523"], "srw.127": ["srw.127", "main.430", "main.490", "main.37", "main.384"], "srw.128": ["srw.128", "srw.137", "main.324", "main.148", "main.150"], "srw.129": ["srw.129", "main.82", "main.187", "main.696", "main.737"], "srw.131": ["srw.131", "main.113", "main.206", "main.188", "main.387"], "srw.135": ["srw.135", "main.768", "main.774", "main.479", "main.543"], "srw.137": ["srw.137", "main.252", "main.324", "main.421", "main.658"], "srw.144": ["srw.144", "main.150", "main.58", "main.744", "main.612"], "tacl.1709": ["tacl.1709", "main.43", "cl.1482", "srw.9", "srw.127"], "tacl.1720": ["tacl.1720", "main.236", "main.434", "main.552", "main.431"], "tacl.1727": ["tacl.1727", "main.554", "main.318", "main.196", "main.358"], "tacl.1743": ["tacl.1743", "main.179", "main.491", "main.442", "main.206"], "tacl.1756": ["tacl.1756", "main.463", "main.382", "main.144", "main.545"], "tacl.1759": ["tacl.1759", "main.198", "main.766", "main.262", "srw.90"], "tacl.1766": ["tacl.1766", "main.536", "main.250", "main.197", "main.38"], "tacl.1779": ["tacl.1779", "main.768", "main.540", "main.479", "main.251"], "tacl.1780": ["tacl.1780", "main.768", "tacl.1915", "main.220", "main.173"], "tacl.1801": ["tacl.1801", "main.346", "main.272", "main.370", "main.681"], "tacl.1805": ["tacl.1805", "main.67", "main.167", "main.119", "main.640"], "tacl.1811": ["tacl.1811", "main.13", "main.300", "main.460", "main.439"], "tacl.1815": ["tacl.1815", "main.687", "srw.115", "main.385", "main.411"], "tacl.1834": ["tacl.1834", "main.353", "main.405", "main.2", "main.419"], "tacl.1843": ["tacl.1843", "main.399", "main.757", "main.277", "main.291"], "tacl.1845": ["tacl.1845", "main.69", "main.498", "main.662", "main.21"], "tacl.1849": ["tacl.1849", "main.247", "main.197", "main.451", "main.518"], "tacl.1852": ["tacl.1852", "main.429", "main.247", "main.768", "main.370"], "tacl.1853": ["tacl.1853", "main.247", "main.11", "main.622", "main.299"], "tacl.1876": ["tacl.1876", "main.391", "main.315", "demo.69", "demo.67"], "tacl.1882": ["tacl.1882", "main.507", "main.508", "demo.69", "main.135"], "tacl.1886": ["tacl.1886", "main.178", "main.133", "main.515", "main.711"], "tacl.1892": ["tacl.1892", "main.158", "main.303", "srw.28", "main.375"], "tacl.1901": ["tacl.1901", "main.638", "main.62", "main.524", "main.566"], "tacl.1903": ["tacl.1903", "main.334", "main.336", "main.136", "main.367"], "tacl.1906": ["tacl.1906", "main.336", "main.722", "main.760", "main.252"], "tacl.1912": ["tacl.1912", "main.259", "main.507", "srw.28", "main.664"], "tacl.1915": ["tacl.1915", "srw.22", "tacl.1780", "main.530", "tacl.1811"], "tacl.1929": ["tacl.1929", "main.662", "main.652", "main.653", "main.500"], "tacl.1967": ["tacl.1967", "main.60", "main.22", "main.226", "main.27"], "tacl.2001": ["tacl.2001", "main.580", "main.207", "main.235", "main.321"]} +{"main.1004": ["main.1022", "main.959", "main.2228", "main.2380", "main.2117"], "main.1006": ["main.1201", "main.689", "main.1654", "main.1846", "main.128"], "main.1009": ["main.2839", "main.2927", "main.1113", "main.355", "main.1085"], "main.1010": ["main.782", "main.666", "TACL.2121", "main.179", "main.1648"], "main.1011": ["main.2181", "main.2877", "main.2406", "main.1787", "main.1460"], "main.1012": ["main.1702", "main.1201", "main.1846", "TACL.2143", "main.2209"], "main.1018": ["TACL.2011", "main.3093", "main.865", "main.802", "main.3143"], "main.1022": ["main.449", "main.2586", "main.1788", "main.3186", "main.1837"], "main.1023": ["main.965", "main.471", "main.3012", "main.3581", "main.2506"], "main.1024": ["main.3257", "main.3353", "main.1006", "main.1455", "main.1938"], "main.1030": ["main.2120", "main.319", "main.3398", "main.3054", "main.1123"], "main.1032": ["main.858", "main.74", "main.1803", "main.2167", "main.2943"], "main.1046": ["main.1923", "main.2733", "main.989", "main.426", "main.1458"], "main.1049": ["main.870", "main.835", "main.2476", "main.2590", "main.2635"], "main.1052": ["main.2122", "main.1159", "main.41", "main.1130", "main.2363"], "main.106": ["main.2430", "main.2349", "main.920", "main.1613", "main.699"], "main.1061": ["main.1379", "main.3597", "main.143", "main.2641", "main.3046"], "main.1070": ["main.3216", "main.2849", "main.1787", "main.298", "main.1706"], "main.1071": ["main.2702", "main.666", "main.531", "main.1052", "main.1231"], "main.108": ["main.1351", "main.2931", "main.3074", "main.1159", "main.1528"], "main.1085": ["main.2839", "main.2322", "main.1009", "main.284", "main.702"], "main.1086": ["main.2724", "main.1618", "main.204", "main.3398", "main.2253"], "main.1091": ["main.3088", "main.652", "main.714", "main.471", "main.1023"], "main.110": ["main.1631", "main.1923", "main.2630", "main.748", "main.1159"], "main.1100": ["main.1572", "main.2661", "main.3227", "main.856", "main.888"], "main.1103": ["main.3291", "main.1972", "main.2758", "main.3450", "main.1191"], "main.1107": ["main.850", "main.1495", "main.1179", "main.891", "main.1130"], "main.1113": ["main.995", "main.647", "main.355", "main.1402", "main.3360"], "main.1116": ["main.2972", "main.237", "main.1421", "main.2508", "main.1569"], "main.1123": ["main.2125", "main.1648", "main.2761", "main.693", "main.3054"], "main.1129": ["main.2367", "main.3389", "main.3010", "main.1049", "main.2650"], "main.1130": ["main.2363", "TACL.2041", "main.1892", "main.74", "main.852"], "main.1135": ["main.928", "main.605", "main.3617", "main.1421", "main.3486"], "main.1140": ["TACL.2143", "main.787", "main.215", "main.2141", "main.1846"], "main.1141": ["main.2635", "main.2476", "main.861", "main.3437", "main.574"], "main.1146": ["main.26", "TACL.2107", "main.701", "main.130", "main.2893"], "main.1159": ["main.2739", "main.1669", "main.2307", "main.2363", "main.1528"], "main.1179": ["main.1561", "main.1957", "TACL.2143", "main.876", "main.850"], "main.1180": ["main.1575", "main.1675", "main.1289", "main.2068", "main.2430"], "main.1187": ["main.2650", "main.1892", "main.989", "main.1159", "main.1488"], "main.1191": ["main.607", "main.2253", "main.2650", "main.3470", "main.1103"], "main.1196": ["main.3183", "TACL.2055", "main.387", "TACL.2129", "main.2258"], "main.1201": ["main.1654", "main.215", "main.1846", "main.128", "main.3393"], "main.1205": ["main.1787", "TACL.2095", "main.693", "main.300", "main.1949"], "main.1208": ["main.2838", "TACL.2041", "main.74", "main.1734", "main.1485"], "main.1210": ["TACL.2411", "main.2040", "TACL.2141", "main.2179", "main.1957"], "main.1217": ["main.3579", "main.3013", "main.3185", "main.1675", "main.3064"], "main.1219": ["main.2491", "main.471", "TACL.2041", "main.1986", "main.130"], "main.1220": ["main.3337", "main.3074", "main.247", "main.1299", "main.1960"], "main.1225": ["main.1615", "main.2636", "main.2790", "main.148", "main.1720"], "main.1227": ["main.2078", "main.2635", "main.1960", "main.1351", "main.858"], "main.1231": ["main.666", "main.1706", "TACL.2121", "main.531", "main.143"], "main.1248": ["main.3337", "main.1733", "main.1960", "main.2675", "main.1798"], "main.125": ["main.648", "main.2650", "main.1187", "main.870", "main.84"], "main.1250": ["main.1159", "main.1952", "main.3651", "main.2630", "main.911"], "main.1258": ["main.1494", "main.1957", "main.1943", "TACL.2141", "main.447"], "main.1262": ["main.3054", "main.1485", "main.319", "main.1030", "main.3140"], "main.1263": ["main.74", "main.1680", "main.2500", "main.858", "main.1379"], "main.1267": ["main.2763", "main.2553", "main.1970", "main.2739", "main.315"], "main.1271": ["main.110", "main.55", "main.1923", "main.1942", "main.1219"], "main.1275": ["main.2179", "main.1754", "CL.2", "main.1957", "main.143"], "main.128": ["main.1654", "main.3179", "main.1201", "main.2164", "main.215"], "main.1280": ["main.2424", "TACL.2041", "main.2363", "main.2414", "main.852"], "main.1282": ["main.3181", "main.3115", "main.457", "TACL.2013", "main.820"], "main.1287": ["main.916", "TACL.2093", "main.1675", "main.2261", "main.2996"], "main.1289": ["main.3286", "main.1675", "main.1766", "main.3375", "main.1550"], "main.1298": ["main.2746", "main.2278", "main.1379", "main.871", "main.3116"], "main.1299": ["main.1734", "main.2491", "main.247", "main.16", "main.2793"], "main.130": ["TACL.2107", "main.3227", "main.1485", "main.522", "main.1680"], "main.1305": ["main.3093", "main.1428", "main.585", "main.2596", "main.2251"], "main.1320": ["main.3566", "main.2847", "main.870", "main.701", "main.2349"], "main.1322": ["main.1952", "main.1485", "main.450", "main.2931", "main.965"], "main.1339": ["main.2635", "main.2412", "main.891", "main.214", "main.2915"], "main.1351": ["main.2491", "main.1960", "TACL.2047", "main.1631", "main.2078"], "main.1356": ["main.910", "main.2430", "main.891", "main.1960", "main.148"], "main.1377": ["TACL.2169", "main.2198", "main.648", "main.3183", "main.2307"], "main.1379": ["main.1061", "main.143", "main.1803", "main.2777", "main.871"], "main.1383": ["main.2490", "main.1986", "main.517", "main.3116", "TACL.2107"], "main.1388": ["main.2702", "main.3360", "TACL.2041", "main.3470", "main.2838"], "main.1389": ["main.1923", "main.345", "main.2068", "main.2739", "main.2733"], "main.1390": ["main.3495", "main.3327", "main.527", "main.916", "main.2931"], "main.1393": ["main.1938", "main.3093", "main.1957", "main.1135", "main.1935"], "main.1395": ["main.2251", "main.2363", "main.3292", "main.298", "main.143"], "main.1399": ["main.1508", "main.2721", "main.1923", "main.1305", "main.426"], "main.1402": ["main.888", "TACL.2221", "main.1572", "main.2702", "main.870"], "main.1408": ["main.1399", "main.1923", "main.345", "main.2886", "TACL.2055"], "main.1421": ["main.1116", "main.2508", "main.237", "main.96", "main.607"], "main.1428": ["main.1130", "main.1631", "main.1351", "main.2078", "main.2777"], "main.143": ["main.1379", "main.623", "main.2630", "main.1061", "main.2251"], "main.1432": ["main.453", "main.2430", "main.2661", "main.3348", "main.891"], "main.1445": ["main.3116", "main.852", "main.522", "main.2630", "main.3688"], "main.1446": ["main.2430", "main.1892", "main.1130", "main.2491", "main.3483"], "main.1455": ["main.2638", "main.3115", "main.2363", "main.1130", "main.1613"], "main.1456": ["main.1503", "main.3046", "main.1935", "main.1061", "main.852"], "main.1458": ["main.2959", "main.745", "main.1046", "main.3434", "main.426"], "main.1460": ["main.1787", "main.1493", "main.2406", "main.1508", "main.684"], "main.1465": ["main.3084", "main.3617", "main.2972", "main.237", "main.1116"], "main.1466": ["main.684", "main.1648", "main.607", "main.574", "main.787"], "main.148": ["main.2635", "main.1720", "main.74", "main.3216", "main.2790"], "main.1482": ["main.3184", "main.1631", "main.74", "main.714", "main.1647"], "main.1484": ["main.2238", "main.527", "main.2215", "main.2221", "main.1219"], "main.1485": ["main.130", "main.618", "main.3394", "main.1130", "main.1986"], "main.1488": ["main.151", "main.782", "main.2764", "TACL.2121", "main.3437"], "main.1490": ["main.2430", "main.1446", "main.3483", "main.2635", "main.910"], "main.1492": ["main.3181", "main.246", "main.1613", "main.767", "main.3115"], "main.1493": ["main.1706", "main.666", "main.1787", "main.2406", "main.1460"], "main.1494": ["TACL.2013", "main.2684", "main.3181", "TACL.2141", "main.2064"], "main.1495": ["main.850", "TACL.2411", "main.1107", "main.3327", "main.471"], "main.1498": ["TACL.2093", "main.2792", "main.30", "main.3348", "main.2865"], "main.1503": ["main.1061", "main.639", "main.2641", "main.1694", "main.143"], "main.1504": ["main.1572", "main.3227", "main.246", "main.1770", "main.471"], "main.1508": ["main.1399", "main.2873", "main.3434", "main.426", "main.1460"], "main.151": ["main.1488", "main.782", "main.2367", "main.2995", "main.1669"], "main.1518": ["main.883", "main.3647", "main.315", "main.1621", "main.2661"], "main.1522": ["main.2209", "main.215", "main.128", "main.645", "main.699"], "main.1528": ["main.911", "main.1159", "main.666", "main.605", "main.2630"], "main.1540": ["main.1923", "main.1023", "main.1159", "main.3183", "main.748"], "main.1547": ["main.3540", "main.1159", "main.2640", "main.2793", "main.911"], "main.1550": ["main.3375", "main.1675", "main.1766", "main.1289", "main.1952"], "main.1551": ["main.2122", "main.498", "main.3093", "main.947", "main.1970"], "main.1552": ["main.1485", "main.3543", "main.2615", "main.618", "TACL.2041"], "main.1561": ["main.1179", "main.689", "TACL.2143", "main.2209", "main.977"], "main.1566": ["main.2367", "main.2078", "main.2476", "main.1339", "main.983"], "main.1569": ["main.1116", "main.1957", "main.2972", "main.315", "main.1561"], "main.1572": ["main.1100", "main.3227", "main.3688", "main.1680", "main.1504"], "main.1574": ["main.2783", "main.956", "main.3394", "main.2779", "main.1219"], "main.1575": ["main.493", "main.3174", "main.1180", "main.2068", "main.2958"], "main.1578": ["main.763", "main.2574", "main.699", "TACL.2143", "main.2410"], "main.158": ["main.327", "main.574", "main.2761", "main.1706", "main.1648"], "main.1580": ["main.319", "main.2586", "main.3186", "main.2587", "TACL.2049"], "main.1581": ["main.888", "main.2382", "main.1898", "main.3227", "main.2349"], "main.1594": ["main.1631", "main.1428", "main.748", "main.16", "main.110"], "main.16": ["main.2793", "main.1631", "main.3023", "main.2491", "main.400"], "main.1603": ["main.2261", "main.392", "main.376", "main.3532", "main.1322"], "main.1606": ["main.1702", "TACL.2143", "main.128", "main.1140", "main.2141"], "main.1611": ["main.2167", "main.76", "main.1159", "main.989", "main.2893"], "main.1612": ["main.1960", "main.2585", "main.2650", "main.2389", "main.3348"], "main.1613": ["main.2851", "main.3181", "main.1892", "main.3115", "main.852"], "main.1614": ["main.2914", "main.2895", "main.60", "main.47", "main.2313"], "main.1615": ["main.1225", "main.3348", "main.2636", "main.1432", "main.2198"], "main.1618": ["main.3337", "main.2635", "main.3278", "main.3398", "main.1798"], "main.1621": ["main.3647", "main.883", "main.3593", "main.1518", "main.2271"], "main.1622": ["main.485", "main.2444", "main.41", "main.527", "main.3010"], "main.1625": ["main.2419", "main.1957", "main.486", "main.447", "main.1938"], "main.1626": ["main.1997", "main.2396", "main.3566", "main.106", "main.3101"], "main.1631": ["main.1130", "main.16", "main.2491", "main.1482", "main.1351"], "main.1634": ["main.3353", "main.2590", "main.2511", "main.648", "main.1049"], "main.1647": ["main.1928", "main.1130", "main.1482", "main.2650", "main.3054"], "main.1648": ["main.923", "main.574", "main.2761", "main.531", "main.1123"], "main.1649": ["main.870", "main.2098", "main.1061", "main.357", "main.852"], "main.165": ["main.476", "main.2707", "main.3072", "main.2561", "main.3352"], "main.1654": ["main.1201", "main.128", "main.215", "main.2141", "main.1006"], "main.1658": ["main.920", "main.2758", "main.1581", "main.2349", "main.701"], "main.1669": ["main.1159", "main.471", "main.2476", "main.2739", "main.2068"], "main.1670": ["main.3183", "main.284", "main.376", "TACL.2041", "main.317"], "main.1675": ["main.3286", "main.3185", "main.3064", "main.1289", "main.3375"], "main.168": ["main.3072", "main.2766", "main.2561", "main.3352", "main.2058"], "main.1680": ["main.3688", "TACL.2107", "main.522", "main.852", "main.1986"], "main.1682": ["main.2167", "main.2793", "main.1032", "main.148", "main.1611"], "main.1687": ["main.1928", "main.2758", "main.2982", "main.605", "main.2382"], "main.1694": ["main.3688", "main.1503", "main.1061", "main.522", "main.3227"], "main.1700": ["main.3179", "main.317", "main.478", "main.2410", "main.2164"], "main.1702": ["main.1012", "main.1201", "TACL.2143", "main.1606", "main.1846"], "main.1706": ["main.1787", "main.1231", "main.158", "main.666", "main.531"], "main.1707": ["main.648", "main.3483", "main.730", "main.2382", "main.2511"], "main.1720": ["main.148", "main.2790", "main.2636", "main.1356", "main.1225"], "main.1733": ["main.3656", "TACL.2255", "main.2389", "main.2078", "main.930"], "main.1734": ["main.2838", "main.1299", "TACL.2041", "main.2491", "main.74"], "main.1738": ["main.2947", "main.3216", "main.1482", "main.2799", "main.989"], "main.1739": ["main.179", "TACL.2141", "main.2448", "main.2990", "main.1970"], "main.1749": ["main.2048", "main.3438", "main.1116", "main.2427", "main.3617"], "main.1750": ["main.2996", "main.916", "main.440", "main.955", "main.2688"], "main.1754": ["main.1957", "main.1061", "main.2179", "main.1179", "main.574"], "main.1755": ["TACL.2103", "main.2974", "main.911", "main.989", "main.2799"], "main.1766": ["main.1289", "main.3375", "main.1675", "main.1550", "main.2363"], "main.1770": ["main.334", "main.522", "main.3688", "main.1504", "main.1694"], "main.1782": ["main.871", "main.2630", "main.2278", "main.1803", "main.1379"], "main.1784": ["main.349", "main.1706", "TACL.2121", "main.3682", "main.1488"], "main.1787": ["main.1706", "main.2877", "main.3216", "main.2406", "main.2974"], "main.1788": ["main.1022", "main.1837", "main.2586", "main.959", "main.3183"], "main.179": ["main.1010", "main.1061", "main.2342", "main.1739", "main.1503"], "main.1797": ["main.419", "main.2839", "main.1972", "main.787", "main.128"], "main.1798": ["main.618", "main.130", "main.3337", "main.701", "main.1618"], "main.1803": ["main.74", "main.1379", "main.3688", "main.871", "main.2500"], "main.1817": ["main.3646", "main.1669", "main.1977", "main.3470", "TACL.1997"], "main.1832": ["main.2894", "main.3551", "main.2886", "main.2535", "main.2076"], "main.1834": ["main.148", "main.3348", "main.989", "main.3023", "main.3227"], "main.1835": ["main.2650", "main.2125", "main.2506", "main.471", "main.714"], "main.1837": ["main.1022", "main.3183", "main.3140", "main.319", "main.449"], "main.1846": ["TACL.2143", "main.1201", "main.478", "main.1012", "main.215"], "main.1857": ["main.1952", "main.151", "main.2078", "TACL.2255", "main.3298"], "main.1862": ["main.3517", "main.2268", "main.1159", "main.3648", "main.2739"], "main.1863": ["main.916", "main.645", "main.2050", "main.527", "main.2839"], "main.1866": ["main.349", "main.2590", "TACL.1983", "main.3682", "main.876"], "main.1877": ["main.2635", "main.3140", "main.3054", "main.2078", "main.2586"], "main.1892": ["main.2851", "main.407", "main.2635", "main.1130", "main.852"], "main.1898": ["main.1581", "main.2389", "main.247", "main.648", "main.2382"], "main.1901": ["main.2131", "main.865", "main.2891", "main.410", "main.447"], "main.1904": ["main.2510", "main.789", "main.851", "main.838", "main.3424"], "main.1906": ["main.3507", "main.607", "main.928", "main.2922", "main.2112"], "main.1908": ["main.1942", "main.1631", "main.1923", "main.2238", "main.3486"], "main.1923": ["main.74", "main.2739", "main.2721", "main.2491", "main.1540"], "main.1928": ["main.252", "main.1647", "main.2758", "main.3054", "main.1130"], "main.1935": ["main.3224", "main.2251", "main.143", "main.644", "main.2349"], "main.1938": ["main.1892", "main.1957", "main.2890", "main.891", "main.315"], "main.1942": ["main.1908", "main.110", "main.2989", "main.3648", "main.2307"], "main.1943": ["main.447", "TACL.2141", "main.1258", "main.1957", "main.1494"], "main.1949": ["main.3327", "main.3240", "main.1023", "main.2931", "main.693"], "main.1952": ["main.2650", "main.471", "main.143", "main.3437", "main.714"], "main.1957": ["main.1754", "main.1061", "main.1179", "main.2419", "TACL.2141"], "main.1960": ["TACL.1943", "main.522", "main.835", "main.1351", "main.2661"], "main.1970": ["main.2363", "main.143", "TACL.2411", "TACL.2013", "main.2179"], "main.1972": ["main.41", "main.1622", "main.32", "main.527", "main.1797"], "main.1974": ["main.1159", "main.387", "main.1923", "main.2943", "main.1032"], "main.1975": ["main.574", "main.2253", "main.782", "main.2761", "main.151"], "main.1977": ["main.96", "main.1159", "main.2608", "main.2048", "main.2739"], "main.1986": ["main.1680", "main.3688", "main.522", "main.852", "main.1263"], "main.1996": ["main.3348", "main.2040", "main.989", "main.2851", "main.345"], "main.1997": ["main.3046", "main.2131", "main.143", "main.3453", "main.1626"], "main.2005": ["main.3457", "main.744", "main.3327", "main.210", "main.371"], "main.2012": ["main.2072", "main.2962", "main.387", "main.638", "main.362"], "main.204": ["main.2890", "main.1618", "main.1210", "main.1339", "main.2635"], "main.2040": ["main.1613", "main.2851", "main.3181", "main.76", "main.1996"], "main.2042": ["main.916", "main.1863", "main.2281", "main.485", "main.834"], "main.2048": ["main.1749", "main.1977", "main.96", "main.2427", "main.2972"], "main.2050": ["main.1863", "main.645", "main.215", "main.1201", "main.916"], "main.2054": ["main.2415", "TACL.2049", "main.607", "main.3035", "main.2253"], "main.2055": ["main.522", "main.3688", "main.1572", "main.701", "main.888"], "main.2057": ["main.457", "main.504", "main.3116", "main.143", "main.2452"], "main.2058": ["main.2141", "main.689", "main.1522", "main.3157", "main.916"], "main.2061": ["main.888", "TACL.1997", "main.1572", "main.835", "main.856"], "main.2064": ["CL.1", "TACL.2013", "main.1494", "main.2847", "main.1258"], "main.2066": ["main.3093", "main.1611", "main.1159", "main.3292", "main.1935"], "main.2068": ["main.2733", "main.1923", "main.493", "main.471", "main.1351"], "main.207": ["main.1494", "main.1258", "main.447", "main.1957", "TACL.2141"], "main.2070": ["main.128", "main.3393", "main.355", "main.3318", "main.3179"], "main.2072": ["main.32", "main.84", "TACL.2411", "main.1581", "main.353"], "main.2075": ["main.3566", "main.1702", "main.2205", "main.1379", "main.2641"], "main.2076": ["main.2122", "main.210", "main.143", "main.84", "main.1997"], "main.2078": ["TACL.2255", "main.2476", "main.2733", "main.2087", "main.1482"], "main.2083": ["main.2273", "main.3609", "main.148", "main.3360", "main.2702"], "main.2087": ["main.2078", "main.1923", "main.2491", "main.2500", "main.74"], "main.2094": ["main.3457", "main.210", "main.2076", "main.2122", "main.1970"], "main.2098": ["main.1649", "main.2635", "main.3227", "main.2491", "main.130"], "main.210": ["main.1923", "main.2076", "main.2943", "main.84", "main.876"], "main.2100": ["main.453", "TACL.2221", "main.1402", "main.1694", "main.1432"], "main.2112": ["main.2512", "main.3507", "main.3453", "main.3216", "main.1267"], "main.2114": ["main.457", "main.1935", "main.2891", "main.3224", "main.2363"], "main.2117": ["main.872", "main.3151", "main.2570", "main.1004", "main.2506"], "main.2120": ["main.1030", "main.319", "main.449", "TACL.2041", "main.1022"], "main.2122": ["main.498", "main.947", "main.2363", "main.1052", "TACL.2411"], "main.2125": ["main.2437", "main.2506", "main.1835", "main.2650", "main.965"], "main.2131": ["main.865", "main.2278", "main.2363", "CL.2", "main.1901"], "main.2133": ["main.471", "main.1159", "main.2688", "main.2476", "main.574"], "main.214": ["main.891", "main.1339", "main.3257", "main.2635", "main.689"], "main.2141": ["main.1654", "main.128", "TACL.2143", "main.1201", "main.1702"], "main.215": ["main.2209", "main.1201", "main.689", "main.1522", "main.128"], "main.2151": ["main.1625", "main.3593", "main.2112", "main.447", "main.2419"], "main.2163": ["main.2915", "TACL.2221", "TACL.2107", "main.106", "main.888"], "main.2164": ["main.128", "main.478", "main.1654", "main.215", "main.2141"], "main.2167": ["main.1682", "main.1611", "main.1032", "main.2289", "main.148"], "main.2179": ["TACL.2013", "main.1970", "main.143", "main.3457", "TACL.2141"], "main.2181": ["main.1011", "main.2873", "main.2761", "TACL.2121", "main.666"], "main.2198": ["main.3550", "main.1377", "main.648", "main.2430", "main.1445"], "main.2205": ["main.1892", "TACL.2411", "main.2075", "main.3257", "main.128"], "main.2208": ["main.2216", "main.2766", "main.3391", "main.2818", "main.1320"], "main.2209": ["main.215", "main.1522", "main.689", "main.1012", "main.128"], "main.2212": ["main.787", "main.1006", "main.1846", "main.3179", "main.1522"], "main.2215": ["main.763", "main.2999", "main.618", "TACL.2041", "main.317"], "main.2216": ["main.2208", "main.3391", "main.2818", "main.1739", "main.1901"], "main.2218": ["main.2915", "main.106", "TACL.2221", "main.2343", "main.2208"], "main.2221": ["main.2410", "main.2834", "main.286", "main.1734", "main.3672"], "main.2225": ["main.2520", "main.1180", "main.3486", "main.2784", "main.1287"], "main.2228": ["main.2258", "main.959", "main.3035", "TACL.2049", "main.2380"], "main.2238": ["main.84", "main.2268", "main.1540", "TACL.2055", "main.2122"], "main.2251": ["main.298", "main.143", "main.3224", "main.1935", "main.3093"], "main.2253": ["main.1975", "main.1191", "main.531", "main.607", "main.2415"], "main.2258": ["main.2586", "main.2228", "TACL.2049", "main.319", "main.2864"], "main.2261": ["main.3329", "main.3437", "main.1675", "main.1766", "main.3013"], "main.2268": ["main.84", "main.2238", "main.1159", "main.2739", "main.3486"], "main.2271": ["TACL.2013", "main.767", "main.2491", "main.143", "main.888"], "main.2273": ["main.2083", "main.2702", "main.143", "main.1052", "main.2342"], "main.2278": ["main.2630", "main.1803", "main.143", "main.3216", "main.1061"], "main.2281": ["main.1846", "main.1201", "main.645", "main.485", "main.419"], "main.2289": ["main.2167", "main.1023", "main.1159", "main.693", "main.3298"], "main.2298": ["main.522", "main.1379", "TACL.2107", "main.888", "main.852"], "main.2307": ["main.1159", "main.2650", "TACL.2411", "main.2470", "main.1023"], "main.2313": ["main.2914", "main.371", "main.2895", "main.426", "main.3434"], "main.2322": ["main.284", "main.1923", "main.1085", "main.3183", "main.317"], "main.233": ["main.478", "main.1702", "main.1846", "main.1201", "main.1522"], "main.2331": ["main.493", "main.345", "main.2078", "main.2793", "main.956"], "main.2337": ["main.2640", "main.2739", "main.1923", "main.1022", "main.2943"], "main.2342": ["main.3470", "main.3010", "main.1923", "main.3506", "main.1130"], "main.2343": ["main.2615", "main.3483", "main.2515", "main.247", "TACL.2107"], "main.2349": ["main.143", "main.3093", "main.1935", "main.2363", "main.2251"], "main.2357": ["TACL.2047", "main.2313", "main.3227", "main.2389", "main.701"], "main.2363": ["main.2630", "main.1970", "main.1130", "main.143", "TACL.2411"], "main.2367": ["main.1129", "main.151", "main.2476", "main.825", "main.693"], "main.237": ["main.1116", "main.574", "main.2761", "main.158", "main.666"], "main.2370": ["main.1503", "main.1935", "main.1901", "main.644", "main.973"], "main.2377": ["main.1159", "main.3010", "main.2650", "main.1892", "main.128"], "main.2380": ["main.2228", "TACL.2049", "main.319", "main.3186", "main.2943"], "main.2382": ["main.1130", "main.1892", "main.648", "main.1707", "main.3010"], "main.2383": ["main.2590", "main.1928", "main.876", "main.527", "main.1797"], "main.2389": ["main.648", "main.26", "main.247", "main.3353", "main.891"], "main.2391": ["TACL.2013", "main.143", "main.835", "main.1892", "main.623"], "main.2396": ["main.871", "TACL.2107", "main.1803", "main.143", "main.1263"], "main.2406": ["main.1787", "main.1706", "main.1648", "main.1493", "main.3517"], "main.2410": ["main.1700", "main.954", "main.2839", "main.2583", "main.1201"], "main.2412": ["main.1339", "main.1803", "main.835", "main.1263", "main.3688"], "main.2414": ["TACL.2041", "main.2851", "main.2363", "main.2696", "main.2893"], "main.2415": ["main.2054", "TACL.2013", "main.2470", "main.1622", "main.1972"], "main.2416": ["main.2415", "main.2253", "TACL.2013", "main.1970", "main.2040"], "main.2419": ["main.1625", "main.1957", "main.447", "main.2890", "TACL.2141"], "main.2422": ["main.870", "main.1339", "main.2795", "main.2098", "main.648"], "main.2424": ["main.1280", "TACL.2141", "main.2675", "TACL.2041", "main.1892"], "main.2426": ["main.3462", "main.300", "main.3646", "main.1669", "main.1159"], "main.2427": ["main.96", "main.1749", "main.1116", "main.2048", "main.2508"], "main.2430": ["main.1446", "main.3013", "main.2851", "main.3483", "main.1892"], "main.2437": ["main.2125", "main.2506", "main.714", "main.2650", "main.1835"], "main.2438": ["main.2087", "main.449", "main.3470", "TACL.2041", "main.2363"], "main.2444": ["main.916", "main.1622", "main.527", "main.128", "main.2141"], "main.2448": ["main.3299", "main.3457", "main.1892", "main.1503", "main.2098"], "main.2452": ["main.3115", "main.2638", "main.457", "main.1455", "main.3181"], "main.246": ["TACL.2107", "main.852", "main.3688", "main.888", "main.522"], "main.247": ["main.3483", "main.2389", "main.1299", "main.1898", "main.3023"], "main.2470": ["main.2650", "TACL.2411", "main.2307", "main.2506", "main.471"], "main.2476": ["main.2078", "main.835", "main.471", "main.2635", "main.1263"], "main.2490": ["main.1379", "main.2278", "main.143", "main.1680", "main.3688"], "main.2491": ["TACL.2041", "main.1351", "main.2635", "main.522", "main.1631"], "main.2493": ["main.2777", "main.2847", "main.517", "main.870", "main.1997"], "main.2500": ["main.858", "main.74", "main.1803", "main.1263", "main.852"], "main.2506": ["main.2125", "main.1835", "main.1023", "main.2437", "main.2470"], "main.2508": ["main.1116", "main.1421", "main.96", "main.1977", "main.1749"], "main.2510": ["main.1904", "main.3101", "main.2707", "main.851", "main.789"], "main.2511": ["main.1707", "main.3398", "main.1634", "main.1522", "main.648"], "main.2512": ["main.2922", "main.2141", "main.2476", "main.574", "main.2640"], "main.2515": ["main.2396", "main.1898", "main.3337", "main.2389", "main.247"], "main.252": ["main.1928", "main.1647", "main.2758", "main.2650", "main.2864"], "main.2520": ["main.2225", "main.1180", "main.3486", "main.1488", "main.1289"], "main.2529": ["main.3437", "main.2650", "main.2684", "TACL.2411", "main.1952"], "main.2533": ["main.3506", "main.210", "main.1970", "main.3470", "main.1159"], "main.2535": ["main.2307", "main.1446", "main.345", "main.3292", "main.87"], "main.2549": ["main.1675", "main.1550", "main.1611", "main.1654", "main.3329"], "main.2553": ["main.1061", "main.1379", "main.1970", "main.143", "main.2890"], "main.2561": ["main.3352", "main.3072", "main.2707", "main.476", "main.851"], "main.2570": ["main.2506", "main.2962", "main.2117", "main.1159", "main.3151"], "main.2574": ["main.1578", "main.2982", "main.390", "main.763", "main.645"], "main.2579": ["main.2476", "main.2931", "main.883", "main.2122", "main.2078"], "main.2581": ["main.648", "main.1707", "main.2590", "main.2382", "main.870"], "main.2583": ["main.2389", "main.74", "main.128", "main.1892", "main.2851"], "main.2585": ["main.958", "main.1612", "main.2958", "main.1159", "main.2040"], "main.2586": ["main.3140", "main.2258", "main.319", "main.3186", "main.2721"], "main.2587": ["main.2635", "main.693", "main.449", "main.1022", "main.2640"], "main.2590": ["main.3353", "main.876", "TACL.2135", "main.2763", "main.2641"], "main.2596": ["main.3093", "main.2792", "main.1305", "TACL.2093", "main.585"], "main.26": ["main.1146", "main.701", "main.2389", "main.1960", "main.835"], "main.2608": ["main.1977", "main.2972", "main.96", "main.3329", "main.3646"], "main.2612": ["main.527", "main.1797", "main.916", "main.787", "main.2444"], "main.2614": ["main.371", "main.2114", "main.2357", "main.2914", "main.635"], "main.2615": ["TACL.2041", "main.2491", "main.2893", "main.2834", "main.1552"], "main.2630": ["main.2363", "main.143", "main.2278", "main.3453", "main.623"], "main.2632": ["main.1734", "main.2430", "main.1208", "TACL.2041", "main.2838"], "main.2635": ["main.1339", "main.891", "main.1892", "main.2491", "main.148"], "main.2636": ["main.148", "main.2430", "main.850", "main.3348", "main.1615"], "main.2638": ["main.3115", "main.1455", "main.3181", "main.2452", "main.1613"], "main.2640": ["main.2635", "main.3140", "main.2337", "main.2476", "main.2587"], "main.2641": ["main.1061", "main.3046", "main.870", "main.1503", "main.852"], "main.2644": ["main.2886", "main.834", "main.353", "TACL.2011", "main.2893"], "main.2650": ["main.1835", "main.2470", "main.2125", "main.3437", "main.471"], "main.2651": ["main.789", "main.2784", "main.2996", "main.2596", "main.1675"], "main.2661": ["main.856", "main.1960", "main.891", "main.3337", "main.1100"], "main.267": ["main.410", "main.3688", "main.852", "main.870", "main.3116"], "main.2674": ["main.3227", "TACL.2047", "main.894", "main.522", "main.1960"], "main.2675": ["main.3656", "main.852", "main.143", "main.2298", "main.522"], "main.2684": ["main.1494", "main.143", "TACL.2411", "main.3540", "main.2851"], "main.2688": ["main.1023", "main.3389", "main.2506", "main.965", "main.1159"], "main.2696": ["TACL.2041", "main.2414", "main.1130", "main.2179", "TACL.2411"], "main.2702": ["main.1388", "main.3360", "main.2122", "main.2083", "TACL.2411"], "main.2705": ["main.1923", "main.2886", "main.3648", "main.387", "main.2535"], "main.2707": ["main.165", "main.3072", "main.2561", "main.476", "main.3352"], "main.2712": ["main.1952", "main.2792", "main.2996", "main.30", "main.1287"], "main.2718": ["CL.2", "main.3116", "main.750", "main.870", "main.2891"], "main.2721": ["main.1923", "main.2586", "main.3054", "main.2068", "main.3140"], "main.2724": ["main.782", "main.1086", "main.1010", "main.2506", "main.574"], "main.2733": ["main.2068", "main.2078", "main.345", "main.148", "main.3540"], "main.2739": ["main.1159", "main.1923", "main.2763", "main.3470", "main.3506"], "main.2746": ["main.1298", "main.1061", "main.3453", "main.2131", "main.3046"], "main.2750": ["main.1086", "main.2054", "main.2570", "main.3035", "main.2416"], "main.2756": ["main.30", "TACL.2093", "TACL.2083", "main.2430", "main.2931"], "main.2758": ["main.390", "main.2702", "main.252", "main.1928", "main.373"], "main.2761": ["main.782", "main.574", "main.1648", "TACL.2121", "main.158"], "main.2763": ["TACL.1983", "main.2630", "main.2590", "main.3506", "main.41"], "main.2764": ["main.782", "main.1488", "TACL.2121", "main.2761", "main.151"], "main.2766": ["main.920", "main.2208", "main.1658", "TACL.2013", "main.1625"], "main.2767": ["TACL.2169", "main.3236", "main.1960", "main.3074", "TACL.1943"], "main.2777": ["main.1379", "main.870", "main.3597", "main.1061", "main.522"], "main.2779": ["main.3327", "main.2943", "main.3541", "main.2382", "main.3517"], "main.2783": ["main.956", "main.3543", "main.1485", "main.1552", "main.3394"], "main.2784": ["main.3644", "main.635", "main.2651", "main.789", "main.2430"], "main.279": ["main.666", "main.2873", "main.684", "main.923", "main.1010"], "main.2790": ["main.148", "main.1720", "main.891", "main.1356", "main.3609"], "main.2792": ["TACL.2093", "main.2931", "main.2596", "main.30", "main.3093"], "main.2793": ["main.16", "main.74", "main.345", "main.1482", "main.2893"], "main.2795": ["main.2422", "main.1485", "TACL.2121", "main.1707", "main.471"], "main.2799": ["main.3217", "main.989", "TACL.2103", "main.1738", "main.2974"], "main.2809": ["main.2914", "main.2313", "main.1490", "main.3126", "main.426"], "main.2814": ["main.2307", "main.959", "TACL.2049", "main.3648", "main.2258"], "main.2818": ["main.1901", "main.3181", "main.2891", "main.865", "main.3115"], "main.2825": ["main.748", "main.1631", "main.2943", "main.1923", "main.1974"], "main.2834": ["main.2615", "main.2491", "TACL.2411", "main.345", "main.1923"], "main.2838": ["main.1734", "main.1208", "TACL.2041", "main.74", "main.3470"], "main.2839": ["main.1085", "main.645", "main.1797", "main.3179", "TACL.2143"], "main.284": ["main.2322", "TACL.2041", "main.3360", "main.2758", "main.3183"], "main.2847": ["TACL.2013", "main.870", "main.143", "main.2363", "main.1970"], "main.2849": ["main.911", "main.327", "main.3010", "main.1528", "main.3216"], "main.2851": ["TACL.2411", "main.1892", "main.1613", "main.2430", "main.76"], "main.2853": ["main.355", "TACL.2107", "main.1113", "main.3483", "main.628"], "main.286": ["main.965", "main.471", "main.714", "main.1023", "main.1504"], "main.2864": ["main.3186", "main.2973", "main.2258", "TACL.2049", "main.449"], "main.2865": ["TACL.2093", "main.2430", "main.1446", "main.30", "main.1498"], "main.287": ["main.3216", "main.1528", "main.666", "main.327", "main.911"], "main.2873": ["main.2181", "main.1508", "main.658", "main.279", "TACL.2121"], "main.2877": ["main.1787", "main.2974", "main.1706", "main.300", "main.666"], "main.2886": ["main.802", "main.2076", "main.2644", "main.2122", "main.345"], "main.2890": ["main.1061", "main.891", "main.1957", "main.750", "main.1379"], "main.2891": ["main.3224", "main.1935", "main.1503", "main.2718", "main.639"], "main.2893": ["TACL.2041", "main.2851", "main.3023", "main.2793", "main.1146"], "main.2894": ["main.3357", "main.868", "main.151", "main.2996", "main.2764"], "main.2895": ["main.2914", "main.2313", "main.47", "main.1614", "main.60"], "main.2900": ["main.540", "main.286", "main.3012", "main.1123", "main.471"], "main.2914": ["main.2313", "main.2895", "main.47", "main.426", "main.3434"], "main.2915": ["TACL.2107", "main.1339", "TACL.2221", "main.856", "main.2661"], "main.2916": ["main.3353", "main.2389", "main.699", "main.1006", "main.2511"], "main.2920": ["main.2072", "main.3450", "main.32", "TACL.2011", "main.353"], "main.2922": ["main.2512", "main.574", "main.2739", "main.1569", "main.3057"], "main.2927": ["main.1009", "main.373", "main.1201", "main.647", "main.128"], "main.2931": ["main.2792", "TACL.2093", "main.1952", "main.2476", "main.3581"], "main.2938": ["main.426", "main.748", "main.1923", "main.2289", "main.387"], "main.2943": ["main.449", "main.3517", "main.1032", "main.210", "main.1022"], "main.2947": ["main.989", "main.1738", "main.3216", "main.2635", "main.2733"], "main.2958": ["main.2585", "main.3540", "main.1834", "main.1575", "main.76"], "main.2959": ["main.1458", "main.3023", "main.1834", "main.2793", "main.1046"], "main.2962": ["main.3151", "main.2570", "main.2117", "TACL.2049", "main.2506"], "main.2972": ["main.1116", "main.1159", "main.3462", "main.1569", "main.850"], "main.2973": ["main.449", "main.3186", "main.928", "main.2864", "main.3529"], "main.2974": ["main.3216", "main.300", "main.2877", "main.1787", "main.1755"], "main.2975": ["main.318", "main.1196", "main.664", "main.3179", "main.3495"], "main.298": ["main.2251", "main.3224", "main.1935", "main.2891", "main.1395"], "main.2982": ["main.390", "main.2650", "main.2758", "main.2511", "main.3010"], "main.2989": ["main.748", "main.110", "main.1528", "TACL.2049", "main.3151"], "main.2990": ["main.2590", "main.2382", "main.246", "main.648", "main.852"], "main.2991": ["main.2758", "main.284", "main.373", "main.2072", "main.2382"], "main.2994": ["main.2777", "main.3566", "main.1379", "main.870", "main.852"], "main.2995": ["main.151", "main.2733", "main.1023", "main.2476", "main.3540"], "main.2996": ["main.1750", "main.2688", "main.1287", "main.789", "main.2712"], "main.2999": ["main.210", "main.607", "TACL.2041", "main.3470", "TACL.2411"], "main.30": ["TACL.2093", "main.2756", "main.2792", "TACL.2083", "main.2430"], "main.300": ["main.2974", "main.2761", "main.1528", "main.666", "main.2426"], "main.3010": ["main.1892", "main.2377", "main.1023", "main.2382", "main.128"], "main.3012": ["main.965", "main.1023", "main.3552", "main.2125", "main.471"], "main.3013": ["main.2430", "main.989", "main.471", "main.930", "main.3656"], "main.3022": ["main.2635", "main.3287", "main.1803", "main.1503", "main.2877"], "main.3023": ["main.16", "main.2893", "main.852", "main.2491", "main.1631"], "main.3028": ["main.476", "main.3532", "main.2982", "main.668", "main.916"], "main.3032": ["main.605", "main.2849", "main.2974", "main.652", "main.1928"], "main.3035": ["TACL.2049", "main.2228", "main.1648", "main.2380", "main.2943"], "main.3046": ["main.2641", "main.143", "main.852", "main.407", "main.1061"], "main.3049": ["main.748", "main.1669", "main.1923", "main.2068", "main.387"], "main.3051": ["main.2389", "main.2198", "main.247", "main.3483", "main.1356"], "main.3054": ["main.3186", "main.2650", "main.2586", "main.3327", "main.1022"], "main.3057": ["main.1923", "main.2922", "main.210", "main.76", "main.3470"], "main.3064": ["main.1675", "main.3185", "main.3286", "main.1289", "main.1023"], "main.3065": ["main.2068", "main.2733", "main.1923", "main.1356", "main.493"], "main.3068": ["main.41", "main.485", "main.2763", "main.2590", "main.1201"], "main.3072": ["main.2561", "main.3352", "main.2707", "main.851", "main.165"], "main.3074": ["main.2491", "main.1130", "main.1960", "main.1351", "main.1446"], "main.3084": ["main.1465", "main.3617", "main.2972", "main.684", "main.300"], "main.3088": ["main.1091", "main.2261", "main.702", "main.1287", "main.3298"], "main.3093": ["main.3292", "main.3635", "main.1305", "main.2251", "main.2596"], "main.3101": ["main.3424", "main.384", "main.2996", "main.1626", "main.1287"], "main.3111": ["main.3398", "main.471", "main.1835", "main.965", "main.714"], "main.3115": ["main.3181", "main.1282", "main.1613", "main.2638", "TACL.2013"], "main.3116": ["main.870", "main.852", "main.143", "main.1445", "main.410"], "main.3126": ["main.426", "main.3434", "main.47", "main.2914", "TACL.2389"], "main.3136": ["main.2426", "main.1787", "main.1205", "main.3287", "main.2273"], "main.3140": ["main.2586", "main.319", "main.3183", "main.2635", "main.1837"], "main.3143": ["main.3635", "main.1485", "TACL.2411", "main.2635", "main.858"], "main.315": ["main.1970", "main.2476", "main.1892", "main.1130", "main.3647"], "main.3151": ["main.2962", "main.2117", "main.2570", "TACL.2049", "main.2506"], "main.3157": ["main.1522", "main.2209", "main.2058", "main.3179", "main.1006"], "main.317": ["main.699", "main.1700", "main.471", "main.3183", "main.3179"], "main.3174": ["main.3013", "main.1575", "main.493", "main.426", "main.1217"], "main.3179": ["main.128", "main.699", "main.215", "main.891", "main.1700"], "main.318": ["main.834", "main.1522", "main.3179", "main.3157", "main.128"], "main.3181": ["main.3115", "TACL.2013", "main.1613", "main.1282", "main.2363"], "main.3183": ["main.1196", "main.1837", "main.3140", "main.1022", "TACL.2041"], "main.3184": ["main.1482", "main.1923", "main.714", "main.3010", "main.2342"], "main.3185": ["main.1675", "main.3064", "main.3286", "main.1217", "main.1289"], "main.3186": ["main.2586", "main.2973", "main.3054", "main.41", "main.1022"], "main.319": ["main.2586", "main.449", "main.3140", "main.1580", "main.2258"], "main.32": ["main.1130", "main.2363", "main.3181", "main.1052", "main.2382"], "main.3205": ["main.865", "main.1379", "main.471", "main.3609", "main.410"], "main.3216": ["main.3453", "main.2278", "main.2974", "main.1803", "main.1061"], "main.3217": ["main.2799", "main.148", "main.1834", "main.2793", "main.1032"], "main.3224": ["main.1935", "main.2251", "main.298", "main.2891", "main.644"], "main.3227": ["main.888", "main.1572", "main.852", "main.3688", "TACL.2107"], "main.3231": ["main.2048", "main.1749", "main.3101", "main.2895", "main.3390"], "main.3236": ["main.2661", "main.1402", "main.1572", "main.2100", "main.3441"], "main.3239": ["main.376", "main.3360", "main.76", "main.891", "main.1834"], "main.3240": ["main.1949", "main.2733", "main.2083", "main.693", "main.471"], "main.3257": ["main.891", "main.1892", "main.143", "main.3353", "main.214"], "main.3259": ["main.1494", "main.1943", "main.3348", "main.2684", "main.2098"], "main.327": ["main.158", "main.1159", "main.911", "main.2849", "main.3216"], "main.3270": ["main.2415", "TACL.2411", "TACL.2041", "main.623", "main.2054"], "main.3272": ["main.3348", "main.1446", "main.2430", "main.2198", "main.1615"], "main.3278": ["main.2650", "TACL.2411", "main.1618", "main.1970", "main.1130"], "main.328": ["main.371", "main.2914", "main.3391", "main.60", "main.125"], "main.3282": ["main.1675", "main.3286", "main.983", "main.3375", "main.2261"], "main.3286": ["main.1675", "main.1289", "main.3185", "main.3375", "main.3282"], "main.3287": ["main.3022", "TACL.2103", "main.1787", "main.2974", "main.3216"], "main.3291": ["main.1103", "main.607", "main.928", "main.1191", "main.1455"], "main.3292": ["main.3093", "main.1130", "main.2349", "main.143", "main.3635"], "main.3298": ["main.989", "main.911", "main.1952", "main.2733", "main.3635"], "main.3299": ["main.2448", "main.767", "main.2357", "TACL.2047", "main.639"], "main.3304": ["TACL.2041", "main.2893", "main.2349", "main.3183", "main.2122"], "main.3318": ["main.2070", "main.3393", "main.1846", "main.1797", "main.148"], "main.3321": ["main.3609", "main.2790", "main.2076", "main.973", "main.471"], "main.3327": ["main.1949", "main.3054", "main.2931", "main.693", "main.2635"], "main.3329": ["main.3594", "main.2261", "main.548", "main.983", "main.3282"], "main.3336": ["main.527", "main.1287", "main.1390", "main.1052", "main.2996"], "main.3337": ["main.1960", "main.2661", "main.1618", "main.835", "main.891"], "main.334": ["main.1770", "main.648", "main.2650", "main.471", "main.2430"], "main.3344": ["main.2630", "main.76", "main.471", "main.41", "TACL.2041"], "main.3348": ["main.648", "main.1615", "main.345", "main.2430", "main.2068"], "main.3352": ["main.2561", "main.3072", "main.851", "main.3450", "main.838"], "main.3353": ["main.2590", "main.891", "main.3495", "main.2389", "main.1634"], "main.3357": ["main.1675", "main.1766", "main.1289", "main.789", "main.1287"], "main.3358": ["main.3635", "main.267", "main.865", "main.2851", "main.410"], "main.3360": ["main.407", "main.1892", "main.1130", "main.1388", "main.2083"], "main.3370": ["main.2493", "main.2777", "main.852", "main.2298", "main.1379"], "main.3375": ["main.1289", "main.1550", "main.1675", "main.1766", "main.983"], "main.3389": ["main.965", "main.2650", "main.2125", "main.1023", "main.2367"], "main.3390": ["main.237", "main.1488", "main.2724", "main.1766", "TACL.2121"], "main.3391": ["CL.4", "main.648", "main.246", "main.1613", "main.2448"], "main.3393": ["main.1201", "main.478", "main.128", "main.1846", "main.1702"], "main.3394": ["main.1485", "TACL.2411", "main.1892", "TACL.2041", "main.1208"], "main.3398": ["main.3111", "main.1835", "main.2511", "main.3437", "main.1618"], "main.3403": ["main.648", "main.2382", "main.2389", "main.1898", "main.1892"], "main.3408": ["main.1863", "TACL.2411", "main.2529", "main.2430", "main.1734"], "main.3419": ["main.748", "main.693", "main.574", "main.2962", "main.1123"], "main.3424": ["main.789", "main.2996", "main.3101", "main.2784", "main.3644"], "main.3431": ["main.702", "main.3088", "main.3136", "main.317", "main.471"], "main.3434": ["main.2914", "main.426", "main.745", "main.2313", "TACL.2389"], "main.3437": ["main.2650", "main.1835", "main.2761", "main.714", "main.1023"], "main.3438": ["main.1749", "main.1465", "main.2508", "main.1135", "main.1116"], "main.3441": ["main.891", "main.2661", "main.1960", "main.3227", "main.2389"], "main.345": ["main.2793", "main.2733", "main.3348", "main.76", "main.2068"], "main.3450": ["main.1103", "main.3352", "main.3072", "main.1972", "main.2179"], "main.3453": ["main.3216", "main.2630", "main.1061", "main.2278", "main.2363"], "main.3454": ["main.2766", "main.2452", "main.868", "main.1455", "main.3593"], "main.3457": ["main.2179", "main.2448", "main.2650", "TACL.2013", "main.2005"], "main.3462": ["main.3375", "main.2972", "main.2426", "main.471", "main.16"], "main.3464": ["main.2650", "main.1835", "main.2125", "main.965", "main.3437"], "main.3470": ["TACL.2041", "main.2342", "main.1923", "main.2838", "main.2491"], "main.3483": ["main.1707", "TACL.2107", "main.2430", "main.247", "main.1446"], "main.3486": ["main.3093", "main.2650", "main.1159", "TACL.2411", "main.2307"], "main.349": ["main.1866", "main.3682", "main.3544", "main.3507", "main.1784"], "main.3495": ["main.3353", "main.1960", "main.2763", "main.2389", "main.3470"], "main.3496": ["main.2367", "main.652", "main.1755", "main.1129", "main.1159"], "main.3497": ["main.1179", "main.3453", "main.2342", "main.3506", "main.3646"], "main.3504": ["main.1388", "main.3360", "main.1928", "main.2864", "main.1923"], "main.3506": ["main.3597", "main.2763", "main.888", "main.2586", "main.2590"], "main.3507": ["main.3682", "main.3517", "main.349", "main.3216", "main.1706"], "main.3513": ["main.618", "main.1552", "main.2696", "main.1485", "main.557"], "main.3517": ["main.2943", "main.3507", "main.449", "main.3216", "main.1862"], "main.3519": ["main.1116", "main.327", "main.861", "main.3287", "main.605"], "main.3529": ["main.1970", "main.449", "main.2973", "main.928", "main.3186"], "main.353": ["main.143", "main.2278", "main.1379", "main.623", "main.2363"], "main.3532": ["main.2261", "main.1287", "main.476", "main.384", "main.3028"], "main.3540": ["main.2733", "main.2684", "main.1146", "main.345", "main.471"], "main.3541": ["main.1949", "main.2943", "main.2931", "main.204", "main.3517"], "main.3543": ["main.1552", "main.2414", "main.1446", "TACL.2041", "main.2615"], "main.3544": ["main.349", "main.3507", "main.1866", "main.3682", "main.84"], "main.355": ["main.2070", "main.1113", "main.647", "main.2853", "TACL.2041"], "main.3550": ["main.2198", "main.2696", "TACL.2411", "main.2851", "main.1996"], "main.3551": ["main.517", "main.2675", "main.2076", "main.143", "main.2363"], "main.3552": ["main.3012", "main.965", "main.1023", "main.2125", "main.714"], "main.3563": ["CL.2", "main.3181", "TACL.2013", "main.2131", "main.2718"], "main.3566": ["main.870", "main.1320", "main.2994", "main.3181", "main.2075"], "main.3567": ["main.1299", "main.2635", "main.2947", "main.2793", "main.1032"], "main.357": ["main.1649", "main.2891", "main.1957", "CL.2", "main.1061"], "main.3573": ["main.3617", "main.1135", "main.1116", "main.3497", "main.2849"], "main.3579": ["main.1217", "main.1675", "TACL.2255", "main.3013", "main.471"], "main.3580": ["main.2839", "main.3179", "main.317", "main.1797", "main.699"], "main.3581": ["main.1023", "main.693", "main.2476", "main.3012", "main.983"], "main.359": ["main.2313", "TACL.2129", "main.2430", "main.3434", "main.1834"], "main.3593": ["main.1621", "main.3647", "main.883", "main.2416", "main.315"], "main.3594": ["main.3329", "main.2261", "main.3579", "main.392", "main.548"], "main.3597": ["main.1061", "main.3506", "main.143", "main.852", "main.2777"], "main.360": ["main.1287", "main.2712", "main.3532", "main.3101", "main.2261"], "main.3609": ["main.3635", "main.2083", "main.471", "main.3205", "main.2790"], "main.3617": ["main.3084", "main.1465", "main.3573", "main.607", "main.1116"], "main.362": ["main.387", "TACL.2411", "main.1103", "main.1191", "main.2307"], "main.3621": ["main.3257", "main.3051", "main.1225", "main.1432", "main.557"], "main.3635": ["main.3358", "main.3609", "TACL.2411", "main.3093", "main.2851"], "main.3644": ["main.2784", "main.2996", "main.2995", "main.2057", "main.789"], "main.3646": ["main.1669", "main.1159", "main.2426", "main.652", "main.3497"], "main.3647": ["main.883", "main.1621", "main.315", "main.1518", "main.2890"], "main.3648": ["main.1159", "main.1669", "main.3216", "main.387", "main.143"], "main.3651": ["main.911", "main.3216", "main.3298", "main.2947", "main.1952"], "main.3656": ["main.930", "main.557", "main.1733", "main.2349", "main.3013"], "main.3672": ["main.3054", "main.74", "main.319", "main.2838", "main.2586"], "main.3676": ["main.2476", "main.2367", "main.2412", "main.2422", "main.1581"], "main.3682": ["main.349", "main.3507", "main.1866", "main.1784", "main.3544"], "main.3688": ["main.1680", "main.522", "main.852", "main.267", "TACL.2107"], "main.371": ["main.2313", "main.2914", "main.47", "main.2357", "main.60"], "main.373": ["main.2758", "main.1402", "main.1113", "main.647", "main.2702"], "main.376": ["main.3239", "main.2994", "main.1670", "main.2430", "main.1402"], "main.384": ["main.1287", "main.916", "main.2261", "main.3532", "main.782"], "main.387": ["main.1159", "main.2506", "main.1923", "main.2739", "main.911"], "main.390": ["main.2758", "main.2982", "main.928", "main.1129", "main.605"], "main.392": ["main.2261", "main.3329", "main.376", "main.3594", "main.1603"], "main.400": ["main.74", "main.1803", "main.871", "main.3688", "main.2630"], "main.407": ["main.74", "main.3046", "main.1892", "main.522", "main.852"], "main.41": ["main.2763", "main.3186", "TACL.2049", "main.2630", "main.1052"], "main.410": ["main.267", "main.3116", "main.865", "CL.2", "main.852"], "main.419": ["main.1797", "main.1846", "main.128", "main.1522", "main.699"], "main.426": ["main.2914", "main.3434", "main.2313", "main.1485", "main.1046"], "main.438": ["main.440", "main.96", "main.2416", "main.3495", "main.2895"], "main.440": ["main.1750", "main.438", "main.2996", "main.955", "main.868"], "main.445": ["main.527", "main.2839", "main.2758", "main.3580", "main.1085"], "main.447": ["TACL.2141", "main.1943", "main.1957", "main.2890", "main.1901"], "main.449": ["main.319", "main.2973", "main.1022", "main.2943", "main.2586"], "main.450": ["TACL.1997", "main.958", "main.493", "main.345", "main.1159"], "main.453": ["main.2100", "main.1432", "TACL.2107", "main.1694", "main.522"], "main.457": ["main.3181", "main.3115", "main.2114", "main.32", "main.1282"], "main.47": ["main.2914", "main.2895", "main.60", "main.371", "main.2313"], "main.470": ["main.471", "main.2724", "main.977", "main.3617", "main.143"], "main.471": ["main.1023", "main.714", "main.1835", "main.2650", "main.965"], "main.476": ["main.165", "main.3028", "main.3532", "main.2561", "main.2707"], "main.478": ["main.1846", "main.3393", "main.2164", "TACL.2143", "main.1012"], "main.485": ["main.1622", "main.1179", "main.1654", "main.128", "main.689"], "main.486": ["main.1625", "main.1938", "main.1957", "main.214", "main.2419"], "main.493": ["main.2068", "main.345", "main.1575", "main.426", "main.1923"], "main.498": ["main.2122", "main.2363", "TACL.2411", "main.1970", "main.1551"], "main.504": ["main.875", "main.3116", "main.3257", "main.1049", "main.871"], "main.517": ["main.3116", "main.143", "main.1997", "main.2076", "main.2630"], "main.522": ["main.852", "main.3688", "main.1680", "main.74", "main.1960"], "main.527": ["main.2444", "main.485", "main.916", "main.1622", "main.41"], "main.531": ["main.666", "main.1648", "main.923", "main.1159", "main.1231"], "main.540": ["main.2900", "main.693", "main.1123", "main.3581", "main.714"], "main.548": ["main.3329", "main.1159", "main.387", "main.3532", "main.1289"], "main.55": ["main.1271", "main.1942", "main.1611", "main.2337", "main.2608"], "main.557": ["main.3656", "main.930", "main.1733", "main.618", "main.1952"], "main.574": ["main.1648", "main.2761", "main.782", "main.158", "main.237"], "main.585": ["main.1061", "main.3093", "main.1305", "main.1130", "main.2718"], "main.593": ["CL.1", "TACL.2141", "main.3115", "TACL.1936", "TACL.2013"], "main.595": ["main.1023", "main.3186", "main.1675", "main.2973", "main.1540"], "main.598": ["main.2179", "main.315", "main.1938", "main.1970", "main.1669"], "main.60": ["main.2914", "main.47", "main.2895", "main.1614", "main.2313"], "main.605": ["main.1528", "main.1159", "main.2377", "main.1116", "main.911"], "main.607": ["main.928", "main.3470", "main.1191", "main.2054", "main.1179"], "main.616": ["main.1618", "main.2696", "main.1798", "main.618", "main.2414"], "main.618": ["main.1485", "main.1798", "main.130", "main.3337", "main.1960"], "main.619": ["main.84", "main.894", "main.888", "main.2055", "TACL.2221"], "main.623": ["main.143", "main.2630", "main.1803", "main.1130", "main.1379"], "main.628": ["main.702", "main.3013", "main.2853", "main.3174", "main.471"], "main.635": ["main.2784", "main.47", "main.1614", "main.2914", "TACL.2129"], "main.638": ["main.947", "main.2122", "main.498", "main.3181", "main.1052"], "main.639": ["main.1503", "main.754", "main.825", "main.2891", "main.1061"], "main.644": ["main.1935", "main.3224", "main.2891", "main.2363", "main.2251"], "main.645": ["main.1522", "main.215", "main.1702", "main.699", "main.128"], "main.647": ["main.1113", "main.355", "main.373", "main.1797", "main.995"], "main.648": ["main.870", "main.1707", "main.2389", "main.2491", "main.3348"], "main.652": ["main.3646", "main.3496", "main.3453", "main.2688", "main.1669"], "main.658": ["main.2873", "main.2181", "main.2877", "main.3084", "main.973"], "main.664": ["main.1522", "main.1846", "TACL.2143", "main.215", "main.699"], "main.666": ["main.531", "main.1231", "main.1159", "TACL.2121", "main.923"], "main.668": ["main.645", "main.1129", "main.2650", "main.1928", "main.1952"], "main.675": ["main.3012", "main.3648", "main.1389", "main.3552", "main.2289"], "main.684": ["main.1648", "main.3084", "main.923", "main.300", "main.1466"], "main.689": ["main.215", "main.2209", "main.1561", "main.1006", "main.1201"], "main.693": ["main.3581", "main.1123", "main.2587", "TACL.2095", "main.1023"], "main.699": ["main.3179", "main.1846", "main.1522", "main.1201", "main.215"], "main.701": ["main.26", "main.894", "main.888", "main.891", "main.2055"], "main.702": ["main.628", "main.3088", "main.471", "main.1085", "main.3111"], "main.714": ["main.471", "main.2125", "main.1835", "main.2650", "main.1023"], "main.730": ["main.1707", "main.2590", "main.648", "main.471", "main.3010"], "main.733": ["main.1675", "main.2784", "main.3286", "main.789", "main.3579"], "main.74": ["main.858", "main.1803", "main.407", "main.400", "main.2500"], "main.744": ["main.2005", "main.891", "main.2587", "main.143", "main.2790"], "main.745": ["main.3434", "main.1458", "TACL.2389", "main.3023", "main.493"], "main.748": ["main.3049", "main.1540", "main.387", "main.1631", "main.3453"], "main.750": ["main.1803", "main.2718", "main.3116", "main.2278", "main.267"], "main.754": ["main.639", "main.1957", "main.1938", "main.825", "main.1503"], "main.76": ["main.2851", "main.345", "main.1611", "main.1923", "main.2040"], "main.763": ["main.1578", "main.664", "main.2389", "main.3672", "main.2583"], "main.767": ["TACL.2047", "TACL.2013", "main.3181", "main.2271", "main.3115"], "main.782": ["main.2761", "main.574", "main.1010", "TACL.2121", "main.1648"], "main.787": ["main.1140", "main.2212", "main.128", "main.485", "main.689"], "main.789": ["main.2651", "main.2784", "main.2996", "main.3424", "main.1287"], "main.802": ["main.2886", "main.838", "main.834", "main.1611", "TACL.2011"], "main.809": ["CL.2", "main.1613", "main.2718", "main.3116", "main.2596"], "main.820": ["main.3181", "main.2363", "main.2349", "main.644", "main.3224"], "main.821": ["main.1485", "main.2491", "main.1960", "main.3337", "main.522"], "main.825": ["main.639", "main.2382", "main.1694", "main.2367", "main.3010"], "main.834": ["main.318", "main.2444", "main.1700", "main.916", "main.2072"], "main.835": ["main.3688", "main.1960", "main.1803", "main.143", "main.870"], "main.838": ["TACL.2011", "main.802", "main.851", "main.3352", "main.789"], "main.84": ["main.888", "main.619", "main.210", "main.2076", "main.143"], "main.850": ["main.1130", "main.891", "main.1179", "main.1061", "main.148"], "main.851": ["main.3352", "main.3072", "main.2561", "main.838", "main.789"], "main.852": ["main.522", "main.3688", "TACL.2107", "main.1680", "main.267"], "main.856": ["main.888", "main.2661", "main.891", "main.522", "main.1572"], "main.858": ["main.74", "main.2500", "main.1803", "main.1032", "main.407"], "main.861": ["main.2579", "main.1669", "main.883", "main.1116", "main.1159"], "main.865": ["main.410", "main.2131", "main.3205", "main.3358", "main.852"], "main.868": ["main.1455", "main.498", "main.84", "main.2444", "main.3495"], "main.87": ["main.2307", "main.2834", "main.2535", "main.3183", "TACL.2041"], "main.870": ["main.3116", "main.648", "main.267", "main.2641", "main.1649"], "main.871": ["main.1803", "main.852", "main.143", "main.1379", "main.1680"], "main.872": ["main.2117", "main.789", "main.527", "main.3151", "main.1004"], "main.875": ["main.504", "main.2746", "main.1298", "main.1023", "main.3116"], "main.876": ["main.2590", "main.1179", "main.210", "main.2382", "TACL.2143"], "main.877": ["main.3375", "main.3013", "main.1952", "main.1957", "main.2040"], "main.883": ["main.1518", "main.3647", "main.1621", "main.2579", "main.891"], "main.888": ["main.856", "main.3688", "main.522", "main.3227", "TACL.2107"], "main.891": ["main.2635", "main.522", "main.856", "main.214", "main.1960"], "main.894": ["main.701", "main.888", "main.522", "TACL.2107", "main.3227"], "main.903": ["main.3513", "main.1734", "main.1670", "main.355", "main.2650"], "main.910": ["main.1356", "main.2389", "TACL.2255", "main.2430", "main.2078"], "main.911": ["main.1528", "main.1159", "main.2849", "main.989", "main.327"], "main.916": ["main.1863", "main.1522", "main.2444", "main.128", "main.645"], "main.920": ["main.2349", "main.106", "main.1658", "main.3093", "main.2766"], "main.923": ["main.1648", "main.666", "main.531", "main.1647", "main.1706"], "main.928": ["main.2973", "main.3186", "main.607", "main.449", "main.3529"], "main.930": ["main.3656", "main.557", "main.3013", "main.1733", "main.16"], "main.947": ["main.2122", "main.1551", "main.638", "main.3093", "main.2893"], "main.954": ["main.1846", "main.1201", "main.1522", "main.699", "main.128"], "main.955": ["main.128", "main.2141", "main.2377", "main.2512", "main.1750"], "main.956": ["main.1485", "main.3394", "main.2783", "main.1707", "TACL.2041"], "main.958": ["main.1159", "main.2307", "main.2585", "main.76", "main.2650"], "main.959": ["main.2228", "main.1022", "TACL.2049", "main.2258", "main.2380"], "main.96": ["main.1977", "main.1116", "main.2427", "main.1421", "main.2972"], "main.965": ["main.1023", "main.3012", "main.2125", "main.3389", "main.471"], "main.973": ["main.3093", "main.2877", "TACL.2121", "main.666", "main.782"], "main.977": ["main.689", "main.1702", "main.1179", "main.2209", "main.1561"], "main.983": ["main.3375", "main.1159", "main.3581", "main.1952", "main.989"], "main.989": ["main.1159", "main.2851", "main.2799", "main.911", "main.3013"], "main.995": ["main.1113", "main.355", "main.647", "main.1402", "TACL.2041"], "main.999": ["main.689", "main.2141", "main.1654", "main.485", "main.2209"], "CL.1": ["main.593", "main.2064", "TACL.2141", "TACL.2013", "main.2179"], "CL.2": ["main.2718", "main.143", "main.3116", "main.410", "main.3181"], "CL.3": ["main.1578", "main.2574", "main.527", "main.2410", "main.851"], "CL.4": ["TACL.2221", "main.648", "main.3046", "main.3257", "main.3391"], "CL.5": ["TACL.1936", "main.2253", "main.2419", "main.1625", "TACL.2141"], "TACL.1936": ["TACL.2141", "main.1494", "main.2179", "main.2702", "main.2122"], "TACL.1943": ["main.1960", "main.522", "main.3227", "TACL.1997", "main.2661"], "TACL.1983": ["main.2763", "main.2630", "TACL.2049", "main.41", "main.1866"], "TACL.1997": ["main.2061", "TACL.1943", "main.888", "main.894", "main.522"], "TACL.2011": ["main.1018", "main.3093", "main.838", "main.1399", "main.2596"], "TACL.2013": ["main.3181", "main.143", "TACL.2141", "main.2847", "main.2179"], "TACL.2041": ["main.2491", "main.1130", "main.74", "TACL.2411", "main.2838"], "TACL.2047": ["main.767", "main.1351", "main.3227", "main.1960", "main.2491"], "TACL.2049": ["main.3035", "main.2258", "main.41", "main.2228", "main.2380"], "TACL.2055": ["main.2491", "TACL.2041", "TACL.2047", "main.1631", "main.1196"], "TACL.2083": ["TACL.2093", "main.30", "main.2792", "main.3093", "main.3358"], "TACL.2093": ["main.2792", "main.30", "TACL.2083", "main.2931", "main.1498"], "TACL.2095": ["main.693", "main.1023", "main.471", "main.1949", "main.3012"], "TACL.2103": ["main.1755", "main.989", "main.3216", "main.2799", "main.1528"], "TACL.2107": ["main.1680", "main.852", "main.3688", "main.522", "main.888"], "TACL.2121": ["main.2761", "main.782", "main.666", "main.1231", "main.1010"], "TACL.2129": ["main.2313", "main.1923", "main.359", "main.3183", "TACL.2389"], "TACL.2135": ["main.2590", "main.1130", "main.1647", "main.2342", "main.648"], "TACL.2141": ["TACL.2013", "main.2363", "TACL.1936", "main.1957", "main.2179"], "TACL.2143": ["main.1846", "main.1702", "main.1140", "main.1201", "main.1012"], "TACL.2169": ["main.1377", "main.2767", "main.3348", "main.2198", "main.730"], "TACL.2221": ["CL.4", "main.2915", "main.1402", "main.1100", "main.888"], "TACL.2255": ["main.2078", "main.1733", "main.3292", "main.910", "main.1428"], "TACL.2389": ["main.3434", "main.1923", "main.2313", "main.2914", "TACL.2129"], "TACL.2411": ["main.2851", "main.3635", "main.2363", "main.1970", "TACL.2041"]} \ No newline at end of file diff --git a/sitedata/papers_projection.json b/sitedata/papers_projection.json index 3ad1677..d53810d 100644 --- a/sitedata/papers_projection.json +++ b/sitedata/papers_projection.json @@ -1 +1,5476 @@ -[{"id": "main.1", "pos": [13.263115882873535, 7.318037509918213]}, {"id": "main.2", "pos": [12.979305267333984, 8.162558555603027]}, {"id": "main.3", "pos": [7.694222450256348, 6.858680248260498]}, {"id": "main.4", "pos": [7.753469467163086, 8.29307746887207]}, {"id": "main.5", "pos": [7.063747882843018, 7.13237190246582]}, {"id": "main.6", "pos": [7.7937397956848145, 7.539342403411865]}, {"id": "main.7", "pos": [6.635162353515625, 8.836078643798828]}, {"id": "main.8", "pos": [6.765034198760986, 8.844942092895508]}, {"id": "main.9", "pos": [12.929691314697266, 5.384807586669922]}, {"id": "main.10", "pos": [8.347936630249023, 7.2001953125]}, {"id": "main.11", "pos": [7.481893539428711, 7.061415195465088]}, {"id": "main.12", "pos": [7.27938985824585, 6.789852142333984]}, {"id": "main.13", "pos": [10.96905517578125, 10.582162857055664]}, {"id": "main.14", "pos": [7.696529388427734, 7.591522693634033]}, {"id": "main.15", "pos": [8.67658519744873, 5.856150150299072]}, {"id": "main.16", "pos": [11.189714431762695, 7.602625370025635]}, {"id": "main.17", "pos": [11.350518226623535, 5.044659614562988]}, {"id": "main.18", "pos": [9.953343391418457, 6.400523662567139]}, {"id": "main.19", "pos": [13.366543769836426, 5.565090179443359]}, {"id": "main.20", "pos": [9.224658012390137, 7.649901390075684]}, {"id": "main.21", "pos": [10.87326431274414, 3.5501880645751953]}, {"id": "main.22", "pos": [12.729668617248535, 6.013164043426514]}, {"id": "main.23", "pos": [10.337454795837402, 7.196964740753174]}, {"id": "main.24", "pos": [10.0991849899292, 8.823253631591797]}, {"id": "main.25", "pos": [9.676671981811523, 6.149744510650635]}, {"id": "main.26", "pos": [7.0797576904296875, 6.598024845123291]}, {"id": "main.27", "pos": [9.928458213806152, 5.0753889083862305]}, {"id": "main.28", "pos": [11.586064338684082, 3.73309326171875]}, {"id": "main.29", "pos": [11.707176208496094, 6.63037633895874]}, {"id": "main.30", "pos": [9.75080680847168, 7.036556720733643]}, {"id": "main.31", "pos": [8.110586166381836, 2.431795835494995]}, {"id": "main.32", "pos": [10.381662368774414, 5.231573104858398]}, {"id": "main.33", "pos": [9.640515327453613, 7.584011077880859]}, {"id": "main.34", "pos": [11.173311233520508, 9.03012752532959]}, {"id": "main.35", "pos": [7.928827285766602, 7.1708760261535645]}, {"id": "main.36", "pos": [11.451472282409668, 7.716140270233154]}, {"id": "main.37", "pos": [11.43310832977295, 8.05740737915039]}, {"id": "main.38", "pos": [13.364579200744629, 4.163045406341553]}, {"id": "main.39", "pos": [9.035202980041504, 6.068936347961426]}, {"id": "main.40", "pos": [11.492344856262207, 8.474924087524414]}, {"id": "main.41", "pos": [9.88197135925293, 6.832515239715576]}, {"id": "main.42", "pos": [9.146060943603516, 8.348477363586426]}, {"id": "main.43", "pos": [11.19227409362793, 5.046916961669922]}, {"id": "main.44", "pos": [11.602458000183105, 8.73149585723877]}, {"id": "main.45", "pos": [10.187660217285156, 8.853711128234863]}, {"id": "main.46", "pos": [10.356061935424805, 6.080698013305664]}, {"id": "main.47", "pos": [10.76474666595459, 6.584508419036865]}, {"id": "main.48", "pos": [9.545328140258789, 5.439977645874023]}, {"id": "main.49", "pos": [6.3416032791137695, 8.132634162902832]}, {"id": "main.50", "pos": [6.505117893218994, 8.2391357421875]}, {"id": "main.51", "pos": [10.993486404418945, 7.624875545501709]}, {"id": "main.52", "pos": [7.02100133895874, 7.751898288726807]}, {"id": "main.53", "pos": [7.342716693878174, 7.261143207550049]}, {"id": "main.54", "pos": [7.1623382568359375, 7.931463718414307]}, {"id": "main.55", "pos": [8.474653244018555, 8.432367324829102]}, {"id": "main.56", "pos": [12.93094253540039, 7.291661739349365]}, {"id": "main.57", "pos": [8.224034309387207, 6.168436050415039]}, {"id": "main.58", "pos": [8.198409080505371, 6.823605537414551]}, {"id": "main.59", "pos": [6.494148254394531, 6.414926052093506]}, {"id": "main.60", "pos": [8.95240592956543, 5.9025559425354]}, {"id": "main.61", "pos": [13.94351577758789, 6.482197284698486]}, {"id": "main.62", "pos": [6.855108737945557, 6.753317356109619]}, {"id": "main.63", "pos": [11.924420356750488, 6.03358268737793]}, {"id": "main.64", "pos": [7.823593616485596, 8.810970306396484]}, {"id": "main.65", "pos": [10.80540657043457, 7.151523113250732]}, {"id": "main.66", "pos": [8.826099395751953, 6.196462154388428]}, {"id": "main.67", "pos": [8.101056098937988, 2.3836939334869385]}, {"id": "main.68", "pos": [9.868019104003906, 7.397243499755859]}, {"id": "main.69", "pos": [11.866643905639648, 7.126724720001221]}, {"id": "main.70", "pos": [9.734888076782227, 6.1840362548828125]}, {"id": "main.71", "pos": [7.948847770690918, 5.924601078033447]}, {"id": "main.72", "pos": [11.896080017089844, 3.362194776535034]}, {"id": "main.73", "pos": [9.020857810974121, 2.607741594314575]}, {"id": "main.74", "pos": [13.231080055236816, 5.346936225891113]}, {"id": "main.75", "pos": [5.137445449829102, 8.56356143951416]}, {"id": "main.76", "pos": [11.069307327270508, 3.825030565261841]}, {"id": "main.77", "pos": [7.675169467926025, 4.1777729988098145]}, {"id": "main.78", "pos": [10.817850112915039, 6.088648319244385]}, {"id": "main.79", "pos": [9.506555557250977, 3.912693500518799]}, {"id": "main.80", "pos": [10.462282180786133, 7.677069187164307]}, {"id": "main.81", "pos": [10.8082914352417, 10.456917762756348]}, {"id": "main.82", "pos": [10.469343185424805, 10.735462188720703]}, {"id": "main.83", "pos": [11.777871131896973, 6.419312953948975]}, {"id": "main.84", "pos": [10.744483947753906, 3.708177328109741]}, {"id": "main.85", "pos": [9.305346488952637, 7.777370452880859]}, {"id": "main.86", "pos": [11.546785354614258, 6.508510589599609]}, {"id": "main.87", "pos": [10.667158126831055, 6.930556297302246]}, {"id": "main.88", "pos": [13.604508399963379, 5.428911209106445]}, {"id": "main.89", "pos": [10.463150024414062, 6.465011119842529]}, {"id": "main.90", "pos": [7.112016677856445, 8.513391494750977]}, {"id": "main.91", "pos": [10.479144096374512, 3.1713802814483643]}, {"id": "main.92", "pos": [14.056475639343262, 5.7574567794799805]}, {"id": "main.93", "pos": [8.823145866394043, 9.126102447509766]}, {"id": "main.94", "pos": [12.251378059387207, 7.867340564727783]}, {"id": "main.95", "pos": [10.176331520080566, 8.741188049316406]}, {"id": "main.96", "pos": [12.203485488891602, 6.752135753631592]}, {"id": "main.97", "pos": [10.207179069519043, 5.57102632522583]}, {"id": "main.98", "pos": [7.679973125457764, 8.760200500488281]}, {"id": "main.99", "pos": [7.4234819412231445, 7.991095066070557]}, {"id": "main.100", "pos": [10.87442398071289, 7.73425817489624]}, {"id": "main.101", "pos": [10.84427547454834, 4.178957462310791]}, {"id": "main.102", "pos": [10.715149879455566, 4.361705780029297]}, {"id": "main.103", "pos": [10.114784240722656, 5.1392645835876465]}, {"id": "main.104", "pos": [9.81181812286377, 4.081568241119385]}, {"id": "main.105", "pos": [10.362443923950195, 5.040643692016602]}, {"id": "main.106", "pos": [7.989933967590332, 2.764394760131836]}, {"id": "main.107", "pos": [11.306662559509277, 3.331177234649658]}, {"id": "main.108", "pos": [10.61009693145752, 3.6805992126464844]}, {"id": "main.109", "pos": [12.40057373046875, 6.941251754760742]}, {"id": "main.110", "pos": [8.021334648132324, 8.023768424987793]}, {"id": "main.111", "pos": [11.028501510620117, 4.7890448570251465]}, {"id": "main.112", "pos": [9.997031211853027, 9.395360946655273]}, {"id": "main.113", "pos": [12.361186981201172, 7.21030855178833]}, {"id": "main.114", "pos": [12.912508010864258, 6.3852410316467285]}, {"id": "main.115", "pos": [10.20561408996582, 6.90299129486084]}, {"id": "main.116", "pos": [10.271190643310547, 4.668856620788574]}, {"id": "main.117", "pos": [6.596007347106934, 6.32750940322876]}, {"id": "main.118", "pos": [5.296194076538086, 8.580364227294922]}, {"id": "main.119", "pos": [10.430391311645508, 3.4803402423858643]}, {"id": "main.120", "pos": [8.162115097045898, 5.187887668609619]}, {"id": "main.121", "pos": [12.630633354187012, 7.733105182647705]}, {"id": "main.122", "pos": [11.468152046203613, 5.416236877441406]}, {"id": "main.123", "pos": [8.201881408691406, 8.500765800476074]}, {"id": "main.124", "pos": [8.992393493652344, 7.418107509613037]}, {"id": "main.125", "pos": [8.68150806427002, 4.811174392700195]}, {"id": "main.126", "pos": [7.60781717300415, 8.374829292297363]}, {"id": "main.127", "pos": [13.44111442565918, 6.612326622009277]}, {"id": "main.128", "pos": [8.7595796585083, 6.6773834228515625]}, {"id": "main.129", "pos": [6.436100006103516, 6.487366199493408]}, {"id": "main.130", "pos": [10.749787330627441, 8.847139358520508]}, {"id": "main.131", "pos": [6.745145320892334, 8.880165100097656]}, {"id": "main.132", "pos": [10.980207443237305, 5.576553821563721]}, {"id": "main.133", "pos": [7.31301736831665, 7.910783767700195]}, {"id": "main.134", "pos": [9.165569305419922, 2.2333004474639893]}, {"id": "main.135", "pos": [10.59253215789795, 3.319608211517334]}, {"id": "main.136", "pos": [10.916125297546387, 4.05092716217041]}, {"id": "main.137", "pos": [7.227620601654053, 5.272306442260742]}, {"id": "main.138", "pos": [8.676874160766602, 5.411282539367676]}, {"id": "main.139", "pos": [10.118120193481445, 7.307547092437744]}, {"id": "main.140", "pos": [10.682249069213867, 7.3213911056518555]}, {"id": "main.141", "pos": [10.38851547241211, 3.558516263961792]}, {"id": "main.142", "pos": [9.50507640838623, 6.772485733032227]}, {"id": "main.143", "pos": [11.484004974365234, 8.674071311950684]}, {"id": "main.144", "pos": [12.450221061706543, 6.12685489654541]}, {"id": "main.145", "pos": [12.605749130249023, 7.5125627517700195]}, {"id": "main.146", "pos": [9.973323822021484, 7.014316082000732]}, {"id": "main.147", "pos": [11.384467124938965, 7.92803955078125]}, {"id": "main.148", "pos": [10.423956871032715, 9.133901596069336]}, {"id": "main.149", "pos": [10.447667121887207, 7.240666389465332]}, {"id": "main.150", "pos": [10.722175598144531, 9.003800392150879]}, {"id": "main.151", "pos": [9.989221572875977, 7.921534061431885]}, {"id": "main.152", "pos": [12.37016773223877, 7.5929765701293945]}, {"id": "main.153", "pos": [11.289222717285156, 7.919968128204346]}, {"id": "main.154", "pos": [10.752182006835938, 8.486733436584473]}, {"id": "main.155", "pos": [11.24589729309082, 8.22245979309082]}, {"id": "main.156", "pos": [9.928868293762207, 8.069469451904297]}, {"id": "main.157", "pos": [12.728209495544434, 5.583381175994873]}, {"id": "main.158", "pos": [9.708089828491211, 6.437885284423828]}, {"id": "main.159", "pos": [11.4523344039917, 9.43838119506836]}, {"id": "main.160", "pos": [10.536825180053711, 6.120334148406982]}, {"id": "main.161", "pos": [8.177999496459961, 9.214251518249512]}, {"id": "main.162", "pos": [10.961050987243652, 4.909714698791504]}, {"id": "main.163", "pos": [11.900579452514648, 6.069886207580566]}, {"id": "main.164", "pos": [9.146941184997559, 9.58230209350586]}, {"id": "main.165", "pos": [8.982230186462402, 5.2464823722839355]}, {"id": "main.166", "pos": [9.400216102600098, 3.2338004112243652]}, {"id": "main.167", "pos": [8.476202011108398, 2.956631898880005]}, {"id": "main.168", "pos": [12.013733863830566, 4.549959659576416]}, {"id": "main.169", "pos": [11.224212646484375, 7.760255813598633]}, {"id": "main.170", "pos": [10.530781745910645, 8.179705619812012]}, {"id": "main.171", "pos": [9.139174461364746, 6.690412998199463]}, {"id": "main.172", "pos": [12.578242301940918, 7.27745246887207]}, {"id": "main.173", "pos": [9.200419425964355, 9.77425765991211]}, {"id": "main.174", "pos": [8.202590942382812, 9.025654792785645]}, {"id": "main.175", "pos": [8.047019004821777, 7.8064398765563965]}, {"id": "main.176", "pos": [11.15272331237793, 7.309240818023682]}, {"id": "main.177", "pos": [10.022022247314453, 6.607564449310303]}, {"id": "main.178", "pos": [8.525001525878906, 8.680000305175781]}, {"id": "main.179", "pos": [9.562023162841797, 9.604833602905273]}, {"id": "main.180", "pos": [9.45025634765625, 9.632636070251465]}, {"id": "main.181", "pos": [11.673154830932617, 8.88536262512207]}, {"id": "main.182", "pos": [8.769477844238281, 9.187456130981445]}, {"id": "main.183", "pos": [6.903238773345947, 7.618020534515381]}, {"id": "main.184", "pos": [8.939010620117188, 3.389683961868286]}, {"id": "main.185", "pos": [8.362714767456055, 7.111880779266357]}, {"id": "main.186", "pos": [10.113690376281738, 4.142124652862549]}, {"id": "main.187", "pos": [12.039929389953613, 5.936866283416748]}, {"id": "main.188", "pos": [7.377986907958984, 5.712579250335693]}, {"id": "main.189", "pos": [7.585129737854004, 6.503234386444092]}, {"id": "main.190", "pos": [13.003479957580566, 5.015559196472168]}, {"id": "main.191", "pos": [8.15282154083252, 5.649298667907715]}, {"id": "main.192", "pos": [12.276565551757812, 4.169219017028809]}, {"id": "main.193", "pos": [8.81017780303955, 5.080780982971191]}, {"id": "main.194", "pos": [11.799087524414062, 5.835400104522705]}, {"id": "main.195", "pos": [14.038272857666016, 7.09349250793457]}, {"id": "main.196", "pos": [8.757516860961914, 7.809950351715088]}, {"id": "main.197", "pos": [10.086677551269531, 7.529656410217285]}, {"id": "main.198", "pos": [8.552998542785645, 6.309616565704346]}, {"id": "main.199", "pos": [8.89868450164795, 2.6580581665039062]}, {"id": "main.200", "pos": [11.835712432861328, 6.626865863800049]}, {"id": "main.201", "pos": [11.686358451843262, 8.292208671569824]}, {"id": "main.202", "pos": [14.135177612304688, 6.775892734527588]}, {"id": "main.203", "pos": [9.385175704956055, 7.9402031898498535]}, {"id": "main.204", "pos": [8.035012245178223, 5.821848392486572]}, {"id": "main.205", "pos": [9.97223949432373, 4.462034702301025]}, {"id": "main.206", "pos": [13.365633010864258, 7.3938469886779785]}, {"id": "main.207", "pos": [10.031353950500488, 5.115469932556152]}, {"id": "main.208", "pos": [10.637826919555664, 4.172643184661865]}, {"id": "main.209", "pos": [11.223671913146973, 4.668673992156982]}, {"id": "main.210", "pos": [10.719264030456543, 4.267213344573975]}, {"id": "main.211", "pos": [10.701245307922363, 4.689622402191162]}, {"id": "main.212", "pos": [12.855121612548828, 8.784303665161133]}, {"id": "main.213", "pos": [13.348702430725098, 8.58401870727539]}, {"id": "main.214", "pos": [13.011417388916016, 8.772167205810547]}, {"id": "main.215", "pos": [13.084343910217285, 7.542043209075928]}, {"id": "main.216", "pos": [12.995260238647461, 6.972325801849365]}, {"id": "main.217", "pos": [11.247899055480957, 8.663610458374023]}, {"id": "main.218", "pos": [7.06295919418335, 8.639495849609375]}, {"id": "main.219", "pos": [6.663082599639893, 9.114028930664062]}, {"id": "main.220", "pos": [7.230788230895996, 8.477697372436523]}, {"id": "main.221", "pos": [6.992349147796631, 7.610178470611572]}, {"id": "main.222", "pos": [12.49555778503418, 5.324370861053467]}, {"id": "main.223", "pos": [8.769874572753906, 3.543882369995117]}, {"id": "main.224", "pos": [8.374195098876953, 2.777996301651001]}, {"id": "main.225", "pos": [10.470731735229492, 5.0804033279418945]}, {"id": "main.226", "pos": [10.43917465209961, 5.74478006362915]}, {"id": "main.227", "pos": [11.5562744140625, 6.175942420959473]}, {"id": "main.228", "pos": [11.028038024902344, 4.873029708862305]}, {"id": "main.229", "pos": [11.723801612854004, 7.797658920288086]}, {"id": "main.230", "pos": [9.879657745361328, 4.511967658996582]}, {"id": "main.231", "pos": [7.049452304840088, 7.124056816101074]}, {"id": "main.232", "pos": [9.901208877563477, 4.1605424880981445]}, {"id": "main.233", "pos": [12.465960502624512, 5.8390069007873535]}, {"id": "main.234", "pos": [9.299976348876953, 7.196463108062744]}, {"id": "main.235", "pos": [8.720331192016602, 6.339437961578369]}, {"id": "main.236", "pos": [13.443236351013184, 8.166756629943848]}, {"id": "main.237", "pos": [13.443134307861328, 6.390030384063721]}, {"id": "main.238", "pos": [7.9119391441345215, 5.375680923461914]}, {"id": "main.239", "pos": [10.432051658630371, 7.4974236488342285]}, {"id": "main.240", "pos": [10.226912498474121, 9.135030746459961]}, {"id": "main.241", "pos": [9.026044845581055, 2.838334560394287]}, {"id": "main.242", "pos": [8.104366302490234, 6.932295799255371]}, {"id": "main.243", "pos": [9.539450645446777, 5.982561111450195]}, {"id": "main.244", "pos": [13.497660636901855, 4.046662330627441]}, {"id": "main.245", "pos": [9.332687377929688, 8.472442626953125]}, {"id": "main.246", "pos": [8.895983695983887, 6.0209574699401855]}, {"id": "main.247", "pos": [13.648921012878418, 7.867067337036133]}, {"id": "main.248", "pos": [10.02231502532959, 4.338033199310303]}, {"id": "main.249", "pos": [9.577642440795898, 6.294254779815674]}, {"id": "main.250", "pos": [13.365434646606445, 4.126458168029785]}, {"id": "main.251", "pos": [8.543496131896973, 5.785752773284912]}, {"id": "main.252", "pos": [10.453447341918945, 9.156721115112305]}, {"id": "main.253", "pos": [11.0520601272583, 8.490010261535645]}, {"id": "main.254", "pos": [11.005522727966309, 9.845903396606445]}, {"id": "main.255", "pos": [9.6477632522583, 6.553455352783203]}, {"id": "main.256", "pos": [10.982090950012207, 10.639860153198242]}, {"id": "main.257", "pos": [8.551817893981934, 5.843443393707275]}, {"id": "main.258", "pos": [11.298133850097656, 9.432403564453125]}, {"id": "main.259", "pos": [11.739585876464844, 4.767682075500488]}, {"id": "main.260", "pos": [9.8259859085083, 8.210556983947754]}, {"id": "main.261", "pos": [9.037456512451172, 7.015614986419678]}, {"id": "main.262", "pos": [8.046489715576172, 7.0077080726623535]}, {"id": "main.263", "pos": [10.445836067199707, 9.322041511535645]}, {"id": "main.264", "pos": [9.218891143798828, 8.81621265411377]}, {"id": "main.265", "pos": [9.383652687072754, 9.050378799438477]}, {"id": "main.266", "pos": [10.212726593017578, 4.607842445373535]}, {"id": "main.267", "pos": [9.138423919677734, 6.348575115203857]}, {"id": "main.268", "pos": [8.865700721740723, 5.730269908905029]}, {"id": "main.269", "pos": [9.519858360290527, 8.422082901000977]}, {"id": "main.270", "pos": [13.156181335449219, 4.259345531463623]}, {"id": "main.271", "pos": [9.4983549118042, 5.557781219482422]}, {"id": "main.272", "pos": [7.17155647277832, 6.626194953918457]}, {"id": "main.273", "pos": [8.088057518005371, 2.507852077484131]}, {"id": "main.274", "pos": [11.3242769241333, 4.542776584625244]}, {"id": "main.275", "pos": [12.65904426574707, 7.016883850097656]}, {"id": "main.276", "pos": [11.824210166931152, 7.768004894256592]}, {"id": "main.277", "pos": [10.570178031921387, 4.022219657897949]}, {"id": "main.278", "pos": [8.871918678283691, 9.310213088989258]}, {"id": "main.279", "pos": [10.306273460388184, 10.986652374267578]}, {"id": "main.280", "pos": [8.583846092224121, 3.6211392879486084]}, {"id": "main.281", "pos": [9.828761100769043, 3.9429709911346436]}, {"id": "main.282", "pos": [8.471776962280273, 2.572873592376709]}, {"id": "main.283", "pos": [7.638192176818848, 5.078570365905762]}, {"id": "main.284", "pos": [12.167074203491211, 4.571514129638672]}, {"id": "main.285", "pos": [11.34649658203125, 3.6791300773620605]}, {"id": "main.286", "pos": [6.669648170471191, 5.348261833190918]}, {"id": "main.287", "pos": [6.143494129180908, 8.301848411560059]}, {"id": "main.288", "pos": [8.22559928894043, 4.386773586273193]}, {"id": "main.289", "pos": [8.553728103637695, 4.044728755950928]}, {"id": "main.290", "pos": [10.761412620544434, 8.971331596374512]}, {"id": "main.291", "pos": [9.583314895629883, 3.967919111251831]}, {"id": "main.292", "pos": [9.394290924072266, 3.0020382404327393]}, {"id": "main.293", "pos": [8.518502235412598, 8.195881843566895]}, {"id": "main.294", "pos": [10.05667781829834, 7.070849418640137]}, {"id": "main.295", "pos": [8.332365036010742, 3.939206838607788]}, {"id": "main.296", "pos": [8.28499984741211, 4.418485164642334]}, {"id": "main.297", "pos": [10.172529220581055, 8.5625638961792]}, {"id": "main.298", "pos": [8.450501441955566, 3.3485352993011475]}, {"id": "main.299", "pos": [10.038338661193848, 2.2405333518981934]}, {"id": "main.300", "pos": [9.753461837768555, 7.698326587677002]}, {"id": "main.301", "pos": [10.248340606689453, 2.4745423793792725]}, {"id": "main.302", "pos": [7.450742721557617, 5.75454568862915]}, {"id": "main.303", "pos": [8.641230583190918, 3.420058012008667]}, {"id": "main.304", "pos": [14.184196472167969, 6.82049036026001]}, {"id": "main.305", "pos": [6.5098371505737305, 8.480692863464355]}, {"id": "main.306", "pos": [9.8494291305542, 4.900662422180176]}, {"id": "main.307", "pos": [7.725553035736084, 6.16603422164917]}, {"id": "main.308", "pos": [6.290891170501709, 8.416029930114746]}, {"id": "main.309", "pos": [12.258378028869629, 8.749794960021973]}, {"id": "main.310", "pos": [12.740138053894043, 8.598836898803711]}, {"id": "main.311", "pos": [10.736808776855469, 5.746691703796387]}, {"id": "main.312", "pos": [11.996946334838867, 5.132406711578369]}, {"id": "main.313", "pos": [8.274018287658691, 2.9036967754364014]}, {"id": "main.314", "pos": [10.662955284118652, 6.589808940887451]}, {"id": "main.315", "pos": [10.603713035583496, 10.607681274414062]}, {"id": "main.316", "pos": [8.895540237426758, 7.39412260055542]}, {"id": "main.317", "pos": [9.722296714782715, 8.7645845413208]}, {"id": "main.318", "pos": [8.238569259643555, 2.4940028190612793]}, {"id": "main.319", "pos": [8.605339050292969, 7.036749362945557]}, {"id": "main.320", "pos": [12.4979829788208, 7.285224914550781]}, {"id": "main.321", "pos": [12.221388816833496, 6.713691234588623]}, {"id": "main.322", "pos": [8.11262035369873, 7.409926414489746]}, {"id": "main.323", "pos": [9.06025505065918, 6.336519718170166]}, {"id": "main.324", "pos": [10.691485404968262, 9.234718322753906]}, {"id": "main.325", "pos": [11.15556812286377, 9.029519081115723]}, {"id": "main.326", "pos": [8.855342864990234, 6.525924205780029]}, {"id": "main.327", "pos": [13.281381607055664, 6.751157760620117]}, {"id": "main.328", "pos": [12.638391494750977, 4.501567840576172]}, {"id": "main.329", "pos": [12.272421836853027, 7.138627052307129]}, {"id": "main.330", "pos": [12.067197799682617, 5.176627159118652]}, {"id": "main.331", "pos": [7.6238579750061035, 4.397091388702393]}, {"id": "main.332", "pos": [6.474997043609619, 8.458571434020996]}, {"id": "main.333", "pos": [12.036152839660645, 5.89594841003418]}, {"id": "main.334", "pos": [8.948746681213379, 7.148634433746338]}, {"id": "main.335", "pos": [7.191190719604492, 5.236530303955078]}, {"id": "main.336", "pos": [10.027637481689453, 8.597223281860352]}, {"id": "main.337", "pos": [8.746716499328613, 6.760620594024658]}, {"id": "main.338", "pos": [8.907011032104492, 4.059457302093506]}, {"id": "main.339", "pos": [8.319506645202637, 6.924679279327393]}, {"id": "main.340", "pos": [8.217072486877441, 4.799678325653076]}, {"id": "main.341", "pos": [9.145668029785156, 5.076972484588623]}, {"id": "main.342", "pos": [8.98816967010498, 3.9710533618927]}, {"id": "main.343", "pos": [10.182784080505371, 10.934619903564453]}, {"id": "main.344", "pos": [13.362764358520508, 7.528354167938232]}, {"id": "main.345", "pos": [13.260198593139648, 7.734084129333496]}, {"id": "main.346", "pos": [13.380655288696289, 8.654288291931152]}, {"id": "main.347", "pos": [10.621338844299316, 7.0600996017456055]}, {"id": "main.348", "pos": [12.943349838256836, 7.539803981781006]}, {"id": "main.349", "pos": [5.340684413909912, 8.664170265197754]}, {"id": "main.350", "pos": [12.842763900756836, 7.422754764556885]}, {"id": "main.351", "pos": [13.222511291503906, 7.492109298706055]}, {"id": "main.352", "pos": [6.179460525512695, 8.532179832458496]}, {"id": "main.353", "pos": [7.153550148010254, 9.00600814819336]}, {"id": "main.354", "pos": [11.851652145385742, 5.344583511352539]}, {"id": "main.355", "pos": [10.592631340026855, 5.165359973907471]}, {"id": "main.356", "pos": [9.632793426513672, 4.634565353393555]}, {"id": "main.357", "pos": [10.187032699584961, 6.196957588195801]}, {"id": "main.358", "pos": [8.68856430053711, 3.4768307209014893]}, {"id": "main.359", "pos": [11.843554496765137, 7.980727672576904]}, {"id": "main.360", "pos": [7.412832736968994, 6.559839248657227]}, {"id": "main.361", "pos": [12.912467956542969, 8.350689888000488]}, {"id": "main.362", "pos": [8.055830001831055, 2.7939484119415283]}, {"id": "main.363", "pos": [8.591782569885254, 4.566919803619385]}, {"id": "main.364", "pos": [10.166158676147461, 6.522622585296631]}, {"id": "main.365", "pos": [10.473071098327637, 5.562744140625]}, {"id": "main.366", "pos": [9.677911758422852, 3.114779472351074]}, {"id": "main.367", "pos": [10.133905410766602, 5.885389804840088]}, {"id": "main.368", "pos": [12.995194435119629, 8.37137222290039]}, {"id": "main.369", "pos": [9.801557540893555, 8.194367408752441]}, {"id": "main.370", "pos": [8.041603088378906, 5.7738728523254395]}, {"id": "main.371", "pos": [7.757441997528076, 7.691563129425049]}, {"id": "main.372", "pos": [12.7170991897583, 8.013726234436035]}, {"id": "main.373", "pos": [7.226235866546631, 7.582059860229492]}, {"id": "main.374", "pos": [9.130904197692871, 6.4337921142578125]}, {"id": "main.375", "pos": [9.703269004821777, 3.735032081604004]}, {"id": "main.376", "pos": [10.057088851928711, 2.288567066192627]}, {"id": "main.377", "pos": [8.780055046081543, 2.49955677986145]}, {"id": "main.378", "pos": [9.940106391906738, 8.110808372497559]}, {"id": "main.379", "pos": [10.974405288696289, 9.848718643188477]}, {"id": "main.380", "pos": [5.547213554382324, 8.15427303314209]}, {"id": "main.381", "pos": [12.940308570861816, 7.563150405883789]}, {"id": "main.382", "pos": [9.930314064025879, 6.1471357345581055]}, {"id": "main.383", "pos": [12.749495506286621, 8.894327163696289]}, {"id": "main.384", "pos": [12.081783294677734, 8.521017074584961]}, {"id": "main.385", "pos": [11.433406829833984, 5.840323448181152]}, {"id": "main.386", "pos": [8.673957824707031, 8.091368675231934]}, {"id": "main.387", "pos": [8.95852279663086, 9.20777416229248]}, {"id": "main.388", "pos": [8.659141540527344, 3.7746407985687256]}, {"id": "main.389", "pos": [12.803046226501465, 6.21539831161499]}, {"id": "main.390", "pos": [6.788656234741211, 6.683724403381348]}, {"id": "main.391", "pos": [12.991061210632324, 8.878556251525879]}, {"id": "main.392", "pos": [7.807403087615967, 4.077975749969482]}, {"id": "main.393", "pos": [9.433966636657715, 5.442081928253174]}, {"id": "main.394", "pos": [7.65778112411499, 7.572021007537842]}, {"id": "main.395", "pos": [12.723875045776367, 5.139193534851074]}, {"id": "main.396", "pos": [6.953934669494629, 8.244152069091797]}, {"id": "main.397", "pos": [8.768059730529785, 2.9818410873413086]}, {"id": "main.398", "pos": [10.964423179626465, 4.077826023101807]}, {"id": "main.399", "pos": [10.968173027038574, 4.901942729949951]}, {"id": "main.400", "pos": [11.431976318359375, 7.971721172332764]}, {"id": "main.401", "pos": [5.688068866729736, 8.545022010803223]}, {"id": "main.402", "pos": [6.637433052062988, 7.924453258514404]}, {"id": "main.403", "pos": [5.747597694396973, 8.56605052947998]}, {"id": "main.404", "pos": [10.031818389892578, 9.421916007995605]}, {"id": "main.405", "pos": [5.662964344024658, 7.759943962097168]}, {"id": "main.406", "pos": [10.739174842834473, 3.5707030296325684]}, {"id": "main.407", "pos": [11.099652290344238, 6.424555778503418]}, {"id": "main.408", "pos": [9.646990776062012, 6.492180824279785]}, {"id": "main.409", "pos": [8.992843627929688, 5.926518440246582]}, {"id": "main.410", "pos": [6.887719631195068, 5.568807601928711]}, {"id": "main.411", "pos": [10.538970947265625, 8.376290321350098]}, {"id": "main.412", "pos": [7.869327545166016, 4.987522602081299]}, {"id": "main.413", "pos": [9.499134063720703, 6.207531929016113]}, {"id": "main.414", "pos": [11.313365936279297, 4.374855995178223]}, {"id": "main.415", "pos": [10.804858207702637, 7.071407794952393]}, {"id": "main.416", "pos": [10.631298065185547, 6.9333086013793945]}, {"id": "main.417", "pos": [12.426278114318848, 3.5481183528900146]}, {"id": "main.418", "pos": [5.718380928039551, 7.893456935882568]}, {"id": "main.419", "pos": [12.042699813842773, 4.760252952575684]}, {"id": "main.420", "pos": [10.669290542602539, 7.2310791015625]}, {"id": "main.421", "pos": [10.66887378692627, 9.108880996704102]}, {"id": "main.422", "pos": [9.236201286315918, 5.172560214996338]}, {"id": "main.423", "pos": [9.561056137084961, 6.406502723693848]}, {"id": "main.424", "pos": [12.078865051269531, 7.680512428283691]}, {"id": "main.425", "pos": [12.165534019470215, 6.007342338562012]}, {"id": "main.426", "pos": [9.046521186828613, 5.38428258895874]}, {"id": "main.427", "pos": [13.369401931762695, 4.968611240386963]}, {"id": "main.428", "pos": [8.434767723083496, 8.64356803894043]}, {"id": "main.429", "pos": [12.636626243591309, 8.222853660583496]}, {"id": "main.430", "pos": [10.447725296020508, 6.3543195724487305]}, {"id": "main.431", "pos": [9.142370223999023, 8.449334144592285]}, {"id": "main.432", "pos": [9.059243202209473, 9.07210922241211]}, {"id": "main.433", "pos": [9.781474113464355, 5.631633281707764]}, {"id": "main.434", "pos": [8.812697410583496, 5.8249640464782715]}, {"id": "main.435", "pos": [12.717424392700195, 5.710191249847412]}, {"id": "main.436", "pos": [13.636631965637207, 6.261281490325928]}, {"id": "main.437", "pos": [9.737547874450684, 7.863495349884033]}, {"id": "main.438", "pos": [9.09320068359375, 6.641833305358887]}, {"id": "main.439", "pos": [11.04896068572998, 9.196978569030762]}, {"id": "main.440", "pos": [8.729166030883789, 5.020418167114258]}, {"id": "main.441", "pos": [9.246068000793457, 6.560153484344482]}, {"id": "main.442", "pos": [8.237793922424316, 7.641474723815918]}, {"id": "main.443", "pos": [10.839254379272461, 3.508315086364746]}, {"id": "main.444", "pos": [12.594725608825684, 5.234034538269043]}, {"id": "main.445", "pos": [11.371668815612793, 6.095509052276611]}, {"id": "main.446", "pos": [6.88587760925293, 7.961524486541748]}, {"id": "main.447", "pos": [11.351205825805664, 3.4980239868164062]}, {"id": "main.448", "pos": [12.51063346862793, 6.831550598144531]}, {"id": "main.449", "pos": [9.779081344604492, 4.580735683441162]}, {"id": "main.450", "pos": [8.805939674377441, 9.744343757629395]}, {"id": "main.451", "pos": [9.672991752624512, 3.577707529067993]}, {"id": "main.452", "pos": [9.301490783691406, 8.542367935180664]}, {"id": "main.453", "pos": [10.958416938781738, 4.937929630279541]}, {"id": "main.454", "pos": [9.016693115234375, 9.762568473815918]}, {"id": "main.455", "pos": [8.925909042358398, 9.777512550354004]}, {"id": "main.456", "pos": [8.416709899902344, 8.446757316589355]}, {"id": "main.457", "pos": [8.368658065795898, 4.731219291687012]}, {"id": "main.458", "pos": [6.789352893829346, 5.256067752838135]}, {"id": "main.459", "pos": [7.059981822967529, 7.345536231994629]}, {"id": "main.460", "pos": [11.115321159362793, 6.161398410797119]}, {"id": "main.461", "pos": [7.90329122543335, 8.217401504516602]}, {"id": "main.462", "pos": [12.385120391845703, 6.145631313323975]}, {"id": "main.463", "pos": [10.657671928405762, 6.181807041168213]}, {"id": "main.464", "pos": [11.556289672851562, 3.9177510738372803]}, {"id": "main.465", "pos": [9.534760475158691, 9.606386184692383]}, {"id": "main.466", "pos": [8.784799575805664, 3.662069797515869]}, {"id": "main.467", "pos": [12.988914489746094, 8.494260787963867]}, {"id": "main.468", "pos": [8.574097633361816, 7.590639591217041]}, {"id": "main.469", "pos": [12.414578437805176, 7.906676769256592]}, {"id": "main.470", "pos": [8.214627265930176, 8.518960952758789]}, {"id": "main.471", "pos": [6.318043231964111, 7.632128715515137]}, {"id": "main.472", "pos": [9.53538703918457, 9.961869239807129]}, {"id": "main.473", "pos": [12.35898208618164, 8.478652954101562]}, {"id": "main.474", "pos": [6.564672470092773, 7.996251106262207]}, {"id": "main.475", "pos": [7.666865825653076, 8.651363372802734]}, {"id": "main.476", "pos": [7.625100135803223, 8.781439781188965]}, {"id": "main.477", "pos": [10.417308807373047, 6.180263996124268]}, {"id": "main.478", "pos": [6.660398483276367, 9.271787643432617]}, {"id": "main.479", "pos": [12.137703895568848, 8.982030868530273]}, {"id": "main.480", "pos": [10.029389381408691, 4.063494682312012]}, {"id": "main.481", "pos": [8.564388275146484, 6.431995868682861]}, {"id": "main.482", "pos": [10.402088165283203, 9.552825927734375]}, {"id": "main.483", "pos": [5.491091728210449, 8.091733932495117]}, {"id": "main.484", "pos": [9.681902885437012, 8.518939018249512]}, {"id": "main.485", "pos": [5.7187180519104, 7.758017063140869]}, {"id": "main.486", "pos": [5.800465106964111, 8.05504035949707]}, {"id": "main.487", "pos": [9.212112426757812, 6.546424388885498]}, {"id": "main.488", "pos": [9.384963989257812, 8.323765754699707]}, {"id": "main.489", "pos": [9.413298606872559, 5.234959125518799]}, {"id": "main.490", "pos": [11.264781951904297, 9.068328857421875]}, {"id": "main.491", "pos": [12.911590576171875, 8.94714641571045]}, {"id": "main.492", "pos": [9.433480262756348, 6.810777187347412]}, {"id": "main.493", "pos": [9.491283416748047, 7.15557336807251]}, {"id": "main.494", "pos": [10.299535751342773, 6.030262470245361]}, {"id": "main.495", "pos": [9.790433883666992, 5.806128025054932]}, {"id": "main.496", "pos": [7.673625469207764, 6.7200117111206055]}, {"id": "main.497", "pos": [10.552101135253906, 6.7012200355529785]}, {"id": "main.498", "pos": [13.225557327270508, 5.62950325012207]}, {"id": "main.499", "pos": [9.426194190979004, 6.5565900802612305]}, {"id": "main.500", "pos": [13.099261283874512, 6.319324493408203]}, {"id": "main.501", "pos": [11.593767166137695, 4.422806262969971]}, {"id": "main.502", "pos": [14.151656150817871, 5.941990852355957]}, {"id": "main.503", "pos": [8.524253845214844, 6.588804721832275]}, {"id": "main.504", "pos": [7.9628729820251465, 6.149140357971191]}, {"id": "main.505", "pos": [12.893927574157715, 4.972548961639404]}, {"id": "main.506", "pos": [8.527600288391113, 7.4805684089660645]}, {"id": "main.507", "pos": [13.855064392089844, 5.601067066192627]}, {"id": "main.508", "pos": [8.680949211120605, 8.909927368164062]}, {"id": "main.509", "pos": [7.331207752227783, 8.058472633361816]}, {"id": "main.510", "pos": [9.73856258392334, 7.001952171325684]}, {"id": "main.511", "pos": [9.994190216064453, 8.61043930053711]}, {"id": "main.512", "pos": [8.263440132141113, 3.7038156986236572]}, {"id": "main.513", "pos": [7.784429550170898, 8.503284454345703]}, {"id": "main.514", "pos": [11.347110748291016, 6.576542854309082]}, {"id": "main.515", "pos": [9.931842803955078, 10.771614074707031]}, {"id": "main.516", "pos": [6.847744941711426, 8.808720588684082]}, {"id": "main.517", "pos": [7.244622707366943, 7.533353328704834]}, {"id": "main.518", "pos": [12.642578125, 5.664157867431641]}, {"id": "main.519", "pos": [10.820863723754883, 5.3830060958862305]}, {"id": "main.520", "pos": [7.409883499145508, 5.358395099639893]}, {"id": "main.521", "pos": [10.815528869628906, 5.691078186035156]}, {"id": "main.522", "pos": [7.906209468841553, 6.139864921569824]}, {"id": "main.523", "pos": [9.979179382324219, 9.130691528320312]}, {"id": "main.524", "pos": [8.619588851928711, 5.93536376953125]}, {"id": "main.525", "pos": [11.460334777832031, 5.604703903198242]}, {"id": "main.526", "pos": [7.672372817993164, 5.48850154876709]}, {"id": "main.527", "pos": [7.471530437469482, 7.5194244384765625]}, {"id": "main.528", "pos": [10.940255165100098, 10.741596221923828]}, {"id": "main.529", "pos": [11.182487487792969, 9.164468765258789]}, {"id": "main.530", "pos": [13.58722972869873, 7.882524490356445]}, {"id": "main.531", "pos": [12.197942733764648, 6.3296709060668945]}, {"id": "main.532", "pos": [10.990128517150879, 8.230438232421875]}, {"id": "main.533", "pos": [13.166114807128906, 7.262630462646484]}, {"id": "main.534", "pos": [10.506264686584473, 4.920740127563477]}, {"id": "main.535", "pos": [11.311638832092285, 6.954549312591553]}, {"id": "main.536", "pos": [12.201130867004395, 6.877314567565918]}, {"id": "main.537", "pos": [12.083255767822266, 8.265973091125488]}, {"id": "main.538", "pos": [12.387605667114258, 3.6298270225524902]}, {"id": "main.539", "pos": [9.185389518737793, 2.6345837116241455]}, {"id": "main.540", "pos": [8.988078117370605, 6.670730113983154]}, {"id": "main.541", "pos": [10.367972373962402, 4.103416919708252]}, {"id": "main.542", "pos": [13.447717666625977, 6.402146816253662]}, {"id": "main.543", "pos": [9.746498107910156, 6.564801216125488]}, {"id": "main.544", "pos": [10.066536903381348, 6.499984264373779]}, {"id": "main.545", "pos": [13.074326515197754, 5.864340782165527]}, {"id": "main.546", "pos": [8.8167142868042, 4.396934509277344]}, {"id": "main.547", "pos": [10.564141273498535, 10.481749534606934]}, {"id": "main.548", "pos": [8.183187484741211, 6.3575520515441895]}, {"id": "main.549", "pos": [8.461390495300293, 4.089859962463379]}, {"id": "main.550", "pos": [8.132760047912598, 7.023376941680908]}, {"id": "main.551", "pos": [11.267610549926758, 6.266496658325195]}, {"id": "main.552", "pos": [11.204099655151367, 5.328760147094727]}, {"id": "main.553", "pos": [9.546486854553223, 2.871922254562378]}, {"id": "main.554", "pos": [11.489304542541504, 7.5932793617248535]}, {"id": "main.555", "pos": [9.646135330200195, 2.789179801940918]}, {"id": "main.556", "pos": [11.818424224853516, 6.433623313903809]}, {"id": "main.557", "pos": [10.045186996459961, 2.3330562114715576]}, {"id": "main.558", "pos": [9.501493453979492, 9.573803901672363]}, {"id": "main.559", "pos": [10.362125396728516, 6.219387054443359]}, {"id": "main.560", "pos": [11.031290054321289, 7.156970977783203]}, {"id": "main.561", "pos": [10.053423881530762, 4.889774322509766]}, {"id": "main.562", "pos": [11.801910400390625, 4.1295084953308105]}, {"id": "main.563", "pos": [7.288875579833984, 7.053957462310791]}, {"id": "main.564", "pos": [7.266507148742676, 7.361298084259033]}, {"id": "main.565", "pos": [9.29810905456543, 5.511541843414307]}, {"id": "main.566", "pos": [6.5495524406433105, 6.562410831451416]}, {"id": "main.567", "pos": [6.9559006690979, 7.258937835693359]}, {"id": "main.568", "pos": [7.590386390686035, 8.679206848144531]}, {"id": "main.569", "pos": [9.613117218017578, 4.770356178283691]}, {"id": "main.570", "pos": [11.433711051940918, 6.0648193359375]}, {"id": "main.571", "pos": [9.101329803466797, 2.5145649909973145]}, {"id": "main.572", "pos": [7.913778781890869, 5.03514289855957]}, {"id": "main.573", "pos": [12.935066223144531, 8.384659767150879]}, {"id": "main.574", "pos": [9.42877197265625, 5.713113307952881]}, {"id": "main.575", "pos": [8.924917221069336, 4.483031749725342]}, {"id": "main.576", "pos": [6.811279773712158, 5.503922462463379]}, {"id": "main.577", "pos": [10.671849250793457, 5.400087833404541]}, {"id": "main.578", "pos": [8.987625122070312, 2.6214208602905273]}, {"id": "main.579", "pos": [7.967756748199463, 5.561314105987549]}, {"id": "main.580", "pos": [9.979328155517578, 3.351513624191284]}, {"id": "main.581", "pos": [14.144124984741211, 6.689162731170654]}, {"id": "main.582", "pos": [8.1378755569458, 4.341980457305908]}, {"id": "main.583", "pos": [12.288122177124023, 5.410400390625]}, {"id": "main.584", "pos": [7.762563705444336, 5.036013126373291]}, {"id": "main.585", "pos": [9.857786178588867, 7.688915252685547]}, {"id": "main.586", "pos": [10.125486373901367, 7.017152786254883]}, {"id": "main.587", "pos": [11.534646987915039, 8.240358352661133]}, {"id": "main.588", "pos": [8.158440589904785, 3.8983676433563232]}, {"id": "main.589", "pos": [11.75790786743164, 4.110620498657227]}, {"id": "main.590", "pos": [8.998007774353027, 7.141569137573242]}, {"id": "main.591", "pos": [9.288573265075684, 3.6804020404815674]}, {"id": "main.592", "pos": [9.639872550964355, 5.3014936447143555]}, {"id": "main.593", "pos": [8.07917594909668, 6.233005046844482]}, {"id": "main.594", "pos": [10.253453254699707, 9.323185920715332]}, {"id": "main.595", "pos": [10.4224853515625, 10.56066608428955]}, {"id": "main.596", "pos": [10.223464965820312, 5.844649314880371]}, {"id": "main.597", "pos": [11.486957550048828, 9.323211669921875]}, {"id": "main.598", "pos": [10.371617317199707, 5.820277690887451]}, {"id": "main.599", "pos": [10.944463729858398, 4.048521995544434]}, {"id": "main.600", "pos": [8.286042213439941, 6.438269138336182]}, {"id": "main.601", "pos": [10.21570110321045, 6.381131649017334]}, {"id": "main.602", "pos": [11.31518268585205, 4.195547580718994]}, {"id": "main.603", "pos": [10.84411334991455, 4.891856670379639]}, {"id": "main.604", "pos": [11.126331329345703, 4.169346809387207]}, {"id": "main.605", "pos": [9.367731094360352, 2.5644779205322266]}, {"id": "main.606", "pos": [10.850228309631348, 5.980023384094238]}, {"id": "main.607", "pos": [9.17897891998291, 2.4522416591644287]}, {"id": "main.608", "pos": [10.675936698913574, 5.550411224365234]}, {"id": "main.609", "pos": [9.102733612060547, 2.684370279312134]}, {"id": "main.610", "pos": [11.146503448486328, 7.049361228942871]}, {"id": "main.611", "pos": [10.92764949798584, 10.746176719665527]}, {"id": "main.612", "pos": [9.42244815826416, 3.2949531078338623]}, {"id": "main.613", "pos": [11.601500511169434, 7.675454616546631]}, {"id": "main.614", "pos": [9.663468360900879, 8.595338821411133]}, {"id": "main.615", "pos": [8.92335319519043, 8.687665939331055]}, {"id": "main.616", "pos": [8.904693603515625, 4.897627353668213]}, {"id": "main.617", "pos": [7.843273162841797, 4.66559362411499]}, {"id": "main.618", "pos": [11.842056274414062, 8.806562423706055]}, {"id": "main.619", "pos": [10.7557373046875, 7.988889217376709]}, {"id": "main.620", "pos": [9.36010456085205, 6.233351230621338]}, {"id": "main.621", "pos": [9.579604148864746, 4.868085861206055]}, {"id": "main.622", "pos": [11.201069831848145, 4.129180431365967]}, {"id": "main.623", "pos": [8.881033897399902, 8.185576438903809]}, {"id": "main.624", "pos": [7.719282150268555, 6.297266960144043]}, {"id": "main.625", "pos": [10.081151962280273, 6.734342098236084]}, {"id": "main.626", "pos": [12.161705017089844, 5.714570045471191]}, {"id": "main.627", "pos": [12.061187744140625, 5.69438362121582]}, {"id": "main.628", "pos": [11.060047149658203, 5.838627815246582]}, {"id": "main.629", "pos": [9.611848831176758, 2.698974609375]}, {"id": "main.630", "pos": [11.648175239562988, 5.45200777053833]}, {"id": "main.631", "pos": [8.963278770446777, 8.104299545288086]}, {"id": "main.632", "pos": [6.7009077072143555, 8.258219718933105]}, {"id": "main.633", "pos": [6.555423736572266, 8.307462692260742]}, {"id": "main.634", "pos": [7.149266242980957, 7.7225494384765625]}, {"id": "main.635", "pos": [9.62736701965332, 10.666576385498047]}, {"id": "main.636", "pos": [7.631594181060791, 6.9286675453186035]}, {"id": "main.637", "pos": [7.533660411834717, 8.007782936096191]}, {"id": "main.638", "pos": [6.998809337615967, 7.701532363891602]}, {"id": "main.639", "pos": [9.428488731384277, 7.45352029800415]}, {"id": "main.640", "pos": [7.912675857543945, 2.4461846351623535]}, {"id": "main.641", "pos": [9.792010307312012, 8.995309829711914]}, {"id": "main.642", "pos": [7.802595615386963, 2.6426596641540527]}, {"id": "main.643", "pos": [8.152608871459961, 2.42509126663208]}, {"id": "main.644", "pos": [13.402478218078613, 5.021605968475342]}, {"id": "main.645", "pos": [9.156731605529785, 7.8597564697265625]}, {"id": "main.646", "pos": [8.51418685913086, 6.205021381378174]}, {"id": "main.647", "pos": [8.984228134155273, 6.258940696716309]}, {"id": "main.648", "pos": [10.721517562866211, 10.928569793701172]}, {"id": "main.649", "pos": [10.451641082763672, 4.107645511627197]}, {"id": "main.650", "pos": [11.608728408813477, 7.307736396789551]}, {"id": "main.651", "pos": [11.4404935836792, 4.908907890319824]}, {"id": "main.652", "pos": [7.405154705047607, 6.876832962036133]}, {"id": "main.653", "pos": [11.79940128326416, 7.419101715087891]}, {"id": "main.654", "pos": [10.100871086120605, 6.815808296203613]}, {"id": "main.655", "pos": [8.198083877563477, 4.426885604858398]}, {"id": "main.656", "pos": [11.400467872619629, 4.992027759552002]}, {"id": "main.657", "pos": [8.622111320495605, 2.754929304122925]}, {"id": "main.658", "pos": [11.199911117553711, 7.443203449249268]}, {"id": "main.659", "pos": [10.885919570922852, 7.28772497177124]}, {"id": "main.660", "pos": [10.09066104888916, 5.125513076782227]}, {"id": "main.661", "pos": [8.632131576538086, 8.35066032409668]}, {"id": "main.662", "pos": [6.480947017669678, 6.672099590301514]}, {"id": "main.663", "pos": [11.801078796386719, 5.336148738861084]}, {"id": "main.664", "pos": [9.79361343383789, 4.612331390380859]}, {"id": "main.665", "pos": [12.216249465942383, 5.553732872009277]}, {"id": "main.666", "pos": [11.057211875915527, 6.5217766761779785]}, {"id": "main.667", "pos": [11.553080558776855, 5.70838737487793]}, {"id": "main.668", "pos": [11.556708335876465, 3.857116937637329]}, {"id": "main.669", "pos": [8.521708488464355, 5.088283061981201]}, {"id": "main.670", "pos": [10.930610656738281, 4.449621200561523]}, {"id": "main.671", "pos": [8.134041786193848, 5.607027530670166]}, {"id": "main.672", "pos": [12.223207473754883, 4.672244548797607]}, {"id": "main.673", "pos": [9.877165794372559, 7.825038909912109]}, {"id": "main.674", "pos": [13.014628410339355, 8.082590103149414]}, {"id": "main.675", "pos": [11.718358039855957, 8.712732315063477]}, {"id": "main.676", "pos": [10.582721710205078, 5.576508045196533]}, {"id": "main.677", "pos": [10.859352111816406, 3.0855116844177246]}, {"id": "main.678", "pos": [11.190373420715332, 6.335867881774902]}, {"id": "main.679", "pos": [12.295111656188965, 8.551342964172363]}, {"id": "main.680", "pos": [11.105451583862305, 5.971916198730469]}, {"id": "main.681", "pos": [7.793633460998535, 6.152310371398926]}, {"id": "main.682", "pos": [9.282885551452637, 6.971508979797363]}, {"id": "main.683", "pos": [12.096875190734863, 5.229269027709961]}, {"id": "main.684", "pos": [13.640905380249023, 6.065169811248779]}, {"id": "main.685", "pos": [13.548347473144531, 6.362760543823242]}, {"id": "main.686", "pos": [12.812840461730957, 4.449853420257568]}, {"id": "main.687", "pos": [10.466181755065918, 8.7620210647583]}, {"id": "main.688", "pos": [10.113901138305664, 8.263510704040527]}, {"id": "main.689", "pos": [10.423245429992676, 5.385894298553467]}, {"id": "main.690", "pos": [9.857036590576172, 8.778060913085938]}, {"id": "main.691", "pos": [10.5576171875, 8.633173942565918]}, {"id": "main.692", "pos": [9.950783729553223, 5.728858470916748]}, {"id": "main.693", "pos": [11.151116371154785, 5.5383710861206055]}, {"id": "main.694", "pos": [8.672966957092285, 6.485506057739258]}, {"id": "main.695", "pos": [9.996896743774414, 5.989992618560791]}, {"id": "main.696", "pos": [10.298633575439453, 7.630858898162842]}, {"id": "main.697", "pos": [11.305516242980957, 3.6111979484558105]}, {"id": "main.698", "pos": [9.01972484588623, 9.106351852416992]}, {"id": "main.699", "pos": [11.706140518188477, 4.159221172332764]}, {"id": "main.700", "pos": [12.523703575134277, 6.044020652770996]}, {"id": "main.701", "pos": [10.973943710327148, 4.72235631942749]}, {"id": "main.702", "pos": [9.513267517089844, 8.859674453735352]}, {"id": "main.703", "pos": [10.551161766052246, 7.608888626098633]}, {"id": "main.704", "pos": [10.210076332092285, 8.885656356811523]}, {"id": "main.705", "pos": [13.771791458129883, 7.434942245483398]}, {"id": "main.706", "pos": [10.734597206115723, 4.911586761474609]}, {"id": "main.707", "pos": [11.46912956237793, 6.647791862487793]}, {"id": "main.708", "pos": [11.086711883544922, 5.262233734130859]}, {"id": "main.709", "pos": [12.152320861816406, 7.286009788513184]}, {"id": "main.710", "pos": [10.470845222473145, 5.380087375640869]}, {"id": "main.711", "pos": [5.327550888061523, 8.633814811706543]}, {"id": "main.712", "pos": [8.168755531311035, 2.5789308547973633]}, {"id": "main.713", "pos": [8.972797393798828, 2.8734958171844482]}, {"id": "main.714", "pos": [10.951858520507812, 5.679384708404541]}, {"id": "main.715", "pos": [9.612703323364258, 4.171562671661377]}, {"id": "main.716", "pos": [10.361242294311523, 7.797347545623779]}, {"id": "main.717", "pos": [9.571013450622559, 3.122764825820923]}, {"id": "main.718", "pos": [10.364023208618164, 3.567267894744873]}, {"id": "main.719", "pos": [6.681036949157715, 5.255532741546631]}, {"id": "main.720", "pos": [11.969162940979004, 7.589808940887451]}, {"id": "main.721", "pos": [8.875177383422852, 2.8895530700683594]}, {"id": "main.722", "pos": [8.714356422424316, 7.8275370597839355]}, {"id": "main.723", "pos": [7.74325704574585, 4.577731609344482]}, {"id": "main.724", "pos": [10.38969612121582, 4.832041263580322]}, {"id": "main.725", "pos": [10.507591247558594, 4.43467903137207]}, {"id": "main.726", "pos": [9.121553421020508, 8.175599098205566]}, {"id": "main.727", "pos": [9.140032768249512, 8.769248008728027]}, {"id": "main.728", "pos": [10.56411075592041, 8.159512519836426]}, {"id": "main.729", "pos": [13.185701370239258, 4.838235855102539]}, {"id": "main.730", "pos": [12.67458438873291, 5.60382080078125]}, {"id": "main.731", "pos": [10.930167198181152, 7.688587188720703]}, {"id": "main.732", "pos": [11.365700721740723, 8.410866737365723]}, {"id": "main.733", "pos": [11.460663795471191, 7.424380302429199]}, {"id": "main.734", "pos": [10.687667846679688, 11.005390167236328]}, {"id": "main.735", "pos": [10.46640396118164, 11.024932861328125]}, {"id": "main.736", "pos": [11.56674575805664, 8.552309036254883]}, {"id": "main.737", "pos": [12.494336128234863, 7.232755661010742]}, {"id": "main.738", "pos": [10.612929344177246, 4.1468915939331055]}, {"id": "main.739", "pos": [8.702728271484375, 4.298409461975098]}, {"id": "main.740", "pos": [11.887946128845215, 6.501669406890869]}, {"id": "main.741", "pos": [9.745821952819824, 8.217913627624512]}, {"id": "main.742", "pos": [10.724400520324707, 3.0777268409729004]}, {"id": "main.743", "pos": [9.267111778259277, 5.792608737945557]}, {"id": "main.744", "pos": [8.064322471618652, 6.275618076324463]}, {"id": "main.745", "pos": [10.823275566101074, 2.9365291595458984]}, {"id": "main.746", "pos": [9.009940147399902, 2.733186960220337]}, {"id": "main.747", "pos": [11.937179565429688, 7.546385288238525]}, {"id": "main.748", "pos": [9.160443305969238, 4.348589897155762]}, {"id": "main.749", "pos": [9.795663833618164, 5.835976600646973]}, {"id": "main.750", "pos": [8.153244972229004, 7.599206924438477]}, {"id": "main.751", "pos": [9.418703079223633, 4.730806827545166]}, {"id": "main.752", "pos": [10.00322437286377, 6.204073905944824]}, {"id": "main.753", "pos": [9.050345420837402, 7.678377151489258]}, {"id": "main.754", "pos": [12.338725090026855, 7.592973709106445]}, {"id": "main.755", "pos": [8.706719398498535, 7.005222797393799]}, {"id": "main.756", "pos": [8.786550521850586, 7.487487316131592]}, {"id": "main.757", "pos": [9.175643920898438, 5.895702838897705]}, {"id": "main.758", "pos": [12.349114418029785, 4.5485124588012695]}, {"id": "main.759", "pos": [7.697848320007324, 8.45029354095459]}, {"id": "main.760", "pos": [9.144455909729004, 3.6549065113067627]}, {"id": "main.761", "pos": [7.991159439086914, 4.840510368347168]}, {"id": "main.762", "pos": [9.376192092895508, 4.25636100769043]}, {"id": "main.763", "pos": [11.402896881103516, 7.1549482345581055]}, {"id": "main.764", "pos": [10.000396728515625, 7.147685527801514]}, {"id": "main.765", "pos": [7.95156717300415, 9.068066596984863]}, {"id": "main.766", "pos": [11.646387100219727, 7.404158115386963]}, {"id": "main.767", "pos": [10.234045028686523, 3.7952980995178223]}, {"id": "main.768", "pos": [12.873103141784668, 8.63630485534668]}, {"id": "main.769", "pos": [9.050115585327148, 6.827986717224121]}, {"id": "main.770", "pos": [9.31102180480957, 8.49223804473877]}, {"id": "main.771", "pos": [12.173035621643066, 5.632710933685303]}, {"id": "main.772", "pos": [12.898731231689453, 5.810518741607666]}, {"id": "main.773", "pos": [9.682090759277344, 8.061910629272461]}, {"id": "main.774", "pos": [8.820637702941895, 9.269468307495117]}, {"id": "main.775", "pos": [9.627619743347168, 8.213127136230469]}, {"id": "main.776", "pos": [12.236767768859863, 8.299094200134277]}, {"id": "main.777", "pos": [8.785978317260742, 7.592190742492676]}, {"id": "main.778", "pos": [10.099983215332031, 2.4647324085235596]}, {"id": "demo.24", "pos": [9.391716003417969, 4.8608927726745605]}, {"id": "demo.28", "pos": [12.31663703918457, 3.583505630493164]}, {"id": "demo.31", "pos": [10.343623161315918, 11.052046775817871]}, {"id": "demo.32", "pos": [8.008642196655273, 8.02009391784668]}, {"id": "demo.33", "pos": [11.370047569274902, 7.948054313659668]}, {"id": "demo.35", "pos": [11.288697242736816, 10.245737075805664]}, {"id": "demo.37", "pos": [13.001131057739258, 5.470852851867676]}, {"id": "demo.39", "pos": [11.49230670928955, 3.6878039836883545]}, {"id": "demo.41", "pos": [13.226678848266602, 4.888883590698242]}, {"id": "demo.44", "pos": [11.678454399108887, 3.2782187461853027]}, {"id": "demo.45", "pos": [11.940421104431152, 3.3629331588745117]}, {"id": "demo.46", "pos": [12.273554801940918, 3.578479051589966]}, {"id": "demo.47", "pos": [12.172415733337402, 3.8587894439697266]}, {"id": "demo.48", "pos": [10.92530345916748, 3.446927547454834]}, {"id": "demo.49", "pos": [7.734013557434082, 8.134902954101562]}, {"id": "demo.54", "pos": [9.135287284851074, 7.1438798904418945]}, {"id": "demo.58", "pos": [8.72391414642334, 4.653102397918701]}, {"id": "demo.59", "pos": [8.179064750671387, 3.0147669315338135]}, {"id": "demo.61", "pos": [12.312792778015137, 3.9638054370880127]}, {"id": "demo.66", "pos": [11.98661994934082, 7.000277996063232]}, {"id": "demo.67", "pos": [12.488608360290527, 6.441026210784912]}, {"id": "demo.69", "pos": [9.529315948486328, 3.7218680381774902]}, {"id": "demo.79", "pos": [6.642405986785889, 6.127103328704834]}, {"id": "demo.84", "pos": [12.224326133728027, 6.1386260986328125]}, {"id": "demo.86", "pos": [7.239123344421387, 5.527375221252441]}, {"id": "demo.87", "pos": [8.131061553955078, 6.410661220550537]}, {"id": "demo.89", "pos": [8.11219596862793, 7.486374378204346]}, {"id": "demo.90", "pos": [12.209939956665039, 3.9457297325134277]}, {"id": "demo.91", "pos": [12.199222564697266, 3.750478982925415]}, {"id": "demo.93", "pos": [7.7423834800720215, 6.9201436042785645]}, {"id": "demo.94", "pos": [11.443426132202148, 3.082396984100342]}, {"id": "demo.96", "pos": [5.136790752410889, 8.526924133300781]}, {"id": "demo.100", "pos": [6.614457130432129, 9.241703987121582]}, {"id": "demo.101", "pos": [9.670450210571289, 4.706907749176025]}, {"id": "demo.102", "pos": [7.54175329208374, 7.470064163208008]}, {"id": "demo.104", "pos": [6.799896240234375, 5.424972057342529]}, {"id": "demo.107", "pos": [11.719770431518555, 4.036208629608154]}, {"id": "demo.115", "pos": [12.384943962097168, 3.6884021759033203]}, {"id": "demo.116", "pos": [11.981887817382812, 6.941270351409912]}, {"id": "demo.120", "pos": [6.270053386688232, 8.512995719909668]}, {"id": "demo.124", "pos": [8.209931373596191, 2.7886946201324463]}, {"id": "demo.130", "pos": [11.009413719177246, 3.6633238792419434]}, {"id": "demo.139", "pos": [13.601885795593262, 7.302314758300781]}, {"id": "srw.2", "pos": [10.843957901000977, 5.71698522567749]}, {"id": "srw.5", "pos": [11.556538581848145, 4.419581413269043]}, {"id": "srw.9", "pos": [10.670413970947266, 8.156561851501465]}, {"id": "srw.14", "pos": [10.3727445602417, 10.37224292755127]}, {"id": "srw.15", "pos": [12.716314315795898, 8.909284591674805]}, {"id": "srw.16", "pos": [12.550524711608887, 4.548882484436035]}, {"id": "srw.17", "pos": [11.337430953979492, 6.252729892730713]}, {"id": "srw.18", "pos": [5.97659969329834, 8.319357872009277]}, {"id": "srw.19", "pos": [11.350642204284668, 7.3999433517456055]}, {"id": "srw.22", "pos": [7.690663814544678, 7.975511074066162]}, {"id": "srw.28", "pos": [10.867597579956055, 6.848644733428955]}, {"id": "srw.35", "pos": [7.437699317932129, 5.362488746643066]}, {"id": "srw.36", "pos": [8.707271575927734, 6.567407131195068]}, {"id": "srw.39", "pos": [10.268771171569824, 7.491389751434326]}, {"id": "srw.42", "pos": [8.70948314666748, 4.8469319343566895]}, {"id": "srw.46", "pos": [6.551723957061768, 6.452007293701172]}, {"id": "srw.48", "pos": [10.843680381774902, 10.718575477600098]}, {"id": "srw.49", "pos": [10.324798583984375, 8.226448059082031]}, {"id": "srw.52", "pos": [9.6734619140625, 8.567570686340332]}, {"id": "srw.53", "pos": [7.025976181030273, 5.477107524871826]}, {"id": "srw.54", "pos": [10.997946739196777, 8.861496925354004]}, {"id": "srw.55", "pos": [9.43945026397705, 8.362242698669434]}, {"id": "srw.58", "pos": [7.265152931213379, 8.27270793914795]}, {"id": "srw.69", "pos": [11.035445213317871, 6.722524166107178]}, {"id": "srw.79", "pos": [10.363227844238281, 9.135799407958984]}, {"id": "srw.82", "pos": [11.573432922363281, 7.184135437011719]}, {"id": "srw.84", "pos": [9.101308822631836, 6.710806369781494]}, {"id": "srw.85", "pos": [6.366297245025635, 8.018675804138184]}, {"id": "srw.90", "pos": [9.96233081817627, 9.49477481842041]}, {"id": "srw.95", "pos": [8.124500274658203, 4.849302291870117]}, {"id": "srw.98", "pos": [10.915780067443848, 6.138043403625488]}, {"id": "srw.99", "pos": [8.154001235961914, 6.788134574890137]}, {"id": "srw.104", "pos": [5.114825248718262, 8.378327369689941]}, {"id": "srw.105", "pos": [9.298247337341309, 6.795305252075195]}, {"id": "srw.106", "pos": [11.121394157409668, 7.60977840423584]}, {"id": "srw.109", "pos": [12.008110046386719, 6.255063056945801]}, {"id": "srw.114", "pos": [10.113154411315918, 3.9498445987701416]}, {"id": "srw.115", "pos": [11.752195358276367, 6.655194282531738]}, {"id": "srw.116", "pos": [7.97794771194458, 7.823441505432129]}, {"id": "srw.122", "pos": [6.257230758666992, 8.360031127929688]}, {"id": "srw.123", "pos": [8.95978832244873, 7.725756645202637]}, {"id": "srw.127", "pos": [10.892711639404297, 7.3793840408325195]}, {"id": "srw.128", "pos": [11.067269325256348, 7.842207431793213]}, {"id": "srw.129", "pos": [10.902847290039062, 7.227745056152344]}, {"id": "srw.131", "pos": [8.029378890991211, 6.710271835327148]}, {"id": "srw.135", "pos": [11.042181015014648, 5.245833396911621]}, {"id": "srw.137", "pos": [12.242218017578125, 7.507055759429932]}, {"id": "srw.144", "pos": [11.08476734161377, 5.523712635040283]}, {"id": "cl.1482", "pos": [11.052345275878906, 7.010970592498779]}, {"id": "cl.1508", "pos": [6.658796787261963, 9.032265663146973]}, {"id": "cl.1543", "pos": [10.763266563415527, 5.570960521697998]}, {"id": "cl.1547", "pos": [11.00387954711914, 7.521714687347412]}, {"id": "cl.1550", "pos": [11.685138702392578, 7.702889442443848]}, {"id": "cl.1552", "pos": [12.013227462768555, 7.426867961883545]}, {"id": "cl.1554", "pos": [12.43586540222168, 4.3494110107421875]}, {"id": "tacl.1709", "pos": [10.33511734008789, 6.697405815124512]}, {"id": "tacl.1720", "pos": [12.11512565612793, 4.475583076477051]}, {"id": "tacl.1727", "pos": [8.24050521850586, 6.620774745941162]}, {"id": "tacl.1743", "pos": [13.386107444763184, 5.645632743835449]}, {"id": "tacl.1756", "pos": [12.869604110717773, 6.112816333770752]}, {"id": "tacl.1759", "pos": [10.51839542388916, 9.106481552124023]}, {"id": "tacl.1766", "pos": [9.622814178466797, 7.688530445098877]}, {"id": "tacl.1779", "pos": [12.131568908691406, 3.7685184478759766]}, {"id": "tacl.1780", "pos": [8.8445405960083, 9.17034912109375]}, {"id": "tacl.1801", "pos": [9.585051536560059, 5.4272236824035645]}, {"id": "tacl.1805", "pos": [7.917147636413574, 2.5740861892700195]}, {"id": "tacl.1811", "pos": [9.756397247314453, 6.4286417961120605]}, {"id": "tacl.1815", "pos": [11.396145820617676, 5.602360725402832]}, {"id": "tacl.1834", "pos": [8.100253105163574, 9.285673141479492]}, {"id": "tacl.1843", "pos": [9.146112442016602, 5.43355655670166]}, {"id": "tacl.1845", "pos": [11.115071296691895, 3.7032854557037354]}, {"id": "tacl.1849", "pos": [10.34272575378418, 5.354431629180908]}, {"id": "tacl.1852", "pos": [12.624476432800293, 8.546205520629883]}, {"id": "tacl.1853", "pos": [12.39880084991455, 6.691556453704834]}, {"id": "tacl.1876", "pos": [10.883768081665039, 10.43184757232666]}, {"id": "tacl.1882", "pos": [10.192509651184082, 10.555179595947266]}, {"id": "tacl.1886", "pos": [8.445252418518066, 8.921631813049316]}, {"id": "tacl.1892", "pos": [9.499344825744629, 4.889459133148193]}, {"id": "tacl.1901", "pos": [9.721879005432129, 10.71410846710205]}, {"id": "tacl.1903", "pos": [8.730875015258789, 4.25508975982666]}, {"id": "tacl.1906", "pos": [9.736845016479492, 8.544329643249512]}, {"id": "tacl.1912", "pos": [12.100733757019043, 6.126567840576172]}, {"id": "tacl.1915", "pos": [9.444076538085938, 9.363091468811035]}, {"id": "tacl.1929", "pos": [13.213932991027832, 5.723365783691406]}, {"id": "tacl.1967", "pos": [9.619535446166992, 7.335033416748047]}, {"id": "tacl.2001", "pos": [10.88821792602539, 7.851185321807861]}] +[ + { + "id": "main.1004", + "pos": [ + 8.184931755065918, + 10.130494117736816 + ] + }, + { + "id": "main.1006", + "pos": [ + 5.168003559112549, + 9.426759719848633 + ] + }, + { + "id": "main.1009", + "pos": [ + 5.930365562438965, + 10.379329681396484 + ] + }, + { + "id": "main.1010", + "pos": [ + 10.538464546203613, + 9.024141311645508 + ] + }, + { + "id": "main.1011", + "pos": [ + 11.248796463012695, + 9.249489784240723 + ] + }, + { + "id": "main.1012", + "pos": [ + 5.0803608894348145, + 9.653098106384277 + ] + }, + { + "id": "main.1018", + "pos": [ + 7.17668342590332, + 9.18215560913086 + ] + }, + { + "id": "main.1022", + "pos": [ + 8.380731582641602, + 10.35976791381836 + ] + }, + { + "id": "main.1023", + "pos": [ + 9.087031364440918, + 9.541060447692871 + ] + }, + { + "id": "main.1024", + "pos": [ + 8.333172798156738, + 12.853711128234863 + ] + }, + { + "id": "main.1030", + "pos": [ + 8.457670211791992, + 10.415514945983887 + ] + }, + { + "id": "main.1032", + "pos": [ + 10.18299388885498, + 11.016000747680664 + ] + }, + { + "id": "main.1046", + "pos": [ + 10.863092422485352, + 11.217057228088379 + ] + }, + { + "id": "main.1049", + "pos": [ + 8.621140480041504, + 12.633464813232422 + ] + }, + { + "id": "main.1052", + "pos": [ + 8.37385082244873, + 11.80783462524414 + ] + }, + { + "id": "main.106", + "pos": [ + 8.726348876953125, + 13.26508903503418 + ] + }, + { + "id": "main.1061", + "pos": [ + 8.168220520019531, + 13.575611114501953 + ] + }, + { + "id": "main.1070", + "pos": [ + 10.859148025512695, + 9.858744621276855 + ] + }, + { + "id": "main.1071", + "pos": [ + 8.705020904541016, + 11.516244888305664 + ] + }, + { + "id": "main.108", + "pos": [ + 9.553475379943848, + 11.872015953063965 + ] + }, + { + "id": "main.1085", + "pos": [ + 6.257370948791504, + 10.21512222290039 + ] + }, + { + "id": "main.1086", + "pos": [ + 8.223355293273926, + 10.005450248718262 + ] + }, + { + "id": "main.1091", + "pos": [ + 9.224579811096191, + 9.252211570739746 + ] + }, + { + "id": "main.110", + "pos": [ + 9.816535949707031, + 11.069907188415527 + ] + }, + { + "id": "main.1100", + "pos": [ + 9.464727401733398, + 14.176521301269531 + ] + }, + { + "id": "main.1103", + "pos": [ + 7.317256450653076, + 10.25833511352539 + ] + }, + { + "id": "main.1107", + "pos": [ + 8.64432430267334, + 11.910917282104492 + ] + }, + { + "id": "main.1113", + "pos": [ + 6.163842678070068, + 10.654666900634766 + ] + }, + { + "id": "main.1116", + "pos": [ + 10.758061408996582, + 8.43391227722168 + ] + }, + { + "id": "main.1123", + "pos": [ + 9.20920467376709, + 9.394920349121094 + ] + }, + { + "id": "main.1129", + "pos": [ + 8.507781028747559, + 9.471328735351562 + ] + }, + { + "id": "main.1130", + "pos": [ + 8.900128364562988, + 12.207240104675293 + ] + }, + { + "id": "main.1135", + "pos": [ + 7.674680233001709, + 10.406473159790039 + ] + }, + { + "id": "main.1140", + "pos": [ + 4.867302417755127, + 9.780634880065918 + ] + }, + { + "id": "main.1141", + "pos": [ + 9.591733932495117, + 10.442893028259277 + ] + }, + { + "id": "main.1146", + "pos": [ + 9.533297538757324, + 13.622273445129395 + ] + }, + { + "id": "main.1159", + "pos": [ + 9.867904663085938, + 10.059473037719727 + ] + }, + { + "id": "main.1179", + "pos": [ + 6.984335422515869, + 11.844132423400879 + ] + }, + { + "id": "main.1180", + "pos": [ + 8.91893482208252, + 8.30911922454834 + ] + }, + { + "id": "main.1187", + "pos": [ + 9.062108039855957, + 10.210319519042969 + ] + }, + { + "id": "main.1191", + "pos": [ + 7.579293251037598, + 10.394017219543457 + ] + }, + { + "id": "main.1196", + "pos": [ + 8.63536262512207, + 10.693572044372559 + ] + }, + { + "id": "main.1201", + "pos": [ + 5.173103332519531, + 9.655495643615723 + ] + }, + { + "id": "main.1205", + "pos": [ + 10.424774169921875, + 9.698698043823242 + ] + }, + { + "id": "main.1208", + "pos": [ + 9.513643264770508, + 12.15544319152832 + ] + }, + { + "id": "main.1210", + "pos": [ + 8.323463439941406, + 12.4014310836792 + ] + }, + { + "id": "main.1217", + "pos": [ + 9.364623069763184, + 8.473187446594238 + ] + }, + { + "id": "main.1219", + "pos": [ + 9.628305435180664, + 12.324319839477539 + ] + }, + { + "id": "main.1220", + "pos": [ + 9.83303451538086, + 12.859074592590332 + ] + }, + { + "id": "main.1225", + "pos": [ + 10.511384010314941, + 12.040221214294434 + ] + }, + { + "id": "main.1227", + "pos": [ + 9.68139934539795, + 11.682480812072754 + ] + }, + { + "id": "main.1231", + "pos": [ + 10.72814655303955, + 9.309203147888184 + ] + }, + { + "id": "main.1248", + "pos": [ + 9.96466064453125, + 13.419503211975098 + ] + }, + { + "id": "main.125", + "pos": [ + 9.034789085388184, + 12.916142463684082 + ] + }, + { + "id": "main.1250", + "pos": [ + 9.857666015625, + 9.859053611755371 + ] + }, + { + "id": "main.1258", + "pos": [ + 6.910449981689453, + 12.776697158813477 + ] + }, + { + "id": "main.1262", + "pos": [ + 8.330497741699219, + 10.793278694152832 + ] + }, + { + "id": "main.1263", + "pos": [ + 8.906732559204102, + 14.11041259765625 + ] + }, + { + "id": "main.1267", + "pos": [ + 7.776847839355469, + 11.981037139892578 + ] + }, + { + "id": "main.1271", + "pos": [ + 9.922042846679688, + 11.00949478149414 + ] + }, + { + "id": "main.1275", + "pos": [ + 7.337594509124756, + 12.754110336303711 + ] + }, + { + "id": "main.128", + "pos": [ + 5.457560062408447, + 9.69342041015625 + ] + }, + { + "id": "main.1280", + "pos": [ + 8.812809944152832, + 12.450216293334961 + ] + }, + { + "id": "main.1282", + "pos": [ + 7.150064468383789, + 13.524724960327148 + ] + }, + { + "id": "main.1287", + "pos": [ + 7.848379135131836, + 8.470157623291016 + ] + }, + { + "id": "main.1289", + "pos": [ + 9.000923156738281, + 8.213265419006348 + ] + }, + { + "id": "main.1298", + "pos": [ + 8.14731502532959, + 14.209450721740723 + ] + }, + { + "id": "main.1299", + "pos": [ + 9.825312614440918, + 12.36821460723877 + ] + }, + { + "id": "main.130", + "pos": [ + 9.841195106506348, + 13.598791122436523 + ] + }, + { + "id": "main.1305", + "pos": [ + 10.134678840637207, + 11.873078346252441 + ] + }, + { + "id": "main.1320", + "pos": [ + 8.186949729919434, + 13.362020492553711 + ] + }, + { + "id": "main.1322", + "pos": [ + 10.617491722106934, + 11.262009620666504 + ] + }, + { + "id": "main.1339", + "pos": [ + 9.319986343383789, + 13.385730743408203 + ] + }, + { + "id": "main.1351", + "pos": [ + 9.76822280883789, + 11.905786514282227 + ] + }, + { + "id": "main.1356", + "pos": [ + 10.33792495727539, + 12.269543647766113 + ] + }, + { + "id": "main.1377", + "pos": [ + 9.347114562988281, + 12.088449478149414 + ] + }, + { + "id": "main.1379", + "pos": [ + 8.367547035217285, + 13.93682861328125 + ] + }, + { + "id": "main.1383", + "pos": [ + 8.700840950012207, + 14.377843856811523 + ] + }, + { + "id": "main.1388", + "pos": [ + 8.707324981689453, + 11.726365089416504 + ] + }, + { + "id": "main.1389", + "pos": [ + 9.852529525756836, + 10.923629760742188 + ] + }, + { + "id": "main.1390", + "pos": [ + 6.314508438110352, + 9.40473747253418 + ] + }, + { + "id": "main.1393", + "pos": [ + 7.628002643585205, + 12.737343788146973 + ] + }, + { + "id": "main.1395", + "pos": [ + 7.700715065002441, + 12.99399471282959 + ] + }, + { + "id": "main.1399", + "pos": [ + 11.117382049560547, + 10.662420272827148 + ] + }, + { + "id": "main.1402", + "pos": [ + 9.421987533569336, + 14.218767166137695 + ] + }, + { + "id": "main.1408", + "pos": [ + 10.577343940734863, + 10.973301887512207 + ] + }, + { + "id": "main.1421", + "pos": [ + 10.827049255371094, + 8.29781436920166 + ] + }, + { + "id": "main.1428", + "pos": [ + 9.83519458770752, + 11.793498039245605 + ] + }, + { + "id": "main.143", + "pos": [ + 8.076814651489258, + 13.45113468170166 + ] + }, + { + "id": "main.1432", + "pos": [ + 9.9871244430542, + 13.933173179626465 + ] + }, + { + "id": "main.1445", + "pos": [ + 8.540823936462402, + 14.215070724487305 + ] + }, + { + "id": "main.1446", + "pos": [ + 10.21047592163086, + 12.374905586242676 + ] + }, + { + "id": "main.1455", + "pos": [ + 7.06985330581665, + 13.368266105651855 + ] + }, + { + "id": "main.1456", + "pos": [ + 8.055367469787598, + 13.913551330566406 + ] + }, + { + "id": "main.1458", + "pos": [ + 11.263312339782715, + 11.376070022583008 + ] + }, + { + "id": "main.1460", + "pos": [ + 11.200942993164062, + 9.421442031860352 + ] + }, + { + "id": "main.1465", + "pos": [ + 10.864622116088867, + 8.551992416381836 + ] + }, + { + "id": "main.1466", + "pos": [ + 10.90599250793457, + 8.926796913146973 + ] + }, + { + "id": "main.148", + "pos": [ + 10.484042167663574, + 11.611618995666504 + ] + }, + { + "id": "main.1482", + "pos": [ + 9.947909355163574, + 10.844705581665039 + ] + }, + { + "id": "main.1484", + "pos": [ + 6.818316459655762, + 10.219584465026855 + ] + }, + { + "id": "main.1485", + "pos": [ + 10.206116676330566, + 12.787127494812012 + ] + }, + { + "id": "main.1488", + "pos": [ + 10.150066375732422, + 8.926957130432129 + ] + }, + { + "id": "main.1490", + "pos": [ + 10.172513961791992, + 12.429381370544434 + ] + }, + { + "id": "main.1492", + "pos": [ + 7.441854000091553, + 13.448404312133789 + ] + }, + { + "id": "main.1493", + "pos": [ + 11.10435676574707, + 9.409870147705078 + ] + }, + { + "id": "main.1494", + "pos": [ + 6.897450923919678, + 12.964929580688477 + ] + }, + { + "id": "main.1495", + "pos": [ + 8.812150001525879, + 11.59526252746582 + ] + }, + { + "id": "main.1498", + "pos": [ + 10.940112113952637, + 11.905218124389648 + ] + }, + { + "id": "main.1503", + "pos": [ + 7.877359390258789, + 13.844685554504395 + ] + }, + { + "id": "main.1504", + "pos": [ + 9.591719627380371, + 13.968501091003418 + ] + }, + { + "id": "main.1508", + "pos": [ + 11.481339454650879, + 10.206235885620117 + ] + }, + { + "id": "main.151", + "pos": [ + 9.899431228637695, + 9.576464653015137 + ] + }, + { + "id": "main.1518", + "pos": [ + 7.581427574157715, + 12.252936363220215 + ] + }, + { + "id": "main.1522", + "pos": [ + 5.4033331871032715, + 9.443024635314941 + ] + }, + { + "id": "main.1528", + "pos": [ + 10.557541847229004, + 9.759937286376953 + ] + }, + { + "id": "main.1540", + "pos": [ + 9.374334335327148, + 10.702089309692383 + ] + }, + { + "id": "main.1547", + "pos": [ + 10.258017539978027, + 10.952059745788574 + ] + }, + { + "id": "main.1550", + "pos": [ + 8.910304069519043, + 8.134511947631836 + ] + }, + { + "id": "main.1551", + "pos": [ + 8.292794227600098, + 11.885726928710938 + ] + }, + { + "id": "main.1552", + "pos": [ + 10.222429275512695, + 12.81452751159668 + ] + }, + { + "id": "main.1561", + "pos": [ + 5.045755386352539, + 9.477103233337402 + ] + }, + { + "id": "main.1566", + "pos": [ + 9.545042991638184, + 9.93100357055664 + ] + }, + { + "id": "main.1569", + "pos": [ + 7.097303867340088, + 12.279668807983398 + ] + }, + { + "id": "main.1572", + "pos": [ + 9.624929428100586, + 14.222825050354004 + ] + }, + { + "id": "main.1574", + "pos": [ + 10.372462272644043, + 12.725854873657227 + ] + }, + { + "id": "main.1575", + "pos": [ + 10.364063262939453, + 10.875754356384277 + ] + }, + { + "id": "main.1578", + "pos": [ + 6.072323322296143, + 10.038471221923828 + ] + }, + { + "id": "main.158", + "pos": [ + 10.727073669433594, + 9.037259101867676 + ] + }, + { + "id": "main.1580", + "pos": [ + 8.267136573791504, + 10.382481575012207 + ] + }, + { + "id": "main.1581", + "pos": [ + 9.127229690551758, + 13.122706413269043 + ] + }, + { + "id": "main.1594", + "pos": [ + 9.84210205078125, + 11.661818504333496 + ] + }, + { + "id": "main.16", + "pos": [ + 10.22790813446045, + 11.642441749572754 + ] + }, + { + "id": "main.1603", + "pos": [ + 8.372201919555664, + 8.343348503112793 + ] + }, + { + "id": "main.1606", + "pos": [ + 4.966498851776123, + 9.721148490905762 + ] + }, + { + "id": "main.1611", + "pos": [ + 10.077484130859375, + 10.920461654663086 + ] + }, + { + "id": "main.1612", + "pos": [ + 9.622757911682129, + 11.324399948120117 + ] + }, + { + "id": "main.1613", + "pos": [ + 7.923768520355225, + 12.97665786743164 + ] + }, + { + "id": "main.1614", + "pos": [ + 12.083231925964355, + 11.242968559265137 + ] + }, + { + "id": "main.1615", + "pos": [ + 10.505197525024414, + 12.184676170349121 + ] + }, + { + "id": "main.1618", + "pos": [ + 9.988953590393066, + 13.137323379516602 + ] + }, + { + "id": "main.1621", + "pos": [ + 7.524637699127197, + 12.259057998657227 + ] + }, + { + "id": "main.1622", + "pos": [ + 6.138153553009033, + 9.43995189666748 + ] + }, + { + "id": "main.1625", + "pos": [ + 7.0385422706604, + 12.569629669189453 + ] + }, + { + "id": "main.1626", + "pos": [ + 8.213642120361328, + 8.689336776733398 + ] + }, + { + "id": "main.1631", + "pos": [ + 9.82158088684082, + 11.68218994140625 + ] + }, + { + "id": "main.1634", + "pos": [ + 7.819273471832275, + 11.693537712097168 + ] + }, + { + "id": "main.1647", + "pos": [ + 7.546461582183838, + 11.069744110107422 + ] + }, + { + "id": "main.1648", + "pos": [ + 10.864246368408203, + 9.008133888244629 + ] + }, + { + "id": "main.1649", + "pos": [ + 8.384528160095215, + 13.46707820892334 + ] + }, + { + "id": "main.165", + "pos": [ + 6.8549485206604, + 8.37054443359375 + ] + }, + { + "id": "main.1654", + "pos": [ + 5.199023246765137, + 9.52939224243164 + ] + }, + { + "id": "main.1658", + "pos": [ + 8.032930374145508, + 12.86044692993164 + ] + }, + { + "id": "main.1669", + "pos": [ + 9.966837882995605, + 9.970216751098633 + ] + }, + { + "id": "main.1670", + "pos": [ + 8.444650650024414, + 10.788983345031738 + ] + }, + { + "id": "main.1675", + "pos": [ + 8.901952743530273, + 8.110005378723145 + ] + }, + { + "id": "main.168", + "pos": [ + 6.732244491577148, + 8.559639930725098 + ] + }, + { + "id": "main.1680", + "pos": [ + 9.004725456237793, + 14.201351165771484 + ] + }, + { + "id": "main.1682", + "pos": [ + 10.414186477661133, + 10.703299522399902 + ] + }, + { + "id": "main.1687", + "pos": [ + 6.852909564971924, + 10.765690803527832 + ] + }, + { + "id": "main.1694", + "pos": [ + 9.332682609558105, + 14.22864818572998 + ] + }, + { + "id": "main.1700", + "pos": [ + 5.658627510070801, + 9.701454162597656 + ] + }, + { + "id": "main.1702", + "pos": [ + 5.105837345123291, + 9.60877513885498 + ] + }, + { + "id": "main.1706", + "pos": [ + 10.995473861694336, + 9.376569747924805 + ] + }, + { + "id": "main.1707", + "pos": [ + 9.441450119018555, + 12.672911643981934 + ] + }, + { + "id": "main.1720", + "pos": [ + 10.376029968261719, + 11.7567777633667 + ] + }, + { + "id": "main.1733", + "pos": [ + 10.240180015563965, + 13.39728832244873 + ] + }, + { + "id": "main.1734", + "pos": [ + 9.761664390563965, + 12.209344863891602 + ] + }, + { + "id": "main.1738", + "pos": [ + 10.705682754516602, + 10.330062866210938 + ] + }, + { + "id": "main.1739", + "pos": [ + 8.082718849182129, + 13.091267585754395 + ] + }, + { + "id": "main.1749", + "pos": [ + 10.711047172546387, + 8.148253440856934 + ] + }, + { + "id": "main.1750", + "pos": [ + 7.500577449798584, + 8.90270709991455 + ] + }, + { + "id": "main.1754", + "pos": [ + 7.135692119598389, + 12.461570739746094 + ] + }, + { + "id": "main.1755", + "pos": [ + 10.812475204467773, + 9.99057388305664 + ] + }, + { + "id": "main.1766", + "pos": [ + 9.005407333374023, + 8.228474617004395 + ] + }, + { + "id": "main.1770", + "pos": [ + 9.245560646057129, + 14.022497177124023 + ] + }, + { + "id": "main.1782", + "pos": [ + 8.57225227355957, + 14.117250442504883 + ] + }, + { + "id": "main.1784", + "pos": [ + 10.443343162536621, + 9.254362106323242 + ] + }, + { + "id": "main.1787", + "pos": [ + 11.032347679138184, + 9.53511905670166 + ] + }, + { + "id": "main.1788", + "pos": [ + 8.320643424987793, + 10.443885803222656 + ] + }, + { + "id": "main.179", + "pos": [ + 8.24977970123291, + 13.225619316101074 + ] + }, + { + "id": "main.1797", + "pos": [ + 5.850074291229248, + 9.871672630310059 + ] + }, + { + "id": "main.1798", + "pos": [ + 9.987268447875977, + 13.398883819580078 + ] + }, + { + "id": "main.1803", + "pos": [ + 8.681327819824219, + 13.918757438659668 + ] + }, + { + "id": "main.1817", + "pos": [ + 9.974621772766113, + 9.78064250946045 + ] + }, + { + "id": "main.1832", + "pos": [ + 7.592413902282715, + 9.414253234863281 + ] + }, + { + "id": "main.1834", + "pos": [ + 10.614365577697754, + 11.478507995605469 + ] + }, + { + "id": "main.1835", + "pos": [ + 8.821187973022461, + 9.664074897766113 + ] + }, + { + "id": "main.1837", + "pos": [ + 8.499451637268066, + 10.543668746948242 + ] + }, + { + "id": "main.1846", + "pos": [ + 5.172065258026123, + 9.748473167419434 + ] + }, + { + "id": "main.1857", + "pos": [ + 9.872905731201172, + 9.8128662109375 + ] + }, + { + "id": "main.1862", + "pos": [ + 9.010720252990723, + 10.39071273803711 + ] + }, + { + "id": "main.1863", + "pos": [ + 5.80144739151001, + 9.274863243103027 + ] + }, + { + "id": "main.1866", + "pos": [ + 7.307633876800537, + 11.738365173339844 + ] + }, + { + "id": "main.1877", + "pos": [ + 8.751473426818848, + 10.790159225463867 + ] + }, + { + "id": "main.1892", + "pos": [ + 8.570316314697266, + 12.620565414428711 + ] + }, + { + "id": "main.1898", + "pos": [ + 9.314026832580566, + 12.916194915771484 + ] + }, + { + "id": "main.1901", + "pos": [ + 7.730632305145264, + 14.316774368286133 + ] + }, + { + "id": "main.1904", + "pos": [ + 7.454981803894043, + 8.456390380859375 + ] + }, + { + "id": "main.1906", + "pos": [ + 7.314486980438232, + 11.526959419250488 + ] + }, + { + "id": "main.1908", + "pos": [ + 9.63467025756836, + 10.873817443847656 + ] + }, + { + "id": "main.1923", + "pos": [ + 9.667535781860352, + 10.912924766540527 + ] + }, + { + "id": "main.1928", + "pos": [ + 7.104084491729736, + 10.99149227142334 + ] + }, + { + "id": "main.1935", + "pos": [ + 7.440739631652832, + 13.853813171386719 + ] + }, + { + "id": "main.1938", + "pos": [ + 7.489354133605957, + 12.797518730163574 + ] + }, + { + "id": "main.1942", + "pos": [ + 9.745519638061523, + 10.799038887023926 + ] + }, + { + "id": "main.1943", + "pos": [ + 6.838314533233643, + 12.732697486877441 + ] + }, + { + "id": "main.1949", + "pos": [ + 9.525175094604492, + 10.670763969421387 + ] + }, + { + "id": "main.1952", + "pos": [ + 9.660663604736328, + 9.583348274230957 + ] + }, + { + "id": "main.1957", + "pos": [ + 7.125567436218262, + 12.504443168640137 + ] + }, + { + "id": "main.1960", + "pos": [ + 9.820513725280762, + 13.351865768432617 + ] + }, + { + "id": "main.1970", + "pos": [ + 7.741837978363037, + 12.50940990447998 + ] + }, + { + "id": "main.1972", + "pos": [ + 6.6072869300842285, + 9.816985130310059 + ] + }, + { + "id": "main.1974", + "pos": [ + 9.773716926574707, + 10.314260482788086 + ] + }, + { + "id": "main.1975", + "pos": [ + 10.475964546203613, + 8.960245132446289 + ] + }, + { + "id": "main.1977", + "pos": [ + 10.61862850189209, + 8.22462272644043 + ] + }, + { + "id": "main.1986", + "pos": [ + 9.011523246765137, + 14.321606636047363 + ] + }, + { + "id": "main.1996", + "pos": [ + 10.32190990447998, + 11.898427963256836 + ] + }, + { + "id": "main.1997", + "pos": [ + 8.246258735656738, + 13.871732711791992 + ] + }, + { + "id": "main.2005", + "pos": [ + 8.592146873474121, + 12.165802955627441 + ] + }, + { + "id": "main.2012", + "pos": [ + 7.884665012359619, + 9.749645233154297 + ] + }, + { + "id": "main.204", + "pos": [ + 8.484712600708008, + 12.47088623046875 + ] + }, + { + "id": "main.2040", + "pos": [ + 8.12070083618164, + 12.38112735748291 + ] + }, + { + "id": "main.2042", + "pos": [ + 5.951929569244385, + 9.335508346557617 + ] + }, + { + "id": "main.2048", + "pos": [ + 10.65087890625, + 8.171792984008789 + ] + }, + { + "id": "main.2050", + "pos": [ + 5.661355972290039, + 9.34554386138916 + ] + }, + { + "id": "main.2054", + "pos": [ + 7.769022464752197, + 10.470876693725586 + ] + }, + { + "id": "main.2055", + "pos": [ + 9.2994384765625, + 14.055427551269531 + ] + }, + { + "id": "main.2057", + "pos": [ + 7.737287998199463, + 13.837803840637207 + ] + }, + { + "id": "main.2058", + "pos": [ + 5.3362016677856445, + 9.4329252243042 + ] + }, + { + "id": "main.2061", + "pos": [ + 9.57014274597168, + 14.060904502868652 + ] + }, + { + "id": "main.2064", + "pos": [ + 6.986530303955078, + 12.912970542907715 + ] + }, + { + "id": "main.2066", + "pos": [ + 10.066226959228516, + 11.387876510620117 + ] + }, + { + "id": "main.2068", + "pos": [ + 10.289098739624023, + 11.079668998718262 + ] + }, + { + "id": "main.207", + "pos": [ + 6.858957767486572, + 12.872563362121582 + ] + }, + { + "id": "main.2070", + "pos": [ + 5.329118728637695, + 10.070072174072266 + ] + }, + { + "id": "main.2072", + "pos": [ + 7.36693000793457, + 9.746170997619629 + ] + }, + { + "id": "main.2075", + "pos": [ + 8.12617015838623, + 13.056992530822754 + ] + }, + { + "id": "main.2076", + "pos": [ + 8.11596965789795, + 11.992046356201172 + ] + }, + { + "id": "main.2078", + "pos": [ + 9.870828628540039, + 11.49960994720459 + ] + }, + { + "id": "main.2083", + "pos": [ + 8.837645530700684, + 11.458600044250488 + ] + }, + { + "id": "main.2087", + "pos": [ + 9.448299407958984, + 11.615736961364746 + ] + }, + { + "id": "main.2094", + "pos": [ + 8.290657043457031, + 12.208810806274414 + ] + }, + { + "id": "main.2098", + "pos": [ + 9.014151573181152, + 13.161670684814453 + ] + }, + { + "id": "main.210", + "pos": [ + 8.195771217346191, + 11.252156257629395 + ] + }, + { + "id": "main.2100", + "pos": [ + 9.740294456481934, + 14.165559768676758 + ] + }, + { + "id": "main.2112", + "pos": [ + 7.452215194702148, + 11.904210090637207 + ] + }, + { + "id": "main.2114", + "pos": [ + 7.300869941711426, + 13.723830223083496 + ] + }, + { + "id": "main.2117", + "pos": [ + 8.097801208496094, + 9.787070274353027 + ] + }, + { + "id": "main.2120", + "pos": [ + 8.306597709655762, + 10.548384666442871 + ] + }, + { + "id": "main.2122", + "pos": [ + 8.348153114318848, + 11.907185554504395 + ] + }, + { + "id": "main.2125", + "pos": [ + 8.883827209472656, + 9.559328079223633 + ] + }, + { + "id": "main.2131", + "pos": [ + 7.984779357910156, + 14.31856632232666 + ] + }, + { + "id": "main.2133", + "pos": [ + 9.446303367614746, + 9.883926391601562 + ] + }, + { + "id": "main.214", + "pos": [ + 9.193986892700195, + 13.27407455444336 + ] + }, + { + "id": "main.2141", + "pos": [ + 5.180946350097656, + 9.563343048095703 + ] + }, + { + "id": "main.215", + "pos": [ + 5.119947910308838, + 9.499870300292969 + ] + }, + { + "id": "main.2151", + "pos": [ + 7.205809593200684, + 12.512663841247559 + ] + }, + { + "id": "main.2163", + "pos": [ + 9.308682441711426, + 14.096077919006348 + ] + }, + { + "id": "main.2164", + "pos": [ + 5.241134166717529, + 9.685047149658203 + ] + }, + { + "id": "main.2167", + "pos": [ + 10.329583168029785, + 10.670764923095703 + ] + }, + { + "id": "main.2179", + "pos": [ + 7.410019874572754, + 12.759065628051758 + ] + }, + { + "id": "main.2181", + "pos": [ + 11.21045970916748, + 9.123377799987793 + ] + }, + { + "id": "main.2198", + "pos": [ + 9.430030822753906, + 12.415450096130371 + ] + }, + { + "id": "main.2205", + "pos": [ + 8.292794227600098, + 12.705850601196289 + ] + }, + { + "id": "main.2208", + "pos": [ + 7.698984146118164, + 13.672708511352539 + ] + }, + { + "id": "main.2209", + "pos": [ + 5.25095272064209, + 9.550504684448242 + ] + }, + { + "id": "main.2212", + "pos": [ + 5.157416343688965, + 9.772381782531738 + ] + }, + { + "id": "main.2215", + "pos": [ + 6.891973495483398, + 10.345660209655762 + ] + }, + { + "id": "main.2216", + "pos": [ + 7.795663833618164, + 13.873647689819336 + ] + }, + { + "id": "main.2218", + "pos": [ + 9.125990867614746, + 13.62340259552002 + ] + }, + { + "id": "main.2221", + "pos": [ + 6.517288684844971, + 10.1409273147583 + ] + }, + { + "id": "main.2225", + "pos": [ + 8.277534484863281, + 8.419981956481934 + ] + }, + { + "id": "main.2228", + "pos": [ + 8.097082138061523, + 10.200361251831055 + ] + }, + { + "id": "main.2238", + "pos": [ + 8.393129348754883, + 11.125910758972168 + ] + }, + { + "id": "main.2251", + "pos": [ + 7.695290565490723, + 13.504772186279297 + ] + }, + { + "id": "main.2253", + "pos": [ + 7.507674694061279, + 10.572534561157227 + ] + }, + { + "id": "main.2258", + "pos": [ + 8.148602485656738, + 10.376822471618652 + ] + }, + { + "id": "main.2261", + "pos": [ + 8.39004898071289, + 8.295891761779785 + ] + }, + { + "id": "main.2268", + "pos": [ + 8.5697021484375, + 10.86843204498291 + ] + }, + { + "id": "main.2271", + "pos": [ + 7.707272052764893, + 13.192562103271484 + ] + }, + { + "id": "main.2273", + "pos": [ + 8.687424659729004, + 11.465475082397461 + ] + }, + { + "id": "main.2278", + "pos": [ + 8.259214401245117, + 13.9103422164917 + ] + }, + { + "id": "main.2281", + "pos": [ + 5.557607650756836, + 9.589434623718262 + ] + }, + { + "id": "main.2289", + "pos": [ + 9.581645965576172, + 9.725725173950195 + ] + }, + { + "id": "main.2298", + "pos": [ + 9.074857711791992, + 14.045295715332031 + ] + }, + { + "id": "main.2307", + "pos": [ + 9.165846824645996, + 10.525994300842285 + ] + }, + { + "id": "main.2313", + "pos": [ + 11.972737312316895, + 11.252028465270996 + ] + }, + { + "id": "main.2322", + "pos": [ + 7.194822788238525, + 10.616480827331543 + ] + }, + { + "id": "main.233", + "pos": [ + 5.142807483673096, + 9.712480545043945 + ] + }, + { + "id": "main.2331", + "pos": [ + 10.481813430786133, + 11.460721969604492 + ] + }, + { + "id": "main.2337", + "pos": [ + 9.052262306213379, + 10.553191184997559 + ] + }, + { + "id": "main.2342", + "pos": [ + 8.313199043273926, + 11.682060241699219 + ] + }, + { + "id": "main.2343", + "pos": [ + 9.744599342346191, + 12.67092514038086 + ] + }, + { + "id": "main.2349", + "pos": [ + 8.124150276184082, + 13.073026657104492 + ] + }, + { + "id": "main.2357", + "pos": [ + 10.54491901397705, + 12.99521255493164 + ] + }, + { + "id": "main.2363", + "pos": [ + 7.862764358520508, + 12.724416732788086 + ] + }, + { + "id": "main.2367", + "pos": [ + 9.485098838806152, + 9.571914672851562 + ] + }, + { + "id": "main.237", + "pos": [ + 10.688180923461914, + 8.710618019104004 + ] + }, + { + "id": "main.2370", + "pos": [ + 7.487493515014648, + 13.94455623626709 + ] + }, + { + "id": "main.2377", + "pos": [ + 9.337156295776367, + 10.0512113571167 + ] + }, + { + "id": "main.2380", + "pos": [ + 8.036097526550293, + 10.256562232971191 + ] + }, + { + "id": "main.2382", + "pos": [ + 8.894086837768555, + 12.684172630310059 + ] + }, + { + "id": "main.2383", + "pos": [ + 7.161789894104004, + 11.039895057678223 + ] + }, + { + "id": "main.2389", + "pos": [ + 9.457183837890625, + 12.870655059814453 + ] + }, + { + "id": "main.2391", + "pos": [ + 7.79024076461792, + 13.151313781738281 + ] + }, + { + "id": "main.2396", + "pos": [ + 8.940055847167969, + 13.968499183654785 + ] + }, + { + "id": "main.2406", + "pos": [ + 11.101667404174805, + 9.322298049926758 + ] + }, + { + "id": "main.2410", + "pos": [ + 5.770413398742676, + 9.935214042663574 + ] + }, + { + "id": "main.2412", + "pos": [ + 9.284551620483398, + 13.6043062210083 + ] + }, + { + "id": "main.2414", + "pos": [ + 9.106700897216797, + 12.270771026611328 + ] + }, + { + "id": "main.2415", + "pos": [ + 7.595291614532471, + 10.820556640625 + ] + }, + { + "id": "main.2416", + "pos": [ + 7.570151329040527, + 10.861753463745117 + ] + }, + { + "id": "main.2419", + "pos": [ + 7.013322830200195, + 12.56423282623291 + ] + }, + { + "id": "main.2422", + "pos": [ + 9.061910629272461, + 13.01965618133545 + ] + }, + { + "id": "main.2424", + "pos": [ + 8.614849090576172, + 12.534041404724121 + ] + }, + { + "id": "main.2426", + "pos": [ + 10.343354225158691, + 9.37032699584961 + ] + }, + { + "id": "main.2427", + "pos": [ + 10.685322761535645, + 8.204992294311523 + ] + }, + { + "id": "main.2430", + "pos": [ + 10.2342529296875, + 12.206679344177246 + ] + }, + { + "id": "main.2437", + "pos": [ + 8.866083145141602, + 9.558244705200195 + ] + }, + { + "id": "main.2438", + "pos": [ + 9.067105293273926, + 11.479659080505371 + ] + }, + { + "id": "main.2444", + "pos": [ + 6.096078395843506, + 9.314204216003418 + ] + }, + { + "id": "main.2448", + "pos": [ + 8.603652954101562, + 13.017167091369629 + ] + }, + { + "id": "main.2452", + "pos": [ + 7.047276496887207, + 13.41884994506836 + ] + }, + { + "id": "main.246", + "pos": [ + 9.192421913146973, + 14.093749046325684 + ] + }, + { + "id": "main.247", + "pos": [ + 9.686306953430176, + 12.822975158691406 + ] + }, + { + "id": "main.2470", + "pos": [ + 8.806316375732422, + 9.914340019226074 + ] + }, + { + "id": "main.2476", + "pos": [ + 9.450515747070312, + 11.0062894821167 + ] + }, + { + "id": "main.2490", + "pos": [ + 8.707490921020508, + 14.385541915893555 + ] + }, + { + "id": "main.2491", + "pos": [ + 9.655542373657227, + 12.224260330200195 + ] + }, + { + "id": "main.2493", + "pos": [ + 8.571040153503418, + 13.71367359161377 + ] + }, + { + "id": "main.2500", + "pos": [ + 8.854633331298828, + 13.887763977050781 + ] + }, + { + "id": "main.2506", + "pos": [ + 8.71704387664795, + 9.616787910461426 + ] + }, + { + "id": "main.2508", + "pos": [ + 10.807535171508789, + 8.31191349029541 + ] + }, + { + "id": "main.2510", + "pos": [ + 7.490241050720215, + 8.405689239501953 + ] + }, + { + "id": "main.2511", + "pos": [ + 7.994083881378174, + 11.334834098815918 + ] + }, + { + "id": "main.2512", + "pos": [ + 9.028050422668457, + 10.810383796691895 + ] + }, + { + "id": "main.2515", + "pos": [ + 9.586691856384277, + 13.12160587310791 + ] + }, + { + "id": "main.252", + "pos": [ + 7.146149635314941, + 10.856130599975586 + ] + }, + { + "id": "main.2520", + "pos": [ + 8.683262825012207, + 8.482057571411133 + ] + }, + { + "id": "main.2529", + "pos": [ + 9.209517478942871, + 9.889517784118652 + ] + }, + { + "id": "main.2533", + "pos": [ + 7.810286045074463, + 12.06122875213623 + ] + }, + { + "id": "main.2535", + "pos": [ + 9.56460189819336, + 11.144179344177246 + ] + }, + { + "id": "main.2549", + "pos": [ + 8.773801803588867, + 8.062156677246094 + ] + }, + { + "id": "main.2553", + "pos": [ + 8.185714721679688, + 13.611306190490723 + ] + }, + { + "id": "main.2561", + "pos": [ + 6.893625259399414, + 8.42255687713623 + ] + }, + { + "id": "main.2570", + "pos": [ + 8.324228286743164, + 9.664105415344238 + ] + }, + { + "id": "main.2574", + "pos": [ + 6.321373462677002, + 10.290733337402344 + ] + }, + { + "id": "main.2579", + "pos": [ + 9.270164489746094, + 11.201068878173828 + ] + }, + { + "id": "main.2581", + "pos": [ + 8.819046020507812, + 13.107604026794434 + ] + }, + { + "id": "main.2583", + "pos": [ + 9.127212524414062, + 12.850625038146973 + ] + }, + { + "id": "main.2585", + "pos": [ + 10.133062362670898, + 10.80811882019043 + ] + }, + { + "id": "main.2586", + "pos": [ + 8.306124687194824, + 10.649042129516602 + ] + }, + { + "id": "main.2587", + "pos": [ + 8.80806827545166, + 10.611814498901367 + ] + }, + { + "id": "main.2590", + "pos": [ + 7.621500492095947, + 11.696390151977539 + ] + }, + { + "id": "main.2596", + "pos": [ + 10.729217529296875, + 11.786517143249512 + ] + }, + { + "id": "main.26", + "pos": [ + 9.6657075881958, + 13.600969314575195 + ] + }, + { + "id": "main.2608", + "pos": [ + 10.496198654174805, + 8.305285453796387 + ] + }, + { + "id": "main.2612", + "pos": [ + 6.084193229675293, + 9.411398887634277 + ] + }, + { + "id": "main.2614", + "pos": [ + 12.049683570861816, + 11.380776405334473 + ] + }, + { + "id": "main.2615", + "pos": [ + 9.589213371276855, + 12.200014114379883 + ] + }, + { + "id": "main.2630", + "pos": [ + 8.05678653717041, + 13.191463470458984 + ] + }, + { + "id": "main.2632", + "pos": [ + 9.906498908996582, + 12.201382637023926 + ] + }, + { + "id": "main.2635", + "pos": [ + 9.583869934082031, + 11.417340278625488 + ] + }, + { + "id": "main.2636", + "pos": [ + 10.448697090148926, + 12.070401191711426 + ] + }, + { + "id": "main.2638", + "pos": [ + 7.0388712882995605, + 13.393624305725098 + ] + }, + { + "id": "main.2640", + "pos": [ + 8.945602416992188, + 10.745366096496582 + ] + }, + { + "id": "main.2641", + "pos": [ + 8.361624717712402, + 13.803339004516602 + ] + }, + { + "id": "main.2644", + "pos": [ + 6.97118616104126, + 9.376349449157715 + ] + }, + { + "id": "main.2650", + "pos": [ + 8.927800178527832, + 10.042991638183594 + ] + }, + { + "id": "main.2651", + "pos": [ + 7.772651195526123, + 8.697779655456543 + ] + }, + { + "id": "main.2661", + "pos": [ + 9.76976490020752, + 13.788886070251465 + ] + }, + { + "id": "main.267", + "pos": [ + 8.536413192749023, + 14.154541015625 + ] + }, + { + "id": "main.2674", + "pos": [ + 9.59527587890625, + 13.756523132324219 + ] + }, + { + "id": "main.2675", + "pos": [ + 8.899932861328125, + 13.629181861877441 + ] + }, + { + "id": "main.2684", + "pos": [ + 6.87037992477417, + 13.036697387695312 + ] + }, + { + "id": "main.2688", + "pos": [ + 8.953619003295898, + 9.405275344848633 + ] + }, + { + "id": "main.2696", + "pos": [ + 8.948376655578613, + 12.292475700378418 + ] + }, + { + "id": "main.2702", + "pos": [ + 8.523673057556152, + 11.598928451538086 + ] + }, + { + "id": "main.2705", + "pos": [ + 9.671440124511719, + 10.564888000488281 + ] + }, + { + "id": "main.2707", + "pos": [ + 6.836493492126465, + 8.427992820739746 + ] + }, + { + "id": "main.2712", + "pos": [ + 8.612764358520508, + 8.899465560913086 + ] + }, + { + "id": "main.2718", + "pos": [ + 8.101227760314941, + 14.170827865600586 + ] + }, + { + "id": "main.2721", + "pos": [ + 8.725777626037598, + 10.604020118713379 + ] + }, + { + "id": "main.2724", + "pos": [ + 10.267860412597656, + 8.981099128723145 + ] + }, + { + "id": "main.2733", + "pos": [ + 10.33996295928955, + 11.190624237060547 + ] + }, + { + "id": "main.2739", + "pos": [ + 9.258535385131836, + 10.41878604888916 + ] + }, + { + "id": "main.2746", + "pos": [ + 8.049269676208496, + 14.080453872680664 + ] + }, + { + "id": "main.2750", + "pos": [ + 7.970381259918213, + 10.101861000061035 + ] + }, + { + "id": "main.2756", + "pos": [ + 11.057027816772461, + 11.95122241973877 + ] + }, + { + "id": "main.2758", + "pos": [ + 6.799990653991699, + 10.761974334716797 + ] + }, + { + "id": "main.2761", + "pos": [ + 10.642701148986816, + 8.917107582092285 + ] + }, + { + "id": "main.2763", + "pos": [ + 7.770046234130859, + 11.709532737731934 + ] + }, + { + "id": "main.2764", + "pos": [ + 10.313736915588379, + 8.884801864624023 + ] + }, + { + "id": "main.2766", + "pos": [ + 7.349248886108398, + 13.057621955871582 + ] + }, + { + "id": "main.2767", + "pos": [ + 9.531858444213867, + 12.588603019714355 + ] + }, + { + "id": "main.2777", + "pos": [ + 8.488235473632812, + 13.844907760620117 + ] + }, + { + "id": "main.2779", + "pos": [ + 9.059015274047852, + 10.924553871154785 + ] + }, + { + "id": "main.2783", + "pos": [ + 10.445120811462402, + 12.706962585449219 + ] + }, + { + "id": "main.2784", + "pos": [ + 7.857262134552002, + 8.642576217651367 + ] + }, + { + "id": "main.279", + "pos": [ + 11.060724258422852, + 9.06058120727539 + ] + }, + { + "id": "main.2790", + "pos": [ + 10.389716148376465, + 11.81204605102539 + ] + }, + { + "id": "main.2792", + "pos": [ + 10.895355224609375, + 11.916787147521973 + ] + }, + { + "id": "main.2793", + "pos": [ + 10.400568008422852, + 11.332658767700195 + ] + }, + { + "id": "main.2795", + "pos": [ + 10.187610626220703, + 12.636287689208984 + ] + }, + { + "id": "main.2799", + "pos": [ + 10.739992141723633, + 10.449631690979004 + ] + }, + { + "id": "main.2809", + "pos": [ + 11.805200576782227, + 11.168049812316895 + ] + }, + { + "id": "main.2814", + "pos": [ + 8.534096717834473, + 10.32405948638916 + ] + }, + { + "id": "main.2818", + "pos": [ + 7.626887321472168, + 14.230380058288574 + ] + }, + { + "id": "main.2825", + "pos": [ + 9.923606872558594, + 10.532177925109863 + ] + }, + { + "id": "main.2834", + "pos": [ + 10.036540985107422, + 11.513222694396973 + ] + }, + { + "id": "main.2838", + "pos": [ + 9.402886390686035, + 12.074864387512207 + ] + }, + { + "id": "main.2839", + "pos": [ + 5.903013706207275, + 9.907883644104004 + ] + }, + { + "id": "main.284", + "pos": [ + 7.016017436981201, + 10.70311164855957 + ] + }, + { + "id": "main.2847", + "pos": [ + 7.872157573699951, + 13.240442276000977 + ] + }, + { + "id": "main.2849", + "pos": [ + 10.598875999450684, + 9.796965599060059 + ] + }, + { + "id": "main.2851", + "pos": [ + 8.812164306640625, + 12.260523796081543 + ] + }, + { + "id": "main.2853", + "pos": [ + 6.190783500671387, + 10.522822380065918 + ] + }, + { + "id": "main.286", + "pos": [ + 9.145845413208008, + 9.66738224029541 + ] + }, + { + "id": "main.2864", + "pos": [ + 8.048166275024414, + 10.645541191101074 + ] + }, + { + "id": "main.2865", + "pos": [ + 10.894207000732422, + 11.897205352783203 + ] + }, + { + "id": "main.287", + "pos": [ + 10.757328987121582, + 9.893319129943848 + ] + }, + { + "id": "main.2873", + "pos": [ + 11.215537071228027, + 9.243379592895508 + ] + }, + { + "id": "main.2877", + "pos": [ + 11.099328994750977, + 9.588411331176758 + ] + }, + { + "id": "main.2886", + "pos": [ + 7.236546039581299, + 9.508559226989746 + ] + }, + { + "id": "main.2890", + "pos": [ + 7.693192958831787, + 13.0222749710083 + ] + }, + { + "id": "main.2891", + "pos": [ + 7.526119232177734, + 13.979162216186523 + ] + }, + { + "id": "main.2893", + "pos": [ + 9.152679443359375, + 11.91939926147461 + ] + }, + { + "id": "main.2894", + "pos": [ + 7.720800399780273, + 9.016433715820312 + ] + }, + { + "id": "main.2895", + "pos": [ + 12.009075164794922, + 11.211179733276367 + ] + }, + { + "id": "main.2900", + "pos": [ + 9.266304969787598, + 9.516461372375488 + ] + }, + { + "id": "main.2914", + "pos": [ + 11.96753978729248, + 11.237849235534668 + ] + }, + { + "id": "main.2915", + "pos": [ + 9.40613842010498, + 13.984106063842773 + ] + }, + { + "id": "main.2916", + "pos": [ + 7.692976951599121, + 11.428369522094727 + ] + }, + { + "id": "main.2920", + "pos": [ + 7.0496978759765625, + 9.418476104736328 + ] + }, + { + "id": "main.2922", + "pos": [ + 9.164464950561523, + 10.79848861694336 + ] + }, + { + "id": "main.2927", + "pos": [ + 5.896662712097168, + 10.414896965026855 + ] + }, + { + "id": "main.2931", + "pos": [ + 10.737808227539062, + 11.690507888793945 + ] + }, + { + "id": "main.2938", + "pos": [ + 11.371237754821777, + 10.693536758422852 + ] + }, + { + "id": "main.2943", + "pos": [ + 8.768247604370117, + 10.589024543762207 + ] + }, + { + "id": "main.2947", + "pos": [ + 10.574568748474121, + 10.445496559143066 + ] + }, + { + "id": "main.2958", + "pos": [ + 10.314057350158691, + 10.90018081665039 + ] + }, + { + "id": "main.2959", + "pos": [ + 11.070714950561523, + 11.43740463256836 + ] + }, + { + "id": "main.2962", + "pos": [ + 8.111037254333496, + 9.757576942443848 + ] + }, + { + "id": "main.2972", + "pos": [ + 10.636639595031738, + 8.481032371520996 + ] + }, + { + "id": "main.2973", + "pos": [ + 7.982329845428467, + 10.542264938354492 + ] + }, + { + "id": "main.2974", + "pos": [ + 10.87259292602539, + 9.904010772705078 + ] + }, + { + "id": "main.2975", + "pos": [ + 5.597542762756348, + 9.518636703491211 + ] + }, + { + "id": "main.298", + "pos": [ + 7.4358344078063965, + 13.618236541748047 + ] + }, + { + "id": "main.2982", + "pos": [ + 6.880736351013184, + 10.76491928100586 + ] + }, + { + "id": "main.2989", + "pos": [ + 9.802157402038574, + 10.462348937988281 + ] + }, + { + "id": "main.2990", + "pos": [ + 8.52783489227295, + 12.876947402954102 + ] + }, + { + "id": "main.2991", + "pos": [ + 6.621706008911133, + 10.66895866394043 + ] + }, + { + "id": "main.2994", + "pos": [ + 8.442885398864746, + 13.80785083770752 + ] + }, + { + "id": "main.2995", + "pos": [ + 9.694256782531738, + 9.879648208618164 + ] + }, + { + "id": "main.2996", + "pos": [ + 7.699832916259766, + 8.813647270202637 + ] + }, + { + "id": "main.2999", + "pos": [ + 8.240113258361816, + 11.065892219543457 + ] + }, + { + "id": "main.30", + "pos": [ + 10.878056526184082, + 11.882935523986816 + ] + }, + { + "id": "main.300", + "pos": [ + 11.007625579833984, + 9.586464881896973 + ] + }, + { + "id": "main.3010", + "pos": [ + 8.197870254516602, + 11.413719177246094 + ] + }, + { + "id": "main.3012", + "pos": [ + 9.023438453674316, + 9.465350151062012 + ] + }, + { + "id": "main.3013", + "pos": [ + 9.565216064453125, + 8.842230796813965 + ] + }, + { + "id": "main.3022", + "pos": [ + 10.633157730102539, + 10.215239524841309 + ] + }, + { + "id": "main.3023", + "pos": [ + 10.417828559875488, + 11.745408058166504 + ] + }, + { + "id": "main.3028", + "pos": [ + 7.078289031982422, + 8.336191177368164 + ] + }, + { + "id": "main.3032", + "pos": [ + 10.331141471862793, + 9.715370178222656 + ] + }, + { + "id": "main.3035", + "pos": [ + 8.086973190307617, + 10.124317169189453 + ] + }, + { + "id": "main.3046", + "pos": [ + 8.212679862976074, + 13.826406478881836 + ] + }, + { + "id": "main.3049", + "pos": [ + 10.063397407531738, + 10.325268745422363 + ] + }, + { + "id": "main.3051", + "pos": [ + 9.730969429016113, + 12.884045600891113 + ] + }, + { + "id": "main.3054", + "pos": [ + 8.537666320800781, + 10.503791809082031 + ] + }, + { + "id": "main.3057", + "pos": [ + 9.407225608825684, + 10.964166641235352 + ] + }, + { + "id": "main.3064", + "pos": [ + 9.036102294921875, + 8.199114799499512 + ] + }, + { + "id": "main.3065", + "pos": [ + 10.244400024414062, + 11.142778396606445 + ] + }, + { + "id": "main.3068", + "pos": [ + 7.787083148956299, + 11.141057014465332 + ] + }, + { + "id": "main.3072", + "pos": [ + 6.837315559387207, + 8.459601402282715 + ] + }, + { + "id": "main.3074", + "pos": [ + 9.915681838989258, + 12.380694389343262 + ] + }, + { + "id": "main.3084", + "pos": [ + 10.90886402130127, + 8.662376403808594 + ] + }, + { + "id": "main.3088", + "pos": [ + 9.320858001708984, + 9.022832870483398 + ] + }, + { + "id": "main.3093", + "pos": [ + 10.015536308288574, + 11.896622657775879 + ] + }, + { + "id": "main.3101", + "pos": [ + 7.8310112953186035, + 8.404882431030273 + ] + }, + { + "id": "main.3111", + "pos": [ + 8.884950637817383, + 9.735639572143555 + ] + }, + { + "id": "main.3115", + "pos": [ + 7.1422038078308105, + 13.420125961303711 + ] + }, + { + "id": "main.3116", + "pos": [ + 8.202985763549805, + 14.14755916595459 + ] + }, + { + "id": "main.3126", + "pos": [ + 11.8868989944458, + 11.137223243713379 + ] + }, + { + "id": "main.3136", + "pos": [ + 10.333523750305176, + 9.44841480255127 + ] + }, + { + "id": "main.3140", + "pos": [ + 8.5059814453125, + 10.640707015991211 + ] + }, + { + "id": "main.3143", + "pos": [ + 9.656941413879395, + 12.017964363098145 + ] + }, + { + "id": "main.315", + "pos": [ + 7.6571173667907715, + 12.450460433959961 + ] + }, + { + "id": "main.3151", + "pos": [ + 8.15747356414795, + 9.715352058410645 + ] + }, + { + "id": "main.3157", + "pos": [ + 5.37221622467041, + 9.420432090759277 + ] + }, + { + "id": "main.317", + "pos": [ + 6.333697319030762, + 9.954843521118164 + ] + }, + { + "id": "main.3174", + "pos": [ + 9.703948020935059, + 8.973335266113281 + ] + }, + { + "id": "main.3179", + "pos": [ + 5.476848602294922, + 9.711333274841309 + ] + }, + { + "id": "main.318", + "pos": [ + 5.5082478523254395, + 9.340886116027832 + ] + }, + { + "id": "main.3181", + "pos": [ + 7.347564697265625, + 13.302690505981445 + ] + }, + { + "id": "main.3183", + "pos": [ + 8.531292915344238, + 10.610106468200684 + ] + }, + { + "id": "main.3184", + "pos": [ + 9.626951217651367, + 10.388768196105957 + ] + }, + { + "id": "main.3185", + "pos": [ + 8.999373435974121, + 8.181180953979492 + ] + }, + { + "id": "main.3186", + "pos": [ + 8.128698348999023, + 10.670411109924316 + ] + }, + { + "id": "main.319", + "pos": [ + 8.294412612915039, + 10.461273193359375 + ] + }, + { + "id": "main.32", + "pos": [ + 8.00771427154541, + 12.339054107666016 + ] + }, + { + "id": "main.3205", + "pos": [ + 8.105642318725586, + 14.289268493652344 + ] + }, + { + "id": "main.3216", + "pos": [ + 10.734394073486328, + 10.006104469299316 + ] + }, + { + "id": "main.3217", + "pos": [ + 10.630105972290039, + 11.015727043151855 + ] + }, + { + "id": "main.3224", + "pos": [ + 7.407352924346924, + 13.760910987854004 + ] + }, + { + "id": "main.3227", + "pos": [ + 9.569112777709961, + 13.82996654510498 + ] + }, + { + "id": "main.3231", + "pos": [ + 10.620331764221191, + 8.160157203674316 + ] + }, + { + "id": "main.3236", + "pos": [ + 9.815954208374023, + 13.7750883102417 + ] + }, + { + "id": "main.3239", + "pos": [ + 9.264013290405273, + 11.647248268127441 + ] + }, + { + "id": "main.3240", + "pos": [ + 9.880231857299805, + 10.707962989807129 + ] + }, + { + "id": "main.3257", + "pos": [ + 8.5740385055542, + 13.011377334594727 + ] + }, + { + "id": "main.3259", + "pos": [ + 6.810100555419922, + 12.858704566955566 + ] + }, + { + "id": "main.327", + "pos": [ + 10.626011848449707, + 9.537896156311035 + ] + }, + { + "id": "main.3270", + "pos": [ + 7.8172712326049805, + 10.95828628540039 + ] + }, + { + "id": "main.3272", + "pos": [ + 10.39644718170166, + 12.276942253112793 + ] + }, + { + "id": "main.3278", + "pos": [ + 9.136421203613281, + 12.334124565124512 + ] + }, + { + "id": "main.328", + "pos": [ + 11.91796588897705, + 11.19993782043457 + ] + }, + { + "id": "main.3282", + "pos": [ + 8.864337921142578, + 8.14941692352295 + ] + }, + { + "id": "main.3286", + "pos": [ + 8.945352554321289, + 8.153952598571777 + ] + }, + { + "id": "main.3287", + "pos": [ + 10.693809509277344, + 9.927821159362793 + ] + }, + { + "id": "main.3291", + "pos": [ + 7.451843738555908, + 10.391423225402832 + ] + }, + { + "id": "main.3292", + "pos": [ + 10.029988288879395, + 11.822486877441406 + ] + }, + { + "id": "main.3298", + "pos": [ + 10.277308464050293, + 10.023445129394531 + ] + }, + { + "id": "main.3299", + "pos": [ + 8.439021110534668, + 13.092486381530762 + ] + }, + { + "id": "main.3304", + "pos": [ + 9.12481689453125, + 11.9711332321167 + ] + }, + { + "id": "main.3318", + "pos": [ + 5.315499782562256, + 10.03267765045166 + ] + }, + { + "id": "main.3321", + "pos": [ + 9.75533390045166, + 11.570727348327637 + ] + }, + { + "id": "main.3327", + "pos": [ + 9.052443504333496, + 10.953544616699219 + ] + }, + { + "id": "main.3329", + "pos": [ + 8.44985580444336, + 8.189082145690918 + ] + }, + { + "id": "main.3336", + "pos": [ + 6.312061786651611, + 9.35916805267334 + ] + }, + { + "id": "main.3337", + "pos": [ + 9.913475036621094, + 13.387275695800781 + ] + }, + { + "id": "main.334", + "pos": [ + 9.362361907958984, + 12.831013679504395 + ] + }, + { + "id": "main.3344", + "pos": [ + 8.855710983276367, + 11.314064979553223 + ] + }, + { + "id": "main.3348", + "pos": [ + 10.502619743347168, + 12.145903587341309 + ] + }, + { + "id": "main.3352", + "pos": [ + 6.975417137145996, + 8.531469345092773 + ] + }, + { + "id": "main.3353", + "pos": [ + 7.750361442565918, + 11.621794700622559 + ] + }, + { + "id": "main.3357", + "pos": [ + 8.646419525146484, + 8.254646301269531 + ] + }, + { + "id": "main.3358", + "pos": [ + 9.10516357421875, + 12.662078857421875 + ] + }, + { + "id": "main.3360", + "pos": [ + 8.614500045776367, + 11.796128273010254 + ] + }, + { + "id": "main.3370", + "pos": [ + 8.764045715332031, + 13.851250648498535 + ] + }, + { + "id": "main.3375", + "pos": [ + 9.140772819519043, + 8.343419075012207 + ] + }, + { + "id": "main.3389", + "pos": [ + 8.89350700378418, + 9.46325969696045 + ] + }, + { + "id": "main.3390", + "pos": [ + 10.406905174255371, + 8.70427417755127 + ] + }, + { + "id": "main.3391", + "pos": [ + 8.649709701538086, + 13.47225570678711 + ] + }, + { + "id": "main.3393", + "pos": [ + 5.19144344329834, + 9.83558464050293 + ] + }, + { + "id": "main.3394", + "pos": [ + 10.145776748657227, + 12.62537956237793 + ] + }, + { + "id": "main.3398", + "pos": [ + 8.72932243347168, + 9.92029857635498 + ] + }, + { + "id": "main.3403", + "pos": [ + 9.017048835754395, + 12.774368286132812 + ] + }, + { + "id": "main.3408", + "pos": [ + 9.17586898803711, + 11.28165054321289 + ] + }, + { + "id": "main.3419", + "pos": [ + 9.781660079956055, + 10.083165168762207 + ] + }, + { + "id": "main.3424", + "pos": [ + 7.878684043884277, + 8.57625675201416 + ] + }, + { + "id": "main.3431", + "pos": [ + 9.424492835998535, + 9.08231258392334 + ] + }, + { + "id": "main.3434", + "pos": [ + 11.837696075439453, + 11.248409271240234 + ] + }, + { + "id": "main.3437", + "pos": [ + 9.14747428894043, + 9.418724060058594 + ] + }, + { + "id": "main.3438", + "pos": [ + 10.793128967285156, + 8.29556655883789 + ] + }, + { + "id": "main.3441", + "pos": [ + 9.81457805633545, + 13.587862014770508 + ] + }, + { + "id": "main.345", + "pos": [ + 10.360495567321777, + 11.216574668884277 + ] + }, + { + "id": "main.3450", + "pos": [ + 6.9725847244262695, + 9.20457935333252 + ] + }, + { + "id": "main.3453", + "pos": [ + 7.616094589233398, + 12.195123672485352 + ] + }, + { + "id": "main.3454", + "pos": [ + 7.200582027435303, + 13.261055946350098 + ] + }, + { + "id": "main.3457", + "pos": [ + 8.32155990600586, + 12.360151290893555 + ] + }, + { + "id": "main.3462", + "pos": [ + 10.066025733947754, + 8.784151077270508 + ] + }, + { + "id": "main.3464", + "pos": [ + 8.8753023147583, + 9.646138191223145 + ] + }, + { + "id": "main.3470", + "pos": [ + 8.92300033569336, + 11.614990234375 + ] + }, + { + "id": "main.3483", + "pos": [ + 9.79086685180664, + 12.729449272155762 + ] + }, + { + "id": "main.3486", + "pos": [ + 9.268978118896484, + 10.509222984313965 + ] + }, + { + "id": "main.349", + "pos": [ + 7.233345985412598, + 11.707747459411621 + ] + }, + { + "id": "main.3495", + "pos": [ + 7.778262615203857, + 11.685774803161621 + ] + }, + { + "id": "main.3496", + "pos": [ + 9.809409141540527, + 9.494412422180176 + ] + }, + { + "id": "main.3497", + "pos": [ + 7.240481376647949, + 11.879020690917969 + ] + }, + { + "id": "main.3504", + "pos": [ + 8.635042190551758, + 11.523663520812988 + ] + }, + { + "id": "main.3506", + "pos": [ + 7.757588863372803, + 11.939321517944336 + ] + }, + { + "id": "main.3507", + "pos": [ + 7.263705730438232, + 11.654563903808594 + ] + }, + { + "id": "main.3513", + "pos": [ + 10.160819053649902, + 12.807040214538574 + ] + }, + { + "id": "main.3517", + "pos": [ + 8.921181678771973, + 10.501809120178223 + ] + }, + { + "id": "main.3519", + "pos": [ + 10.606645584106445, + 9.26608943939209 + ] + }, + { + "id": "main.3529", + "pos": [ + 7.993675231933594, + 10.627860069274902 + ] + }, + { + "id": "main.353", + "pos": [ + 7.940077781677246, + 13.48586368560791 + ] + }, + { + "id": "main.3532", + "pos": [ + 7.732285499572754, + 8.274297714233398 + ] + }, + { + "id": "main.3540", + "pos": [ + 10.271240234375, + 11.030346870422363 + ] + }, + { + "id": "main.3541", + "pos": [ + 9.368082046508789, + 10.680431365966797 + ] + }, + { + "id": "main.3543", + "pos": [ + 10.232376098632812, + 12.6438627243042 + ] + }, + { + "id": "main.3544", + "pos": [ + 7.255878925323486, + 11.649584770202637 + ] + }, + { + "id": "main.355", + "pos": [ + 6.099674224853516, + 10.5646390914917 + ] + }, + { + "id": "main.3550", + "pos": [ + 8.938826560974121, + 12.305437088012695 + ] + }, + { + "id": "main.3551", + "pos": [ + 8.295368194580078, + 13.339200019836426 + ] + }, + { + "id": "main.3552", + "pos": [ + 8.985546112060547, + 9.568221092224121 + ] + }, + { + "id": "main.3563", + "pos": [ + 7.395507335662842, + 13.483379364013672 + ] + }, + { + "id": "main.3566", + "pos": [ + 8.254783630371094, + 13.44025707244873 + ] + }, + { + "id": "main.3567", + "pos": [ + 9.823026657104492, + 11.468897819519043 + ] + }, + { + "id": "main.357", + "pos": [ + 7.808335781097412, + 13.597261428833008 + ] + }, + { + "id": "main.3573", + "pos": [ + 10.831310272216797, + 8.48275375366211 + ] + }, + { + "id": "main.3579", + "pos": [ + 9.27890396118164, + 8.42033863067627 + ] + }, + { + "id": "main.3580", + "pos": [ + 5.809241771697998, + 9.72873306274414 + ] + }, + { + "id": "main.3581", + "pos": [ + 9.109687805175781, + 9.463086128234863 + ] + }, + { + "id": "main.359", + "pos": [ + 11.877862930297852, + 11.309886932373047 + ] + }, + { + "id": "main.3593", + "pos": [ + 7.441007614135742, + 12.261382102966309 + ] + }, + { + "id": "main.3594", + "pos": [ + 8.652264595031738, + 8.245634078979492 + ] + }, + { + "id": "main.3597", + "pos": [ + 8.25607681274414, + 13.622295379638672 + ] + }, + { + "id": "main.360", + "pos": [ + 7.999185562133789, + 8.516266822814941 + ] + }, + { + "id": "main.3609", + "pos": [ + 9.54712963104248, + 11.62077808380127 + ] + }, + { + "id": "main.3617", + "pos": [ + 10.877838134765625, + 8.47060489654541 + ] + }, + { + "id": "main.362", + "pos": [ + 7.888866424560547, + 10.422924995422363 + ] + }, + { + "id": "main.3621", + "pos": [ + 9.77656078338623, + 12.814495086669922 + ] + }, + { + "id": "main.3635", + "pos": [ + 9.379851341247559, + 11.831483840942383 + ] + }, + { + "id": "main.3644", + "pos": [ + 7.802358627319336, + 8.700201034545898 + ] + }, + { + "id": "main.3646", + "pos": [ + 9.939497947692871, + 9.622735023498535 + ] + }, + { + "id": "main.3647", + "pos": [ + 7.481970310211182, + 12.28473949432373 + ] + }, + { + "id": "main.3648", + "pos": [ + 9.753015518188477, + 10.164929389953613 + ] + }, + { + "id": "main.3651", + "pos": [ + 10.335762023925781, + 10.020933151245117 + ] + }, + { + "id": "main.3656", + "pos": [ + 10.293310165405273, + 13.43323040008545 + ] + }, + { + "id": "main.3672", + "pos": [ + 8.361600875854492, + 10.786873817443848 + ] + }, + { + "id": "main.3676", + "pos": [ + 9.262672424316406, + 13.007108688354492 + ] + }, + { + "id": "main.3682", + "pos": [ + 7.1774139404296875, + 11.6880521774292 + ] + }, + { + "id": "main.3688", + "pos": [ + 9.149133682250977, + 14.113346099853516 + ] + }, + { + "id": "main.371", + "pos": [ + 12.042619705200195, + 11.284687995910645 + ] + }, + { + "id": "main.373", + "pos": [ + 6.3028178215026855, + 10.723831176757812 + ] + }, + { + "id": "main.376", + "pos": [ + 8.486554145812988, + 8.906554222106934 + ] + }, + { + "id": "main.384", + "pos": [ + 7.930331230163574, + 8.280699729919434 + ] + }, + { + "id": "main.387", + "pos": [ + 9.376814842224121, + 10.28940486907959 + ] + }, + { + "id": "main.390", + "pos": [ + 6.79501485824585, + 10.677862167358398 + ] + }, + { + "id": "main.392", + "pos": [ + 8.48309326171875, + 8.319515228271484 + ] + }, + { + "id": "main.400", + "pos": [ + 8.678295135498047, + 14.036458969116211 + ] + }, + { + "id": "main.407", + "pos": [ + 8.746832847595215, + 13.70193862915039 + ] + }, + { + "id": "main.41", + "pos": [ + 7.86841344833374, + 10.918858528137207 + ] + }, + { + "id": "main.410", + "pos": [ + 8.120428085327148, + 14.335413932800293 + ] + }, + { + "id": "main.419", + "pos": [ + 5.48047399520874, + 9.602745056152344 + ] + }, + { + "id": "main.426", + "pos": [ + 11.676797866821289, + 11.059412002563477 + ] + }, + { + "id": "main.438", + "pos": [ + 7.56973123550415, + 9.540671348571777 + ] + }, + { + "id": "main.440", + "pos": [ + 7.492405891418457, + 9.151809692382812 + ] + }, + { + "id": "main.445", + "pos": [ + 6.229669094085693, + 9.836484909057617 + ] + }, + { + "id": "main.447", + "pos": [ + 7.054998874664307, + 12.672747611999512 + ] + }, + { + "id": "main.449", + "pos": [ + 8.322870254516602, + 10.455543518066406 + ] + }, + { + "id": "main.450", + "pos": [ + 10.392136573791504, + 10.877372741699219 + ] + }, + { + "id": "main.453", + "pos": [ + 9.83617115020752, + 14.213235855102539 + ] + }, + { + "id": "main.457", + "pos": [ + 7.214698791503906, + 13.460921287536621 + ] + }, + { + "id": "main.47", + "pos": [ + 12.010249137878418, + 11.183248519897461 + ] + }, + { + "id": "main.470", + "pos": [ + 9.608904838562012, + 9.485971450805664 + ] + }, + { + "id": "main.471", + "pos": [ + 9.296512603759766, + 9.956192970275879 + ] + }, + { + "id": "main.476", + "pos": [ + 6.970868110656738, + 8.259211540222168 + ] + }, + { + "id": "main.478", + "pos": [ + 5.245199680328369, + 9.733161926269531 + ] + }, + { + "id": "main.485", + "pos": [ + 5.869119167327881, + 9.511970520019531 + ] + }, + { + "id": "main.486", + "pos": [ + 7.02972936630249, + 12.570636749267578 + ] + }, + { + "id": "main.493", + "pos": [ + 10.441465377807617, + 11.040703773498535 + ] + }, + { + "id": "main.498", + "pos": [ + 8.136041641235352, + 12.028299331665039 + ] + }, + { + "id": "main.504", + "pos": [ + 7.99515962600708, + 13.985529899597168 + ] + }, + { + "id": "main.517", + "pos": [ + 8.275729179382324, + 13.924529075622559 + ] + }, + { + "id": "main.522", + "pos": [ + 9.227263450622559, + 13.854607582092285 + ] + }, + { + "id": "main.527", + "pos": [ + 6.244213581085205, + 9.46200180053711 + ] + }, + { + "id": "main.531", + "pos": [ + 10.756474494934082, + 9.217055320739746 + ] + }, + { + "id": "main.540", + "pos": [ + 9.277230262756348, + 9.496832847595215 + ] + }, + { + "id": "main.548", + "pos": [ + 8.251824378967285, + 8.15443229675293 + ] + }, + { + "id": "main.55", + "pos": [ + 9.929301261901855, + 10.869333267211914 + ] + }, + { + "id": "main.557", + "pos": [ + 10.208163261413574, + 13.403620719909668 + ] + }, + { + "id": "main.574", + "pos": [ + 10.572616577148438, + 8.940892219543457 + ] + }, + { + "id": "main.585", + "pos": [ + 8.435506820678711, + 13.702850341796875 + ] + }, + { + "id": "main.593", + "pos": [ + 7.120162487030029, + 13.010514259338379 + ] + }, + { + "id": "main.595", + "pos": [ + 8.650273323059082, + 9.890580177307129 + ] + }, + { + "id": "main.598", + "pos": [ + 7.518055438995361, + 12.722161293029785 + ] + }, + { + "id": "main.60", + "pos": [ + 11.998790740966797, + 11.205499649047852 + ] + }, + { + "id": "main.605", + "pos": [ + 10.374234199523926, + 9.609175682067871 + ] + }, + { + "id": "main.607", + "pos": [ + 7.5925750732421875, + 10.528680801391602 + ] + }, + { + "id": "main.616", + "pos": [ + 9.962054252624512, + 13.272245407104492 + ] + }, + { + "id": "main.618", + "pos": [ + 10.164712905883789, + 13.154136657714844 + ] + }, + { + "id": "main.619", + "pos": [ + 9.33325481414795, + 14.137660026550293 + ] + }, + { + "id": "main.623", + "pos": [ + 8.181258201599121, + 13.34870433807373 + ] + }, + { + "id": "main.628", + "pos": [ + 9.48144245147705, + 8.902276039123535 + ] + }, + { + "id": "main.635", + "pos": [ + 11.987065315246582, + 11.108916282653809 + ] + }, + { + "id": "main.638", + "pos": [ + 8.399786949157715, + 11.841379165649414 + ] + }, + { + "id": "main.639", + "pos": [ + 7.660942554473877, + 14.00399398803711 + ] + }, + { + "id": "main.644", + "pos": [ + 7.364542484283447, + 13.709916114807129 + ] + }, + { + "id": "main.645", + "pos": [ + 5.679032325744629, + 9.548377990722656 + ] + }, + { + "id": "main.647", + "pos": [ + 6.1112165451049805, + 10.516013145446777 + ] + }, + { + "id": "main.648", + "pos": [ + 9.158448219299316, + 12.73893928527832 + ] + }, + { + "id": "main.652", + "pos": [ + 9.916309356689453, + 9.45759105682373 + ] + }, + { + "id": "main.658", + "pos": [ + 11.257728576660156, + 9.205669403076172 + ] + }, + { + "id": "main.664", + "pos": [ + 5.282824516296387, + 9.637473106384277 + ] + }, + { + "id": "main.666", + "pos": [ + 10.764861106872559, + 9.317663192749023 + ] + }, + { + "id": "main.668", + "pos": [ + 7.16837215423584, + 10.203292846679688 + ] + }, + { + "id": "main.675", + "pos": [ + 9.165515899658203, + 9.804265975952148 + ] + }, + { + "id": "main.684", + "pos": [ + 11.04030990600586, + 8.924248695373535 + ] + }, + { + "id": "main.689", + "pos": [ + 5.169066905975342, + 9.427948951721191 + ] + }, + { + "id": "main.693", + "pos": [ + 9.277462005615234, + 9.678435325622559 + ] + }, + { + "id": "main.699", + "pos": [ + 5.568265438079834, + 9.796828269958496 + ] + }, + { + "id": "main.701", + "pos": [ + 9.567022323608398, + 13.801589965820312 + ] + }, + { + "id": "main.702", + "pos": [ + 9.308770179748535, + 8.980022430419922 + ] + }, + { + "id": "main.714", + "pos": [ + 9.022886276245117, + 9.606497764587402 + ] + }, + { + "id": "main.730", + "pos": [ + 9.334160804748535, + 12.675436973571777 + ] + }, + { + "id": "main.733", + "pos": [ + 8.993734359741211, + 8.217345237731934 + ] + }, + { + "id": "main.74", + "pos": [ + 8.837590217590332, + 13.831127166748047 + ] + }, + { + "id": "main.744", + "pos": [ + 8.809347152709961, + 12.140426635742188 + ] + }, + { + "id": "main.745", + "pos": [ + 11.576011657714844, + 11.290315628051758 + ] + }, + { + "id": "main.748", + "pos": [ + 9.942278861999512, + 10.496133804321289 + ] + }, + { + "id": "main.750", + "pos": [ + 8.292193412780762, + 14.158241271972656 + ] + }, + { + "id": "main.754", + "pos": [ + 7.753470420837402, + 13.89326000213623 + ] + }, + { + "id": "main.76", + "pos": [ + 9.194432258605957, + 11.318055152893066 + ] + }, + { + "id": "main.763", + "pos": [ + 6.411958694458008, + 10.08619499206543 + ] + }, + { + "id": "main.767", + "pos": [ + 7.710254192352295, + 13.256537437438965 + ] + }, + { + "id": "main.782", + "pos": [ + 10.419876098632812, + 8.92359447479248 + ] + }, + { + "id": "main.787", + "pos": [ + 5.061851501464844, + 9.784747123718262 + ] + }, + { + "id": "main.789", + "pos": [ + 7.689473628997803, + 8.652769088745117 + ] + }, + { + "id": "main.802", + "pos": [ + 7.002111911773682, + 9.152294158935547 + ] + }, + { + "id": "main.809", + "pos": [ + 7.935058116912842, + 13.782447814941406 + ] + }, + { + "id": "main.820", + "pos": [ + 7.407896995544434, + 13.309618949890137 + ] + }, + { + "id": "main.821", + "pos": [ + 9.995036125183105, + 12.960742950439453 + ] + }, + { + "id": "main.825", + "pos": [ + 7.717037200927734, + 13.92602825164795 + ] + }, + { + "id": "main.834", + "pos": [ + 6.381173610687256, + 9.145214080810547 + ] + }, + { + "id": "main.835", + "pos": [ + 9.457256317138672, + 13.665067672729492 + ] + }, + { + "id": "main.838", + "pos": [ + 7.034928798675537, + 8.929213523864746 + ] + }, + { + "id": "main.84", + "pos": [ + 8.3272705078125, + 11.216608047485352 + ] + }, + { + "id": "main.850", + "pos": [ + 8.815025329589844, + 11.82096004486084 + ] + }, + { + "id": "main.851", + "pos": [ + 7.0175604820251465, + 8.637543678283691 + ] + }, + { + "id": "main.852", + "pos": [ + 8.848404884338379, + 14.048019409179688 + ] + }, + { + "id": "main.856", + "pos": [ + 9.456822395324707, + 13.979805946350098 + ] + }, + { + "id": "main.858", + "pos": [ + 8.760018348693848, + 14.034077644348145 + ] + }, + { + "id": "main.861", + "pos": [ + 9.868494033813477, + 9.977631568908691 + ] + }, + { + "id": "main.865", + "pos": [ + 7.963026523590088, + 14.401644706726074 + ] + }, + { + "id": "main.868", + "pos": [ + 7.607671737670898, + 9.098457336425781 + ] + }, + { + "id": "main.87", + "pos": [ + 9.386783599853516, + 11.034730911254883 + ] + }, + { + "id": "main.870", + "pos": [ + 8.516010284423828, + 13.334596633911133 + ] + }, + { + "id": "main.871", + "pos": [ + 8.649077415466309, + 14.123137474060059 + ] + }, + { + "id": "main.872", + "pos": [ + 7.9570183753967285, + 9.38187026977539 + ] + }, + { + "id": "main.875", + "pos": [ + 8.056262016296387, + 14.138389587402344 + ] + }, + { + "id": "main.876", + "pos": [ + 7.3781890869140625, + 11.61052417755127 + ] + }, + { + "id": "main.877", + "pos": [ + 9.545174598693848, + 8.837599754333496 + ] + }, + { + "id": "main.883", + "pos": [ + 7.596994876861572, + 12.232556343078613 + ] + }, + { + "id": "main.888", + "pos": [ + 9.266469955444336, + 13.942567825317383 + ] + }, + { + "id": "main.891", + "pos": [ + 9.389130592346191, + 13.39861011505127 + ] + }, + { + "id": "main.894", + "pos": [ + 9.538475036621094, + 13.940753936767578 + ] + }, + { + "id": "main.903", + "pos": [ + 9.729170799255371, + 12.164717674255371 + ] + }, + { + "id": "main.910", + "pos": [ + 10.241454124450684, + 12.233962059020996 + ] + }, + { + "id": "main.911", + "pos": [ + 10.447086334228516, + 9.955985069274902 + ] + }, + { + "id": "main.916", + "pos": [ + 5.972958564758301, + 9.198019027709961 + ] + }, + { + "id": "main.920", + "pos": [ + 8.08829116821289, + 12.949901580810547 + ] + }, + { + "id": "main.923", + "pos": [ + 10.888069152832031, + 9.006360054016113 + ] + }, + { + "id": "main.928", + "pos": [ + 7.638970375061035, + 10.55160903930664 + ] + }, + { + "id": "main.930", + "pos": [ + 10.381773948669434, + 13.515620231628418 + ] + }, + { + "id": "main.947", + "pos": [ + 8.401927947998047, + 11.887014389038086 + ] + }, + { + "id": "main.954", + "pos": [ + 5.273094654083252, + 9.77721118927002 + ] + }, + { + "id": "main.955", + "pos": [ + 5.950684547424316, + 9.371842384338379 + ] + }, + { + "id": "main.956", + "pos": [ + 10.342310905456543, + 12.725508689880371 + ] + }, + { + "id": "main.958", + "pos": [ + 9.834893226623535, + 10.26301383972168 + ] + }, + { + "id": "main.959", + "pos": [ + 8.116708755493164, + 10.225296974182129 + ] + }, + { + "id": "main.96", + "pos": [ + 10.616619110107422, + 8.140978813171387 + ] + }, + { + "id": "main.965", + "pos": [ + 8.970198631286621, + 9.522197723388672 + ] + }, + { + "id": "main.973", + "pos": [ + 10.147336959838867, + 11.591672897338867 + ] + }, + { + "id": "main.977", + "pos": [ + 5.006229877471924, + 9.40998649597168 + ] + }, + { + "id": "main.983", + "pos": [ + 9.105128288269043, + 8.638668060302734 + ] + }, + { + "id": "main.989", + "pos": [ + 10.511483192443848, + 10.432181358337402 + ] + }, + { + "id": "main.995", + "pos": [ + 6.1739630699157715, + 10.683991432189941 + ] + }, + { + "id": "main.999", + "pos": [ + 5.078319072723389, + 9.410812377929688 + ] + }, + { + "id": "CL.1", + "pos": [ + 7.060129165649414, + 12.886306762695312 + ] + }, + { + "id": "CL.2", + "pos": [ + 7.854038238525391, + 14.02639102935791 + ] + }, + { + "id": "CL.3", + "pos": [ + 6.079244613647461, + 9.927435874938965 + ] + }, + { + "id": "CL.4", + "pos": [ + 8.952251434326172, + 13.624655723571777 + ] + }, + { + "id": "CL.5", + "pos": [ + 6.972578525543213, + 12.482458114624023 + ] + }, + { + "id": "TACL.1936", + "pos": [ + 7.0695109367370605, + 12.758400917053223 + ] + }, + { + "id": "TACL.1943", + "pos": [ + 9.776409149169922, + 13.653993606567383 + ] + }, + { + "id": "TACL.1983", + "pos": [ + 7.730368614196777, + 11.613316535949707 + ] + }, + { + "id": "TACL.1997", + "pos": [ + 9.679237365722656, + 13.915996551513672 + ] + }, + { + "id": "TACL.2011", + "pos": [ + 7.094400882720947, + 9.110810279846191 + ] + }, + { + "id": "TACL.2013", + "pos": [ + 7.426070690155029, + 13.200339317321777 + ] + }, + { + "id": "TACL.2041", + "pos": [ + 9.0868558883667, + 11.916460990905762 + ] + }, + { + "id": "TACL.2047", + "pos": [ + 9.57729434967041, + 13.298322677612305 + ] + }, + { + "id": "TACL.2049", + "pos": [ + 8.027885437011719, + 10.371682167053223 + ] + }, + { + "id": "TACL.2055", + "pos": [ + 9.541186332702637, + 11.951516151428223 + ] + }, + { + "id": "TACL.2083", + "pos": [ + 10.909870147705078, + 11.939735412597656 + ] + }, + { + "id": "TACL.2093", + "pos": [ + 10.951171875, + 11.887301445007324 + ] + }, + { + "id": "TACL.2095", + "pos": [ + 9.206442832946777, + 9.745831489562988 + ] + }, + { + "id": "TACL.2103", + "pos": [ + 10.770514488220215, + 10.159212112426758 + ] + }, + { + "id": "TACL.2107", + "pos": [ + 9.350503921508789, + 14.017936706542969 + ] + }, + { + "id": "TACL.2121", + "pos": [ + 10.631641387939453, + 8.980679512023926 + ] + }, + { + "id": "TACL.2129", + "pos": [ + 11.85280990600586, + 11.208096504211426 + ] + }, + { + "id": "TACL.2135", + "pos": [ + 7.97947883605957, + 11.749464988708496 + ] + }, + { + "id": "TACL.2141", + "pos": [ + 7.151585578918457, + 12.782059669494629 + ] + }, + { + "id": "TACL.2143", + "pos": [ + 5.1364946365356445, + 9.676339149475098 + ] + }, + { + "id": "TACL.2169", + "pos": [ + 9.464832305908203, + 12.31582260131836 + ] + }, + { + "id": "TACL.2221", + "pos": [ + 9.41541576385498, + 14.135518074035645 + ] + }, + { + "id": "TACL.2255", + "pos": [ + 10.092090606689453, + 11.977276802062988 + ] + }, + { + "id": "TACL.2389", + "pos": [ + 11.881233215332031, + 11.150455474853516 + ] + }, + { + "id": "TACL.2411", + "pos": [ + 8.793354988098145, + 11.822294235229492 + ] + } +] \ No newline at end of file diff --git a/sitedata/srw_paper_sessions.yml b/sitedata/srw_paper_sessions.yml deleted file mode 100644 index 774f517..0000000 --- a/sitedata/srw_paper_sessions.yml +++ /dev/null @@ -1,172 +0,0 @@ -1A: - date: 2020-07-06_05:00:00 - papers: - - srw.16 - - srw.17 - - srw.19 - - srw.35 -1B: - date: 2020-07-06_06:00:00 - papers: - - srw.15 - - srw.36 - - srw.39 - - srw.52 -2A: - date: 2020-07-06_08:00:00 - papers: - - srw.5 - - srw.9 - - srw.14 - - srw.18 -2B: - date: 2020-07-06_09:00:00 - papers: - - srw.49 - - srw.53 - - srw.54 - - srw.55 -3A: - date: 2020-07-06_12:00:00 - papers: - - srw.22 - - srw.46 - - srw.49 - - srw.69 -3B: - date: 2020-07-06_13:00:00 - papers: - - srw.79 - - srw.82 - - srw.84 - - srw.128 -4A: - date: 2020-07-06_17:00:00 - papers: - - srw.18 - - srw.42 - - srw.99 - - srw.104 -4B: - date: 2020-07-06_18:00:00 - papers: - - srw.105 - - srw.106 - - srw.109 - - srw.116 -6A: - date: 2020-07-07_05:00:00 - papers: - - srw.58 - - srw.90 - - srw.98 - - srw.115 -6B: - date: 2020-07-07_06:00:00 - papers: - - srw.123 - - srw.129 - - srw.131 -7A: - date: 2020-07-07_08:00:00 - papers: - - srw.19 - - srw.52 - - srw.90 - - srw.123 -7B: - date: 2020-07-07_09:00:00 - papers: - - srw.127 - - srw.129 - - srw.137 -8A: - date: 2020-07-07_12:00:00 - papers: - - srw.127 - - srw.135 - - srw.137 - - srw.144 -8B: - date: 2020-07-07_13:00:00 - papers: - - srw.48 - - srw.131 -9A: - date: 2020-07-07_17:00:00 - papers: - - srw.2 - - srw.95 - - srw.114 - - srw.122 -9B: - date: 2020-07-07_18:00:00 - papers: - - srw.28 - - srw.39 - - srw.85 - - srw.99 - - srw.114 -11A: - date: 2020-07-08_05:00:00 - papers: - - srw.35 - - srw.55 -11B: - date: 2020-07-08_06:00:00 - papers: - - srw.53 - - srw.54 -12A: - date: 2020-07-08_08:00:00 - papers: - - srw.14 - - srw.36 - - srw.58 -12B: - date: 2020-07-08_09:00:00 - papers: - - srw.16 - - srw.79 - - srw.115 -13A: - date: 2020-07-08_12:00:00 - papers: - - srw.48 - - srw.82 - - srw.95 -13B: - date: 2020-07-08_13:00:00 - papers: - - srw.98 - - srw.109 - - srw.116 - - srw.135 -14A: - date: 2020-07-08_17:00:00 - papers: - - srw.5 - - srw.9 - - srw.42 - - srw.46 -14B: - date: 2020-07-08_18:00:00 - papers: - - srw.69 - - srw.84 - - srw.105 - - srw.144 -15A: - date: 2020-07-08_20:00:00 - papers: - - srw.2 - - srw.17 - - srw.22 - - srw.28 -15B: - date: 2020-07-08_21:00:00 - papers: - - srw.85 - - srw.104 - - srw.106 - - srw.122 diff --git a/sitedata/srw_paper_slideslive_ids.csv b/sitedata/srw_paper_slideslive_ids.csv deleted file mode 100644 index 0ccebaf..0000000 --- a/sitedata/srw_paper_slideslive_ids.csv +++ /dev/null @@ -1,49 +0,0 @@ -UID,presentation_id -srw.2,38928632 -srw.5,38928633 -srw.9,38928634 -srw.14,38928635 -srw.15,38928636 -srw.16,38928637 -srw.17,38928638 -srw.18,38928639 -srw.19,38928640 -srw.22,38928641 -srw.28,38928642 -srw.35,38928643 -srw.36,38928644 -srw.39,38928645 -srw.42,38928646 -srw.46,38928647 -srw.48,38928648 -srw.49,38928649 -srw.52,38928650 -srw.53,38928651 -srw.54,38928652 -srw.55,38928653 -srw.58,38928654 -srw.69,38928655 -srw.79,38928656 -srw.82,38928657 -srw.84,38928658 -srw.85,38928659 -srw.90,38928660 -srw.95,38928661 -srw.98,38928662 -srw.99,38928663 -srw.104,38928664 -srw.105,38928665 -srw.106,38928666 -srw.109,38928667 -srw.114,38928668 -srw.115,38928669 -srw.116,38928670 -srw.122,38928672 -srw.123,38928673 -srw.127,38928674 -srw.128,38928675 -srw.129,38928676 -srw.131,38928677 -srw.135,38928678 -srw.137,38928679 -srw.144,38928680 diff --git a/sitedata/srw_paper_zoom_links.csv b/sitedata/srw_paper_zoom_links.csv deleted file mode 100644 index 13d3fe5..0000000 --- a/sitedata/srw_paper_zoom_links.csv +++ /dev/null @@ -1,95 +0,0 @@ -UID,session_name,starttime,endtime,timezone,zoom_join_link -srw.2,9A,2020-07-07T17:00:00Z,2020-07-07T18:00:00Z,UTC,https://zoom.us/j/95699746685 -srw.2,15A,2020-07-08T20:00:00Z,2020-07-08T21:00:00Z,UTC,https://zoom.us/j/96837053577 -srw.5,2A,2020-07-06T08:00:00Z,2020-07-06T09:00:00Z,UTC,https://zoom.us/j/93278433931 -srw.5,14A,2020-07-08T17:00:00Z,2020-07-08T18:00:00Z,UTC,https://zoom.us/j/93332960285 -srw.9,2A,2020-07-06T08:00:00Z,2020-07-06T09:00:00Z,UTC,https://zoom.us/j/99618389924 -srw.9,14A,2020-07-08T17:00:00Z,2020-07-08T18:00:00Z,UTC,https://zoom.us/j/91723289082 -srw.14,2A,2020-07-06T08:00:00Z,2020-07-06T09:00:00Z,UTC,https://zoom.us/j/93754596223 -srw.14,12A,2020-07-08T08:00:00Z,2020-07-08T09:00:00Z,UTC,https://zoom.us/j/95765365011 -srw.15,1B,2020-07-06T06:00:00Z,2020-07-06T07:00:00Z,UTC,https://zoom.us/j/93468134389 -srw.16,1A,2020-07-06T05:00:00Z,2020-07-06T06:00:00Z,UTC,https://zoom.us/j/96141587180 -srw.16,12B,2020-07-08T09:00:00Z,2020-07-08T10:00:00Z,UTC,https://zoom.us/j/91640607988 -srw.17,1A,2020-07-06T05:00:00Z,2020-07-06T06:00:00Z,UTC,https://zoom.us/j/95587857568 -srw.17,15A,2020-07-08T20:00:00Z,2020-07-08T21:00:00Z,UTC,https://zoom.us/j/99666475671 -srw.18,2A,2020-07-06T08:00:00Z,2020-07-06T09:00:00Z,UTC,https://zoom.us/j/98257429421 -srw.18,4A,2020-07-06T17:00:00Z,2020-07-06T18:00:00Z,UTC,https://zoom.us/j/99726908640 -srw.19,1A,2020-07-06T05:00:00Z,2020-07-06T06:00:00Z,UTC,https://zoom.us/j/98566105847 -srw.19,7A,2020-07-07T08:00:00Z,2020-07-07T09:00:00Z,UTC,https://zoom.us/j/96475200846 -srw.22,3A,2020-07-06T12:00:00Z,2020-07-06T13:00:00Z,UTC,https://zoom.us/j/94967984817 -srw.22,15A,2020-07-08T20:00:00Z,2020-07-08T21:00:00Z,UTC,https://zoom.us/j/92532770894 -srw.28,9B,2020-07-07T18:00:00Z,2020-07-07T19:00:00Z,UTC,https://zoom.us/j/91311598993 -srw.28,15A,2020-07-08T20:00:00Z,2020-07-08T21:00:00Z,UTC,https://zoom.us/j/95177575947 -srw.35,1A,2020-07-06T05:00:00Z,2020-07-06T06:00:00Z,UTC,https://zoom.us/j/93678253668 -srw.35,11A,2020-07-08T05:00:00Z,2020-07-08T06:00:00Z,UTC,https://zoom.us/j/94522695946 -srw.36,1B,2020-07-06T06:00:00Z,2020-07-06T07:00:00Z,UTC,https://zoom.us/j/97445163158 -srw.36,12A,2020-07-08T08:00:00Z,2020-07-08T09:00:00Z,UTC,https://zoom.us/j/97477341631 -srw.39,1B,2020-07-06T06:00:00Z,2020-07-06T07:00:00Z,UTC,https://zoom.us/j/94207402059 -srw.39,9B,2020-07-07T18:00:00Z,2020-07-07T19:00:00Z,UTC,https://zoom.us/j/91673134753 -srw.42,4A,2020-07-06T17:00:00Z,2020-07-06T18:00:00Z,UTC,https://zoom.us/j/95788599784 -srw.42,14A,2020-07-08T17:00:00Z,2020-07-08T18:00:00Z,UTC,https://zoom.us/j/97473165380 -srw.46,3A,2020-07-06T12:00:00Z,2020-07-06T13:00:00Z,UTC,https://zoom.us/j/91950327476 -srw.46,14A,2020-07-08T17:00:00Z,2020-07-08T18:00:00Z,UTC,https://zoom.us/j/98779497472 -srw.48,8B,2020-07-07T13:00:00Z,2020-07-07T14:00:00Z,UTC,https://zoom.us/j/98138254309 -srw.48,13A,2020-07-08T12:00:00Z,2020-07-08T13:00:00Z,UTC,https://zoom.us/j/92178877253 -srw.49,2B,2020-07-06T09:00:00Z,2020-07-06T10:00:00Z,UTC,https://zoom.us/j/94926557991 -srw.49,3A,2020-07-06T12:00:00Z,2020-07-06T13:00:00Z,UTC,https://zoom.us/j/94275493818 -srw.52,1B,2020-07-06T06:00:00Z,2020-07-06T07:00:00Z,UTC,https://zoom.us/j/95610810864 -srw.52,7A,2020-07-07T08:00:00Z,2020-07-07T09:00:00Z,UTC,https://zoom.us/j/97524408720 -srw.53,2B,2020-07-06T09:00:00Z,2020-07-06T10:00:00Z,UTC,https://zoom.us/j/93572698912 -srw.53,11B,2020-07-08T06:00:00Z,2020-07-08T07:00:00Z,UTC,https://zoom.us/j/95076979492 -srw.54,2B,2020-07-06T09:00:00Z,2020-07-06T10:00:00Z,UTC,https://zoom.us/j/91489894357 -srw.54,11B,2020-07-08T06:00:00Z,2020-07-08T07:00:00Z,UTC,https://zoom.us/j/95329796551 -srw.55,2B,2020-07-06T09:00:00Z,2020-07-06T10:00:00Z,UTC,https://zoom.us/j/96133103458 -srw.55,11A,2020-07-08T05:00:00Z,2020-07-08T06:00:00Z,UTC,https://zoom.us/j/95628996447 -srw.58,6A,2020-07-07T05:00:00Z,2020-07-07T06:00:00Z,UTC,https://zoom.us/j/98221240007 -srw.58,12A,2020-07-08T08:00:00Z,2020-07-08T09:00:00Z,UTC,https://zoom.us/j/99658173297 -srw.69,3A,2020-07-06T12:00:00Z,2020-07-06T13:00:00Z,UTC,https://zoom.us/j/92617482842 -srw.69,14B,2020-07-08T18:00:00Z,2020-07-08T19:00:00Z,UTC,https://zoom.us/j/98566526963 -srw.79,3B,2020-07-06T13:00:00Z,2020-07-06T14:00:00Z,UTC,https://zoom.us/j/93395182207 -srw.79,12B,2020-07-08T09:00:00Z,2020-07-08T10:00:00Z,UTC,https://zoom.us/j/91249660406 -srw.82,3B,2020-07-06T13:00:00Z,2020-07-06T14:00:00Z,UTC,https://zoom.us/j/97235098471 -srw.82,13A,2020-07-08T12:00:00Z,2020-07-08T13:00:00Z,UTC,https://zoom.us/j/99339856765 -srw.84,3B,2020-07-06T13:00:00Z,2020-07-06T14:00:00Z,UTC,https://zoom.us/j/92419809994 -srw.84,14B,2020-07-08T18:00:00Z,2020-07-08T19:00:00Z,UTC,https://zoom.us/j/96434913808 -srw.85,9B,2020-07-07T18:00:00Z,2020-07-07T19:00:00Z,UTC,https://zoom.us/j/99355708807 -srw.85,15B,2020-07-08T21:00:00Z,2020-07-08T22:00:00Z,UTC,https://zoom.us/j/96920729247 -srw.90,6A,2020-07-07T05:00:00Z,2020-07-07T06:00:00Z,UTC,https://zoom.us/j/96316172412 -srw.90,7A,2020-07-07T08:00:00Z,2020-07-07T09:00:00Z,UTC,https://zoom.us/j/92535641111 -srw.95,9A,2020-07-07T17:00:00Z,2020-07-07T18:00:00Z,UTC,https://zoom.us/j/94126847773 -srw.95,13A,2020-07-08T12:00:00Z,2020-07-08T13:00:00Z,UTC,https://zoom.us/j/97801405942 -srw.98,6A,2020-07-07T05:00:00Z,2020-07-07T06:00:00Z,UTC,https://zoom.us/j/94035754771 -srw.98,13B,2020-07-08T13:00:00Z,2020-07-08T14:00:00Z,UTC,https://zoom.us/j/99497711772 -srw.99,4A,2020-07-06T17:00:00Z,2020-07-06T18:00:00Z,UTC,https://zoom.us/j/95006912001 -srw.99,9B,2020-07-07T18:00:00Z,2020-07-07T19:00:00Z,UTC,https://zoom.us/j/95899978727 -srw.104,4A,2020-07-06T17:00:00Z,2020-07-06T18:00:00Z,UTC,https://zoom.us/j/91363154831 -srw.104,15B,2020-07-08T21:00:00Z,2020-07-08T22:00:00Z,UTC,https://zoom.us/j/98753707960 -srw.105,4B,2020-07-06T18:00:00Z,2020-07-06T19:00:00Z,UTC,https://zoom.us/j/96490652785 -srw.105,14B,2020-07-08T18:00:00Z,2020-07-08T19:00:00Z,UTC,https://zoom.us/j/92629842062 -srw.106,4B,2020-07-06T18:00:00Z,2020-07-06T19:00:00Z,UTC,https://zoom.us/j/92575199326 -srw.106,15B,2020-07-08T21:00:00Z,2020-07-08T22:00:00Z,UTC,https://zoom.us/j/99050557675 -srw.109,4B,2020-07-06T18:00:00Z,2020-07-06T19:00:00Z,UTC,https://zoom.us/j/95706712542 -srw.109,13B,2020-07-08T13:00:00Z,2020-07-08T14:00:00Z,UTC,https://zoom.us/j/96870697271 -srw.114,9A,2020-07-07T17:00:00Z,2020-07-07T18:00:00Z,UTC,https://zoom.us/j/95768858488 -srw.114,9B,2020-07-07T18:00:00Z,2020-07-07T19:00:00Z,UTC,https://zoom.us/j/98911307300 -srw.115,6A,2020-07-07T05:00:00Z,2020-07-07T06:00:00Z,UTC,https://zoom.us/j/98293472125 -srw.115,12B,2020-07-08T09:00:00Z,2020-07-08T10:00:00Z,UTC,https://zoom.us/j/91485665965 -srw.116,4B,2020-07-06T18:00:00Z,2020-07-06T19:00:00Z,UTC,https://zoom.us/j/92547793090 -srw.116,13B,2020-07-08T13:00:00Z,2020-07-08T14:00:00Z,UTC,https://zoom.us/j/91311724209 -srw.122,9A,2020-07-07T17:00:00Z,2020-07-07T18:00:00Z,UTC,https://zoom.us/j/92094947765 -srw.122,15B,2020-07-08T21:00:00Z,2020-07-08T22:00:00Z,UTC,https://zoom.us/j/94242426080 -srw.123,6B,2020-07-07T06:00:00Z,2020-07-07T07:00:00Z,UTC,https://zoom.us/j/96719900148 -srw.123,7A,2020-07-07T08:00:00Z,2020-07-07T09:00:00Z,UTC,https://zoom.us/j/98149622766 -srw.127,7B,2020-07-07T09:00:00Z,2020-07-07T10:00:00Z,UTC,https://zoom.us/j/93604276395 -srw.127,8A,2020-07-07T12:00:00Z,2020-07-07T13:00:00Z,UTC,https://zoom.us/j/95667055624 -srw.128,3B,2020-07-06T13:00:00Z,2020-07-06T14:00:00Z,UTC,https://zoom.us/j/93625432530 -srw.129,6B,2020-07-07T06:00:00Z,2020-07-07T07:00:00Z,UTC,https://zoom.us/j/91088308631 -srw.129,7B,2020-07-07T09:00:00Z,2020-07-07T10:00:00Z,UTC,https://zoom.us/j/92075907708 -srw.131,6B,2020-07-07T06:00:00Z,2020-07-07T07:00:00Z,UTC,https://zoom.us/j/91219372246 -srw.131,8B,2020-07-07T13:00:00Z,2020-07-07T14:00:00Z,UTC,https://zoom.us/j/93787397693 -srw.135,8A,2020-07-07T12:00:00Z,2020-07-07T13:00:00Z,UTC,https://zoom.us/j/91381995054 -srw.135,13B,2020-07-08T13:00:00Z,2020-07-08T14:00:00Z,UTC,https://zoom.us/j/94902380638 -srw.137,7B,2020-07-07T09:00:00Z,2020-07-07T10:00:00Z,UTC,https://zoom.us/j/99301204161 -srw.137,8A,2020-07-07T12:00:00Z,2020-07-07T13:00:00Z,UTC,https://zoom.us/j/96945538190 -srw.144,8A,2020-07-07T12:00:00Z,2020-07-07T13:00:00Z,UTC,https://zoom.us/j/92608464474 -srw.144,14B,2020-07-08T18:00:00Z,2020-07-08T19:00:00Z,UTC,https://zoom.us/j/94072816020 diff --git a/sitedata/srw_papers.csv b/sitedata/srw_papers.csv deleted file mode 100644 index 825a4e2..0000000 --- a/sitedata/srw_papers.csv +++ /dev/null @@ -1,49 +0,0 @@ -UID,title,authors,abstract,keywords,track,paper_type,pdf_url -srw.135,Logical Inferences with Comparatives and Generalized Quantifiers,Izumi Haruta|Koji Mineshima|Daisuke Bekki,"Comparative constructions pose a challenge in Natural Language Inference (NLI), which is the task of determining whether a text entails a hypothesis. Comparatives are structurally complex in that they interact with other linguistic phenomena such as quantifiers, numerals, and lexical antonyms. In formal semantics, there is a rich body of work on comparatives and gradable expressions using the notion of degree. However, a logical inference system for comparatives has not been sufficiently developed for use in the NLI task. In this paper, we present a compositional semantics that maps various comparative constructions in English to semantic representations via Combinatory Categorial Grammar (CCG) parsers and combine it with an inference system based on automated theorem proving. We evaluate our system on three NLI datasets that contain complex logical inferences with comparatives, generalized quantifiers, and numerals. We show that the system outperforms previous logic-based systems as well as recent deep learning-based models.",Logical Inferences|Comparative constructions|NLI task|automated proving,SRW,SRW,https://www.aclweb.org/anthology/2020.acl-srw.35.pdf -srw.109,Understanding Points of Correspondence between Sentences for Abstractive Summarization,Logan Lebanoff|John Muchovej|Franck Dernoncourt|Doo Soon Kim|Lidan Wang|Walter Chang|Fei Liu,"Fusing sentences containing disparate content is a remarkable human ability that helps create informative and succinct summaries. Such a simple task for humans has remained challenging for modern abstractive summarizers, substantially restricting their applicability in real-world scenarios. In this paper, we present an investigation into fusing sentences drawn from a document by introducing the notion of points of correspondence, which are cohesive devices that tie any two sentences together into a coherent text. The types of points of correspondence are delineated by text cohesion theory, covering pronominal and nominal referencing, repetition and beyond. We create a dataset containing the documents, source and fusion sentences, and human annotations of points of correspondence between sentences. Our dataset bridges the gap between coreference resolution and summarization. It is publicly shared to serve as a basis for future work to measure the success of sentence fusion systems.",Abstractive Summarization|coreference resolution|summarization|cohesive devices,SRW,SRW,https://www.aclweb.org/anthology/2020.acl-srw.26.pdf -srw.2,Checkpoint Reranking: An Approach to Select Better Hypothesis for Neural Machine Translation Systems,Vinay Pandramish|Dipti Misra Sharma,"In this paper, we propose a method of re-ranking the outputs of Neural Machine Translation (NMT) systems. After the decoding process, we select a few last iteration outputs in the training process as the N-best list. After training a Neural Machine Translation (NMT) baseline system, it has been observed that these iteration outputs have an oracle score higher than baseline up to 1.01 BLEU points compared to the last iteration of the trained system.We come up with a ranking mechanism by solely focusing on the decoder's ability to generate distinct tokens and without the usage of any language model or data. With this method, we achieved a translation improvement up to +0.16 BLEU points over baseline.We also evaluate our approach by applying the coverage penalty to the training process.In cases of moderate coverage penalty, the oracle scores are higher than the final iteration up to +0.99 BLEU points, and our algorithm gives an improvement up to +0.17 BLEU points.With excessive penalty, there is a decrease in translation quality compared to the baseline system. Still, an increase in oracle scores up to +1.30 is observed with the re-ranking algorithm giving an improvement up to +0.15 BLEU points is found in case of excessive penalty.The proposed re-ranking method is a generic one and can be extended to other language pairs as well.",Neural Systems|Checkpoint Reranking|Neural systems|decoding process,SRW,SRW,https://www.aclweb.org/anthology/2020.acl-srw.38.pdf -srw.122,Exploring the Role of Context to Distinguish Rhetorical and Information-Seeking Questions,Yuan Zhuang|Ellen Riloff,"Social media posts often contain questions, but many of the questions are rhetorical and do not seek information. Our work studies the problem of distinguishing rhetorical and information-seeking questions on Twitter. Most work has focused on features of the question itself, but we hypothesize that the prior context plays a role too. This paper introduces a new dataset containing questions in tweets paired with their prior tweets to provide context. We create classification models to assess the difficulty of distinguishing rhetorical and information-seeking questions, and experiment with different properties of the prior context. Our results show that the prior tweet and topic features can improve performance on this task.",distinguishing questions|classification models|Rhetorical Questions|features,SRW,SRW,https://www.aclweb.org/anthology/2020.acl-srw.41.pdf -srw.48,Pre-training via Leveraging Assisting Languages for Neural Machine Translation,Haiyue Song|Raj Dabre|Zhuoyuan Mao|Fei Cheng|Sadao Kurohashi|Eiichiro Sumita,"Sequence-to-sequence (S2S) pre-training using large monolingual data is known to improve performance for various S2S NLP tasks. However, large monolingual corpora might not always be available for the languages of interest (LOI). Thus, we propose to exploit monolingual corpora of other languages to complement the scarcity of monolingual corpora for the LOI. We utilize script mapping (Chinese to Japanese) to increase the similarity (number of cognates) between the monolingual corpora of helping languages and LOI. An empirical case study of low-resource Japanese-English neural machine translation (NMT) reveals that leveraging large Chinese and French monolingual corpora can help overcome the shortage of Japanese and English monolingual corpora, respectively, for S2S pre-training. Using only Chinese and French monolingual corpora, we were able to improve Japanese-English translation quality by up to 8.5 BLEU in low-resource scenarios.",Neural Translation|S2S tasks|LOI|low-resource translation,SRW,SRW,https://www.aclweb.org/anthology/2020.acl-srw.37.pdf -srw.49,SCAR: Sentence Compression using Autoencoders for Reconstruction,Chanakya Malireddy|Tirth Maniar|Manish Shrivastava,"Sentence compression is the task of shortening a sentence while retaining its meaning. Most methods proposed for this task rely on labeled or paired corpora (containing pairs of verbose and compressed sentences), which is often expensive to collect. To overcome this limitation, we present a novel unsupervised deep learning framework (SCAR) for deletion-based sentence compression. SCAR is primarily composed of two encoder-decoder pairs: a compressor and a reconstructor. The compressor masks the input, and the reconstructor tries to regenerate it. The model is entirely trained on unlabeled data and does not require additional inputs such as explicit syntactic information or optimal compression length. SCAR’s merit lies in the novel Linkage Loss function, which correlates the compressor and its effect on reconstruction, guiding it to drop inferable tokens. SCAR achieves higher ROUGE scores on benchmark datasets than the existing state-of-the-art methods and baselines. We also conduct a user study to demonstrate the application of our model as a text highlighting system. Using our model to underscore salient information facilitates speed-reading and reduces the time required to skim a document.",Sentence Compression|Reconstruction|deletion-based compression|reconstructor,SRW,SRW,https://www.aclweb.org/anthology/2020.acl-srw.13.pdf -srw.123,Embeddings of Label Components for Sequence Labeling: A Case Study of Fine-grained Named Entity Recognition,Takuma Kato|Kaori Abe|Hiroki Ouchi|Shumpei Miyawaki|Jun Suzuki|Kentaro Inui,"In general, the labels used in sequence labeling consist of different types of elements.For example, IOB-format entity labels, such as B-Person and I-Person, can be decomposed into span (B and I) and type information (Person).However, while most sequence labeling models do not consider such label components, the shared components across labels, such as Person, can be beneficial for label prediction.In this work, we propose to integrate label component information as embeddings into models.Through experiments on English and Japanese fine-grained named entity recognition, we demonstrate that the proposed method improves performance, especially for instances with low-frequency labels.",Sequence Labeling|Fine-grained Recognition|label prediction|English recognition,SRW,SRW,https://www.aclweb.org/anthology/2020.acl-srw.30.pdf -srw.137,Unsupervised Multilingual Sentence Embeddings for Parallel Corpus Mining,Ivana Kvapilíková|Mikel Artetxe|Gorka Labaka|Eneko Agirre|Ondřej Bojar,"Existing models of multilingual sentence embeddings require large parallel data resources which are not available for low-resource languages. We propose a novel unsupervised method to derive multilingual sentence embeddings relying only on monolingual data. We first produce a synthetic parallel corpus using unsupervised machine translation, and use it to fine-tune a pretrained cross-lingual masked language model (XLM) to derive the multilingual sentence representations. The quality of the representations is evaluated on two parallel corpus mining tasks with improvements of up to 22 F1 points over vanilla XLM. In addition, we observe that a single synthetic bilingual corpus is able to improve results for other language pairs.",Unsupervised Embeddings|Parallel Mining|multilingual embeddings|parallel tasks,SRW,SRW,https://www.aclweb.org/anthology/2020.acl-srw.34.pdf -srw.127,How much complexity does an RNN architecture need to learn syntax-sensitive dependencies?,Gantavya Bhatt|Hritik Bansal|Rishubh Singh|Sumeet Agarwal,"Long short-term memory (LSTM) networks and their variants are capable of encapsulating long-range dependencies, which is evident from their performance on a variety of linguistic tasks. On the other hand, simple recurrent networks (SRNs), which appear more biologically grounded in terms of synaptic connections, have generally been less successful at capturing long-range dependencies as well as the loci of grammatical errors in an unsupervised setting. In this paper, we seek to develop models that bridge the gap between biological plausibility and linguistic competence. We propose a new architecture, the Decay RNN, which incorporates the decaying nature of neuronal activations and models the excitatory and inhibitory connections in a population of neurons. Besides its biological inspiration, our model also shows competitive performance relative to LSTMs on subject-verb agreement, sentence grammaticality, and language modeling tasks. These results provide some pointers towards probing the nature of the inductive biases required for RNN architectures to model linguistic phenomena successfully.",linguistic tasks|unsupervised setting|sentence grammaticality|language tasks,SRW,SRW,https://www.aclweb.org/anthology/2020.acl-srw.33.pdf -srw.5,Topic Balancing with Additive Regularization of Topic Models,Eugeniia Veselova|Konstantin Vorontsov,"This article proposes a new approach for building topic models on unbalanced collections in topic modelling, based on the existing methods and our experiments with such methods. Real-world data collections contain topics in various proportions, and often documents of the relatively small theme become distributed all over the larger topics instead of being grouped into one topic. To address this issue, we design a new regularizer for Theta and Phi matrices in probabilistic Latent Semantic Analysis (pLSA) model. We make sure this regularizer increases the quality of topic models, trained on unbalanced collections. Besides, we conceptually support this regularizer by our experiments.",Topic Balancing|unbalanced modelling|probabilistic model|Additive Models,SRW,SRW,https://www.aclweb.org/anthology/2020.acl-srw.9.pdf -srw.58,uBLEU: Uncertainty-Aware Automatic Evaluation Method for Open-Domain Dialogue Systems,Tsuta Yuma|Naoki Yoshinaga|Masashi Toyoda,"Because open-domain dialogues allow diverse responses, basic reference-based metrics such as BLEU do not work well unless we prepare a massive reference set of high-quality responses for input utterances. To reduce this burden, a human-aided, uncertainty-aware metric, ΔBLEU, has been proposed; it embeds human judgment on the quality of reference outputs into the computation of multiple-reference BLEU. In this study, we instead propose a fully automatic, uncertainty-aware evaluation method for open-domain dialogue systems, υBLEU. This method first collects diverse reference responses from massive dialogue data and then annotates their quality judgments by using a neural network trained on automatically collected training data. Experimental results on massive Twitter data confirmed that υBLEU is comparable to ΔBLEU in terms of its correlation with human judgment and that the state of the art automatic evaluation method, RUBER, is improved by integrating υBLEU.",Open-Domain Systems|uBLEU|Uncertainty-Aware Method|ΔBLEU,SRW,SRW,https://www.aclweb.org/anthology/2020.acl-srw.27.pdf -srw.99,Research Replication Prediction Using Weakly Supervised Learning,Tianyi Luo|Xingyu Li|Hainan Wang|Yang Liu,"Knowing whether a published research result can be replicated or not is important. Carrying out direct replication of published research incurs high cost. It is therefore desirable to have a machine learning aided automatic prediction of a result's replicability. Such predictions can provide a confidence score for each article which can further provide guidelines for spot-checks.Since we will only have access to a small size of annotated dataset to train a machine predictor, we explore the possibility of using weakly supervised learning approaches to improve the prediction accuracy of research replication using both labelled and unlabelled datasets based on text information of research papers. Our experiments over real-world datasets show that much better prediction performance can be obtained compared to the supervised models utilizing only a small size of labelled dataset.",Research Prediction|Replication Prediction|automatic prediction|prediction replication,SRW,SRW, -srw.98,AraDIC: Arabic Document Classification Using Image-Based Character Embeddings and Class-Balanced Loss,Mahmoud Daif|Shunsuke Kitada|Hitoshi Iyatomi,"Classical and some deep learning techniques for Arabic text classification often depend on complex morphological analysis, word segmentation, and hand-crafted feature engineering.These could be eliminated by using character-level features.We propose a novel end-to-end Arabic document classification framework, Arabic document image-based classifier (AraDIC), inspired by the work on image-based character embeddings.AraDIC consists of an image-based character encoder and a classifier.They are trained in an end-to-end fashion using the class balanced loss to deal with the long-tailed data distribution problem.To evaluate the effectiveness of AraDIC, we created and published two datasets, the Arabic Wikipedia title (AWT) dataset and the Arabic poetry (AraP) dataset.To the best of our knowledge, this is the first image-based character embedding framework addressing the problem of Arabic text classification. We also present the first deep learning-based text classifier widely evaluated on modern standard Arabic, colloquial Arabic, and Classical Arabic.AraDIC shows performance improvement over classical and deep learning baselines by 12.29% and 23.05% for the micro and macro F-score, respectively.",Arabic Classification|Class-Balanced Loss|word segmentation|long-tailed problem,SRW,SRW,https://www.aclweb.org/anthology/2020.acl-srw.29.pdf -srw.131,Preventing Critical Scoring Errors in Short Answer Scoring with Confidence Estimation,Hiroaki Funayama|Shota Sasaki|Yuichiroh Matsubayashi|Tomoya Mizumoto|Jun Suzuki|Masato Mita|Kentaro Inui,"Many recent Short Answer Scoring (SAS) systems have employed Quadratic Weighted Kappa (QWK) as the evaluation measure of their systems. However, we hypothesize that QWK is unsatisfactory for the evaluation of the SAS systems when we consider measuring their effectiveness in actual usage. We introduce a new task formulation of SAS that matches the actual usage. In our formulation, the SAS systems should extract as many scoring predictions that are not critical scoring errors (CSEs). We conduct the experiments in our new task formulation and demonstrate that a typical SAS system can predict scores with zero CSE for approximately 50% of test data at maximum by filtering out low-reliablility predictions on the basis of a certain confidence estimation. This result directly indicates the possibility of reducing half the scoring cost of human raters, which is more preferable for the evaluation of SAS systems.",Short Scoring|evaluation systems|Confidence Estimation|Short systems,SRW,SRW,https://www.aclweb.org/anthology/2020.acl-srw.32.pdf -srw.28,Compositional Generalization by Factorizing Alignment and Translation,Jacob Russin|Jason Jo|Randall O'Reilly|Yoshua Bengio,"Standard methods in deep learning for natural language processing fail to capture the compositional structure of human language that allows for systematic generalization outside of the training distribution. However, human learners readily generalize in this way, e.g. by applying known grammatical rules to novel words. Inspired by work in cognitive science suggesting a functional distinction between systems for syntactic and semantic processing, we implement a modification to an existing approach in neural machine translation, imposing an analogous separation between alignment and translation. The resulting architecture substantially outperforms standard recurrent networks on the SCAN dataset, a compositional generalization task, without any additional supervision. Our work suggests that learning to align and to translate in separate modules may be a useful heuristic for capturing compositional structure.",Compositional Generalization|Translation|natural processing|cognitive science,SRW,SRW,https://www.aclweb.org/anthology/2020.acl-srw.42.pdf -srw.14,Zero-shot North Korean to English Neural Machine Translation by Character Tokenization and Phoneme Decomposition,Hwichan Kim|Tosho Hirasawa|Mamoru Komachi,"The primary limitation of North Korean to English translation is the lack of a parallel corpus; therefore, high translation accuracy cannot be achieved. To address this problem, we propose a zero-shot approach using South Korean data, which are remarkably similar to North Korean data. We train a neural machine translation model after tokenizing a South Korean text at the character level and decomposing characters into phonemes.We demonstrate that our method can effectively learn North Korean to English translation and improve the BLEU scores by +1.01 points in comparison with the baseline.",English Translation|Character Tokenization|Phoneme Decomposition|zero-shot approach,SRW,SRW,https://www.aclweb.org/anthology/2020.acl-srw.11.pdf -srw.15,Grammatical Error Correction Using Pseudo Learner Corpus Considering Learner's Error Tendency,Yujin Takahashi|Satoru Katsumata|Mamoru Komachi,"Recently, several studies have focused on improving the performance of grammatical error correction (GEC) tasks using pseudo data. However, a large amount of pseudo data are required to train an accurate GEC model. To address the limitations of language and computational resources, we assume that introducing pseudo errors into sentences similar to those written by the language learners is more efficient, rather than incorporating random pseudo errors into monolingual data. In this regard, we study the effect of pseudo data on GEC task performance using two approaches. First, we extract sentences that are similar to the learners' sentences from monolingual data. Second, we generate realistic pseudo errors by considering error types that learners often make. Based on our comparative results, we observe that F0.5 scores for the Russian GEC task are significantly improved.",Grammatical Correction|grammatical tasks|GEC task|Russian task,SRW,SRW,https://www.aclweb.org/anthology/2020.acl-srw.5.pdf -srw.17,Story-level Text Style Transfer: A Proposal,Yusu Qian,"Text style transfer aims to change the style of the input text to the target style while preserving the content to some extent. Previous works on this task are on the sentence level. We aim to work on story-level text style transfer to generate stories that preserve the plot of the input story while exhibiting a strong target style. The challenge in this task compared to previous work is that the structure of the input story, consisting of named entities and their relations with each other, needs to be preserved, and that the generated story needs to be consistent after adding flavors. We plan to explore three methods including the BERT-based method, the Story Realization method, and the Graph-based method.",Story-level Transfer|Text transfer|BERT-based method|Story method,SRW,SRW,https://www.aclweb.org/anthology/2020.acl-srw.2.pdf -srw.16,Adaptive Transformers for Learning Multimodal Representations,Prajjwal Bhargava,"The usage of transformers has grown from learning about language semantics to forming meaningful visiolinguistic representations. These architectures are often over-parametrized, requiring large amounts of computation. In this work, we extend adaptive approaches to learn more about model interpretability and computational efficiency. Specifically, we study attention spans, sparse, and structured dropout methods to help understand how their attention mechanism extends for vision and language tasks. We further show that these approaches can help us learn more about how the network perceives the complexity of input sequences, sparsity preferences for different modalities, and other related phenomena.",Multimodal Representations|vision tasks|Adaptive Transformers|transformers,SRW,SRW,https://www.aclweb.org/anthology/2020.acl-srw.1.pdf -srw.144,Enhancing Word Embeddings with Knowledge Extracted from Lexical Resources,Magdalena Biesialska|Bardia Rafieian|Marta R. Costa-jussà,"In this work, we present an effective method for semantic specialization of word vector representations. To this end, we use traditional word embeddings and apply specialization methods to better capture semantic relations between words. In our approach, we leverage external knowledge from rich lexical resources such as BabelNet. We also show that our proposed post-specialization method based on an adversarial neural network with the Wasserstein distance allows to gain improvements over state-of-the-art methods on two tasks: word similarity and dialog state tracking.",semantic representations|dialog tracking|word embeddings|specialization methods,SRW,SRW,https://www.aclweb.org/anthology/2020.acl-srw.36.pdf -srw.39,RPD: A Distance Function Between Word Embeddings,Xuhui Zhou|Shujian Huang|Zaixiang Zheng,"It is well-understood that different algorithms, training processes, and corpora produce different word embeddings. However, less is known about the relation between different embedding spaces, i.e. how far different sets of em-beddings deviate from each other. In this paper, we propose a novel metric called Relative Pairwise Inner Product Distance (RPD) to quantify the distance between different sets of word embeddings. This unitary-invariant metric has a unified scale for comparing different sets of word embeddings. Based on the properties of RPD, we study the relations of word embeddings of different algorithms systematically and investigate the influence of different training processes and corpora. The results shed light on the poorly understood word embeddings and justify RPD as a measure of the distance of embedding space.",RPD|Word Embeddings|training processes|Relative Distance,SRW,SRW,https://www.aclweb.org/anthology/2020.acl-srw.7.pdf -srw.35,HGCN4MeSH: Hybrid Graph Convolution Network for MeSH Indexing,Miaomiao Yu|Yujiu Yang|Chenhui Li,"Recently deep learning has been used in Medical subject headings (MeSH) indexing to reduce the time and monetary cost by manual annotation, including DeepMeSH, TextCNN, etc. However, these models still suffer from failing to capture the complex correlations between MeSH terms. To this end, we introduce Graph Convolution Network (GCN) to learn the relationship between these terms, and present a novel Hybrid Graph Convolution Net for MeSH index (HGCN4MeSH). Basically, we utilize two BiGRUs to learn the embedding representation of the abstract and the title of the MeSH index text respectively. At the same time, we establish the adjacency matrix of MeSH terms based on the co-occurrence relationships in Corpus, which is easy to apply for GCN representation learning. On the basis of learning the mixed representation, the prediction problem of the MeSH index keywords is transformed into an extreme multi-label classification problem after the attention layer operation. Experimental results on two datasets show that HGCN4MeSH is competitive compared with the state-of-the-art methods.",MeSH Indexing|Medical indexing|manual annotation|prediction problem,SRW,SRW,https://www.aclweb.org/anthology/2020.acl-srw.4.pdf -srw.36,Research on Task Discovery for Transfer Learning in Deep Neural Networks,Arda Akdemir,"Deep neural network based machine learning models are shown to perform poorly on unseen or out-of-domain examples by numerous recent studies. Transfer learning aims to avoid overfitting and to improve generalizability by leveraging the information obtained from multiple tasks. Yet, the benefits of transfer learning depend largely on task selection and finding the right method of sharing. In this thesis, we hypothesize that current deep neural network based transfer learning models do not achieve their fullest potential for various tasks and there are still many task combinations that will benefit from transfer learning that are not considered by the current models. To this end, we started our research by implementing a novel multi-task learner with relaxed annotated data requirements and obtained a performance improvement on two NLP tasks. We will further devise models to tackle tasks from multiple areas of machine learning, such as Bioinformatics and Computer Vision, in addition to NLP.",Task Discovery|Transfer Learning|task selection|NLP tasks,SRW,SRW,https://www.aclweb.org/anthology/2020.acl-srw.6.pdf -srw.22,Non-Topical Coherence in Social Talk: A Call for Dialogue Model Enrichment,Alex Luu|Sophia A. Malamud,"Current models of dialogue mainly focus on utterances within a topically coherent discourse segment, rather than new-topic utterances (NTUs), which begin a new topic not correlating with the content of prior discourse. As a result, these models may sufficiently account for discourse context of task-oriented but not social conversations. We conduct a pilot annotation study of NTUs as a first step towards a model capable of rationalizing conversational coherence in social talk. We start with the naturally occurring social dialogues in the Disco-SPICE corpus, annotated with discourse relations in the Penn Discourse Treebank and Cognitive approach to Coherence Relations frameworks. We first annotate content-based coherence relations that are not available in Disco-SPICE, and then heuristically identify NTUs, which lack a coherence relation to prior discourse. Based on the interaction between NTUs and their discourse context, we construct a classification for NTUs that actually convey certain non-topical coherence in social talk. This classification introduces new sequence-based social intents that traditional taxonomies of speech acts do not capture. The new findings advocates the development of a Bayesian game-theoretic model for social talk.",Dialogue Enrichment|Coherence frameworks|classification|NTUs,SRW,SRW,https://www.aclweb.org/anthology/2020.acl-srw.17.pdf -srw.18,"Media Bias, the Social Sciences, and NLP: Automating Frame Analyses to Identify Bias by Word Choice and Labeling",Felix Hamborg,"Media bias can strongly impact the public perception of topics reported in the news. A difficult to detect, yet powerful form of slanted news coverage is called bias by word choice and labeling (WCL). WCL bias can occur, for example, when journalists refer to the same semantic concept by using different terms that frame the concept differently and consequently may lead to different assessments by readers, such as the terms ""freedom fighters"" and ""terrorists,"" or ""gun rights"" and ""gun control."" In this research project, I aim to devise methods that identify instances of WCL bias and estimate the frames they induce, e.g., not only is ""terrorists"" of negative polarity but also ascribes to aggression and fear. To achieve this, I plan to research methods using natural language processing and deep learning while employing models and using analysis concepts from the social sciences, where researchers have studied media bias for decades. The first results indicate the effectiveness of this interdisciplinary research approach. My vision is to devise a system that helps news readers to become aware of the differences in media coverage caused by bias.",NLP|labeling|natural processing|Frame Analyses,SRW,SRW,https://www.aclweb.org/anthology/2020.acl-srw.12.pdf -srw.19,Unsupervised Paraphasia Classification in Aphasic Speech,Sharan Pai|Nikhil Sachdeva|Prince Sachdeva|Rajiv Ratn Shah,"Aphasia is a speech and language disorder which results from brain damage, often characterized by word retrieval deficit (anomia) resulting in naming errors (paraphasia).Automatic paraphasia detection has many benefits for both treatment and diagnosis of Aphasia and its type. But supervised learning methods cant be properly utilized as there is a lack of aphasic speech data. In this paper, we describe our novel unsupervised method which can be implemented without the need for labeled paraphasia data. Our evaluations show that our method outperforms previous work based on supervised learning and transfer learning approaches for English. We demonstrate the utility of our method as an essential first step in developing augmentative and alternative communication (AAC) devices for patients suffering from aphasia in any language.",Unsupervised Classification|speech disorder|naming detection|treatment,SRW,SRW,https://www.aclweb.org/anthology/2020.acl-srw.3.pdf -srw.114,Inducing Grammar from Long Short-Term Memory Networks by Shapley Decomposition,Yuhui Zhang|Allen Nie,"The principle of compositionality has deep roots in linguistics: the meaning of an expression is determined by its structure and the meanings of its constituents. However, modern neural network models such as long short-term memory network process expressions in a linear fashion and do not seem to incorporate more complex compositional patterns. In this work, we show that we can explicitly induce grammar by tracing the computational process of a long short-term memory network. We show: (i) the multiplicative nature of long short-term memory network allows complex interaction beyond sequential linear combination; (ii) we can generate compositional trees from the network without external linguistic knowledge; (iii) we evaluate the syntactic difference between the generated trees, randomly generated trees and gold reference trees produced by constituency parsers; (iv) we evaluate whether the generated trees contain the rich semantic information.",Inducing Grammar|Long Networks|Shapley Decomposition|neural models,SRW,SRW,https://www.aclweb.org/anthology/2020.acl-srw.40.pdf -srw.128,Efficient Neural Machine Translation for Low-Resource Languages via Exploiting Related Languages,Vikrant Goyal|Sourav Kumar|Dipti Misra Sharma,"A large percentage of the world’s population speaks a language of the Indian subcontinent, comprising languages from both Indo-Aryan (e.g. Hindi, Punjabi, Gujarati, etc.) and Dravidian (e.g. Tamil, Telugu, Malayalam, etc.) families. A universal characteristic of Indian languages is their complex morphology, which, when combined with the general lack of sufficient quantities of high-quality parallel data, can make developing machine translation (MT) systems for these languages difficult. Neural Machine Translation (NMT) is a rapidly advancing MT paradigm and has shown promising results for many language pairs, especially in large training data scenarios. Since the condition of large parallel corpora is not met for Indian-English language pairs, we present our efforts towards building efficient NMT systems between Indian languages (specifically Indo-Aryan languages) and English via efficiently exploiting parallel data from the related languages. We propose a technique called Unified Transliteration and Subword Segmentation to leverage language similarity while exploiting parallel data from related language pairs. We also propose a Multilingual Transfer Learning technique to leverage parallel data from multiple related languages to assist translation for low resource language pair of interest. Our experiments demonstrate an overall average improvement of 5 BLEU points over the standard Transformer-based NMT baselines.",Neural Translation|machine systems|translation|NMT,SRW,SRW,https://www.aclweb.org/anthology/2020.acl-srw.22.pdf -srw.95,Cross-Lingual Disaster-related Multi-label Tweet Classification with Manifold Mixup,Jishnu Ray Chowdhury|Cornelia Caragea|Doina Caragea,"Distinguishing informative and actionable messages from a social media platform like Twitter is critical for facilitating disaster management. For this purpose, we compile a multilingual dataset of over 130K samples for multi-label classification of disaster-related tweets. We present a masking-based loss function for partially labelled samples and demonstrate the effectiveness of Manifold Mixup in the text domain. Our main model is based on Multilingual BERT, which we further improve with Manifold Mixup. We show that our model generalizes to unseen disasters in the test set. Furthermore, we analyze the capability of our model for zero-shot generalization to new languages. Our code, dataset, and other resources are available on Github.",Cross-Lingual Classification|Distinguishing messages|disaster management|multi-label tweets,SRW,SRW,https://www.aclweb.org/anthology/2020.acl-srw.39.pdf -srw.42,Exploring Interpretability in Event Extraction: Multitask Learning of a Neural Event Classifier and an Explanation Decoder,Zheng Tang|Gus Hahn-Powell|Mihai Surdeanu,"We propose an interpretable approach for event extraction that mitigates the tension between generalization and interpretability by jointly training for the two goals. Our approach uses an encoder-decoder architecture, which jointly trains a classifier for event extraction, and a rule decoder that generates syntactico-semantic rules that explain the decisions of the event classifier. We evaluate the proposed approach on three biomedical events and show that the decoder generates interpretable rules that serve as accurate explanations for the event classifier's decisions, and, importantly, that the joint training generally improves the performance of the event classifier. Lastly, we show that our approach can be used for semi-supervised learning, and that its performance improves when trained on automatically-labeled data generated by a rule-based system.",Interpretability Extraction|Event Extraction|semi-supervised learning|Multitask Learning,SRW,SRW,https://www.aclweb.org/anthology/2020.acl-srw.23.pdf -srw.129,Building a Japanese Typo Dataset from Wikipedia's Revision History,Yu Tanaka|Yugo Murawaki|Daisuke Kawahara|Sadao Kurohashi,"User generated texts contain many typos for which correction is necessary for NLP systems to work. Although a large number of typo–correction pairs are needed to develop a data-driven typo correction system, no such dataset is available for Japanese. In this paper, we extract over half a million Japanese typo–correction pairs from Wikipedia’s revision history. Unlike other languages, Japanese poses unique challenges: (1) Japanese texts are unsegmented so that we cannot simply apply a spelling checker, and (2) the way people inputting kanji logographs results in typos with drastically different surface forms from correct ones. We address them by combining character-based extraction rules, morphological analyzers to guess readings, and various filtering methods. We evaluate the dataset using crowdsourcing and run a baseline seq2seq model for typo correction.",NLP systems|typo correction|data-driven system|spelling checker,SRW,SRW,https://www.aclweb.org/anthology/2020.acl-srw.31.pdf -srw.115,Self-Attention is Not Only a Weight: Analyzing BERT with Vector Norms,Goro Kobayashi|Tatsuki Kuribayashi|Sho Yokoi|Kentaro Inui,"Self-attention modules are essential building blocks of Transformer-based language models and hence are the subject of a large number of studies aiming to discover which linguistic capabilities these models possess (Rogers et al., 2020). Such studies are commonly conducted by analyzing correlations of attention weights with specific linguistic phenomena. In this paper, we show that attention weights alone are only one of two factors determining the output of self-attention modules and propose to incorporate the other factor, namely the norm of the transformed input vectors, into the analysis, as well. Our analysis of self-attention modules in BERT (Devlin et al., 2019) shows that the proposed method produces insights that better agree with linguistic intuitions than an analysis based on attention-weights alone. Our analysis further reveals that BERT controls the amount of the contribution from frequent informative and less informative tokens not by attention weights but via vector norms.",BERT|Self-attention modules|Transformer-based models|output modules,SRW,SRW, -srw.9,Combining Subword Representations into Word-level Representations in the Transformer Architecture,Noe Casas|Marta R. Costa-jussà|José A. R. Fonollosa,"In Neural Machine Translation, using word-level tokens leads to degradation in translation quality. The dominant approaches use subword-level tokens, but this increases the length of the sequences and makes it difficult to profit from word-level information such as POS tags or semantic dependencies.We propose a modification to the Transformer model to combine subword-level representations into word-level ones in the first layers of the encoder, reducing the effective length of the sequences in the following layers and providing a natural point to incorporate extra word-level information.Our experiments show that this approach maintains the translation quality with respect to the normal Transformer model when no extra word-level information is injected and that it is superior to the currently dominant method for incorporating word-level source language information to models based on subword-level vocabularies.",Neural Translation|Subword Representations|Word-level Representations|Transformer Architecture,SRW,SRW,https://www.aclweb.org/anthology/2020.acl-srw.10.pdf -srw.82,A Simple and Effective Dependency Parser for Telugu,Sneha Nallani|Manish Shrivastava|Dipti Sharma,"We present a simple and effective dependency parser for Telugu, a morphologically rich, free word order language. We propose to replace the rich linguistic feature templates used in the past approaches with a minimal feature function using contextual vector representations. We train a BERT model on the Telugu Wikipedia data and use vector representations from this model to train the parser. Each sentence token is associated with a vector representing the token in the context of that sentence and the feature vectors are constructed by concatenating two token representations from the stack and one from the buffer. We put the feature representations through a feedforward network and train with a greedy transition based approach. The resulting parser has a very simple architecture with minimal feature engineering and achieves state-of-the-art results for Telugu.",Telugu|Dependency Parser|contextual representations|BERT model,SRW,SRW,https://www.aclweb.org/anthology/2020.acl-srw.19.pdf -srw.69,Why is penguin more similar to polar bear than to sea gull? Analyzing conceptual knowledge in distributional models,Pia Sommerauer,"What do powerful models of word mean- ing created from distributional data (e.g. Word2vec (Mikolov et al., 2013) BERT (Devlin et al., 2019) and ELMO (Peters et al., 2018)) represent? What causes words to be similar in the semantic space? What type of information is lacking? This thesis proposal presents a framework for investigating the information encoded in distributional semantic models. Several analysis methods have been suggested, but they have been shown to be limited and are not well understood. This approach pairs observations made on actual corpora with insights obtained from data manipulation experiments. The expected outcome is a better understanding of (1) the semantic information we can infer purely based on linguistic co-occurrence patterns and (2) the potential of distributional semantic models to pick up linguistic evidence.",word ing|distributional models|BERT|ELMO,SRW,SRW,https://www.aclweb.org/anthology/2020.acl-srw.18.pdf -srw.55,Considering Likelihood in NLP Classification Explanations with Occlusion and Language Modeling,David Harbecke|Christoph Alt,"Recently, state-of-the-art NLP models gained an increasing syntactic and semantic understanding of language, and explanation methods are crucial to understand their decisions. Occlusion is a well established method that provides explanations on discrete language data, e.g. by removing a language unit from an input and measuring the impact on a model's decision. We argue that current occlusion-based methods often produce invalid or syntactically incorrect language data, neglecting the improved abilities of recent NLP models. Furthermore, gradient-based explanation methods disregard the discrete distribution of data in NLP. Thus, we propose OLM: a novel explanation method that combines occlusion and language models to sample valid and syntactically correct replacements with high likelihood, given the context of the original input. We lay out a theoretical foundation that alleviates these weaknesses of other explanation methods in NLP and provide results that underline the importance of considering data likelihood in occlusion-based explanation.",NLP|NLP Explanations|Language Modeling|NLP models,SRW,SRW,https://www.aclweb.org/anthology/2020.acl-srw.16.pdf -srw.54,Multi-Task Neural Model for Agglutinative Language Translation,Yirong Pan|Xiao Li|Yating Yang|Rui Dong,"Neural machine translation (NMT) has achieved impressive performance recently by using large-scale parallel corpora. However, it struggles in the low-resource and morphologically-rich scenarios of agglutinative language translation task. Inspired by the finding that monolingual data can greatly improve the NMT performance, we propose a multi-task neural model that jointly learns to perform bi-directional translation and agglutinative language stemming. Our approach employs the shared encoder and decoder to train a single model without changing the standard NMT architecture but instead adding a token before each source-side sentence to specify the desired target outputs of the two different tasks. Experimental results on Turkish-English and Uyghur-Chinese show that our proposed approach can significantly improve the translation performance on agglutinative languages by using a small amount of monolingual data.",Agglutinative Translation|agglutinative task|bi-directional translation|agglutinative stemming,SRW,SRW,https://www.aclweb.org/anthology/2020.acl-srw.15.pdf -srw.116,Noise-Based Augmentation Techniques for Emotion Datasets: What do we Recommend?,Mimansa Jaiswal|Emily Mower Provost,"Emotion recognition systems are widely used for many downstream applications such as mental health monitoring, educational problems diagnosis, hate speech classification and targeted advertising. Yet, these systems are generally trained on audio or multimodal datasets collected in a laboratory environment.While acoustically different, they are generally free of major environmental noises. The result is that systems trained on these datasets falter when presented with noisy data, even when that noise doesn’t affect the human perception of emotions. In this work, we use multiple categories of environmental and synthetic noises to generate black box adversarial examples and use these noises to modify the samples in the IEMOCAP dataset. We evaluate how both human and machine emotion perception changes when noise is introduced. We find that the trained state-of-the-art models fail to classify even moderately noisy samples that humans usually have no trouble comprehend-ing, demonstrating the brittleness of these systems in real world conditions.",mental monitoring|educational diagnosis|hate classification|targeted advertising,SRW,SRW, -srw.106,Effectively Aligning and Filtering Parallel Corpora under Sparse Data Conditions,Steinþór Steingrímsson|Hrafn Loftsson|Andy Way,"Parallel corpora are key to developing good machine translation systems. However, abundant parallel data are hard to come by, especially for languages with a low number of speakers. When rich morphology exacerbates the data sparsity problem, it is imperative to have accurate alignment and filtering methods that can help make the most of what is available by maximising the number of correctly translated segments in a corpus and minimising noise by removing incorrect translations and segments containing extraneous data.This paper sets out a research plan for improving alignment and filtering methods for parallel texts in low-resource settings. We propose an effective unsupervised alignment method to tackle the alignment problem. Moreover, we propose a strategy to supplement state-of-the-art models with automatically extracted information using basic NLP tools to effectively handle rich morphology.",Aligning Corpora|machine systems|data problem|alignment problem,SRW,SRW,https://www.aclweb.org/anthology/2020.acl-srw.25.pdf -srw.79,Transferring Monolingual Model to Low-Resource Language: The Case of Tigrinya,Abrhalei Frezghi Tela|Abraham Woubie Zewoudie|Ville Hautamäki,"In recent years, transformer models have achieved great success in natural language processing tasks. Most of the current state-of-the-art NLP results are achieved by using monolingual transformer models, where the model is pre-trained using a single language unlabelled text corpus. Then, the model is fine-tuned to the specific downstream task. However, the cost of pre-training a new transformer model is high for most languages. In this work, we propose a novel transfer learning method to adopt a strong source language model, trained from a large monolingual corpus to a low-resource language. Thus, using XLNet language model, we demonstrate competitive performance with mBERT and a pre-trained target language model on the Cross-lingual Sentiment (CLS) dataset and on a new sentiment analysis dataset for low-resourced language Tigrinya. With only 10k examples of the given Tigrinya sentiment analysis dataset, English XLNet has achieved 78.88% F1-Score outperforming BERT and mBERT by 10% and 7%, respectively. More interestingly, fine-tuning (English) XLNet model on the CLS dataset has promising results compared to mBERT and even outperformed mBERT for one dataset of the Japanese language.",natural tasks|NLP|downstream task|pre-training,SRW,SRW, -srw.105,A Geometry-Inspired Attack for Generating Natural Language Adversarial Examples,Zhao Meng|Roger Wattenhofer,"Generating adversarial examples for natural language is hard, as natural language consists of discrete symbols and examples are often of variable lengths. In this paper, we propose a geometry-inspired attack for generating natural language adversarial examples. Our attack generates adversarial examples by iteratively approximating the decision boundary of deep neural networks. Experiments on two datasets with two different models show that our attack fools the models with high success rates, while only replacing a few words. Human evaluation shows that adversarial examples generated by our attack are hard for humans to recognize. Further experiments show that adversarial training can improve model robustness against our attack.",Generating Examples|Human evaluation|Geometry-Inspired Attack|decision networks,SRW,SRW, -srw.84,Pointwise Paraphrase Appraisal is Potentially Problematic,Hannah Chen|Yangfeng Ji|David Evans,"The prevailing approach for training and evaluating paraphrase identification models is constructed as a binary classification problem: the model is given a pair of sentences, and is judged by how accurately it classifies pairs as either paraphrases or non-paraphrases. This pointwise-based evaluation method does not match well the objective of most real world applications, so the goal of our work is to understand how models which perform well under pointwise evaluation may fail in practice and find better methods for evaluating paraphrase identification models. As a first step towards that goal, we show that although the standard way of fine-tuning BERT for paraphrase identification by pairing two sentences as one sequence results in a model with state-of-the-art performance, that model may perform poorly on simple tasks like identifying pairs with two identical sentences. Moreover, we show that these models may even predict a pair of randomly-selected sentences with higher paraphrase score than a pair of identical ones.",Pointwise Appraisal|binary problem|paraphrase identification|paraphrase models,SRW,SRW,https://www.aclweb.org/anthology/2020.acl-srw.20.pdf -srw.90,To compress or not to compress? A Finite-State approach to Nen verbal morphology,Saliha Muradoglu|Nicholas Evans|Hanna Suominen,"This paper describes the development of a verbal morphological parser for an under-resourced Papuan language, Nen. Nen verbal morphology is particularly complex, with a transitive verb taking up to 1,740 unique features. The structural properties exhibited by Nen verbs raises interesting choices for analysis. Here we compare two possible methods of analysis: ‘Chunking’ and decomposition. ‘Chunking’ refers to the concept of collating morphological segments into one, whereas the decomposition model follows a more classical linguistic approach. Both models are built using the Finite-State Transducer toolkit foma. The resultant architecture shows differences in size and structural clarity. While the ‘Chunking’ model is under half the size of the full de-composed counterpart, the decomposition displays higher structural order. In this paper, we describe the challenges encountered when modelling a language exhibiting distributed exponence and present the first morphological analyser for Nen, with an overall accuracy of 80.3%.",Finite-State approach|verbal parser|Chunking|decomposition model,SRW,SRW,https://www.aclweb.org/anthology/2020.acl-srw.28.pdf -srw.53,Feature Difference Makes Sense: A medical image captioning model exploiting feature difference and tag information,Hyeryun Park|Kyungmo Kim|Jooyoung Yoon|Seongkeun Park|Jinwook Choi,"Medical image captioning can reduce the workload of physicians and save time and expense by automatically generating reports. However, current datasets are small and limited, creating additional challenges for researchers. In this study, we propose a feature difference and tag information combined long short-term memory (LSTM) model for chest x-ray report generation. A feature vector extracted from the image conveys visual information, but its ability to describe the image is limited. Other image captioning studies exhibited improved performance by exploiting feature differences, so the proposed model also utilizes them. First, we propose a difference and tag (DiTag) model containing the difference between the patient and normal images. Then, we propose a multi-difference and tag (mDiTag) model that also contains information about low-level differences, such as contrast, texture, and localized area. Evaluation of the proposed models demonstrates that the mDiTag model provides more information to generate captions and outperforms all other models.",chest generation|image studies|medical model|difference model,SRW,SRW,https://www.aclweb.org/anthology/2020.acl-srw.14.pdf -srw.52,Reflection-based Word Attribute Transfer,Yoichi Ishibashi|Katsuhito Sudoh|Koichiro Yoshino|Satoshi Nakamura,"Word embeddings, which often represent such analogic relations as king - man + woman ~ queen, can be used to change a word's attribute, including its gender. For transferring king into queen in this analogy-based manner, we subtract a difference vector man - woman based on the knowledge that king is male. However, developing such knowledge is very costly for words and attributes. In this work, we propose a novel method for word attribute transfer based on reflection mappings without such an analogy operation. Experimental results show that our proposed method can transfer the word attributes of the given words without changing the words that do not have the target attributes.",Reflection-based Transfer|word transfer|Word embeddings|analogy-based manner,SRW,SRW,https://www.aclweb.org/anthology/2020.acl-srw.8.pdf -srw.46,Dominance as an Indicator of Rapport and Learning in Human-Agent Communication,Amanda Buddemeyer|Xiaoyi Tian|Erin Walker,"Power dynamics in human-human communication can impact rapport-building and learning gains, but little is known about how power impacts human-agent communication. In this paper, we examine dominance behavior in utterances between middle-school students and a teachable robot as they work through math problems, as coded by Rogers and Farace's Relational Communication Control Coding Scheme (RCCCS). We hypothesize that relatively dominant students will show increased learning gains, as will students with greater dominance agreement with the robot. We also hypothesize that gender could be an indicator of differences in dominance behavior. We present a preliminary analysis of dominance characteristics in some of the transactions between robot and student. Ultimately, we hope to determine if manipulating the dominance behavior of a learning robot could support learning.",rapport-building gains|learning|Relational Scheme|RCCCS,SRW,SRW, -srw.85,#NotAWhore! A Computational Linguistic Perspective of Rape Culture and Victimization on Social Media,Ashima Suvarna|Grusha Bhalla,"The recent surge in online forums and movements supporting sexual assault survivors has led to the emergence of a `virtual bubble' where survivors can recount their stories. However, this also makes the survivors vulnerable to bullying, trolling and victim blaming. Specifically, victim blaming has been shown to have acute psychological effects on the survivors and further discourage formal reporting of such crimes. Therefore, it is important to devise computationally relevant methods to identify and prevent victim blaming to protect the victims. In our work, we discuss the drastic effects of victim blaming through a short case study and then propose a single step transfer-learning based classification method to identify victim blaming language on Twitter. Finally, we compare the performance of our proposed model against various deep learning and machine learning models on a manually annotated domain-specific dataset.",sexual survivors|victim blaming|computationally methods|transfer-learning method,SRW,SRW,https://www.aclweb.org/anthology/2020.acl-srw.43.pdf -srw.104,Crossing the Line: Where do Demographic Variables Fit into Humor Detection?,J. A. Meaney,"Recent humor classification shared tasks have struggled with two issues: either the data comprises a highly constrained genre of humor which does not broadly represent humor, or the data is so indiscriminate that the inter-annotator agreement on its humor content is drastically low. These tasks typically average over all annotators' judgments, in spite of the fact that humor is a highly subjective phenomenon. We argue that demographic factors influence whether a text is perceived as humorous or not. We propose the addition of demographic information about the humor annotators in order to bin ratings more sensibly. We also suggest the addition of an 'offensive' label to distinguish between different generations, in terms of humor. This would allow for more nuanced shared tasks and could lead to better performance on downstream tasks, such as content moderation.",Humor Detection|humor tasks|Demographic Variables|humor,SRW,SRW,https://www.aclweb.org/anthology/2020.acl-srw.24.pdf diff --git a/sitedata/tacl_paper_sessions.yml b/sitedata/tacl_paper_sessions.yml deleted file mode 100644 index 9d74d6f..0000000 --- a/sitedata/tacl_paper_sessions.yml +++ /dev/null @@ -1,137 +0,0 @@ -1A: - date: 2020-07-06_05:00:00 - papers: - - tacl.1815 - - tacl.1915 -1B: - date: 2020-07-06_06:00:00 - papers: - - tacl.1901 -2B: - date: 2020-07-06_09:00:00 - papers: - - tacl.1815 - - tacl.1915 -3B: - date: 2020-07-06_13:00:00 - papers: - - tacl.1849 - - tacl.1901 -4A: - date: 2020-07-06_17:00:00 - papers: - - tacl.1720 - - tacl.1780 - - tacl.1834 -4B: - date: 2020-07-06_18:00:00 - papers: - - tacl.1727 - - tacl.1766 - - tacl.1849 - - tacl.1903 -5A: - date: 2020-07-06_20:00:00 - papers: - - tacl.1720 - - tacl.1780 - - tacl.1834 -5B: - date: 2020-07-06_21:00:00 - papers: - - tacl.1727 - - tacl.1766 - - tacl.1903 -6A: - date: 2020-07-07_05:00:00 - papers: - - tacl.1843 - - tacl.1876 -6B: - date: 2020-07-07_06:00:00 - papers: - - tacl.1709 - - tacl.1779 -7A: - date: 2020-07-07_08:00:00 - papers: - - tacl.1843 - - tacl.1876 - - tacl.2001 -8A: - date: 2020-07-07_12:00:00 - papers: - - tacl.1845 -8B: - date: 2020-07-07_13:00:00 - papers: - - tacl.1709 - - tacl.1756 - - tacl.1779 - - tacl.1852 - - tacl.1892 - - tacl.2001 -9A: - date: 2020-07-07_17:00:00 - papers: - - tacl.1853 -9B: - date: 2020-07-07_18:00:00 - papers: - - tacl.1845 - - tacl.1882 - - tacl.1892 - - tacl.1929 -10A: - date: 2020-07-07_20:00:00 - papers: - - tacl.1756 - - tacl.1852 - - tacl.1882 - - tacl.1929 -10B: - date: 2020-07-07_21:00:00 - papers: - - tacl.1853 -11B: - date: 2020-07-08_06:00:00 - papers: - - tacl.1805 - - tacl.1811 -12A: - date: 2020-07-08_08:00:00 - papers: - - tacl.1805 - - tacl.1811 -13A: - date: 2020-07-08_12:00:00 - papers: - - tacl.1906 - - tacl.1967 -13B: - date: 2020-07-08_13:00:00 - papers: - - tacl.1759 -14A: - date: 2020-07-08_17:00:00 - papers: - - tacl.1759 - - tacl.1886 - - tacl.1906 - - tacl.1912 - - tacl.1967 -14B: - date: 2020-07-08_18:00:00 - papers: - - tacl.1743 - - tacl.1801 -15A: - date: 2020-07-08_20:00:00 - papers: - - tacl.1886 - - tacl.1912 -15B: - date: 2020-07-08_21:00:00 - papers: - - tacl.1743 - - tacl.1801 diff --git a/sitedata/tacl_paper_slideslive_ids.csv b/sitedata/tacl_paper_slideslive_ids.csv deleted file mode 100644 index 0d6038a..0000000 --- a/sitedata/tacl_paper_slideslive_ids.csv +++ /dev/null @@ -1,32 +0,0 @@ -UID,presentation_id -tacl.1709,38929484 -tacl.1720,38929485 -tacl.1727,38929486 -tacl.1743,38929487 -tacl.1756,38929488 -tacl.1759,38929489 -tacl.1766,38929490 -tacl.1779,38929491 -tacl.1780,38929492 -tacl.1801,38929493 -tacl.1805,38929494 -tacl.1811,38929495 -tacl.1815,38929496 -tacl.1834,38929497 -tacl.1843,38929498 -tacl.1845,38929499 -tacl.1849,38929500 -tacl.1852,38929501 -tacl.1853,38929502 -tacl.1876,38929503 -tacl.1882,38929504 -tacl.1886,38929505 -tacl.1892,38929506 -tacl.1901,38929507 -tacl.1903,38929508 -tacl.1906,38929509 -tacl.1912,38929510 -tacl.1915,38929511 -tacl.1929,38929512 -tacl.1967,38929513 -tacl.2001,38929514 diff --git a/sitedata/tacl_paper_zoom_links.csv b/sitedata/tacl_paper_zoom_links.csv deleted file mode 100644 index b41cc13..0000000 --- a/sitedata/tacl_paper_zoom_links.csv +++ /dev/null @@ -1,63 +0,0 @@ -UID,session_name,starttime,endtime,timezone,zoom_join_link -tacl.1709,6B,2020-07-07T06:00:00Z,2020-07-07T07:00:00Z,UTC,https://zoom.us/j/96590105613 -tacl.1709,8B,2020-07-07T13:00:00Z,2020-07-07T14:00:00Z,UTC,https://zoom.us/j/98572534304 -tacl.1720,4A,2020-07-06T17:00:00Z,2020-07-06T18:00:00Z,UTC,https://zoom.us/j/97896468186 -tacl.1720,5A,2020-07-06T20:00:00Z,2020-07-06T21:00:00Z,UTC,https://zoom.us/j/94696909873 -tacl.1727,4B,2020-07-06T18:00:00Z,2020-07-06T19:00:00Z,UTC,https://zoom.us/j/99534470603 -tacl.1727,5B,2020-07-06T21:00:00Z,2020-07-06T22:00:00Z,UTC,https://zoom.us/j/97977868953 -tacl.1743,14B,2020-07-08T18:00:00Z,2020-07-08T19:00:00Z,UTC,https://zoom.us/j/97199467059 -tacl.1743,15B,2020-07-08T21:00:00Z,2020-07-08T22:00:00Z,UTC,https://zoom.us/j/96456195642 -tacl.1756,8B,2020-07-07T13:00:00Z,2020-07-07T14:00:00Z,UTC,https://zoom.us/j/92596648393 -tacl.1756,10A,2020-07-07T20:00:00Z,2020-07-07T21:00:00Z,UTC,https://zoom.us/j/95850103088 -tacl.1759,13B,2020-07-08T13:00:00Z,2020-07-08T14:00:00Z,UTC,https://zoom.us/j/94962054808 -tacl.1759,14A,2020-07-08T17:00:00Z,2020-07-08T18:00:00Z,UTC,https://zoom.us/j/91186540555 -tacl.1766,4B,2020-07-06T18:00:00Z,2020-07-06T19:00:00Z,UTC,https://zoom.us/j/98662755691 -tacl.1766,5B,2020-07-06T21:00:00Z,2020-07-06T22:00:00Z,UTC,https://zoom.us/j/91925354346 -tacl.1779,6B,2020-07-07T06:00:00Z,2020-07-07T07:00:00Z,UTC,https://zoom.us/j/93815623121 -tacl.1779,8B,2020-07-07T13:00:00Z,2020-07-07T14:00:00Z,UTC,https://zoom.us/j/95369197123 -tacl.1780,4A,2020-07-06T17:00:00Z,2020-07-06T18:00:00Z,UTC,https://zoom.us/j/97067462588 -tacl.1780,5A,2020-07-06T20:00:00Z,2020-07-06T21:00:00Z,UTC,https://zoom.us/j/97768719888 -tacl.1801,14B,2020-07-08T18:00:00Z,2020-07-08T19:00:00Z,UTC,https://zoom.us/j/95587071452 -tacl.1801,15B,2020-07-08T21:00:00Z,2020-07-08T22:00:00Z,UTC,https://zoom.us/j/91330157384 -tacl.1805,11B,2020-07-08T06:00:00Z,2020-07-08T07:00:00Z,UTC,https://zoom.us/j/95155789983 -tacl.1805,12A,2020-07-08T08:00:00Z,2020-07-08T09:00:00Z,UTC,https://zoom.us/j/99920913912 -tacl.1811,11B,2020-07-08T06:00:00Z,2020-07-08T07:00:00Z,UTC,https://zoom.us/j/98939618192 -tacl.1811,12A,2020-07-08T08:00:00Z,2020-07-08T09:00:00Z,UTC,https://zoom.us/j/91811783241 -tacl.1815,1A,2020-07-06T05:00:00Z,2020-07-06T06:00:00Z,UTC,https://zoom.us/j/93780804165 -tacl.1815,2B,2020-07-06T09:00:00Z,2020-07-06T10:00:00Z,UTC,https://zoom.us/j/93265538460 -tacl.1834,4A,2020-07-06T17:00:00Z,2020-07-06T18:00:00Z,UTC,https://zoom.us/j/98110497740 -tacl.1834,5A,2020-07-06T20:00:00Z,2020-07-06T21:00:00Z,UTC,https://zoom.us/j/91200659482 -tacl.1843,6A,2020-07-07T05:00:00Z,2020-07-07T06:00:00Z,UTC,https://zoom.us/j/94203807932 -tacl.1843,7A,2020-07-07T08:00:00Z,2020-07-07T09:00:00Z,UTC,https://zoom.us/j/99009708460 -tacl.1845,8A,2020-07-07T12:00:00Z,2020-07-07T13:00:00Z,UTC,https://zoom.us/j/97834974219 -tacl.1845,9B,2020-07-07T18:00:00Z,2020-07-07T19:00:00Z,UTC,https://zoom.us/j/98052172973 -tacl.1849,3B,2020-07-06T13:00:00Z,2020-07-06T14:00:00Z,UTC,https://zoom.us/j/96940604947 -tacl.1849,4B,2020-07-06T18:00:00Z,2020-07-06T19:00:00Z,UTC,https://zoom.us/j/99604446502 -tacl.1852,8B,2020-07-07T13:00:00Z,2020-07-07T14:00:00Z,UTC,https://zoom.us/j/92831637858 -tacl.1852,10A,2020-07-07T20:00:00Z,2020-07-07T21:00:00Z,UTC,https://zoom.us/j/95904047072 -tacl.1853,9A,2020-07-07T17:00:00Z,2020-07-07T18:00:00Z,UTC,https://zoom.us/j/93666619676 -tacl.1853,10B,2020-07-07T21:00:00Z,2020-07-07T22:00:00Z,UTC,https://zoom.us/j/95209003576 -tacl.1876,6A,2020-07-07T05:00:00Z,2020-07-07T06:00:00Z,UTC,https://zoom.us/j/95620198857 -tacl.1876,7A,2020-07-07T08:00:00Z,2020-07-07T09:00:00Z,UTC,https://zoom.us/j/98295165471 -tacl.1882,9B,2020-07-07T18:00:00Z,2020-07-07T19:00:00Z,UTC,https://zoom.us/j/98617784044 -tacl.1882,10A,2020-07-07T20:00:00Z,2020-07-07T21:00:00Z,UTC,https://zoom.us/j/91626696411 -tacl.1886,14A,2020-07-08T17:00:00Z,2020-07-08T18:00:00Z,UTC,https://zoom.us/j/91952176349 -tacl.1886,15A,2020-07-08T20:00:00Z,2020-07-08T21:00:00Z,UTC,https://zoom.us/j/93955657226 -tacl.1892,8B,2020-07-07T13:00:00Z,2020-07-07T14:00:00Z,UTC,https://zoom.us/j/98903305485 -tacl.1892,9B,2020-07-07T18:00:00Z,2020-07-07T19:00:00Z,UTC,https://zoom.us/j/92788220456 -tacl.1901,1B,2020-07-06T06:00:00Z,2020-07-06T07:00:00Z,UTC,https://zoom.us/j/97218458626 -tacl.1901,3B,2020-07-06T13:00:00Z,2020-07-06T14:00:00Z,UTC,https://zoom.us/j/97385752394 -tacl.1903,4B,2020-07-06T18:00:00Z,2020-07-06T19:00:00Z,UTC,https://zoom.us/j/98008219134 -tacl.1903,5B,2020-07-06T21:00:00Z,2020-07-06T22:00:00Z,UTC,https://zoom.us/j/92797221295 -tacl.1906,13A,2020-07-08T12:00:00Z,2020-07-08T13:00:00Z,UTC,https://zoom.us/j/99364222722 -tacl.1906,14A,2020-07-08T17:00:00Z,2020-07-08T18:00:00Z,UTC,https://zoom.us/j/91401252683 -tacl.1912,14A,2020-07-08T17:00:00Z,2020-07-08T18:00:00Z,UTC,https://zoom.us/j/94204082942 -tacl.1912,15A,2020-07-08T20:00:00Z,2020-07-08T21:00:00Z,UTC,https://zoom.us/j/95490501224 -tacl.1915,1A,2020-07-06T05:00:00Z,2020-07-06T06:00:00Z,UTC,https://zoom.us/j/97378826825 -tacl.1915,2B,2020-07-06T09:00:00Z,2020-07-06T10:00:00Z,UTC,https://zoom.us/j/93557573447 -tacl.1929,9B,2020-07-07T18:00:00Z,2020-07-07T19:00:00Z,UTC,https://zoom.us/j/92739183372 -tacl.1929,10A,2020-07-07T20:00:00Z,2020-07-07T21:00:00Z,UTC,https://zoom.us/j/91799727657 -tacl.1967,13A,2020-07-08T12:00:00Z,2020-07-08T13:00:00Z,UTC,https://zoom.us/j/91040465945 -tacl.1967,14A,2020-07-08T17:00:00Z,2020-07-08T18:00:00Z,UTC,https://zoom.us/j/99520602996 -tacl.2001,7A,2020-07-07T08:00:00Z,2020-07-07T09:00:00Z,UTC,https://zoom.us/j/99812625195 -tacl.2001,8B,2020-07-07T13:00:00Z,2020-07-07T14:00:00Z,UTC,https://zoom.us/j/91756402149 diff --git a/sitedata/tacl_papers.csv b/sitedata/tacl_papers.csv deleted file mode 100644 index 0ce567e..0000000 --- a/sitedata/tacl_papers.csv +++ /dev/null @@ -1,32 +0,0 @@ -UID,title,authors,abstract,keywords,track,paper_type,pdf_url,emails -tacl.1779,Membership Inference Attacks on Sequence-to-Sequence Models: Is My Data In Your Machine Translation System?,Sorami Hisamoto|Matt Post|Kevin Duh,"Data privacy is an important issue for ""machine learning as a service"" providers. We focus on the problem of membership inference attacks: given a data sample and black-box access to a model's API, determine whether the sample existed in the model's training data. Our contribution is an investigation of this problem in the context of sequence-to-sequence models, which are important in applications such as machine translation and video captioning. We define the membership inference problem for sequence generation, provide an open dataset based on state-of-the-art machine translation models, and report initial results on whether these models leak private information against several kinds of membership inference attacks. ",Machine System|Data privacy|machine learning|membership attacks,Interpretability and Analysis of Models for NLP,TEMNLP,https://www.mitpressjournals.org/doi/pdf/10.1162/tacl_a_00299,s@89.io|post@cs.jhu.edu|kevinduh@cs.jhu.edu| -tacl.1815,Theoretical Limitations of Self-Attention in Neural Sequence Models,Michael Hahn,"Transformers are emerging as the new workhorse of NLP, showing great success across tasks. Unlike LSTMs, transformers process input sequences entirely through self-attention. Previous work has suggested that the computational capabilities of self-attention to process hierarchical structures are limited. In this work, we mathematically investigate the computational power of self-attention to model formal languages. Across both soft and hard attention, we show strong theoretical limitations of the computational abilities of self-attention, finding that it cannot model periodic finite-state languages, nor hierarchical structure, unless the number of layers or heads increases with input length. These limitations seem surprising given the practical success of self-attention and the prominent role assigned to hierarchical structure in linguistics, suggesting that natural language can be approximated well with models that are too weak for the formal languages typically assumed in theoretical linguistics.",NLP|Self-Attention Models|Neural Models|Transformers,Theory and Formalism in NLP (Linguistic and Mathematical),TEMNLP,https://www.mitpressjournals.org/doi/pdf/10.1162/tacl_a_00306,mhahn2@stanford.edu -tacl.1801,Deep Contextualized Self-training for Low Resource Dependency Parsing ,Guy Rotman|Roi Reichart,"Neural dependency parsing has proven very effective, achieving state-of-the-art results on numerous domains and languages. Unfortunately, it requires large amounts of labeled data, that is costly and laborious to create. In this paper we propose a self-training algorithm that alleviates this annotation bottleneck by training a parser on its own output. Our Deep Contextualized Selftraining (DCST) algorithm utilizes representation models trained on sequence labeling tasks that are derived from the parser’s output when applied to unlabeled data, and integrates these models with the base parserthrough a gating mechanism. We conduct experiments across multiple languages, both in low resource in-domain and in cross-domain setups, and demonstrate that DCST substantially outperforms traditional self-training as well as recent semi-supervised training methods.",Low Parsing|sequence tasks|Deep Self-training|Neural parsing,"Syntax: Tagging, Chunking and Parsing",TEMNLP,https://www.mitpressjournals.org/doi/pdf/10.1162/tacl_a_00294,rotmanguy@gmail.com|roireichart@gmail.com -tacl.1780,Inherent Disagreements in Human Textual Inferences,Ellie Pavlick|Tom Kwiatkowski,"We analyze human’s disagreements about the validity of natural language inferences. We show that, very often, disagreements are not dismissible as annotation “noise”, but rather persist as we collect more ratings and as we vary the amount of context provided to raters. We further show that the type of uncertainty captured by current state-of-the-art models for natural language inference is not reflective of the type of uncertainty present in human disagreements. We discuss implications of our results in relation to the recognizing textual entailment (RTE)/natural language inference (NLI) task. We argue for a refined evaluation objective which requires models to explicitly capture the full distribution of plausible human judgments.",Human Inferences|natural inferences|natural inference|evaluation objective,Semantics: Textual Inference and Other Areas of Semantics,TEMNLP,https://www.mitpressjournals.org/doi/pdf/10.1162/tacl_a_00293,ellie_pavlick@brown.edu|tomkwiat@google.com -tacl.1743,Machine Learning-Driven Language Assessment,Burr Settles|Masato Hagiwara|Geoffrey T. LaFlair,"We describe a method for rapidly creating language proficiency assessments, and provide experimental evidence that such tests can be valid, reliable, and secure. Our approach is the first to use machine learning and natural language processing to induce proficiency scales based on a given standard, and then use linguistic models to estimate item difficulty directly for computer-adaptive testing. This alleviates the need for expensive pilot testing with human subjects. We used these methods to develop an online proficiency exam called the Duolingo English Test, and demonstrate that its scores align significantly with other high-stakes English assessments. Furthermore, our approach produces test scores that are highly reliable, while generating item banks large enough to satisfy security requirements.",Machine Assessment|language assessments|natural processing|computer-adaptive testing,NLP Applications,TEMNLP,https://www.mitpressjournals.org/doi/pdf/10.1162/tacl_a_00310,burr@duolingo.com|geoff@duolingo.com|masato@octanove.com -tacl.1756,Paraphrase-Sense-Tagged Sentences,Anne Cocos|Chris Callison-Burch,"Many natural language processing tasks require discriminating the particular meaning of a word in context, but building corpora for developing sense-aware models can be a challenge. We present a large resource of example usages for words having a particular meaning, called Paraphrase-Sense-Tagged Sentences (PSTS). Built upon the premise that a word's paraphrases instantiate its fine-grained meanings -- i.e. 'bug' has different meanings corresponding to its paraphrases 'fly' and 'microbe' -- the resource contains up to 10,000 sentences for each of 3 million target-paraphrase pairs where the target word takes on the meaning of the paraphrase. We describe an automatic method based on bilingual pivoting used to enumerate sentences for PSTS, and present two models for ranking PSTS sentences based on their quality. Finally, we demonstrate the utility of PSTS by using it to build a dataset for the task of hypernym prediction in context. Training a model on this automatically-generated dataset produces accuracy that is competitive with a model trained on smaller datasets crafted with some manual effort.",natural tasks|ranking sentences|hypernym prediction|sense-aware models,Resources and Evaluation,TEMNLP,https://www.mitpressjournals.org/doi/pdf/10.1162/tacl_a_00295,acocos@alumni.upenn.edu|ccb@cis.upenn.edu| -tacl.1805,AMR-To-Text Generation with Graph Transformer,Tianming Wang|Xiaojun Wan|Hanqi Jin,"Abstract meaning representation (AMR)-to-text generation is the challenging task of generating natural language texts from AMR graphs, where nodes represent concepts and edges denote relations. The current state-of-the-art methods use graph-to-sequence models; however, they still cannot significantly outperform the previous sequence-to-sequence models or statistical approaches. In this paper, we propose a novel graph-to-sequence model (Graph Transformer) to address the above-mentioned task. The model directly encodes the AMR graphs and learns the node representations. A pairwise interaction function is used for computing the semantic relations between the concepts. Moreover, attention mechanisms are employed for aggregating the information from the incoming and outgoing neighbors, which help the model to capture the semantic information effectively. Our model outperforms the state-of-the-art neural approach by 1.5 BLEU points on LDC2015E86 and 4.8 BLEU points on LDC2017T10 and achieves new state-of-the-art performances.",AMR-To-Text Generation|Abstract generation|generating texts|Graph Transformer,Semantics: Sentence Level,TEMNLP,https://www.mitpressjournals.org/doi/pdf/10.1162/tacl_a_00297,wangtm@pku.edu.cn|wanxiaojun@pku.edu.cn|jinhanqi@pku.edu.cn -tacl.1811,Unsupervised Discourse Constituency Parsing Using Viterbi EM,Noriki Nishida|Hideki Nakayama,"In this paper, we introduce an unsupervised discourse constituency parsing algorithm. We use Viterbi EM with a margin-based criterion to train a span-based discourse parser in an unsupervised manner. We also propose initialization methods for Viterbi training of discourse constituents based on our prior knowledge of text structures. Experimental results demonstrate that our unsupervised parser achieves comparable or even superior performance to fully supervised parsers. We also investigate discourse constituents that are learned by our method.",Viterbi constituents|Unsupervised Parsing|Viterbi EM|unsupervised algorithm,Discourse and Pragmatics,TEMNLP,https://www.mitpressjournals.org/doi/pdf/10.1162/tacl_a_00312,norikinishida@gmail.com|nakayama@nlab.ci.i.u-tokyo.ac.jp -tacl.1967,Syntax-guided Controlled Generation of Paraphrases,Ashutosh Kumar|Kabir Ahuja|Raghuram Vadapalli|Partha Talukdar,"Given a sentence (e.g., ""I like mangoes"") and a constraint (e.g., negative sentiment), the goal of controlled text generation is to produce a sentence that adapts the input sentence to meet the requirements of the constraint (e.g., ""I hate mangoes""). Going beyond such simple constraints, recent works have started exploring the incorporation of complex syntactic-guidance as constraints in the task of controlled paraphrase generation. In these methods, syntactic-guidance is sourced from a separate exemplar sentence. However, these prior works have only utilized limited syntactic information available in the parse tree of the exemplar sentence. We address this limitation in the paper and propose Syntax Guided Controlled Paraphraser (SGCP), an end-to-end framework for syntactic paraphrase generation. We find that S GCP can generate syntax-conforming sentences while not compromising on relevance. We perform extensive automated and human evaluations over multiple real-world datasets to demonstrate the efficacy of SGCP over state-of-the-art baselines. To drive future research, we have made SGCP ’s source code available.",Syntax-guided Paraphrases|controlled generation|syntactic generation|automated evaluations,Generation,TEMNLP,https://arxiv.org/abs/2005.08417,ashutosh@iisc.ac.in|kabirahuja2431@gmail.com|raghuram.4350@gmail.com|ppt@iisc.ac.in| -tacl.1849,Leveraging Pre-trained Checkpoints for Sequence Generation Tasks,Sascha Rothe|Shashi Narayan and Aliaksei Severyn,"Unsupervised pre-training of large neural models has recently revolutionized Natural Language Processing. By warm-starting from the publicly released checkpoints, NLP practitioners have pushed the state-of-the-art on multiple benchmarks while saving significant amounts of compute time. So far the focus has been mainly on the Natural Language Understanding tasks. In this paper, we demonstrate the efficacy of pre-trained checkpoints for Sequence Generation. We developed a Transformer-based sequence-to-sequence model that is compatible with publicly available pre-trained BERT, GPT-2 and RoBERTa checkpoints and conducted an extensive empirical study on the utility of initializing our model, both encoder and decoder, with these checkpoints. Our models result in new state-of-the-art results on Machine Translation, Text Summarization, Sentence Splitting, and Sentence Fusion.",Sequence Tasks|Natural Processing|Natural tasks|Sequence Generation,Generation,TEMNLP,https://www.mitpressjournals.org/doi/pdf/10.1162/tacl_a_00313,rothe@google.com|shashinarayan@google.com|severyn@google.com -tacl.1929,TyDi QA: A Benchmark for Information-Seeking Question Answering in Typologically Diverse Languages,Jonathan H Clark|Jennimaria Palomaki|Vitaly Nikolaev|Eunsol Choi|Dan Garrette|Michael Collins|Tom Kwiatkowski,"Confidently making progress on multilingual modeling requires challenging, trustworthy evaluations. We present TyDi QA, a question answering dataset covering 11 typologically diverse languages with 141K question-answer pairs. The languages of TyDi QA are diverse with regard to their typology --- the set of linguistic features that each language expresses --- such that we expect models performing well on this set to generalize across a large number of the languages in the world. We present a quantitative analysis of the data quality and example-level qualitative linguistic analyses of observed language phenomena that would not be found in English-only corpora. To provide a realistic information-seeking task and avoid priming effects, questions are written by people who want to know the answer, but don't know the answer yet, and the data is collected directly in each language without the use of translation. We provide initial quality measurements with a baseline model, suggesting a significant room for future work on this data.",Information-Seeking Answering|multilingual modeling|information-seeking task|translation,Question Answering,TEMNLP,https://arxiv.org/abs/2003.05002,jpalomaki@google.com|vitalyn@google.com|eunsolc@google.com|dhgarrette@google.com|mjcollins@google.com|tomkwiat@google.com>|jhclark@google.com -tacl.1915,How Furiously Can Colourless Green Ideas Sleep? Sentence Acceptability in Context,Jey Han Lau|Carlos Santos Armendariz|Matthew Purver|Chang Shu|Shalom Lappin,"We study the influence of context on sentence acceptability. First we compare the acceptability ratings of sentences judged in isolation, with a relevant context, and with an irrelevant context. Our results show that context induces a cognitive load for humans, which compresses the distribution of ratings. Moreover, in relevant contexts we observe a discourse coherence effect which uniformly raises acceptability. Next, we test unidirectional and bidirectional language models in their ability to predict acceptability ratings. The bidirectional models show very promising results, with the best model achieving a new state-of-the-art for unsupervised acceptability prediction. The two sets of experiments provide insights into the cognitive aspects of sentence processing and central issues in the computational modelling of text and discourse.",unsupervised prediction|cognitive processing|computational discourse|unidirectional models,Cognitive Modeling and Psycholinguistics,TEMNLP,https://www.mitpressjournals.org/doi/pdf/10.1162/tacl_a_00315,jeyhan.lau@gmail.com|c.santosarmendariz@qmul.ac.uk shalom.lappin@gu.se|m.purver@qmul.ac.uk|shuchang0011@gmail.com -tacl.1901,CrossWOZ: A Large-Scale Chinese Cross-Domain Task-Oriented Dialogue Dataset,Qi Zhu|Kaili Huang|Zheng Zhang|Xiaoyan Zhu|Minlie Huang,"To advance multi-domain (cross-domain) dialogue modeling as well as alleviate the shortage of Chinese task-oriented datasets, we propose CrossWOZ, the first large-scale Chinese Cross-Domain Wizard-of-Oz task-oriented dataset. It contains 6K dialogue sessions and 102K utterances for 5 domains, including hotel, restaurant, attraction, metro, and taxi. Moreover, the corpus contains rich annotation of dialogue states and dialogue acts at both user and system sides. About 60% of the dialogues have cross-domain user goals that favor inter-domain dependency and encourage natural transition across domains in conversation. We also provide a user simulator and several benchmark models for pipelined task-oriented dialogue systems, which will facilitate researchers to compare and evaluate their models on this corpus. The large size and rich annotation of CrossWOZ make it suitable to investigate a variety of tasks in cross-domain dialogue modeling, such as dialogue state tracking, policy learning, user simulation, etc.",multi-domain modeling|pipelined systems|cross-domain modeling|dialogue tracking,Dialogue and Interactive Systems,TEMNLP,https://www.mitpressjournals.org/doi/pdf/10.1162/tacl_a_00314,zhuq96@gmail.com|kaili.khuang@gmail.com|zhangz.goal@gmail.com|zxy-dcs@tsinghua.edu.cn|aihuang@tsinghua.edu.cn -tacl.1727,Perturbation Based Learning for Structured NLP tasks with Application to Dependency Parsing ,Amichay Doitch|Ram Yazdi|Tamir Hazan|Roi Reichart,"The best solution of structured prediction models in NLP is often inaccurate due to limited expressive power of the model or to non-exact parameter estimation. One way to mitigate this problem is sampling candidate solutions from the model’s solution space, reasoning that effective exploration of this space should yield high quality solutions. Unfortunately, sampling is often computationally hard and many works hence back-off to sub-optimal strategies such as extraction of the best scoring solutions of the model, which are not as diverse as sampled solutions. In this paper we propose a perturbation-based approach where sampling from a probabilistic model is computationally efficient. We present a learning algorithm for the variance of the perturbations, and empirically demonstrate its importance. Moreover, while finding the argmax in our model is intractable, we propose an efficient and effective approximation. We apply our framework to cross-lingual dependency parsing across 72 corpora from 42 languages and to lightly supervised dependency parsing across 13 corpora from 12 languages and demonstrate strong results in terms of both the quality of the entire solution list and of the final solution.",Structured tasks|Dependency Parsing|NLP|sampling,Machine Learning for NLP,TEMNLP,https://www.mitpressjournals.org/doi/pdf/10.1162/tacl_a_00291,amichay.d@gmail.com|ramyazdi1012@gmail.com|tamir.hazan@gmail.com|roireichart@gmail.com -tacl.1876,A Graph-based Model for Joint Chinese Word Segmentation and Dependency Parsing,Hang Yan|Xipeng Qiu|Xuanjing Huang,"Chinese word segmentation and dependency parsing are two fundamental tasks for Chinese natural language processing. The dependency parsing is defined on word-level. Therefore, word segmentation is the precondition of dependency parsing, which makes dependency parsing suffer from error propagation and unable to directly make use of the character-level pre-trained language model (such as BERT). In this paper, we propose a graph-based model to integrate Chinese word segmentation and dependency parsing. Different from previous transition-based joint models, our proposed model is more concise, which results in fewer efforts of feature engineering. Our graph-based joint model achieves better performance than previous joint models and state-of-the-art results in both Chinese word segmentation and dependency parsing. Besides, when BERT is combined, our model can substantially reduce the performance gap of dependency parsing between joint models and gold-segmented word-based models. Our code is publicly available at https://github.com/fastnlp/JointCwsParser.",Joint Segmentation|Joint Parsing|Chinese segmentation|dependency parsing,"Syntax: Tagging, Chunking and Parsing",TEMNLP,https://www.mitpressjournals.org/doi/pdf/10.1162/tacl_a_00301,hyan11@fudan.edu.cn|xjhuang@fudan.edu.cn|xpqiu@fudan.edu.cn -tacl.1903,Learning Lexical Subspaces in a Distributional Vector Space,Kushal Arora|Aishik Chakraborty|Jackie Chi Kit Cheung,"In this paper, we propose LEXSUB, a novel approach towards unifying lexical and distributional semantics. We inject knowledge about lexical-semantic relations into distributional word embeddings by defining subspaces of the distributional vector space in which a lexical relation should hold. Our framework can handle symmetric attract and repel relations (e.g., synonymy and antonymy, respectively), as well as asymmetric relations (e.g., hypernymy and meronomy). In a suite of intrinsic benchmarks, we show that our model outperforms previous post-hoc approaches on relatedness tasks, and on hypernymy classification and detection while being competitive on word similarity tasks. It also outperforms previous systems on extrinsic classification tasks that benefit from exploiting lexical relational cues. We perform a series of analyses to understand the behaviors of our model.",relatedness tasks|hypernymy classification|detection|word tasks,Semantics: Lexical,TEMNLP,https://www.mitpressjournals.org/doi/pdf/10.1162/tacl_a_00316,kushal.arora@mail.mcgill.ca|jcheung@cs.mcgill.ca|aishik.chakraborty@mail.mcgill.ca -tacl.1720,"Decomposing Generalization: Models of Generic, Habitual and Episodic Statements",Venkata Subrahmanyan Govindarajan|Benjamin Van Durme|Aaron Steven White,"We present a novel semantic framework for modeling linguistic expressions of generalization— generic, habitual, and episodic statements—as combinations of simple, real-valued referential properties of predicates and their arguments. We use this framework to construct a dataset covering the entirety of the Universal Dependencies English Web Treebank. We use this dataset to probe the efficacy of type-level and token-level information—including hand-engineered features and static (GloVe) and contextual (ELMo) word embeddings—for predicting expressions of generalization.",linguistic generalization—|predicting generalization|expressions generalization|Decomposing Generalization,Semantics: Textual Inference and Other Areas of Semantics,TEMNLP,https://www.mitpressjournals.org/doi/pdf/10.1162/tacl_a_00285,venkatasg@utexas.edu|vandurme@jhu.edu|Aaron.White@rochester.edu -tacl.1709,Tabula nearly Rasa: Probing the linguistic knowledge of character-level neural language models trained on unsegmented text,Michael Hahn|Marco Baroni,"Recurrent neural networks (RNNs) reached striking performance in many natural language processing tasks. This has renewed interest in whether these generic sequence processing devices are inducing genuine linguistic knowledge. Nearly all current analytical studies, however, initialize the RNNs with a vocabulary of known words, and feed them tokenized input during training. We present a multi-lingual study of the linguistic knowledge encoded in RNNs trained as character-level language models, on input data with word boundaries removed. These networks face a tougher and more cognitively realistic task, having to discover and store any useful linguistic unit from scratch, based on input statistics. The results show that our ""near tabula rasa"" RNNs are mostly able to solve morphological, syntactic and semantic tasks that intuitively presuppose word-level knowledge, and indeed they learned to track ""soft"" word boundaries. Our study opens the door to speculations about the necessity of an explicit word lexicon in language learning and usage.",natural tasks|morphological tasks|language usage|Tabula,Interpretability and Analysis of Models for NLP,TEMNLP,https://www.mitpressjournals.org/doi/pdf/10.1162/tacl_a_00283,mhahn2@stanford.edu|mbaroni@gmail.com -tacl.1912,Decoding Brain Activity Associated with Literal and Metaphoric Sentence Comprehension using Distributional Semantic Models,Vesna G. Djokic|Jean Maillard|Luana Bulat|Ekaterina Shutova,"Recent years have seen a growing interest within the natural language processing (NLP) community in evaluating the ability of semantic models to capture human meaning representation in the brain. Existing research has mainly focused on applying semantic models to decode brain activity patterns associated with the meaning of individual words, and, more recently, this approach has been extended to sentences and larger text fragments. Our work is the first to investigate metaphor processing in the brain in this context. We evaluate a range of semantic models (word embeddings, compositional, and visual models) in their ability to decode brain activity associated with reading of both literal and metaphoric sentences. Our results suggest that compositional models and word embeddings are able to capture differences in the processing of literal and metaphoric sentences, providing support for the idea that the literal meaning is not fully accessible during familiar metaphor comprehension.",Decoding Activity|Literal Comprehension|human representation|metaphor processing,Semantics: Sentence Level,TEMNLP,https://www.mitpressjournals.org/doi/pdf/10.1162/tacl_a_00307,vgdjokic@gmail.com|jean@maillard.it|ltf24@cam.ac.uk|shutova.e@gmail.com|gamezdjo@usc.edu -tacl.1906,Improving Candidate Generation for Low-resource Cross-lingual Entity Linking,Shuyan Zhou|Shruti Rijhwani|John Wieting|Jaime Carbonell|Graham Neubig,"Cross-lingual entity linking (XEL) is the task of finding referents in a target-language knowledge base (KB) for mentions extracted from source-language texts. The first step of (X)EL is candidate generation, which retrieves a list of plausible candidate entities from the target-language KB for each mention. Approaches based on resources from Wikipedia have proven successful in the realm of relatively high-resource languages (HRL), but these do not extend well to low-resource languages (LRL) with few, if any, Wikipedia pages. Recently, transfer learning methods have been shown to reduce the demand for resources in the LRL by utilizing resources in closely-related languages, but the performance still lags far behind their high-resource counterparts. In this paper, we first assess the problems faced by current entity candidate generation methods for low-resource XEL, then propose three improvements that (1) reduce the disconnect between entity mentions and KB entries, and (2) improve the robustness of the model to low-resource scenarios. The methods are simple, but effective: we experiment with our approach on seven XEL datasets and find that they yield an average gain of 16.9% in Top-30 gold candidate recall, compared to state-of-the-art baselines. Our improved model also yields an average gain of 7.9% in in-KB accuracy of end-to-end XEL.",Candidate Generation|Low-resource Linking|Cross-lingual linking|Cross-lingual XEL,Information Extraction,TEMNLP,https://www.mitpressjournals.org/doi/pdf/10.1162/tacl_a_00303,gneubig@cs.cmu.edu|shuyanzh@andrew.cmu.edu|jwieting@andrew.cmu.edu|srijhwan@andrew.cmu.edu -tacl.2001,Better Document-level Machine Translation with Bayes' Rule,Lei Yu|Laurent Sartran|Wojciech Stokowiec|Wang Ling|Lingpeng Kong|Phil Blunsom|Chris Dyer,"We show that Bayes' rule provides an effective mechanism for creating document translation models that can be learned from only parallel sentences and monolingual documents---a compelling benefit as parallel documents are not always available. In our formulation, the posterior probability of a candidate translation is the product of the unconditional (prior) probability of the candidate output document and the ``reverse translation probability'' of translating the candidate output back into the source language. Our proposed model uses a powerful autoregressive language model as the prior on target language documents, but it assumes that each sentence is translated independently from the target to the source language. Crucially, at test time, when a source document is observed, the document language model prior induces dependencies between the translations of the source sentences in the posterior. The model's independence assumption not only enables efficient use of available data, but it additionally admits a practical left-to-right beam-search algorithm for carrying out inference. Experiments show that our model benefits from using cross-sentence context in the language model, and it outperforms existing document translation approaches.",Document-level Translation|inference|Bayes Rule|document models,Machine Translation,TEMNLP,,leiyu@google.com|lsartran@google.com|wstokowiec@google.com|lingwang@google.com|lingpenk@google.com|pblunsom@google.com|cdyer@google.com -tacl.1882,Investigating Prior Knowledge for Challenging Chinese Machine Reading Comprehension,Kai Sun|Dian Yu|Dong Yu|Claire Cardie,"Machine reading comprehension tasks require a machine reader to answer questions relevant to the given document. In this paper, we present the first free-form multiple-Choice Chinese machine reading Comprehension dataset (C^3), containing 13,369 documents (dialogues or more formally written mixed-genre texts) and their associated 19,577 multiple-choice free-form questions collected from Chinese-as-a-second-language examinations. We present a comprehensive analysis of the prior knowledge (i.e., linguistic, domain-specific, and general world knowledge) needed for these real-world problems. We implement rule-based and popular neural methods and find that there is still a significant performance gap between the best performing model (68.5%) and human readers (96.0%), especially on problems that require prior knowledge. We further study the effects of distractor plausibility and data augmentation based on translated relevant datasets for English on model performance. We expect C^3 to present great challenges to existing systems as answering 86.8% of questions requires both knowledge within and beyond the accompanying document, and we hope that C^3 can serve as a platform to study how to leverage various kinds of prior knowledge to better understand a given written or orally oriented text. C^3 is available at https://dataset.org/c3/.",Chinese Comprehension|Machine tasks|real-world problems|data augmentation,Question Answering,TEMNLP,https://www.mitpressjournals.org/doi/pdf/10.1162/tacl_a_00305,ks985@cornell.edu|yudian@tencent.com|dyu@tencent.com|cardie@cs.cornell.edu -tacl.1843,Target-Guided Structured Attention Network for Target-dependent Sentiment Analysis,Ji Zhang|Chengyao Chen|Pengfei Liu|Chao He|Cane Wing-Ki Leung,"Target-dependent sentiment analysis (TDSA) aims to classify the sentiment of a text towards a given target. The major challenge of this task lies in modeling the semantic relatedness between a target and its context sentence. This paper proposes a novel Target-Guided Structured Attention Network (TG-SAN), which captures target-related contexts for TDSA in a fine-to-coarse manner. Given a target and its context sentence, the proposed TG-SAN first identifies multiple semantic segments from the sentence using a target-guided structured attention mechanism. It then fuses the extracted segments based on their relatedness with the target for sentiment classification. We present comprehensive comparative experiments on three benchmarks with three major findings. Firstly, TG-SAN outperforms the state-of-the-art by up to 1.61% and 3.58% in terms of accuracy and Marco-F1 respectively. Secondly, it shows a strong advantage in determining the sentiment of a target when the context sentence contains multiple semantic segments. Lastly, the attention results produced by TG-SAN are highly interpretable as visualization results shown.",Target-dependent Analysis|TDSA|sentiment classification|visualization,"Sentiment Analysis, Stylistic Analysis, and Argument Mining",TEMNLP,https://www.mitpressjournals.org/doi/pdf/10.1162/tacl_a_00308,stacychen@wisers.com|ppfliu@gmail.com|chaohe@wisers.com|caneleung@wisers.com|jasonzhang@wisers.com -tacl.1852,What BERT Is Not: Lessons from a New Suite of Psycholinguistic Diagnostics for Language Models,Allyson Ettinger,"Pre-training by language modeling has become a popular and successful approach to NLP tasks, but we have yet to understand exactly what linguistic capacities these pre-training processes confer upon models. In this paper we introduce a suite of diagnostics drawn from human language experiments, which allow us to ask targeted questions about information used by language models for generating predictions in context. As a case study, we apply these diagnostics to the popular BERT model, finding that it can generally distinguish good from bad completions involving shared category or role reversal, albeit with less sensitivity than humans, and it robustly retrieves noun hypernyms, but it struggles with challenging inference and role-based event prediction — and, in particular, it shows clear insensitivity to the contextual impacts of negation.",Pre-training|NLP tasks|inference|role-based prediction,Interpretability and Analysis of Models for NLP,TEMNLP,https://www.mitpressjournals.org/doi/pdf/10.1162/tacl_a_00298,aettinger@uchicago.edu -tacl.1853,SpanBERT: Improving Pre-training by Representing and Predicting Spans,Mandar Joshi|Danqi Chen|Yinhan Liu|Daniel S. Weld|Luke Zettlemoyer|Omer Levy,"We present SpanBERT, a pre-training method that is designed to better represent and predict spans of text. Our approach extends BERT by (1) masking contiguous random spans, rather than random tokens, and (2) training the span boundary representations to predict the entire content of the masked span, without relying on the individual token representations within it. SpanBERT consistently outperforms BERT and our better-tuned baselines, with substantial gains on span selection tasks such as question answering and coreference resolution. In particular, with the same training data and model size as BERT-Large, our single model obtains 94.6% and 88.7% F1 on SQuAD 1.1 and 2.0 respectively. We also achieve a new state of the art on the OntoNotes coreference resolution task (79.6% F1), strong performance on the TACRED relation extraction benchmark, and even gains on GLUE.",span tasks|question answering|coreference resolution|OntoNotes task,Machine Learning for NLP,TEMNLP,https://www.mitpressjournals.org/doi/pdf/10.1162/tacl_a_00300,danqic@cs.princeton.edu|yinhan@ai2incubator.com|weld@cs.washington.edu|lsz@cs.washington.edu|omerlevy@fb.com|mandar90@cs.washington.edu -tacl.1845,Break It Down: A Question Understanding Benchmark,Tomer Wolfson|Mor Geva|Ankit Gupta|Yoav Goldberg|Matt Gardner|Daniel Deutch|Jonathan Berant,"Understanding natural language questions entails the ability to break down a question into the requisite steps for computing its answer. In this work, we introduce a Question Decomposition Meaning Representation (QDMR) for questions. QDMR constitutes the ordered list of steps, expressed through natural language, that are necessary for answering a question. We develop a crowdsourcing pipeline, showing that quality QDMRs can be annotated at scale, and release the Break dataset, containing over 83K pairs of questions and their QDMRs. We demonstrate the utility of QDMR by showing that (a) it can be used to improve open-domain question answering on the HotpotQA dataset, (b) it can be deterministically converted to a pseudo-SQL formal language, which can alleviate annotation in semantic parsing applications. Last, we use Break to train a sequence-to-sequence model with copying that parses questions into QDMR structures, and show that it substantially outperforms several natural baselines.",Question Benchmark|Understanding questions|open-domain answering|annotation,Question Answering,TEMNLP,https://www.mitpressjournals.org/doi/pdf/10.1162/tacl_a_00309,tomerwol@mail.tau.ac.il|morgeva@mail.tau.ac.il|ankitgupta.iitkanpur@gmail.com|yoav.goldberg@gmail.com|danielde@post.tau.ac.il|joberant@cs.tau.ac.il -tacl.1886,A Knowledge-Enhanced Pretraining Model for Commonsense Story Generation,Jian Guan|Fei Huang|Minlie Huang|Zhihao Zhao|Xiaoyan Zhu,"Story generation, namely generating a reasonable story from a leading context, is an important but challenging task. In spite of the success in modeling fluency and local coherence, existing neural language generation models (e.g., GPT-2) still suffer from repetition, logic conflicts, and lack of long-range coherence in generated stories. We conjecture that this is because of the difficulty of associating relevant commonsense knowledge, understanding the causal relationships, and planning entities and events with proper temporal order. In this paper, we devise a knowledge-enhanced pretraining model for commonsense story generation. We propose to utilize commonsense knowledge from external knowledge bases to generate reasonable stories. To further capture the causal and temporal dependencies between the sentences in a reasonable story, we employ multi-task learning which combines a discriminative objective to distinguish true and fake stories during fine-tuning. Automatic and manual evaluation shows that our model can generate more reasonable stories than state-of-the-art baselines, particularly in terms of logic and global coherence.",Commonsense Generation|Story generation|generating story|Automatic evaluation,Generation,TEMNLP,https://www.mitpressjournals.org/doi/pdf/10.1162/tacl_a_00302,j-guan19@mails.tsinghua.edu.cn|f-huang18@mails.tsinghua.edu.cn|extsuioku@gmail.com|zxy-dcs@tsinghua.edu.cn|aihuang@tsinghua.edu.cn -tacl.1892,Does Syntax Need to Grow on Trees? Sources of Hierarchical Inductive Bias in Sequence-to-Sequence Networks,R. Thomas McCoy|Robert Frank|Tal Linzen,"Learners that are exposed to the same training data might generalize differently due to differing inductive biases. In neural network models, inductive biases could in theory arise from any aspect of the model architecture. We investigate which architectural factors affect the generalization behavior of neural sequence-to-sequence models trained on two syntactic tasks, English question formation and English tense reinflection. For both tasks, the training set is consistent with a generalization based on hierarchical structure and a generalization based on linear order. All architectural factors that we investigated qualitatively affected how models generalized, including factors with no clear connection to hierarchical structure. For example, LSTMs and GRUs displayed qualitatively different inductive biases. However, the only factor that consistently contributed a hierarchical bias across tasks was the use of a tree-structured model rather than a model with sequential recurrence, suggesting that human-like syntactic generalization requires architectural syntactic structure.",syntactic tasks|English formation|Sequence-to-Sequence Networks|neural models,Interpretability and Analysis of Models for NLP,TEMNLP,https://www.mitpressjournals.org/doi/pdf/10.1162/tacl_a_00304,robert.frank@yale.edu|tal.linzen@jhu.edu|tom.mccoy@jhu.edu -tacl.1759,Phonotactic Complexity and Its Trade-offs,Tiago Pimentel|Brian Roark|Ryan D. Cotterell,"We present methods for calculating a measure of phonotactic complexity—bits per phoneme—that permits a straightforward cross-linguistic comparison. When given a word, represented as a sequence of phonemic segments such as symbols in the international phonetic alphabet, and a statistical model trained on a sample of word types from the language, we can approximately measure bits per phoneme using the negative log-probability of that word under the model. This simple measure allows us to compare the entropy across languages, giving insight into how complex a language’s phonotactics is. Using a collection of 1016 basic concept words across 106 languages, we demonstrate a very strong negative correlation of −0.74 between bits per phoneme and the average length of words.",cross-linguistic comparison|statistical model|Phonotactic Complexity|phonemic segments,"Phonology, Morphology and Word Segmentation",TEMNLP,https://www.mitpressjournals.org/doi/pdf/10.1162/tacl_a_00296,"tiagopms@gmail.com, roarkbr@gmail.com, ryan.cotterell@gmail.com" -tacl.1834,Acoustic-Prosodic and Lexical Cues to Deception and Trust: Deciphering How People Detect Lies,Xi (Leslie) Chen|Sarah Ita Levitan|Michelle Levine|Marko Mandic|and Julia Hirschberg,"Humans rarely perform better than chance at lie detection. To better understand human perception of deception, we created a game framework, LieCatcher, to collect ratings of perceived deception using a large corpus of deceptive and truthful interviews. We analyzed the acoustic-prosodic and linguistic characteristics of language trusted and mistrusted by raters and compared these to characteristics of actual truthful and deceptive language to understand how perception aligns with reality. With this data we built classifiers to automatically distinguish trusted from mistrusted speech, achieving an F1 of 66.1%. We next evaluated whether the strategies raters said they used to discriminate between truthful and deceptive responses were in fact useful. Our results show that, while several prosodic and lexical features were consistently perceived as trustworthy, they were not reliable cues. Also, the strategies that judges reported using in deception detection were not helpful for the task. Our work sheds light on the nature of trusted language and provides insight into the challenging problem of human deception detection.",Deception|lie detection|human deception|deception detection,Speech and Multimodality,TEMNLP,https://www.mitpressjournals.org/doi/pdf/10.1162/tacl_a_00311,"xi_chen@cs.columbia.edu, sarahita@cs.columbia.edu, mlevine@cs.columbia.edu, mm5305@columbia.edu, julia@cs.columbia.edu" -tacl.1766,Efficient Contextual Representation Learning With Continuous Outputs,Liunian Harold Li|Patrick H. Chen|Cho-Jui Hsieh|Kai-Wei Chang,"Contextual representation models have achieved great success in improving various downstream natural language processing tasks. However, these language-model-based encoders are difficult to train due to their large parameter size and high computational complexity By carefully examining the training procedure, we observe that the softmax layer, which predicts a distribution of the target word, often induces significant overhead, especially when the vocabulary size is large. Therefore, we revisit the design of the output layer and consider directly predicting the pre-trained embedding of the target word for a given context. When applied to ELMo, the proposed approach achieves a 4 times speedup and eliminates 80% trainable parameters while achieving competitive performance on downstream tasks. Further analysis shows that the approach maintains the speed advantage under various settings, even when the sentence encoder is scaled up.",natural tasks|Contextual Learning|Contextual models|language-model-based encoders,Machine Learning for NLP,TEMNLP,https://www.mitpressjournals.org/doi/pdf/10.1162/tacl_a_00289,liliunian@pku.edu.cn|patrickchen@g.ucla.edu|chohsieh@cs.ucla.edu|kw@kwchang.net diff --git a/sitedata/tutorials.yml b/sitedata/tutorials.yml index c93e1fb..1a3f84a 100644 --- a/sitedata/tutorials.yml +++ b/sitedata/tutorials.yml @@ -1,34 +1,6 @@ - UID: T1 - title: Machine Reasoning for Knowledgeable, Explainable and Inferable Models - organizers: Nan Duan, Duyu Tang and Ming Zhou - abstract: Broadly speaking, machine reasoning research aims to build AI systems - that can draw new conclusions or solve unseen problems from what they are told - (i.e. environment) and already know (i.e. knowledge) by using inference techniques - such as induction and deduction. In this tutorial, we will (1) review definitions, - key components and characteristics of machine reasoning; (2) introduce typical - machine reasoning frameworks, including symbolic reasoning, statistical relational - learning, neural-symbolic reasoning and environment-based reasoning; (3) talk - about necessary technologies of applying machine reasoning in NLP scenarios, including - natural language understanding (NLU), human-computer interaction (HCI), natural - language generation (NLG) and model interpretation; (4) show examples of applying - machine reasoning techniques in real-world applications; (5) summarize challenges - of current methods and discuss possible future directions. - website: '' - material: https://2020.emnlp.org/files/emnlp2020-templates.zip - slides: https://v1.overleaf.com/latex/templates/emnlp-2020/knvtcxwzhbbz.pdf - info: |- - This tutorial has a prerecorded talk on this page (see below) that you can watch anytime during the conference. It - also has two live sessions that will be conducted on Zoom and will be livestreamed on this page. Additionally, it has - a chat window that you can use to have discussions with the tutorial teachers and other attendees anytime during the - conference. - sessions: - - name: Live Session 1 - start_time: 2020-11-19 00:00:00-07:00 - end_time: 2020-11-19 03:00:00-07:00 - livestream_id: 581567 -- UID: T2 title: Interpreting Predictions of NLP Models - organizers: Eric Wallace, Matt Gardner and Sameer Singh + organizers: Eric Wallace, Matt Gardner, Sameer Singh abstract: Although neural NLP models are highly expressive and empirically successful, they also systematically fail in counterintuitive ways and are opaque in their decision-making process. This tutorial will provide a background on interpretation @@ -41,22 +13,63 @@ interpretations for a diverse set of NLP tasks. Finally, we will discuss open problems in the field, e.g., evaluating, extending, and improving interpretation methods. + info: |- + This tutorial has slides that you see can anytime (It does not have any + prerecorded talk). It will be conducted entirely live on Zoom and will be + livestreamed on this page. It has a chat window that you can use to have + discussions with the tutorial teachers and other attendees anytime during the + conference. + rocketchat_channel: tutorials_T1 + sessions: + - name: Part 1 + start_time: 2020-11-19 15:00:00+00:00 + end_time: 2020-11-19 16:30:00+00:00 + livestream_id: 42 + - name: QA 1 + start_time: 2020-11-19 16:30:00+00:00 + end_time: 2020-11-19 17:00:00+00:00 + livestream_id: 42 + - name: BREAK + start_time: 2020-11-19 17:00:00+00:00 + end_time: 2020-11-19 17:30:00+00:00 + livestream_id: 42 + - name: Part 2 + start_time: 2020-11-19 17:30:00+00:00 + end_time: 2020-11-19 19:00:00+00:00 + livestream_id: 42 + - name: QA 2 + start_time: 2020-11-19 19:00:00+00:00 + end_time: 2020-11-19 19:30:00+00:00 + livestream_id: 42 website: https://www.ericswallace.com/interpretability - material: https://2020.emnlp.org/files/emnlp2020-templates.zip - slides: https://v1.overleaf.com/latex/templates/emnlp-2020/knvtcxwzhbbz.pdf +- UID: T2 + title: 'Fact-Checking, Fake News, Propaganda, and Media Bias: Truth Seeking in the + Post-Truth Era' + organizers: Preslav Nakov, Giovanni Da San Martino + abstract: |- + The rise of social media has democratized content creation and has made it easy for anybody to share and to spread information online. On the positive side, this has given rise to citizen journalism, thus enabling much faster dissemination of information compared to what was possible with newspapers, radio, and TV. On the negative side, stripping traditional media from their gate-keeping role has left the public unprotected against the spread of disinformation, which could now travel at breaking-news speed over the same democratic channel. This situation gave rise to the proliferation of false information specifically created to affect individual people's beliefs, and ultimately to influence major events such as political elections; it also set the dawn of the Post-Truth Era, where appeal to emotions has become more important than the truth. More recently, with the emergence of the COVID-19 pandemic, a new blending of medical and political misinformation and disinformation has given rise to the first global infodemic. Limiting the impact of these negative developments has become a major focus for journalists, social media companies, and regulatory authorities. + + The tutorial offers an overview of the emerging and inter-connected research areas of fact-checking, misinformation, disinformation, “fake news”, propaganda, and media bias detection, with focus on text and on computational approaches. It further explores the general fact-checking pipeline and important elements thereof such as check-worthiness estimation, spotting previous fact-checked claims, stance detection, source reliability estimation, and detecting malicious users in social media. Finally, it covers some recent developments such as the emergence of large-scale pre-trained language models, and the challenges and opportunities they offer. info: |- This tutorial has a prerecorded talk on this page (see below) that you can watch anytime during the conference. It - also has two live sessions that will be conducted on Zoom and will be livestreamed on this page. Additionally, it has - a chat window that you can use to have discussions with the tutorial teachers and other attendees anytime during the - conference. + also has two live sessions that will be conducted on Zoom and will be livestreamed on this page. Additionally, it has + a chat window that you can use to have discussions with the tutorial teachers and other attendees anytime during the + conference. + rocketchat_channel: tutorials_T2 sessions: - - name: Live Session 1 - start_time: 2020-11-19 01:00:00-07:00 - end_time: 2020-11-19 04:00:00-07:00 - livestream_id: 482292 + - name: Q&A + start_time: 2020-11-19 09:00:00+00:00 + end_time: 2020-11-19 10:00:00+00:00 + livestream_id: 42 + - name: Q&A + start_time: 2020-11-19 16:00:00+00:00 + end_time: 2020-11-19 17:00:00+00:00 + livestream_id: 42 + website: https://propaganda.qcri.org/emnlp20-tutorial - UID: T3 title: High Performance Natural Language Processing - organizers: Gabriel Ilharco, Cesar Ilharco Magalhaes and Kenton Lee + organizers: Gabriel Ilharco, Cesar Ilharco, Iulia Turc, Tim Dettmers, Felipe Ferreira, + Kenton Lee abstract: Scale has played a central role in the rapid progress natural language processing has enjoyed in recent years. While benchmarks are dominated by ever larger models, efficient hardware use is critical for their widespread adoption @@ -66,43 +79,97 @@ for improving efficiency, including knowledge distillation, quantization, pruning, more efficient architectures, along with case studies and practical implementation tricks. - website: '' - material: https://2020.emnlp.org/files/emnlp2020-templates.zip - slides: https://v1.overleaf.com/latex/templates/emnlp-2020/knvtcxwzhbbz.pdf info: |- This tutorial has a prerecorded talk on this page (see below) that you can watch anytime during the conference. It - also has two live sessions that will be conducted on Zoom and will be livestreamed on this page. Additionally, it has - a chat window that you can use to have discussions with the tutorial teachers and other attendees anytime during the - conference. + also has two live sessions that will be conducted on Zoom and will be livestreamed on this page. Additionally, it has + a chat window that you can use to have discussions with the tutorial teachers and other attendees anytime during the + conference. + rocketchat_channel: tutorials_T3 sessions: - - name: Live Session 1 - start_time: 2020-11-19 02:00:00-07:00 - end_time: 2020-11-19 05:00:00-07:00 - livestream_id: 420051 + - name: Q&A + start_time: 2020-11-19 17:00:00+00:00 + end_time: 2020-11-19 18:00:00+00:00 + livestream_id: 42 + - name: Q&A + start_time: 2020-11-20 00:00:00+00:00 + end_time: 2020-11-20 01:00:00+00:00 + livestream_id: 42 - UID: T4 - title: 'Fact-Checking, Fake News, Propaganda, and Media Bias: Truth Seeking in the - Post-Truth Era' - organizers: Preslav Nakov and Giovanni Da San Martino - abstract: |- - The rise of social media has democratized content creation and has made it easy for anybody to share and to spread information online. On the positive side, this has given rise to citizen journalism, thus enabling much faster dissemination of information compared to what was possible with newspapers, radio, and TV. On the negative side, stripping traditional media from their gate-keeping role has left the public unprotected against the spread of disinformation, which could now travel at breaking-news speed over the same democratic channel. This situation gave rise to the proliferation of false information specifically created to affect individual people's beliefs, and ultimately to influence major events such as political elections; it also set the dawn of the Post-Truth Era, where appeal to emotions has become more important than the truth. More recently, with the emergence of the COVID-19 pandemic, a new blending of medical and political misinformation and disinformation has given rise to the first global infodemic. Limiting the impact of these negative developments has become a major focus for journalists, social media companies, and regulatory authorities. - - The tutorial offers an overview of the emerging and inter-connected research areas of fact-checking, misinformation, disinformation, “fake news”, propaganda, and media bias detection, with focus on text and on computational approaches. It further explores the general fact-checking pipeline and important elements thereof such as check-worthiness estimation, spotting previous fact-checked claims, stance detection, source reliability estimation, and detecting malicious users in social media. Finally, it covers some recent developments such as the emergence of large-scale pre-trained language models, and the challenges and opportunities they offer. - website: https://propaganda.qcri.org/emnlp20-tutorial - material: https://2020.emnlp.org/files/emnlp2020-templates.zip - slides: https://v1.overleaf.com/latex/templates/emnlp-2020/knvtcxwzhbbz.pdf + title: Machine Reasoning for Knowledgeable, Explainable and Inferable Models + organizers: Nan Duan, Duyu Tang, Ming Zhou + abstract: Broadly speaking, machine reasoning research aims to build AI systems + that can draw new conclusions or solve unseen problems from what they are told + (i.e. environment) and already know (i.e. knowledge) by using inference techniques + such as induction and deduction. In this tutorial, we will (1) review definitions, + key components and characteristics of machine reasoning; (2) introduce typical + machine reasoning frameworks, including symbolic reasoning, statistical relational + learning, neural-symbolic reasoning and environment-based reasoning; (3) talk + about necessary technologies of applying machine reasoning in NLP scenarios, including + natural language understanding (NLU), human-computer interaction (HCI), natural + language generation (NLG) and model interpretation; (4) show examples of applying + machine reasoning techniques in real-world applications; (5) summarize challenges + of current methods and discuss possible future directions. info: |- This tutorial has a prerecorded talk on this page (see below) that you can watch anytime during the conference. It - also has two live sessions that will be conducted on Zoom and will be livestreamed on this page. Additionally, it has - a chat window that you can use to have discussions with the tutorial teachers and other attendees anytime during the - conference. + also has two live sessions that will be conducted on Zoom and will be livestreamed on this page. Additionally, it has + a chat window that you can use to have discussions with the tutorial teachers and other attendees anytime during the + conference. + rocketchat_channel: tutorials_T4 sessions: - - name: Live Session 1 - start_time: 2020-11-19 03:00:00-07:00 - end_time: 2020-11-19 06:00:00-07:00 - livestream_id: 143080 + - name: Q&A + start_time: 2020-11-19 09:00:00+00:00 + end_time: 2020-11-19 10:00:00+00:00 + livestream_id: 42 + - name: Q&A + start_time: 2020-11-20 01:00:00+00:00 + end_time: 2020-11-20 02:00:00+00:00 + livestream_id: 42 - UID: T5 + title: Representation, Learning and Reasoning on Spatial Language for Down-stream + NLP Tasks + organizers: Parisa Kordjamshidi, James Pustejovsky, Marie-Francine Moens + abstract: | + Understating spatial semantics expressed in natural language can become highly complex in real-world applications. This includes applications of language grounding, navigation, visual question answering, and more generic human-machine interaction and dialogue systems. In many of such downstream tasks, explicit representation of spatial concepts and relationships can improve the capabilities of machine learning models in reasoning and deep language understanding. In this tutorial, we overview the cutting-edge research results and existing challenges related to spatial language understanding including semantic annotations, existing corpora, symbolic and sub-symbolic representations, qualitative spatial reasoning, spatial common sense, deep and structured learning models. We discuss the recent results on the above-mentioned applications --that need spatial language learning and reasoning -- and highlight the research gaps and future directions. + info: |- + This tutorial has a prerecorded talk on this page (see below) that you can watch anytime during the conference. It + also has two live sessions that will be conducted on Zoom and will be livestreamed on this page. Additionally, it has + a chat window that you can use to have discussions with the tutorial teachers and other attendees anytime during the + conference. + rocketchat_channel: tutorials_T5 + sessions: + - name: Q&A + start_time: 2020-11-20 17:00:00+00:00 + end_time: 2020-11-20 18:00:00+00:00 + livestream_id: 42 + - name: Q&A + start_time: 2020-11-21 00:00:00+00:00 + end_time: 2020-11-21 01:00:00+00:00 + livestream_id: 42 + website: ' https://spatial-language-tutorial.github.io ' +- UID: T6 + title: Simultaneous Translation + organizers: Liang Huang, Colin Cherry, Mingbo Ma, Naveen Arivazhagan, Zhongjun He + abstract: |- + Simultaneous translation, which performs translation concurrently with the source speech, is widely useful in many scenarios such as international conferences, negotiations, press releases, legal proceedings, and medicine. This problem has long been considered one of the hardest problems in AI and one of its holy grails. + Recently, with rapid improvements in machine translation, speech recognition, and speech synthesis, there has been exciting progress towards simultaneous translation. This tutorial will focus on the design and evaluation of policies for simultaneous translation, to leave attendees with a deep technical understanding of the history, the recent advances, and the remaining challenges in this field. + info: |- + This tutorial has a prerecorded talk on this page (see below) that you can watch anytime during the conference. It + also has two live sessions that will be conducted on Zoom and will be livestreamed on this page. Additionally, it has + a chat window that you can use to have discussions with the tutorial teachers and other attendees anytime during the + conference. + rocketchat_channel: tutorials_T6 + sessions: + - name: Q&A + start_time: 2020-11-20 18:00:00+00:00 + end_time: 2020-11-20 19:00:00+00:00 + livestream_id: 42 + - name: Q&A + start_time: 2020-11-21 01:00:00+00:00 + end_time: 2020-11-21 02:00:00+00:00 + livestream_id: 42 +- UID: T7 title: The Amazing World of Neural Language Generation - organizers: Yangfeng Ji, Antoine Bosselut, Thomas Wolf and Asli Celikyilmaz + organizers: Yangfeng Ji, Antoine Bosselut, Thomas Wolf, Asli Celikyilmaz abstract: 'Neural Language Generation (NLG) -- using neural network models to generate coherent text -- is among the most promising methods for automated text creation. Recent years have seen a paradigm shift in neural text generation, caused by the advances @@ -116,55 +183,18 @@ and how they shaped recent research directions in text generation. We will discuss how and why these models succeed/fail at generating coherent text, and provide insights on several applications.' - website: '' - material: https://2020.emnlp.org/files/emnlp2020-templates.zip - slides: https://v1.overleaf.com/latex/templates/emnlp-2020/knvtcxwzhbbz.pdf - info: |- - This tutorial has a prerecorded talk on this page (see below) that you can watch anytime during the conference. It - also has two live sessions that will be conducted on Zoom and will be livestreamed on this page. Additionally, it has - a chat window that you can use to have discussions with the tutorial teachers and other attendees anytime during the - conference. - sessions: - - name: Live Session 1 - start_time: 2020-11-20 04:00:00-07:00 - end_time: 2020-11-20 07:00:00-07:00 - livestream_id: 563079 -- UID: T6 - title: Representation, Learning and Reasoning on Spatial Language for Down-stream - NLP Tasks - organizers: Parisa Kordjamshidi, James Pustejovsky and Marie-Francine Moens - abstract: | - Understating spatial semantics expressed in natural language can become highly complex in real-world applications. This includes applications of language grounding, navigation, visual question answering, and more generic human-machine interaction and dialogue systems. In many of such downstream tasks, explicit representation of spatial concepts and relationships can improve the capabilities of machine learning models in reasoning and deep language understanding. In this tutorial, we overview the cutting-edge research results and existing challenges related to spatial language understanding including semantic annotations, existing corpora, symbolic and sub-symbolic representations, qualitative spatial reasoning, spatial common sense, deep and structured learning models. We discuss the recent results on the above-mentioned applications --that need spatial language learning and reasoning -- and highlight the research gaps and future directions. - website: ' https://spatial-language-tutorial.github.io ' - material: https://2020.emnlp.org/files/emnlp2020-templates.zip - slides: https://v1.overleaf.com/latex/templates/emnlp-2020/knvtcxwzhbbz.pdf - info: |- - This tutorial has a prerecorded talk on this page (see below) that you can watch anytime during the conference. It - also has two live sessions that will be conducted on Zoom and will be livestreamed on this page. Additionally, it has - a chat window that you can use to have discussions with the tutorial teachers and other attendees anytime during the - conference. - sessions: - - name: Live Session 1 - start_time: 2020-11-20 05:00:00-07:00 - end_time: 2020-11-20 08:00:00-07:00 - livestream_id: 327318 -- UID: T7 - title: Simultaneous Translation - organizers: Liang Huang, Colin Cherry, Mingbo Ma, Naveen Arivazhagan and Zhongjun - He - abstract: |- - Simultaneous translation, which performs translation concurrently with the source speech, is widely useful in many scenarios such as international conferences, negotiations, press releases, legal proceedings, and medicine. This problem has long been considered one of the hardest problems in AI and one of its holy grails. - Recently, with rapid improvements in machine translation, speech recognition, and speech synthesis, there has been exciting progress towards simultaneous translation. This tutorial will focus on the design and evaluation of policies for simultaneous translation, to leave attendees with a deep technical understanding of the history, the recent advances, and the remaining challenges in this field. - website: '' - material: https://2020.emnlp.org/files/emnlp2020-templates.zip - slides: https://v1.overleaf.com/latex/templates/emnlp-2020/knvtcxwzhbbz.pdf info: |- This tutorial has a prerecorded talk on this page (see below) that you can watch anytime during the conference. It - also has two live sessions that will be conducted on Zoom and will be livestreamed on this page. Additionally, it has - a chat window that you can use to have discussions with the tutorial teachers and other attendees anytime during the - conference. + also has two live sessions that will be conducted on Zoom and will be livestreamed on this page. Additionally, it has + a chat window that you can use to have discussions with the tutorial teachers and other attendees anytime during the + conference. + rocketchat_channel: tutorials_T7 sessions: - - name: Live Session 1 - start_time: 2020-11-20 06:00:00-07:00 - end_time: 2020-11-20 09:00:00-07:00 - livestream_id: 557870 + - name: Q&A + start_time: 2020-11-20 19:00:00+00:00 + end_time: 2020-11-20 20:00:00+00:00 + livestream_id: 42 + - name: Q&A + start_time: 2020-11-21 01:00:00+00:00 + end_time: 2020-11-21 02:00:00+00:00 + livestream_id: 42 diff --git a/static/js/paper_vis.js b/static/js/paper_vis.js index 02e6988..e409c7e 100644 --- a/static/js/paper_vis.js +++ b/static/js/paper_vis.js @@ -260,7 +260,7 @@ const start = (track) => { d3.json('serve_papers_projection.json') ] if (track != "All tracks") { - loadfiles.push(d3.json("track_" + track + ".json")); + loadfiles.push(d3.json("track_main_" + track + ".json")); } else { trackhighlight =[]; } diff --git a/templates/components.html b/templates/components.html index 6854216..31d6685 100644 --- a/templates/components.html +++ b/templates/components.html @@ -320,6 +320,9 @@

{{ tutorial.organizers }} +
+

{{tutorial.abstract|safe}}

+
{% for session in tutorial.sessions %}
@@ -333,6 +336,7 @@

{% endfor %} + {% endfor %} diff --git a/templates/index.html b/templates/index.html index c28dc6e..78d7117 100644 --- a/templates/index.html +++ b/templates/index.html @@ -59,7 +59,7 @@

Main Conference Papers
Each paper has a pre-recorded 7-12 minute talk, two live Q&A video sessions (on Zoom) at different times, and a linked RocketChat channel (for chatting anytime with the author). - This includes System Demonstrations and Student Research Workshop papers. + This includes System Demonstrations papers.
Tutorials and Workshops
The 7 tutorials and 25 workshops (November 19-20) are all included under conference registration. diff --git a/templates/paper.html b/templates/paper.html index 578a0e7..384ab75 100644 --- a/templates/paper.html +++ b/templates/paper.html @@ -106,7 +106,7 @@

{{ components.live_sessions( paper.content.sessions, paper.content.title, - "https://virtual.acl2020.org/paper_" + paper.id + ".html", + "https://virtual.emnlp.2020.org/paper_" + paper.id + ".html", false) }} diff --git a/templates/papers_vis.html b/templates/papers_vis.html index 8a80cb0..a26d3f8 100644 --- a/templates/papers_vis.html +++ b/templates/papers_vis.html @@ -7,7 +7,7 @@ - + {# #}