From 19a6d843bc076626d7e7e7018cdced7685dd07e4 Mon Sep 17 00:00:00 2001
From: anthology-assist <126604033+anthology-assist@users.noreply.github.com>
Date: Mon, 6 Nov 2023 17:39:48 -0600
Subject: [PATCH 01/12] Ingestion: GWC 2023 (#2861)

---
 data/xml/2023.gwc.xml | 457 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 457 insertions(+)
 create mode 100644 data/xml/2023.gwc.xml
diff --git a/data/xml/2023.gwc.xml b/data/xml/2023.gwc.xml
new file mode 100644
index 0000000000..50c18a108b
--- /dev/null
+++ b/data/xml/2023.gwc.xml
@@ -0,0 +1,457 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2023.gwc">
+  <volume id="1" ingest-date="2023-10-30" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the 12th Global Wordnet Conference</booktitle>
+      <editor><first>German</first><last>Rigau</last></editor>
+      <editor><first>Francis</first><last>Bond</last></editor>
+      <editor><first>Alexandre</first><last>Rademaker</last></editor>
+      <publisher>Global Wordnet Association</publisher>
+      <address>University of the Basque Country, Donostia - San Sebastian, Basque Country</address>
+      <month>January</month>
+      <year>2023</year>
+      <url hash="df6d82bd">2023.gwc-1</url>
+      <venue>gwc</venue>
+    </meta>
+    <frontmatter>
+      <url hash="d5daed1c">2023.gwc-1.0</url>
+      <bibkey>gwc-2023-global</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>Probing Taxonomic and Thematic Embeddings for Taxonomic Information</title>
+      <author><first>Filip</first><last>Klubička</last></author>
+      <author><first>John</first><last>Kelleher</last></author>
+      <pages>1–13</pages>
+      <abstract>Modelling taxonomic and thematic relatedness is important for building AI with comprehensive natural language understanding. The goal of this paper is to learn more about how taxonomic information is structurally encoded in embeddings. To do this, we design a new hypernym-hyponym probing task and perform a comparative probing study of taxonomic and thematic SGNS and GloVe embeddings. Our experiments indicate that both types of embeddings encode some taxonomic information, but the amount, as well as the geometric properties of the encodings, are independently related to both the encoder architecture, as well as the embedding training data. Specifically, we find that only taxonomic embeddings carry taxonomic information in their norm, which is determined by the underlying distribution in the data.</abstract>
+      <url hash="e402fbbb">2023.gwc-1.1</url>
+      <bibkey>klubicka-kelleher-2023-probing</bibkey>
+    </paper>
+    <paper id="2">
+      <title>A <fixed-case>W</fixed-case>ord<fixed-case>N</fixed-case>et View on Crosslingual Transformers</title>
+      <author><first>Wondimagegnhue</first><last>Tufa</last></author>
+      <author><first>Lisa</first><last>Beinborn</last></author>
+      <author><first>Piek</first><last>Vossen</last></author>
+      <pages>14–24</pages>
+      <abstract>WordNet is a database that represents relations between words and concepts as an abstraction of the contexts in which words are used. Contextualized language models represent words in contexts but leave the underlying concepts implicit. In this paper, we investigate how different layers of a pre-trained language model shape the abstract lexical relationship toward the actual contextual concept. Can we define the amount of contextualized concept forming needed given the abstracted representation of a word? Specifically, we consider samples of words with different polysemy profiles shared across three languages, assuming that words with a different polysemy profile require a different degree of concept shaping by context. We conduct probing experiments to investigate the impact of prior polysemy profiles on the representation in different layers. We analyze how contextualized models can approximate meaning through context and examine crosslingual interference effects.</abstract>
+      <url hash="0c7c7a4f">2023.gwc-1.2</url>
+      <bibkey>tufa-etal-2023-wordnet</bibkey>
+    </paper>
+    <paper id="3">
+      <title>What to Make of make? Sense Distinctions for Light Verbs</title>
+      <author><first>Julie</first><last>Kallini</last></author>
+      <author><first>Christiane</first><last>Fellbaum</last></author>
+      <pages>25–30</pages>
+      <abstract>Verbs like make, have and get present challenges for applications requiring automatic word sense discrimination. These verbs are both highly frequent and polysemous, with semantically “full” readings, as in make dinner, and “light” readings, as in make a request. Lexical resources like WordNet encode dozens of senses, making discrimination difficult and inviting proposals for reducing the number of entries or grouping them into coarser-grained supersenses. We propose a data-driven, linguistically-based approach to establishing a motivated sense inventory, focusing on make to establish a proof of concept. From several large, syntactically annotated corpora, we extract nouns that are complements of the verb make, and group them into clusters based on their Word2Vec semantic vectors. We manually inspect, for each cluster, the words with vectors closest to the centroid as well as a random sample of words within the cluster. The results show that the clusters reflect an intuitively plausible sense discrimination of make. As an evaluation, we test whether words within a given cluster cooccur in coordination phrases, such as apples and oranges, as prior work has shown that such conjoined nouns are semantically related. Conversely, noun complements from different clusters are less likely to be conjoined. Thus, coordination provides a similarity metric independent of the contextual embeddings used for clustering. Our results pave the way for a WordNet sense inventory that, while not inconsistent with the present one, would reduce it significantly and hold promise for improved automatic word sense discrimination.</abstract>
+      <url hash="1e51f67f">2023.gwc-1.3</url>
+      <bibkey>kallini-fellbaum-2023-make</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Towards Effective Correction Methods Using <fixed-case>W</fixed-case>ord<fixed-case>N</fixed-case>et Meronymy Relations</title>
+      <author><first>Javier</first><last>Álvez</last></author>
+      <author><first>Itziar</first><last>Gonzalez-Dios</last></author>
+      <author><first>German</first><last>Rigau</last></author>
+      <pages>31–40</pages>
+      <abstract>In this paper, we analyse and compare several correction methods of knowledge resources with the purpose of improving the abilities of systems that require commonsense reasoning with the least possible human-effort. To this end, we cross-check the WordNet meronymy relation member against the knowledge encoded in a SUMO-based first-order logic ontology on the basis of the mapping between WordNet and SUMO. In particular, we focus on the knowledge in WordNet regarding the taxonomy of animals and plants. Despite being created manually, these knowledge resources — WordNet, SUMO and their mapping — are not free of errors and discrepancies. Thus, we propose three correction methods by semi-automatically improving the alignment between WordNet and SUMO, by performing some few corrections in SUMO and by combining the above two strategies. The evaluation of each method includes the required human-effort and the achieved improvement on unseen data from the WebChild project, that is tested using first-order logic automated theorem provers.</abstract>
+      <url hash="97abe085">2023.gwc-1.4</url>
+      <bibkey>alvez-etal-2023-towards</bibkey>
+    </paper>
+    <paper id="5">
+      <title>On the Acquisition of <fixed-case>W</fixed-case>ord<fixed-case>N</fixed-case>et Relations in <fixed-case>P</fixed-case>ortuguese from Pretrained Masked Language Models</title>
+      <author><first>Hugo Gonçalo</first><last>Oliveira</last></author>
+      <pages>41–49</pages>
+      <abstract>This paper studies the application of pretrained BERT in the acquisition of synonyms, antonyms, hypernyms and hyponyms in Portuguese. Masked patterns indicating those relations were compiled with the help of a service for validating semantic relations, and then used for prompting three pretrained BERT models, one multilingual and two for Portuguese (base and large). Predictions for the masks were evaluated in two different test sets. Results achieved by the monolingual models are interesting enough for considering these models as a source for enriching wordnets, especially when predicting hypernyms of nouns. Previously reported performances on prediction were improved with new patterns and with the large model. When it comes to selecting the related word from a set of four options, performance is even better, but not enough for outperforming the selection of the most similar word, as computed with static word embeddings.</abstract>
+      <url hash="73668c5b">2023.gwc-1.5</url>
+      <bibkey>oliveira-2023-acquisition</bibkey>
+    </paper>
+    <paper id="6">
+      <title><fixed-case>W</fixed-case>ordnet for Definition Augmentation with Encoder-Decoder Architecture</title>
+      <author><first>Konrad</first><last>Wojtasik</last></author>
+      <author><first>Arkadiusz</first><last>Janz</last></author>
+      <author><first>Maciej</first><last>Piasecki</last></author>
+      <pages>50–59</pages>
+      <abstract>Data augmentation is a difficult task in Natural Language Processing. Simple methods that can be relatively easily applied in other domains like insertion, deletion or substitution, mostly result in changing the sentence meaning significantly and obtaining an incorrect example. Wordnets are potentially a perfect source of rich and high quality data that when integrated with the powerful capacity of generative models can help to solve this complex task. In this work, we use plWordNet, which is a wordnet of the Polish language, to explore the capability of encoder-decoder architectures in data augmentation of sense glosses. We discuss the limitations of generative methods and perform qualitative review of generated data samples.</abstract>
+      <url hash="62c2f7fe">2023.gwc-1.6</url>
+      <bibkey>wojtasik-etal-2023-wordnet</bibkey>
+    </paper>
+    <paper id="7">
+      <title>Data Augmentation Method for Boosting Multilingual Word Sense Disambiguation</title>
+      <author><first>Arkadiusz</first><last>Janz</last></author>
+      <author><first>Marek</first><last>Maziarz</last></author>
+      <pages>60–66</pages>
+      <abstract>Recent advances in Word Sense Disambiguation suggest neural language models can be successfully improved by incorporating knowledge base structure. Such class of models are called hybrid solutions. We propose a method of improving hybrid WSD models by harnessing data augmentation techniques and bilingual training. The data augmentation consist of structure augmentation using interlingual connections between wordnets and text data augmentation based on multilingual glosses and usage examples. We utilise language-agnostic neural model trained both with SemCor and Princeton WordNet gloss and example corpora, as well as with Polish WordNet glosses and usage examples. This augmentation technique proves to make well-known hybrid WSD architecture to be competitive, when compared to current State-of-the-Art models, even more complex.</abstract>
+      <url hash="f2be814a">2023.gwc-1.7</url>
+      <bibkey>janz-maziarz-2023-data</bibkey>
+    </paper>
+    <paper id="8">
+      <title>Mapping Wordnets on the Fly with Permanent Sense Keys</title>
+      <author><first>Eric</first><last>Kafe</last></author>
+      <pages>67–76</pages>
+      <abstract>Most of the major databases on the semantic web have links to Princeton WordNet (PWN) synonym set (synset) identifiers, which differ for each PWN release, and are thus incompatible between versions. On the other hand, both PWN and the more recent Open English Wordnet (OEWN) provide permanent word sense identifiers (the sense keys), which can solve this interoperability problem. We present an algorithm that runs in linear time, to automatically derive a synset mapping between any pair of Wordnet versions that use PWN sense keys. This allows to update old WordNet links, and seamlessly interoperate with newer English Wordnet versions for which no prior mapping exists. By applying the proposed algorithm on the fly, at load time, we combine the Open Multilingual Wordnet (OMW 1.4, which uses old PWN 3.0 identifiers) with OEWN Edition 2021, and obtain almost perfect precision and recall. We compare the results of our approach using respectively synset offsets, versus the Collaborative InterLingual Index (CILI version 1.0) as synset identifiers, and find that the synset offsets perform better than CILI 1.0 in all cases, except a few ties.</abstract>
+      <url hash="aeee8ec8">2023.gwc-1.8</url>
+      <bibkey>kafe-2023-mapping</bibkey>
+    </paper>
+    <paper id="9">
+      <title>Linking the <fixed-case>S</fixed-case>anskrit <fixed-case>W</fixed-case>ord<fixed-case>N</fixed-case>et to the <fixed-case>V</fixed-case>edic Dependency Treebank: a pilot study</title>
+      <author><first>Erica</first><last>Biagetti</last></author>
+      <author><first>Chiara</first><last>Zanchi</last></author>
+      <author><first>Silvia</first><last>Luraghi</last></author>
+      <pages>77–83</pages>
+      <abstract>The Sanskrit WordNet is a resource currently under development, whose core was induced from a Vedic text sample semantically annotated by means of an ontology mapped on the Princeton WordNet synsets. Building on a previous case study on Ancient Greek (Zanchi et al. 2021), we show how sentence frames can be extracted from morphosyntactically parsed corpora by linking an existing dependency treebank of Vedic Sanskrit to verbal synsets in the Sanskrit WordNet. Our case study focuses on two verbs of asking, yāc- and prach-, featuring a high degree of variability in sentence frames. Treebanks enhanced with WordNet-based semantic information revealed to be of crucial help in motivating sentence frame alternations.</abstract>
+      <url hash="262eb5c8">2023.gwc-1.9</url>
+      <bibkey>biagetti-etal-2023-linking</bibkey>
+    </paper>
+    <paper id="10">
+      <title><fixed-case>S</fixed-case>tar<fixed-case>N</fixed-case>et: A <fixed-case>W</fixed-case>ord<fixed-case>N</fixed-case>et Editor Interface</title>
+      <author><first>Oğuzhan</first><last>Kuyrukçu</last></author>
+      <author><first>Ezgi</first><last>Sanıyar</last></author>
+      <author><first>Olcay Taner</first><last>Yildiz</last></author>
+      <pages>84–90</pages>
+      <abstract>In this paper, we introduce StarNet WordNet Editor, an open-source annotation tool designed for natural language processing. It’s mainly used for creating and maintaining machine-readable dictionaries like WordNet (Miller, 1995) or domain-specific dictionaries. WordNet editor provides a user friendly interface and since it is open-source, it is easy to use and develop. Besides English and Turkish WordNet (KeNet) (Bakay et al., 2020), it is also applicable to several languages and their domain specific dictionaries.</abstract>
+      <url hash="a3991786">2023.gwc-1.10</url>
+      <bibkey>kuyrukcu-etal-2023-starnet</bibkey>
+    </paper>
+    <paper id="11">
+      <title>Identifying <fixed-case>F</fixed-case>rame<fixed-case>N</fixed-case>et Lexical Semantic Structures for Knowledge Graph Extraction from Financial Customer Interactions</title>
+      <author><first>Cécile</first><last>Robin</last></author>
+      <author><first>Atharva</first><last>Kulkarni</last></author>
+      <author><first>Paul</first><last>Buitelaar</last></author>
+      <pages>91–100</pages>
+      <abstract>We explore the use of the well established lexical resource and theory of the Berkeley FrameNet project to support the creation of a domain-specific knowledge graph in the financial domain, more precisely from financial customer interactions. We introduce a domain independent and unsupervised method that can be used across multiple applications, and test our experiments on the financial domain. We use an existing tool for term extraction and taxonomy generation in combination with information taken from FrameNet. By using principles from frame semantic theory, we show that we can connect domain-specific terms with their semantic concepts (semantic frames) and their properties (frame elements) to enrich knowledge about these terms, in order to improve the customer experience in customer-agent dialogue settings.</abstract>
+      <url hash="5c608dbf">2023.gwc-1.11</url>
+      <bibkey>robin-etal-2023-identifying</bibkey>
+    </paper>
+    <paper id="12">
+      <title>Some Considerations in the Construction of a Historical Language <fixed-case>W</fixed-case>ord<fixed-case>N</fixed-case>et</title>
+      <author><first>Fahad</first><last>Khan</last></author>
+      <author><first>John P.</first><last>McCrae</last></author>
+      <author><first>Francisco Javier Minaya</first><last>Gómez</last></author>
+      <author><first>Rafael Cruz</first><last>González</last></author>
+      <author><first>Javier E.</first><last>Díaz-Vera</last></author>
+      <pages>101–105</pages>
+      <abstract>This article describes the manual construction of a part of the Old English WordNet (Old-EWN) covering the semantic field of emotion terms. This manually constructed part of the wordnet is to be eventually integrated with the automatically generated/manually checked part covering the whole of the rest of the Old English lexicon (currently under construction). We present the workflow for the definition of these emotion synsets on the basis of a dataset produced by a specialist in this area. We also look at the enrichment of the original Global WordNet Association Lexical Markup Framework (GWA LMF) schema to include the extra information which this part of the OldEWN requires. In the final part of the article we discuss how the wordnet style of lexicon organisation can be used to share and disseminate research findings/datasets in lexical semantics.</abstract>
+      <url hash="0bc3b684">2023.gwc-1.12</url>
+      <bibkey>khan-etal-2023-considerations</bibkey>
+    </paper>
+    <paper id="13">
+      <title>Hidden in Plain Sight: Can <fixed-case>G</fixed-case>erman <fixed-case>W</fixed-case>iktionary and Wordnets Facilitate the Detection of Antithesis?</title>
+      <author><first>Ramona</first><last>Kuehn</last></author>
+      <author><first>Jelena</first><last>Mitrović</last></author>
+      <author><first>Michael</first><last>Granitzer</last></author>
+      <pages>106–116</pages>
+      <abstract>Existing wordnets mainly focus on synonyms, while antonyms have often been neglected, especially in wordnets in languages other than English. In this paper, we show how regular expressions are used to generate an antonym resource for German by using Wiktionary as a source. This resource contains antonyms for 45499 words. The antonyms can be used to extend existing wordnets. We show that this is important by comparing our antonym resource to the antonyms in OdeNet, the only freely available German wordnet that contains antonyms for 3059 words. We demonstrate that antonyms are relevant for the detection of the rhetorical figure antithesis. This figure has been known to influence the audience by creating contradiction and using a parallel sentence structure combined with antonyms. We first detect parallelism with part-of-speech tags and then apply our rule-based antithesis detection algorithm to a dataset of the messenger service Telegram. We evaluate our approach and achieve a precision of 57% and a recall of 45% thus overcoming the existing approaches.</abstract>
+      <url hash="3c463a71">2023.gwc-1.13</url>
+      <bibkey>kuehn-etal-2023-hidden</bibkey>
+    </paper>
+    <paper id="14">
+      <title>How do We Treat Systematic Polysemy in Wordnets and Similar Resources? – Using Human Intuition and Contextualized Embeddings as Guidance</title>
+      <author><first>Nathalie</first><last>Sørensen</last></author>
+      <author><first>Sanni</first><last>Nimb</last></author>
+      <author><first>Bolette</first><last>Pedersen</last></author>
+      <pages>117–126</pages>
+      <abstract>Systematic polysemy is a well-known linguistic phenomenon where a group of lemmas follow the same polysemy pattern. However, when compiling a lexical resource like a wordnet, a problem arises regarding when to underspecify the two (or more) meanings by one (complex) sense and when to systematically split into separate senses. In this work, we present an extensive analysis of the systematic polysemy patterns in Danish, and in our preliminary study, we examine a subset of these with experiments on human intuition and contextual embeddings. The aim of this preparatory work is to enable future guidelines for each polysemy type. In the future, we hope to expand this approach and thereby hopefully obtain a sense inventory which is distributionally verified and thereby more suitable for NLP.</abstract>
+      <url hash="01392b5d">2023.gwc-1.14</url>
+      <bibkey>sorensen-etal-2023-treat</bibkey>
+    </paper>
+    <paper id="15">
+      <title>The <fixed-case>R</fixed-case>omanian <fixed-case>W</fixed-case>ordnet in Linked Open Data Format</title>
+      <author><first>Elena</first><last>Irimia</last></author>
+      <author><first>Verginica</first><last>Mititelu</last></author>
+      <pages>127–132</pages>
+      <abstract>In this paper we present the standardization of the Romanian Wordnet by means of conversion to the Linked Open Data format. We describe the vocabularies used to encode data and metadata of this resource. The decisions made are in accordance with the characteristics of the Romanian Wordnet, which are the outcome of the development method, enrichment strategies and resources used for its creations. By interlinking with other resources, words in the Romanian Wordnet have now the pronunciation associated, as well as syntagmatic information, in the form of contexts of occurrences.</abstract>
+      <url hash="bbc47a46">2023.gwc-1.15</url>
+      <bibkey>irimia-mititelu-2023-romanian</bibkey>
+    </paper>
+    <paper id="16">
+      <title>Combining <fixed-case>W</fixed-case>ord<fixed-case>N</fixed-case>ets with Treebanks to study idiomatic language: A pilot study on Rigvedic formulas through the lenses of the <fixed-case>S</fixed-case>anskrit <fixed-case>W</fixed-case>ord<fixed-case>N</fixed-case>et and the <fixed-case>V</fixed-case>edic Treebank</title>
+      <author><first>Luca Brigada</first><last>Villa</last></author>
+      <author><first>Erica</first><last>Biagetti</last></author>
+      <author><first>Riccardo</first><last>Ginevra</last></author>
+      <author><first>Chiara</first><last>Zanchi</last></author>
+      <pages>133–139</pages>
+      <abstract>This paper shows how WordNets can be employed in tandem with morpho-syntactically annotated corpora to study poetic formulas. Pairing the lexico-semantic information of the Sanskrit WordNet with morpho-syntactic annotation from the Vedic Treebank, we perform a pilot study of formulas including SPEECH verbs in the RigVeda, the most ancient text of the. Sanskrit literature.</abstract>
+      <url hash="64b2f5e6">2023.gwc-1.16</url>
+      <bibkey>villa-etal-2023-combining</bibkey>
+    </paper>
+    <paper id="17">
+      <title>Word Sense Disambiguation Based on Iterative Activation Spreading with Contextual Embeddings for Sense Matching</title>
+      <author><first>Arkadiusz</first><last>Janz</last></author>
+      <author><first>Maciej</first><last>Piasecki</last></author>
+      <pages>140–149</pages>
+      <abstract>Many knowledge-based solutions were proposed to solve Word Sense Disambiguation (WSD) problem with limited annotated resources. Such WSD algorithms are able to cover very large sense repositories, but still being outperformed by supervised ones on benchmark data. In this paper, we start with analysis identifying key properties and issues in application of spreading activation algorithms in knowledge-based WSD, e.g. influence of the network local structures, interaction with context information and sense frequency. Taking our observations as a point of departure, we introduce a novel solution with new context-to-sense matching using BERT embeddings, iterative parallel spreading activation function and selective sense alignment using contextual BERT embeddings. The proposed solution obtains performance beyond the state-of-the-art for the contemporary knowledge-based WSD approaches for both English and Polish data.</abstract>
+      <url hash="a48d080f">2023.gwc-1.17</url>
+      <bibkey>janz-piasecki-2023-word</bibkey>
+    </paper>
+    <paper id="18">
+      <title>Documenting the Open Multilingual <fixed-case>W</fixed-case>ordnet</title>
+      <author><first>Francis</first><last>Bond</last></author>
+      <author><first>Michael Wayne</first><last>Goodman</last></author>
+      <author><first>Ewa</first><last>Rudnicka</last></author>
+      <author><first>Luis Morgado</first><last>da Costa</last></author>
+      <author><first>Alexandre</first><last>Rademaker</last></author>
+      <author><first>John P.</first><last>McCrae</last></author>
+      <pages>150–157</pages>
+      <abstract>In this project note we describe our work to make better documentation for the Open Multilingual Wordnet (OMW), a platform integrating many open wordnets. This includes the documentation of the OMW website itself as well as of semantic relations used by the component wordnets. Some of this documentation work was done with the support of the Google Season of Docs. The OMW project page, which links both to the actual OMW server and the documentation has been moved to a new location: https://omwn.org.</abstract>
+      <url hash="da8f7b5f">2023.gwc-1.18</url>
+      <bibkey>bond-etal-2023-documenting</bibkey>
+    </paper>
+    <paper id="19">
+      <title>Mapping <fixed-case>G</fixed-case>erma<fixed-case>N</fixed-case>et for the Semantic Web using <fixed-case>O</fixed-case>nto<fixed-case>L</fixed-case>ex-Lemon</title>
+      <author><first>Claus</first><last>Zinn</last></author>
+      <author><first>Marie</first><last>Hinrichs</last></author>
+      <author><first>Erhard</first><last>Hinrichs</last></author>
+      <pages>158–166</pages>
+      <abstract>GermaNet is a large lexical-semantic net that relates German nouns, verbs, and adjectives semantically. The word net has been manually constructed over the last 25 years and hence presents a high-quality, valuable resource for German. While GermaNet is maintained in a Postgres database, all its content can be exported as an XML-based serialisation. Recently, this XML representation has been converted into RDF, largely by staying close to GermaNet’s principle of arrangement where lexunits that share the same meaning are grouped together into so-called synsets. With each lexical unit and synset now globally addressable via a unique resource identifier, it has become much easier to link together GermaNet entries with other lexical and semantic resources. In terms of semantic interoperability, however, the RDF variant of GermaNet leaves much to be desired. In this paper, we describe yet another conversion from GermaNet’s XML representation to RDF. The new conversion makes use of the OntoLex-Lemon ontology, and therefore, presents a decisive step toward a GermaNet representation with a much higher level of semantic interoperability, and which makes it possible to use GermaNet with other wordnets that already support this conceptualisation of lexica.</abstract>
+      <url hash="1ba1cd4d">2023.gwc-1.19</url>
+      <bibkey>zinn-etal-2023-mapping</bibkey>
+    </paper>
+    <paper id="20">
+      <title>Incorporating prepositions in the <fixed-case>B</fixed-case>ul<fixed-case>T</fixed-case>ree<fixed-case>B</fixed-case>ank <fixed-case>W</fixed-case>ord<fixed-case>N</fixed-case>et</title>
+      <author><first>Zara</first><last>Kancheva</last></author>
+      <pages>167–171</pages>
+      <abstract>A model for preposition incorporation in the BulTreeBank WordNet is presented which follows the model for presenting open class words in wordnets. An adapted semantic classification of prepositions is done on the base of Bulgarian grammars and the classes are used for synset categories. The good coverage of prepositions in the wordnet will be used for the aim of neural language models creation for Bulgarian. This extension of the wordnet improves its utility for semantic annotation.</abstract>
+      <url hash="8fb023c5">2023.gwc-1.20</url>
+      <bibkey>kancheva-2023-incorporating</bibkey>
+    </paper>
+    <paper id="21">
+      <title>Are there just <fixed-case>W</fixed-case>ord<fixed-case>N</fixed-case>ets or also <fixed-case>S</fixed-case>ign<fixed-case>N</fixed-case>ets?</title>
+      <author><first>Ineke</first><last>Schuurman</last></author>
+      <author><first>Thierry</first><last>Declerck</last></author>
+      <author><first>Caro</first><last>Brosens</last></author>
+      <author><first>Margot</first><last>Janssens</last></author>
+      <author><first>Vincent</first><last>Vandeghinste</last></author>
+      <author><first>Bram</first><last>Vanroy</last></author>
+      <pages>172–178</pages>
+      <abstract>For Sign Languages (SLs), can we create a SignNet, like a WordNet for spoken languages: a network of semantic relations between constitutive elements of SLs? We first discuss approaches that link SL data to wordnets, or integrate such elements with some adaptations into the structure of WordNet. Then, we present requirements for a SignNet, which is built on SL data and then linked to WordNet.</abstract>
+      <url hash="f6821ba7">2023.gwc-1.21</url>
+      <bibkey>schuurman-etal-2023-just</bibkey>
+    </paper>
+    <paper id="22">
+      <title>The <fixed-case>J</fixed-case>apanese <fixed-case>W</fixed-case>ordnet 2.0</title>
+      <author><first>Francis</first><last>Bond</last></author>
+      <author><first>Takayuki</first><last>Kuribayashi</last></author>
+      <pages>179–186</pages>
+      <abstract>This paper describes a new release of the Japanese wordnet. It uses the new global wordnet formats (McCrae et al., 2021) to incorporate a range of new information: orthographic variants (including hiragana, katakana and Latin representations) first described in Kuroda et al. (2011), classifiers, pronouns and exclamatives (Morgado da Costa and Bond, 2016) and many new senses, motivated both from corpus annotation and linking to the TUFs basic vocabulary (Bond et al., 2020). The wordnet has been moved to github and is available at https://bond-lab.github.io/wnja/.</abstract>
+      <url hash="550365dc">2023.gwc-1.22</url>
+      <bibkey>bond-kuribayashi-2023-japanese</bibkey>
+    </paper>
+    <paper id="23">
+      <title><fixed-case>L</fixed-case>atvian <fixed-case>W</fixed-case>ord<fixed-case>N</fixed-case>et</title>
+      <author><first>Peteris</first><last>Paikens</last></author>
+      <author><first>Agute</first><last>Klints</last></author>
+      <author><first>Ilze</first><last>Lokmane</last></author>
+      <author><first>Lauma</first><last>Pretkalniņa</last></author>
+      <author><first>Laura</first><last>Rituma</last></author>
+      <author><first>Madara</first><last>Stāde</last></author>
+      <author><first>Laine</first><last>Strankale</last></author>
+      <pages>187–196</pages>
+      <abstract>This paper describes the recently developed Latvian WordNet and the main linguistic principles used in its development. The inventory of words and senses is based on the Te̅zaurs.lv online dictionary, restructuring the senses of the most frequently used words based on corpus evidence. The semantic linking methodology adapts Princeton WordNet principles to fit the Latvian language usage and existing linguistic tradition. The semantic links include hyponymy, meronymy, antonymy, similarity, conceptual connection and gradation. We also measure inter-annotator agreement for different types of semantic links. The dataset consists of 7609 words linked in 6515 synsets. 1266 of these words are considered fully completed as they have all the outgoing semantic links annotated, corpus examples assigned for each sense, as well as links to the English Princeton WordNet formed. The data is available to the public on Te̅zaurs.lv as an addition to the general dictionary data, and is also published as a downloadable dataset.</abstract>
+      <url hash="d97d3da2">2023.gwc-1.23</url>
+      <bibkey>paikens-etal-2023-latvian</bibkey>
+    </paper>
+    <paper id="24">
+      <title>Initial Experiments for Building a <fixed-case>G</fixed-case>uarani <fixed-case>W</fixed-case>ord<fixed-case>N</fixed-case>et</title>
+      <author><first>Luis</first><last>Chiruzzo</last></author>
+      <author><first>Marvin</first><last>Agüero-Torales</last></author>
+      <author><first>Aldo</first><last>Alvarez</last></author>
+      <author><first>Yliana</first><last>Rodríguez</last></author>
+      <pages>197–204</pages>
+      <abstract>This paper presents a work in progress about creating a Guarani version of the WordNet database. Guarani is an indigenous South American language and is a low-resource language from the NLP perspective. Following the expand approach, we aim to find Guarani lemmas that correspond to the concepts defined in WordNet. We do this through three strategies that try to select the correct lemmas from Guarani-Spanish datasets. We ran them through three different bilingual dictionaries and had native speakers assess the results. This procedure found Guarani lemmas for about 6.5 thousand synsets, including 27% of the base WordNet concepts. However, more work on the quality of the selected words will be needed in order to create a final version of the dataset.</abstract>
+      <url hash="87c8c278">2023.gwc-1.24</url>
+      <bibkey>chiruzzo-etal-2023-initial</bibkey>
+    </paper>
+    <paper id="25">
+      <title>A <fixed-case>CCG</fixed-case>bank for <fixed-case>T</fixed-case>urkish: From Dependency to <fixed-case>CCG</fixed-case></title>
+      <author><first>Aslı</first><last>Kuzgun</last></author>
+      <author><first>Oğuz Kerem</first><last>Yıldız</last></author>
+      <author><first>Olcay Taner</first><last>Yildiz</last></author>
+      <pages>205–213</pages>
+      <abstract>In this paper, we present the building of a CCGbank for Turkish by using standardised dependency corpora. We automatically induce Combinatory Categorial Grammar (CCG) categories for each word token in the Turkish dependency corpora. The CCG induction algorithm we present here is based on the dependency relations that are defined in the latest release of the Universal Dependencies (UD) framework. We aim for an algorithm that can easily be used in all the Turkish treebanks that are annotated in this framework. Therefore, we employ a lexicalist approach in order to make full use of the dependency relations while creating a semantically transparent corpus. We present the treebanks we employed in this study as well as their annotation framework. We introduce the structure of the algorithm we used along with the specific issues that are different from previous studies. Lastly, we show how the results change with this lexical approach in CCGbank for Turkish compared to the previous CCGbank studies in Turkish.</abstract>
+      <url hash="aff9e6b5">2023.gwc-1.25</url>
+      <bibkey>kuzgun-etal-2023-ccgbank</bibkey>
+    </paper>
+    <paper id="26">
+      <title>Reusing the <fixed-case>D</fixed-case>anish <fixed-case>W</fixed-case>ord<fixed-case>N</fixed-case>et for a New Central Word Register for <fixed-case>D</fixed-case>anish - a Project Report</title>
+      <author><first>Bolette</first><last>Pedersen</last></author>
+      <author><first>Sanni</first><last>Nimb</last></author>
+      <author><first>Nathalie</first><last>Sørensen</last></author>
+      <author><first>Sussi</first><last>Olsen</last></author>
+      <author><first>Ida</first><last>Flörke</last></author>
+      <author><first>Thomas</first><last>Troelsgård</last></author>
+      <pages>214–219</pages>
+      <abstract>In this paper we report on a new Danish lexical initiative, the Central Word Register for Danish, (COR), which aims at providing an open-source, well curated and large-coverage lexicon for AI purposes. The semantic part of the lexicon (COR-S) relies to a large extent on the lexical-semantic information provided in the Danish wordnet, DanNet. However, we have taken the opportunity to evaluate and curate the wordnet information while compiling the new resource. Some information types have been simplified and more systematically curated. This is the case for the hyponymy relations, the ontological typing, and the sense inventory, i.e. the treatment of polysemy, including systematic polysemy.</abstract>
+      <url hash="752100a8">2023.gwc-1.26</url>
+      <bibkey>pedersen-etal-2023-reusing</bibkey>
+    </paper>
+    <paper id="27">
+      <title>Recent Developments in <fixed-case>BTB</fixed-case>-<fixed-case>W</fixed-case>ord<fixed-case>N</fixed-case>et</title>
+      <author><first>Kiril</first><last>Simov</last></author>
+      <author><first>Petya</first><last>Osenova</last></author>
+      <pages>220–227</pages>
+      <abstract>The paper reports on recent developments in Bulgarian BTB-WordNet (BTB-WN). This resource is viewed as playing a central role with respect to the integration and interlinking of various language resources such as: e-dictionaries (morphological, terminological, bilingual, orthographic, etymological and explanatory, etc., including editions from previous periods); corpora (coming from outside or being internal - like the corpus of definitions as well as the corpus of examples to synset meanings); ontologies (such as CIDOC-CRM, DBpedia, etc.); sources of world knowledge (such as information from the Bulgarian Encyclopedia, Wikipedia, etc.). The paper also gives information about a number of applications built on BTB-WN. These are: the Bulgaria-centered knowledge graph, the All about word application as well as some education-oriented exercises.</abstract>
+      <url hash="fcd55ecb">2023.gwc-1.27</url>
+      <bibkey>simov-osenova-2023-recent</bibkey>
+    </paper>
+    <paper id="28">
+      <title>Lexicalised and non-lexicalized multi-word expressions in <fixed-case>W</fixed-case>ord<fixed-case>N</fixed-case>et: a cross-encoder approach</title>
+      <author><first>Marek</first><last>Maziarz</last></author>
+      <author><first>Łukasz</first><last>Grabowski</last></author>
+      <author><first>Tadeusz</first><last>Piotrowski</last></author>
+      <author><first>Ewa</first><last>Rudnicka</last></author>
+      <author><first>Maciej</first><last>Piasecki</last></author>
+      <pages>228–234</pages>
+      <abstract>Focusing on recognition of multi-word expressions (MWEs), we address the problem of recording MWEs in WordNet. In fact, not all MWEs recorded in that lexical database could with no doubt be considered as lexicalised (e.g. elements of wordnet taxonomy, quantifier phrases, certain collocations). In this paper, we use a cross-encoder approach to improve our earlier method of distinguishing between lexicalised and non-lexicalised MWEs found in WordNet using custom-designed rule-based and statistical approaches. We achieve F1-measure for the class of lexicalised word combinations close to 80%, easily beating two baselines (random and a majority class one). Language model also proves to be better than a feature-based logistic regression model.</abstract>
+      <url hash="c39df8be">2023.gwc-1.28</url>
+      <bibkey>maziarz-etal-2023-lexicalised</bibkey>
+    </paper>
+    <paper id="29">
+      <title>Towards an <fixed-case>RDF</fixed-case> Representation of the Infrastructure consisting in using Wordnets as a conceptual Interlingua between multilingual Sign Language Datasets</title>
+      <author><first>Thierry</first><last>Declerck</last></author>
+      <author><first>Thomas</first><last>Troelsgård</last></author>
+      <author><first>Sussi</first><last>Olsen</last></author>
+      <pages>235–242</pages>
+      <abstract>We present ongoing work dealing with a Linked Data compliant representation of infrastructures using wordnets for connecting multilingual Sign Language data sets. We build for this on already existing RDF and OntoLex representations of Open Multilingual Wordnet (OMW) data sets and work done by the European EASIER research project on the use of the CSV files of OMW for linking glosses and basic semantic information associated with Sign Language data sets in two languages: German and Greek. In this context, we started the transformation into RDF of a Danish data set, which links Danish Sign Language data and the wordnet for Danish, DanNet. The final objective of our work is to include Sign Language data sets (and their conceptual cross-linking via wordnets) in the Linguistic Linked Open Data cloud.</abstract>
+      <url hash="659deb18">2023.gwc-1.29</url>
+      <bibkey>declerck-etal-2023-towards</bibkey>
+    </paper>
+    <paper id="30">
+      <title>Semantic Parsing and Sense Tagging the <fixed-case>P</fixed-case>rinceton <fixed-case>W</fixed-case>ord<fixed-case>N</fixed-case>et Gloss Corpus</title>
+      <author><first>Alexandre</first><last>Rademaker</last></author>
+      <author><first>Abhishek</first><last>Basu</last></author>
+      <author><first>Rajkiran</first><last>Veluri</last></author>
+      <pages>243–253</pages>
+      <abstract>In 2008, the Princeton team released the last version of the “Princeton Annotated Gloss Corpus”. In this corpus. The word forms from the definitions and examples (glosses) of Princeton WordNet are manually linked to the context-appropriate sense in WordNet. However, the annotation was not complete, and the dataset was never officially released as part of WordNet 3.0, remaining as one of the standoff files available for download. Eleven years later, in 2019, one of the authors of this paper restarted the project aiming to complete the sense annotation of the approximately 200 thousand word forms not yet annotated. Here, we provide additional motivations to complete this dataset and report the progress in the work and evaluations. Intending to provide an extra level of consistency in the sense annotation and a deep semantic representation of the definitions and examples promoting WordNet from a lexical resource to a lightweight ontology, we now employ the English Resource Grammar (ERG), a broad-coverage HPSG grammar of English to parse the sentences and project the sense annotations from the surface words to the ERG predicates. We also report some initial steps on upgrading the corpus to WordNet 3.1 to facilitate mapping the data to other lexical resources.</abstract>
+      <url hash="c0618189">2023.gwc-1.30</url>
+      <bibkey>rademaker-etal-2023-semantic</bibkey>
+    </paper>
+    <paper id="31">
+      <title>Context-Gloss Augmentation for Improving <fixed-case>A</fixed-case>rabic Target Sense Verification</title>
+      <author><first>Sanad</first><last>Malaysha</last></author>
+      <author><first>Mustafa</first><last>Jarrar</last></author>
+      <author><first>Mohammed</first><last>Khalilia</last></author>
+      <pages>254–262</pages>
+      <abstract>Arabic language lacks semantic datasets and sense inventories. The most common semantically-labeled dataset for Arabic is the ArabGlossBERT, a relatively small dataset that consists of 167K context-gloss pairs (about 60K positive and 107K negative pairs), collected from Arabic dictionaries. This paper presents an enrichment to the ArabGlossBERT dataset, by augmenting it using (Arabic-English-Arabic) machine back-translation. Augmentation increased the dataset size to 352K pairs (149K positive and 203K negative pairs). We measure the impact of augmentation using different data configurations to fine-tune BERT on target sense verification (TSV) task. Overall, the accuracy ranges between 78% to 84% for different data configurations. Although our approach performed at par with the baseline, we did observe some improvements for some POS tags in some experiments. Furthermore, our fine-tuned models are trained on a larger dataset covering larger vocabulary and contexts. We provide an in-depth analysis of the accuracy for each part-of-speech (POS).</abstract>
+      <url hash="1fe12389">2023.gwc-1.31</url>
+      <bibkey>malaysha-etal-2023-context</bibkey>
+    </paper>
+    <paper id="32">
+      <title>The Open <fixed-case>C</fixed-case>antonese Sense-Tagged Corpus</title>
+      <author><first>Joanna</first><last>Sio</last></author>
+      <author><first>Luis Morgado Da</first><last>Costa</last></author>
+      <pages>263–268</pages>
+      <abstract>This paper introduces the Open Cantonese Sense-Tagged Corpus, a new and ongoing project to serve as the companion to the development of the Cantonese Wordnet. This corpus is built on top of the Cantonese Wordnet Corpus, which currently provides example sentences for most verbs in this wordnet. This paper motivates the choice of starting a sense-tagged corpus from both linguistic and educational perspectives, and discusses the current solutions to issues arisen from the sense-tagging exercise. In total, we have tagged over 5,000 concepts, with more than 3,700 direct links to the Cantonese Wordnet.</abstract>
+      <url hash="005fca0c">2023.gwc-1.32</url>
+      <bibkey>sio-costa-2023-open</bibkey>
+    </paper>
+    <paper id="33">
+      <title>Correcting Sense Annotations Using Wordnets and Translations</title>
+      <author><first>Arnob</first><last>Mallik</last></author>
+      <author><first>Grzegorz</first><last>Kondrak</last></author>
+      <pages>269–273</pages>
+      <abstract>Acquiring large amounts of high-quality annotated data is an open issue in word sense disambiguation. This problem has become more critical recently with the advent of supervised models based on neural networks, which require large amounts of annotated data. We propose two algorithms for making selective corrections on a sense-annotated parallel corpus, based on cross-lingual synset mappings. We show that, when applied to bilingual parallel corpora, these algorithms can rectify noisy sense annotations, and thereby produce multilingual sense-annotated data of improved quality.</abstract>
+      <url hash="f262b4fb">2023.gwc-1.33</url>
+      <bibkey>mallik-kondrak-2023-correcting</bibkey>
+    </paper>
+    <paper id="34">
+      <title>A Benchmark and Scoring Algorithm for Enriching <fixed-case>A</fixed-case>rabic Synonyms</title>
+      <author><first>Sana</first><last>Ghanem</last></author>
+      <author><first>Mustafa</first><last>Jarrar</last></author>
+      <author><first>Radi</first><last>Jarrar</last></author>
+      <author><first>Ibrahim</first><last>Bounhas</last></author>
+      <pages>274–283</pages>
+      <abstract>This paper addresses the task of extending a given synset with additional synonyms taking into account synonymy strength as a fuzzy value. Given a mono/multilingual synset and a threshold (a fuzzy value [0−1]), our goal is to extract new synonyms above this threshold from existing lexicons. We present twofold contributions: an algorithm and a benchmark dataset. The dataset consists of 3K candidate synonyms for 500 synsets. Each candidate synonym is annotated with a fuzzy value by four linguists. The dataset is important for (i) understanding how much linguists (dis/)agree on synonymy, in addition to (ii) using the dataset as a baseline to evaluate our algorithm. Our proposed algorithm extracts synonyms from existing lexicons and computes a fuzzy value for each candidate. Our evaluations show that the algorithm behaves like a linguist and its fuzzy values are close to those proposed by linguists (using RMSE and MAE). The dataset and a demo page are publicly available at https://portal.sina.birzeit.edu/synonyms.</abstract>
+      <url hash="7a335909">2023.gwc-1.34</url>
+      <bibkey>ghanem-etal-2023-benchmark</bibkey>
+    </paper>
+    <paper id="35">
+      <title>Expanding the Conceptual Description of Verbs in <fixed-case>W</fixed-case>ord<fixed-case>N</fixed-case>et with Semantic and Syntactic Information</title>
+      <author><first>Ivelina</first><last>Stoyanova</last></author>
+      <author><first>Svetlozara</first><last>Leseva</last></author>
+      <pages>284–294</pages>
+      <abstract>This paper describes an ongoing effort towards expanding the semantic and conceptual description of verbs in WordNet by combining information from two other resources, FrameNet and VerbNet, as well as enriching the verbs’ description with syntactic patterns extracted from the three resources. The conceptual description of verb synsets is provided by assigning a FrameNet frame which provides the relevant set of frame elements denoting the predicate’s participants and props. This information is supplemented by assigning a VerbNet class and the set of semantic roles associated with it. The information extracted from FrameNet and VerbNet and assigned to a synset is aligned (semi-automatically with subsequent manual corrections) at the following levels: (i) FrameNet frame: VerbNet class; (ii) FrameNet frame elements: VerbNet semantic roles; (iii) FrameNet semantic types and restrictions: VerbNet selectional restrictions. We then link the syntactic patterns associated with the units in FrameNet, VerbNet and WordNet, by unifying their representation and by matching the corresponding patterns at the level of syntactic groups. The alignment of the semantic components and their syntactic realisations is essential for the better exploitation of the abundance of information across resources, including shedding light on cross-resource similarities, discrepancies and inconsistencies. The syntactic patterns can facilitate the extraction of examples illustrating the use of verb synset literals in corpora and their semantic characterisation through the association of the syntactic groups with the components of semantic description (frame elements or semantic roles) and can be employed in various tasks requiring semantic and syntactic description. The resource is publicly available to the community. The components of the conceptual description are visualised showing the links to the original resources each component is drawn from.</abstract>
+      <url hash="8cfe3969">2023.gwc-1.35</url>
+      <bibkey>stoyanova-leseva-2023-expanding</bibkey>
+    </paper>
+    <paper id="36">
+      <title>An Experiment: Finding Parents for Parentless Synsets by Means of <fixed-case>CILI</fixed-case></title>
+      <author><first>Ahti</first><last>Lohk</last></author>
+      <author><first>Martin</first><last>Rebane</last></author>
+      <author><first>Heili</first><last>Orav</last></author>
+      <pages>295–302</pages>
+      <abstract>Identifying and correcting inconsistencies in wordnets is a natural part of their development. Focusing only on the subproblem of missing links, we aim to find automatically possible parents for parentless synsets in IS-A hierarchies of a target wordnet by means of source wordnets where target and source wordnets are in XML-format and equipped with Collaborative Interlingual Index (CILI). In this paper, we describe the algorithm and provide statistics on the possible parents of parentless synsets of the wordnets included in the study. Additionally, we investigate the suitability of the proposed potential parent synsets for correcting noun and verb synsets within the Estonian wordnet.</abstract>
+      <url hash="8f0dcae6">2023.gwc-1.36</url>
+      <bibkey>lohk-etal-2023-experiment</bibkey>
+    </paper>
+    <paper id="37">
+      <title>Extending the usage of adjectives in the <fixed-case>Z</fixed-case>ulu <fixed-case>A</fixed-case>f<fixed-case>WN</fixed-case></title>
+      <author><first>Laurette</first><last>Marais</last></author>
+      <author><first>Laurette</first><last>Pretorius</last></author>
+      <pages>303–314</pages>
+      <abstract>The African languages Wordnet (AfWN) for Zulu (ZWN) was built using the expand approach, which relies on the translation of concepts in the Princeton WordNet (PWN), while retaining their PWN lexical categories. In this paper the focus is on the adjective as PWN lexical category. What is considered adjectival information (provided both attributively and predicatively) in English, is usually verbalised quite differently in Zulu - often as verb or copulative constructions - as may be seen by inspecting the Zulu written forms in “adjective” entries in ZWN. These written forms are not complete Zulu verb or copulative constructions and in order for them to be useful, tense, polarity and agreement have to be added. This paper presents a grammar-based approach to recover important morphosyntactic information implicit in the ZWN “adjective” written forms in order to derive a tool that would assist a user of the ZWN to render and analyse correct full forms automatically as desired by the context in which an “adjective” is used.</abstract>
+      <url hash="1e234fd6">2023.gwc-1.37</url>
+      <bibkey>marais-pretorius-2023-extending</bibkey>
+    </paper>
+    <paper id="38">
+      <title>Linking <fixed-case>SIL</fixed-case> Semantic Domains to <fixed-case>W</fixed-case>ordnet and Expanding the <fixed-case>A</fixed-case>bui <fixed-case>W</fixed-case>ordnet through Rapid Word Collection Methodology</title>
+      <author><first>Luis Morgado Da</first><last>Costa</last></author>
+      <author><first>František</first><last>Kratochvíl</last></author>
+      <author><first>George</first><last>Saad</last></author>
+      <author><first>Benidiktus</first><last>Delpada</last></author>
+      <author><first>Daniel Simon</first><last>Lanma</last></author>
+      <author><first>Francis</first><last>Bond</last></author>
+      <author><first>Natálie</first><last>Wolfová</last></author>
+      <author><first>A.l.</first><last>Blake</last></author>
+      <pages>315–324</pages>
+      <abstract>In this paper we describe a new methodology to expand the Abui Wordnet through data collected using the Rapid Word Collection (RWC) method – based on SIL’s Semantic Domains. Using a multilingual sense-intersection algorithm, we created a ranked list of concept suggestions for each domain, and then used the ranked list as a filter to link the Abui RWC data to wordnet. This used translations from both SIL’s Semantic Domain’s structure and example words, both available through SIL’s Fieldworks software and the RWC project. We release both the new mapping of the SIL Semantic Domains to wordnet and an expansion of the Abui Wordnet.</abstract>
+      <url hash="0694f899">2023.gwc-1.38</url>
+      <bibkey>costa-etal-2023-linking</bibkey>
+    </paper>
+    <paper id="39">
+      <title><fixed-case>W</fixed-case>ordnet-oriented recognition of derivational relations</title>
+      <author><first>Wiktor</first><last>Walentynowicz</last></author>
+      <author><first>Maciej</first><last>Piasecki</last></author>
+      <pages>325–330</pages>
+      <abstract>Derivational relations are an important element in defining meanings, as they help to explore word-formation schemes and predict senses of derivates (derived words). In this work, we analyse different methods of representing derivational forms obtained from WordNet – from quantitative vectors to contextual learned embedding methods – and compare ways of classifying the derivational relations occurring between them. Our research focuses on the explainability of the obtained representations and results. The data source for our research is plWordNet, which is the wordnet of the Polish language and includes a rich set of derivation examples.</abstract>
+      <url hash="9dc0a951">2023.gwc-1.39</url>
+      <bibkey>walentynowicz-piasecki-2023-wordnet</bibkey>
+    </paper>
+    <paper id="40">
+      <title>What do Language Models know about word senses? Zero-Shot <fixed-case>WSD</fixed-case> with Language Models and Domain Inventories</title>
+      <author><first>Oscar</first><last>Sainz</last></author>
+      <author><first>Oier Lopez</first><last>de Lacalle</last></author>
+      <author><first>Eneko</first><last>Agirre</last></author>
+      <author><first>German</first><last>Rigau</last></author>
+      <pages>331–342</pages>
+      <abstract>Language Models are the core for almost any Natural Language Processing system nowadays. One of their particularities is their contextualized representations, a game changer feature when a disambiguation between word senses is necessary. In this paper we aim to explore to what extent language models are capable of discerning among senses at inference time. We performed this analysis by prompting commonly used Languages Models such as BERT or RoBERTa to perform the task of Word Sense Disambiguation (WSD). We leverage the relation between word senses and domains, and cast WSD as a textual entailment problem, where the different hypothesis refer to the domains of the word senses. Our results show that this approach is indeed effective, close to supervised systems.</abstract>
+      <url hash="4245997e">2023.gwc-1.40</url>
+      <bibkey>sainz-etal-2023-language</bibkey>
+    </paper>
+    <paper id="41">
+      <title>Resolving Multiple Hyperonymy</title>
+      <author><first>Svetla</first><last>Koeva</last></author>
+      <author><first>Dimitar</first><last>Hristov</last></author>
+      <pages>343–351</pages>
+      <abstract>WordNet contains a fair number of synsets with multiple hyperonyms. In parent–child relations, a child can have only one parent (ancestor). Consequently, multiple hyperonymy represents distinct semantic relations. In order to reclassify the multiple hyperonyms, we define a small set of new semantic relations (such as function, origin and form) that cover the various instances of multiple hyperonyms. The synsets with multiple hyperonyms that lead to the same root and belong to the same semantic class were grouped automatically, resulting in semantic patterns that serve as a point of departure for the classification. The proposed changes are based on semantic analysis and may involve the redefinition of one or several multiple hyperonymy relations to new ones, the removal of one or several multiple hyperonymy relations, and rarely the addition of a new hyperonymy relation. As a result, we incorporate the newly defined semantic relations that resolve the former multiple hyperonymy relations and propose an updated WordNet structure without multiple hyperonyms. The resulting WordNet structure without multiple hyperonyms may be used for a variety of purposes that require proper inheritance.</abstract>
+      <url hash="58aaafc4">2023.gwc-1.41</url>
+      <bibkey>koeva-hristov-2023-resolving</bibkey>
+    </paper>
+    <paper id="42">
+      <title>Towards the integration of <fixed-case>W</fixed-case>ord<fixed-case>N</fixed-case>et into <fixed-case>C</fixed-case>lin<fixed-case>IDM</fixed-case>ap</title>
+      <author><first>Elena</first><last>Zotova</last></author>
+      <author><first>Montse</first><last>Cuadros</last></author>
+      <author><first>German</first><last>Rigau</last></author>
+      <pages>352–362</pages>
+      <abstract>This paper presents the integration of WordNet knowledge resource into ClinIDMap tool, which aims to map identifiers between clinical ontologies and lexical resources. ClinIDMap interlinks identifiers from UMLS, SMOMED-CT, ICD-10 and the corresponding Wikidata and Wikipedia articles for concepts from the UMLS Metathesaurus. The main goal of the tool is to provide semantic interoperability across the clinical concepts from various knowledge bases. As a side effect, the mapping enriches already annotated medical corpora in multiple languages with new labels. In this new release, we add WordNet 3.0 and 3.1 synsets using the available mappings through Wikidata. Thanks to cross-lingual links in MCR we also include the corresponding synsets in other languages and also, extend further ClinIDMap with different domain information. Finally, the final resource helps in the task of enriching of already annotated clinical corpora with additional semantic annotations.</abstract>
+      <url hash="13e2219b">2023.gwc-1.42</url>
+      <bibkey>zotova-etal-2023-towards</bibkey>
+    </paper>
+    <paper id="43">
+      <title>Connecting Multilingual Wordnets: Strategies for Improving <fixed-case>ILI</fixed-case> Classification in <fixed-case>O</fixed-case>de<fixed-case>N</fixed-case>et</title>
+      <author><first>Melanie</first><last>Siegel</last></author>
+      <author><first>Johann</first><last>Bergh</last></author>
+      <pages>363–368</pages>
+      <abstract>The Open Multilingual Wordnet (OMW) is an open source project that was launched with the goal to make it easy to use wordnets in multiple languages without having to pay expensive proprietary licensing costs. As OMW evolved, the interlingual indicator (ILI)1 was used to allow semantically equivalent synsets in different languages to be linked to each other. OdeNet2 is the German language wordnet which forms part of the OMW project. This paper analyses the shortcomings of the initial ILI classification in OdeNet and the consequent methods used to improve this classification.</abstract>
+      <url hash="a5eaa954">2023.gwc-1.43</url>
+      <bibkey>siegel-bergh-2023-connecting</bibkey>
+    </paper>
+  </volume>
+</collection>

From 9ee66d81f57bc58f799da9c9d9a89a399ac0a710 Mon Sep 17 00:00:00 2001
From: acl-pwc-bot <94475230+acl-pwc-bot@users.noreply.github.com>
Date: Thu, 9 Nov 2023 02:05:36 +0100
Subject: [PATCH 02/12] Update metadata from Papers with Code

---
 data/xml/2020.aacl.xml    | 2 +-
 data/xml/2020.acl.xml     | 1 +
 data/xml/2020.bionlp.xml  | 1 -
 data/xml/2020.lrec.xml    | 1 +
 data/xml/2020.starsem.xml | 1 -
 data/xml/2021.acl.xml     | 1 +
 data/xml/2021.naacl.xml   | 1 +
 data/xml/2022.acl.xml     | 1 +
 data/xml/2022.coling.xml  | 3 ++-
 data/xml/2022.ecnlp.xml   | 1 +
 data/xml/2022.lrec.xml    | 1 +
 data/xml/2022.naacl.xml   | 1 +
 data/xml/W19.xml          | 1 +
 13 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/data/xml/2020.aacl.xml b/data/xml/2020.aacl.xml
index add44b9725..b6764cb4ec 100644
--- a/data/xml/2020.aacl.xml
+++ b/data/xml/2020.aacl.xml
@@ -1548,7 +1548,7 @@
       <abstract>We introduce fairseq S2T, a fairseq extension for speech-to-text (S2T) modeling tasks such as end-to-end speech recognition and speech-to-text translation. It follows fairseq’s careful design for scalability and extensibility. We provide end-to-end workflows from data pre-processing, model training to offline (online) inference. We implement state-of-the-art RNN-based as well as Transformer-based models and open-source detailed training recipes. Fairseq’s machine translation models and language models can be seamlessly integrated into S2T workflows for multi-task learning or transfer learning. Fairseq S2T is available at <url>https://github.com/pytorch/fairseq/tree/master/examples/speech_to_text</url>.</abstract>
       <url hash="ba6e2aa3">2020.aacl-demo.6</url>
       <bibkey>wang-etal-2020-fairseq</bibkey>
-      <pwccode url="https://github.com/pytorch/fairseq/tree/master/examples/speech_to_text" additional="true">pytorch/fairseq</pwccode>
+      <pwccode url="https://github.com/pytorch/fairseq" additional="true">pytorch/fairseq</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/librispeech">LibriSpeech</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/must-c">MuST-C</pwcdataset>
     </paper>
diff --git a/data/xml/2020.acl.xml b/data/xml/2020.acl.xml
index d7649eb30b..86c58c3a39 100644
--- a/data/xml/2020.acl.xml
+++ b/data/xml/2020.acl.xml
@@ -1343,6 +1343,7 @@
       <video href="http://slideslive.com/38928951"/>
       <bibkey>miao-etal-2020-diverse</bibkey>
       <pwccode url="https://github.com/chaochun/nlu-asdiv-dataset" additional="false">chaochun/nlu-asdiv-dataset</pwccode>
+      <pwcdataset url="https://paperswithcode.com/dataset/asdiv">ASDiv</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/mathqa">MathQA</pwcdataset>
     </paper>
     <paper id="93">
diff --git a/data/xml/2020.bionlp.xml b/data/xml/2020.bionlp.xml
index 3de390c31c..87d451a93b 100644
--- a/data/xml/2020.bionlp.xml
+++ b/data/xml/2020.bionlp.xml
@@ -288,7 +288,6 @@
       <url hash="89a62594">2020.bionlp-1.19</url>
       <doi>10.18653/v1/2020.bionlp-1.19</doi>
       <bibkey>nejadgholi-etal-2020-extensive</bibkey>
-      <pwccode url="https://github.com/nrc-cnrc/NRC-MedNER-Eval" additional="false">nrc-cnrc/NRC-MedNER-Eval</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/medmentions">MedMentions</pwcdataset>
     </paper>
     <paper id="20">
diff --git a/data/xml/2020.lrec.xml b/data/xml/2020.lrec.xml
index d59271f1ba..bf10dfe367 100644
--- a/data/xml/2020.lrec.xml
+++ b/data/xml/2020.lrec.xml
@@ -10823,6 +10823,7 @@
       <revision id="2" href="2020.lrec-1.868v2" hash="ee5d8fc0" date="2020-09-29">We discovered an evaluation metric error in one independent component in the paper (Named Entity Recognition, Section 8) which hid a flaw in its implementation. In this revision, we update the component with an improved implementation, and correct the evaluation metric error. This change has no consequence on the overall conclusions of the paper. We also fixed a few minor typos, and added an acknowledgment.</revision>
       <bibkey>obeid-etal-2020-camel</bibkey>
       <pwccode url="https://github.com/CAMeL-Lab/camel_tools" additional="false">CAMeL-Lab/camel_tools</pwccode>
+      <pwcdataset url="https://paperswithcode.com/dataset/arabic-tod">Arabic-ToD</pwcdataset>
     </paper>
     <paper id="869">
       <title><fixed-case>R</fixed-case>e<fixed-case>S</fixed-case>i<fixed-case>PC</fixed-case>: a Tool for Complex Searches in Parallel Corpora</title>
diff --git a/data/xml/2020.starsem.xml b/data/xml/2020.starsem.xml
index 4e0678409d..e92adeeff6 100644
--- a/data/xml/2020.starsem.xml
+++ b/data/xml/2020.starsem.xml
@@ -191,7 +191,6 @@
       <abstract>We introduce a new dataset for training and evaluating grounded language models. Our data is collected within a virtual reality environment and is designed to emulate the quality of language data to which a pre-verbal child is likely to have access: That is, naturalistic, spontaneous speech paired with richly grounded visuospatial context. We use the collected data to compare several distributional semantics models for verb learning. We evaluate neural models based on 2D (pixel) features as well as feature-engineered models based on 3D (symbolic, spatial) features, and show that neither modeling approach achieves satisfactory performance. Our results are consistent with evidence from child language acquisition that emphasizes the difficulty of learning verbs from naive distributional data. We discuss avenues for future work on cognitively-inspired grounded language learning, and release our corpus with the intent of facilitating research on the topic.</abstract>
       <url hash="c9249de6">2020.starsem-1.16</url>
       <bibkey>ebert-pavlick-2020-visuospatial</bibkey>
-      <pwccode url="https://github.com/dylanebert/nbc_starsem" additional="false">dylanebert/nbc_starsem</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/new-brown-corpus">New Brown Corpus</pwcdataset>
     </paper>
     <paper id="17">
diff --git a/data/xml/2021.acl.xml b/data/xml/2021.acl.xml
index 8a45b3c280..5c14ceeed4 100644
--- a/data/xml/2021.acl.xml
+++ b/data/xml/2021.acl.xml
@@ -5476,6 +5476,7 @@
       <pwccode url="https://github.com/alipay/KnowledgeGraphEmbeddingsViaPairedRelationVectors_PairRE" additional="false">alipay/KnowledgeGraphEmbeddingsViaPairedRelationVectors_PairRE</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/fb15k-237">FB15k-237</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/ogb">OGB</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/open-graph-benchmark">Open Graph Benchmark</pwcdataset>
     </paper>
     <paper id="337">
       <title>Hierarchy-aware Label Semantics Matching Network for Hierarchical Text Classification</title>
diff --git a/data/xml/2021.naacl.xml b/data/xml/2021.naacl.xml
index a28eaae04f..4b0fafff4d 100644
--- a/data/xml/2021.naacl.xml
+++ b/data/xml/2021.naacl.xml
@@ -2629,6 +2629,7 @@
       <video href="2021.naacl-main.168.mp4"/>
       <pwccode url="https://github.com/arkilpatel/SVAMP" additional="true">arkilpatel/SVAMP</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/svamp">SVAMP</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/asdiv">ASDiv</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/mawps">MAWPS</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/math23k">Math23K</pwcdataset>
     </paper>
diff --git a/data/xml/2022.acl.xml b/data/xml/2022.acl.xml
index 0ccc5ab8dd..d274b9bd9c 100644
--- a/data/xml/2022.acl.xml
+++ b/data/xml/2022.acl.xml
@@ -3097,6 +3097,7 @@
       <pwccode url="https://github.com/automl-research/kgtuner" additional="false">automl-research/kgtuner</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/fb15k-237">FB15k-237</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/ogb">OGB</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/open-graph-benchmark">Open Graph Benchmark</pwcdataset>
     </paper>
     <paper id="195">
       <title>A Meta-framework for Spatiotemporal Quantity Extraction from Text</title>
diff --git a/data/xml/2022.coling.xml b/data/xml/2022.coling.xml
index 9c08a59f97..7092cdfa16 100644
--- a/data/xml/2022.coling.xml
+++ b/data/xml/2022.coling.xml
@@ -1475,7 +1475,7 @@
       <abstract>Natural Language Processing (NLP) has become increasingly utilized to provide adaptivity in educational applications. However, recent research has highlighted a variety of biases in pre-trained language models. While existing studies investigate bias in different domains, they are limited in addressing fine-grained analysis on educational corpora and text that is not English. In this work, we analyze bias across text and through multiple architectures on a corpus of 9,165 German peer-reviews collected from university students over five years. Notably, our corpus includes labels such as helpfulness, quality, and critical aspect ratings from the peer-review recipient as well as demographic attributes. We conduct a Word Embedding Association Test (WEAT) analysis on (1) our collected corpus in connection with the clustered labels, (2) the most common pre-trained German language models (T5, BERT, and GPT-2) and GloVe embeddings, and (3) the language models after fine-tuning on our collected data-set. In contrast to our initial expectations, we found that our collected corpus does not reveal many biases in the co-occurrence analysis or in the GloVe embeddings. However, the pre-trained German language models find substantial conceptual, racial, and gender bias and have significant changes in bias across conceptual and racial axes during fine-tuning on the peer-review data. With our research, we aim to contribute to the fourth UN sustainability goal (quality education) with a novel dataset, an understanding of biases in natural language education data, and the potential harms of not counteracting biases in language models for educational tasks.</abstract>
       <url hash="175f055c">2022.coling-1.115</url>
       <bibkey>wambsganss-etal-2022-bias</bibkey>
-      <pwccode url="https://github.com/epfl-ml4ed/bias-at-a-second-glance" additional="false">epfl-ml4ed/bias-at-a-second-glance</pwccode>
+      <pwccode url="https://github.com/epfl-ml4ed/bias-at-a-second-glance" additional="true">epfl-ml4ed/bias-at-a-second-glance</pwccode>
     </paper>
     <paper id="116">
       <title>Dynamic Relevance Graph Network for Knowledge-Aware Question Answering</title>
@@ -4525,6 +4525,7 @@
       <bibkey>cha-etal-2022-noun</bibkey>
       <revision id="1" href="2022.coling-1.338v1" hash="cfe53195"/>
       <revision id="2" href="2022.coling-1.338v2" hash="eb56f44d" date="2022-11-01">Adds corresponding author information.</revision>
+      <pwcdataset url="https://paperswithcode.com/dataset/asdiv">ASDiv</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/mawps">MAWPS</pwcdataset>
     </paper>
     <paper id="339">
diff --git a/data/xml/2022.ecnlp.xml b/data/xml/2022.ecnlp.xml
index ef09912882..891f0a285f 100644
--- a/data/xml/2022.ecnlp.xml
+++ b/data/xml/2022.ecnlp.xml
@@ -379,6 +379,7 @@
       <bibkey>liu-etal-2022-towards</bibkey>
       <doi>10.18653/v1/2022.ecnlp-1.26</doi>
       <video href="2022.ecnlp-1.26.mp4"/>
+      <pwcdataset url="https://paperswithcode.com/dataset/wands">WANDS</pwcdataset>
     </paper>
     <paper id="27">
       <title>Can Pretrained Language Models Generate Persuasive, Faithful, and Informative Ad Text for Product Descriptions?</title>
diff --git a/data/xml/2022.lrec.xml b/data/xml/2022.lrec.xml
index b2ce6bdb05..a6ab2775b8 100644
--- a/data/xml/2022.lrec.xml
+++ b/data/xml/2022.lrec.xml
@@ -4671,6 +4671,7 @@
       <url hash="b38abc81">2022.lrec-1.373</url>
       <bibkey>sharma-etal-2022-hawp</bibkey>
       <pwccode url="https://github.com/Pruthwik/Hindi-Word-Problem-Solver" additional="false">Pruthwik/Hindi-Word-Problem-Solver</pwccode>
+      <pwcdataset url="https://paperswithcode.com/dataset/asdiv">ASDiv</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/mawps">MAWPS</pwcdataset>
     </paper>
     <paper id="374">
diff --git a/data/xml/2022.naacl.xml b/data/xml/2022.naacl.xml
index fe1da2e113..87aa5ab9d3 100644
--- a/data/xml/2022.naacl.xml
+++ b/data/xml/2022.naacl.xml
@@ -5065,6 +5065,7 @@
       <doi>10.18653/v1/2022.naacl-main.310</doi>
       <video href="2022.naacl-main.310.mp4"/>
       <pwccode url="https://github.com/kevivk/mwp-augmentation" additional="false">kevivk/mwp-augmentation</pwccode>
+      <pwcdataset url="https://paperswithcode.com/dataset/asdiv">ASDiv</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/mawps">MAWPS</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/svamp">SVAMP</pwcdataset>
     </paper>
diff --git a/data/xml/W19.xml b/data/xml/W19.xml
index 45b94101ea..a9a3a81dd7 100644
--- a/data/xml/W19.xml
+++ b/data/xml/W19.xml
@@ -14123,6 +14123,7 @@ One of the references was wrong therefore it is corrected to cite the appropriat
       <url hash="9b228715">W19-5945</url>
       <doi>10.18653/v1/W19-5945</doi>
       <bibkey>keizer-etal-2019-user</bibkey>
+      <pwccode url="https://bitbucket.org/skeizer/madrigal" additional="false">skeizer/madrigal</pwccode>
     </paper>
     <paper id="46">
       <title>Dialogue Act Classification in Team Communication for Robot Assisted Disaster Response</title>

From 8a39df381ffd922301c2cc6b87326367a9218dc5 Mon Sep 17 00:00:00 2001
From: anthology-assist <126604033+anthology-assist@users.noreply.github.com>
Date: Thu, 9 Nov 2023 09:56:26 -0600
Subject: [PATCH 03/12] Ingestion: Ranlp 2023 (#2865)

---
 data/xml/2023.alp.xml               |  276 +++++
 data/xml/2023.case.xml              |  262 +++++
 data/xml/2023.contents.xml          |   94 ++
 data/xml/2023.dravidianlangtech.xml |  536 +++++++++
 data/xml/2023.humeval.xml           |  188 ++++
 data/xml/2023.isa.xml               |    2 +-
 data/xml/2023.ltedi.xml             |  533 +++++++++
 data/xml/2023.nlp4tia.xml           |  106 ++
 data/xml/2023.ranlp.xml             | 1560 +++++++++++++++++++++++++++
 data/xml/2023.tsar.xml              |  161 +++
 data/yaml/venues/alp.yaml           |    2 +
 data/yaml/venues/contents.yaml      |    3 +
 data/yaml/venues/nlp4tia.yaml       |    2 +
 13 files changed, 3724 insertions(+), 1 deletion(-)
 create mode 100644 data/xml/2023.alp.xml
 create mode 100644 data/xml/2023.case.xml
 create mode 100644 data/xml/2023.contents.xml
 create mode 100644 data/xml/2023.dravidianlangtech.xml
 create mode 100644 data/xml/2023.humeval.xml
 create mode 100644 data/xml/2023.ltedi.xml
 create mode 100644 data/xml/2023.nlp4tia.xml
 create mode 100644 data/xml/2023.ranlp.xml
 create mode 100644 data/xml/2023.tsar.xml
 create mode 100644 data/yaml/venues/alp.yaml
 create mode 100644 data/yaml/venues/contents.yaml
 create mode 100644 data/yaml/venues/nlp4tia.yaml

diff --git a/data/xml/2023.alp.xml b/data/xml/2023.alp.xml
new file mode 100644
index 0000000000..6682d62c2e
--- /dev/null
+++ b/data/xml/2023.alp.xml
@@ -0,0 +1,276 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2023.alp">
+  <volume id="1" ingest-date="2023-10-31" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the Ancient Language Processing Workshop</booktitle>
+      <editor><first>Adam</first><last>Anderson</last></editor>
+      <editor><first>Shai</first><last>Gordin</last></editor>
+      <editor><first>Bin</first><last>Li</last></editor>
+      <editor><first>Yudong</first><last>Liu</last></editor>
+      <editor><first>Marco C.</first><last>Passarotti</last></editor>
+      <publisher>INCOMA Ltd., Shoumen, Bulgaria</publisher>
+      <address>Varna, Bulgaria</address>
+      <month>September</month>
+      <year>2023</year>
+      <url hash="12223c0d">2023.alp-1</url>
+      <venue>alp</venue>
+      <venue>ws</venue>
+    </meta>
+    <frontmatter>
+      <url hash="1dee4168">2023.alp-1.0</url>
+      <bibkey>alp-2023-ancient</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>Training and Evaluation of Named Entity Recognition Models for Classical <fixed-case>L</fixed-case>atin</title>
+      <author><first>Marijke</first><last>Beersmans</last></author>
+      <author><first>Evelien</first><last>de Graaf</last></author>
+      <author><first>Tim</first><last>Van de Cruys</last></author>
+      <author><first>Margherita</first><last>Fantoli</last></author>
+      <pages>1–12</pages>
+      <abstract>We evaluate the performance of various models on the task of named entity recognition (NER) for classical Latin. Using an existing dataset, we train two transformer-based LatinBERT models and one shallow conditional random field (CRF) model. The performance is assessed using both standard metrics and a detailed manual error analysis, and compared to the results obtained by different already released Latin NER tools. Both analyses demonstrate that the BERT models achieve a better f1-score than the other models. Furthermore, we annotate new, unseen data for further evaluation of the models, and we discuss the impact of annotation choices on the results.</abstract>
+      <url hash="0b9d0cf7">2023.alp-1.1</url>
+      <bibkey>beersmans-etal-2023-training</bibkey>
+    </paper>
+    <paper id="2">
+      <title>Sentence Embedding Models for <fixed-case>A</fixed-case>ncient <fixed-case>G</fixed-case>reek Using Multilingual Knowledge Distillation</title>
+      <author><first>Kevin</first><last>Krahn</last></author>
+      <author><first>Derrick</first><last>Tate</last></author>
+      <author><first>Andrew C.</first><last>Lamicela</last></author>
+      <pages>13–22</pages>
+      <abstract>Contextual language models have been trained on Classical languages, including Ancient Greek and Latin, for tasks such as lemmatization, morphological tagging, part of speech tagging, authorship attribution, and detection of scribal errors. However, high-quality sentence embedding models for these historical languages are significantly more difficult to achieve due to the lack of training data. In this work, we use a multilingual knowledge distillation approach to train BERT models to produce sentence embeddings for Ancient Greek text. The state-of-the-art sentence embedding approaches for high-resource languages use massive datasets, but our distillation approach allows our Ancient Greek models to inherit the properties of these models while using a relatively small amount of translated sentence data. We build a parallel sentence dataset using a sentence-embedding alignment method to align Ancient Greek documents with English translations, and use this dataset to train our models. We evaluate our models on translation search, semantic similarity, and semantic retrieval tasks and investigate translation bias. We make our training and evaluation datasets freely available.</abstract>
+      <url hash="108a15ea">2023.alp-1.2</url>
+      <bibkey>krahn-etal-2023-sentence</bibkey>
+    </paper>
+    <paper id="3">
+      <title>A Transformer-based parser for <fixed-case>S</fixed-case>yriac morphology</title>
+      <author><first>Martijn</first><last>Naaijer</last></author>
+      <author><first>Constantijn</first><last>Sikkel</last></author>
+      <author><first>Mathias</first><last>Coeckelbergs</last></author>
+      <author><first>Jisk</first><last>Attema</last></author>
+      <author><first>Willem Th.</first><last>Van Peursen</last></author>
+      <pages>23–29</pages>
+      <abstract>In this project we train a Transformer-based model from scratch, with the goal of parsing the morphology of Ancient Syriac texts as accurately as possible. Syriac is still a low resource language, only a relatively small training set was available. Therefore, the training set was expanded by adding Biblical Hebrew data to it. Five different experiments were done: the model was trained on Syriac data only, it was trained with mixed Syriac and (un)vocalized Hebrew data, and it was pretrained on (un)vocalized Hebrew data and then finetuned on Syriac data. The models trained on Hebrew and Syriac data consistently outperform the models trained on Syriac data only. This shows, that the differences between Syriac and Hebrew are small enough that it is worth adding Hebrew data to train the model for parsing Syriac morphology. Training models on different languages is an important trend in NLP, we show that this works well for relatively small datasets of Syriac and Hebrew.</abstract>
+      <url hash="9719a8cb">2023.alp-1.3</url>
+      <bibkey>naaijer-etal-2023-transformer</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Graecia capta ferum victorem cepit. Detecting <fixed-case>L</fixed-case>atin Allusions to <fixed-case>A</fixed-case>ncient <fixed-case>G</fixed-case>reek Literature</title>
+      <author><first>Frederick</first><last>Riemenschneider</last></author>
+      <author><first>Anette</first><last>Frank</last></author>
+      <pages>30–38</pages>
+      <abstract>Intertextual allusions hold a pivotal role in Classical Philology, with Latin authors frequently referencing Ancient Greek texts. Until now, the automatic identification of these intertextual references has been constrained to monolingual approaches, seeking parallels solely within Latin or Greek texts. In this study, we introduce SPhilBERTa, a trilingual Sentence-RoBERTa model tailored for Classical Philology, which excels at cross-lingual semantic comprehension and identification of identical sentences across Ancient Greek, Latin, and English. We generate new training data by automatically translating English into Ancient Greek texts. Further, we present a case study, demonstrating SPhilBERTa’s capability to facilitate automated detection of intertextual parallels. Intertextual allusions hold a pivotal role in Classical Philology, with Latin authors frequently referencing Ancient Greek texts. Until now, the automatic identification of these intertextual references has been constrained to monolingual approaches, seeking parallels solely within Latin or Greek texts. In this study, we introduce SPhilBERTa, a trilingual Sentence-RoBERTa model tailored for Classical Philology, which excels at cross-lingual semantic comprehension and identification of identical sentences across Ancient Greek, Latin, and English. We generate new training data by automatically translating English into Ancient Greek texts. Further, we present a case study, demonstrating SPhilBERTa’s capability to facilitate automated detection of intertextual parallels.</abstract>
+      <url hash="ecdaa135">2023.alp-1.4</url>
+      <bibkey>riemenschneider-frank-2023-graecia</bibkey>
+    </paper>
+    <paper id="5">
+      <title>Larth: Dataset and Machine Translation for <fixed-case>E</fixed-case>truscan</title>
+      <author><first>Gianluca</first><last>Vico</last></author>
+      <author><first>Gerasimos</first><last>Spanakis</last></author>
+      <pages>39–48</pages>
+      <abstract>Etruscan is an ancient language spoken in Italy from the 7th century BC to the 1st century AD. There are no native speakers of the language at the present day, and its resources are scarce, as there are an estimated 12,000 known inscriptions. To the best of our knowledge, there are no publicly available Etruscan corpora for natural language processing. Therefore, we propose a dataset for machine translation from Etruscan to English, which contains 2891 translated examples from existing academic sources. Some examples are extracted manually, while others are acquired in an automatic way. Along with the dataset, we benchmark different machine translation models observing that it is possible to achieve a BLEU score of 10.1 with a small transformer model. Releasing the dataset can help enable future research on this language, similar languages or other languages with scarce resources.</abstract>
+      <url hash="ae28d3dc">2023.alp-1.5</url>
+      <bibkey>vico-spanakis-2023-larth</bibkey>
+    </paper>
+    <paper id="6">
+      <title>Evaluation of Distributional Semantic Models of <fixed-case>A</fixed-case>ncient <fixed-case>G</fixed-case>reek: Preliminary Results and a Road Map for Future Work</title>
+      <author><first>Silvia</first><last>Stopponi</last></author>
+      <author><first>Nilo</first><last>Pedrazzini</last></author>
+      <author><first>Saskia</first><last>Peels</last></author>
+      <author><first>Barbara</first><last>McGillivray</last></author>
+      <author><first>Malvina</first><last>Nissim</last></author>
+      <pages>49–58</pages>
+      <abstract>We evaluate four count-based and predictive distributional semantic models of Ancient Greek against AGREE, a composite benchmark of human judgements, to assess their ability to retrieve semantic relatedness. On the basis of the observations deriving from the analysis of the results, we design a procedure for a larger-scale intrinsic evaluation of count-based and predictive language models, including syntactic embeddings. We also propose possible ways of exploiting the different layers of the whole AGREE benchmark (including both human- and machine-generated data) and different evaluation metrics.</abstract>
+      <url hash="f94f18b0">2023.alp-1.6</url>
+      <bibkey>stopponi-etal-2023-evaluation</bibkey>
+    </paper>
+    <paper id="7">
+      <title><fixed-case>L</fixed-case>atin Morphology through the Centuries: Ensuring Consistency for Better Language Processing</title>
+      <author><first>Federica</first><last>Gamba</last></author>
+      <author><first>Daniel</first><last>Zeman</last></author>
+      <pages>59–67</pages>
+      <abstract>This paper focuses on the process of harmonising the five Latin treebanks available in Universal Dependencies with respect to morphological annotation. We propose a workflow that allows to first spot inconsistencies and missing information, in order to detect to what extent the annotations differ, and then correct the retrieved bugs, with the goal of equalising the annotation of morphological features in the treebanks and producing more consistent linguistic data. Subsequently, we present some experiments carried out with UDPipe and Stanza in order to assess the impact of such harmonisation on parsing accuracy.</abstract>
+      <url hash="e728ac73">2023.alp-1.7</url>
+      <bibkey>gamba-zeman-2023-latin</bibkey>
+    </paper>
+    <paper id="8">
+      <title>Cross-Lingual Constituency Parsing for <fixed-case>M</fixed-case>iddle <fixed-case>H</fixed-case>igh <fixed-case>G</fixed-case>erman: A Delexicalized Approach</title>
+      <author><first>Ercong</first><last>Nie</last></author>
+      <author><first>Helmut</first><last>Schmid</last></author>
+      <author><first>Hinrich</first><last>Schütze</last></author>
+      <pages>68–79</pages>
+      <abstract>Constituency parsing plays a fundamental role in advancing natural language processing (NLP) tasks. However, training an automatic syntactic analysis system for ancient languages solely relying on annotated parse data is a formidable task due to the inherent challenges in building treebanks for such languages. It demands extensive linguistic expertise, leading to a scarcity of available resources. To overcome this hurdle, cross-lingual transfer techniques which require minimal or even no annotated data for low-resource target languages offer a promising solution. In this study, we focus on building a constituency parser for Middle High German (MHG) under realistic conditions, where no annotated MHG treebank is available for training. In our approach, we leverage the linguistic continuity and structural similarity between MHG and Modern German (MG), along with the abundance of MG treebank resources. Specifically, by employing the delexicalization method, we train a constituency parser on MG parse datasets and perform cross-lingual transfer to MHG parsing. Our delexicalized constituency parser demonstrates remarkable performance on the MHG test set, achieving an F1-score of 67.3%. It outperforms the best zero-shot cross-lingual baseline by a margin of 28.6% points. The encouraging results underscore the practicality and potential for automatic syntactic analysis in other ancient languages that face similar challenges as MHG.</abstract>
+      <url hash="074b832b">2023.alp-1.8</url>
+      <bibkey>nie-etal-2023-cross-lingual</bibkey>
+    </paper>
+    <paper id="9">
+      <title>Can Large Langauge Model Comprehend <fixed-case>A</fixed-case>ncient <fixed-case>C</fixed-case>hinese? A Preliminary Test on <fixed-case>ACLUE</fixed-case></title>
+      <author><first>Yixuan</first><last>Zhang</last></author>
+      <author><first>Haonan</first><last>Li</last></author>
+      <pages>80–87</pages>
+      <abstract>Large language models (LLMs) have demonstrated exceptional language understanding and generation capabilities. However, their ability to comprehend ancient languages, specifically ancient Chinese, remains largely unexplored. To bridge this gap, we introduce ACLUE, an evaluation benchmark designed to assess the language abilities of models in relation to ancient Chinese. ACLUE consists of 15 tasks that cover a range of skills, including phonetic, lexical, syntactic, semantic, inference and knowledge. By evaluating 8 state-of-the-art multilingual and Chinese LLMs, we have observed a significant divergence in their performance between modern Chinese and ancient Chinese. Among the evaluated models, ChatGLM2 demonstrates the highest level of performance, achieving an average accuracy of 37.45%. We have established a leaderboard for communities to assess their models.</abstract>
+      <url hash="dc4ace81">2023.alp-1.9</url>
+      <bibkey>zhang-li-2023-large</bibkey>
+    </paper>
+    <paper id="10">
+      <title>Unveiling Emotional Landscapes in Plautus and Terentius Comedies: A Computational Approach for Qualitative Analysis</title>
+      <author><first>Davide</first><last>Picca</last></author>
+      <author><first>Caroline</first><last>Richard</last></author>
+      <pages>88–95</pages>
+      <abstract>This ongoing study explores emotion recognition in Latin texts, specifically focusing on Latin comedies. Leveraging Natural Language Processing and classical philology insights, the project navigates the challenges of Latin’s intricate grammar and nuanced emotional expression. Despite initial challenges with lexicon translation and emotional alignment, the work provides a foundation for a more comprehensive analysis of emotions in Latin literature.</abstract>
+      <url hash="a56c6e2f">2023.alp-1.10</url>
+      <bibkey>picca-richard-2023-unveiling</bibkey>
+    </paper>
+    <paper id="11">
+      <title>Morphological and Semantic Evaluation of <fixed-case>A</fixed-case>ncient <fixed-case>C</fixed-case>hinese Machine Translation</title>
+      <author><first>Kai</first><last>Jin</last></author>
+      <author><first>Dan</first><last>Zhao</last></author>
+      <author><first>Wuying</first><last>Liu</last></author>
+      <pages>96–102</pages>
+      <abstract>Machine translation (MT) of ancient Chinese texts presents unique challenges due to the complex grammatical structures, cultural nuances, and polysemy of the language. This paper focuses on evaluating the translation quality of different platforms for ancient Chinese texts using The Analects as a case study. The evaluation is conducted using the BLEU, LMS, and ESS metrics, and the platforms compared include three machine translation platforms (Baidu Translate, Bing Microsoft Translator, and DeepL), and one language generation model ChatGPT that can engage in translation endeavors. Results show that Baidu performs the best, surpassing the other platforms in all three metrics, while ChatGPT ranks second and demonstrates unique advantages. The translations generated by ChatGPT are deemed highly valuable as references. The study contributes to understanding the challenges of MT for ancient Chinese texts and provides insights for users and researchers in this field. It also highlights the importance of considering specific domain requirements when evaluating MT systems.</abstract>
+      <url hash="4e2feee2">2023.alp-1.11</url>
+      <bibkey>jin-etal-2023-morphological</bibkey>
+    </paper>
+    <paper id="12">
+      <title>A tailored Handwritten-Text-Recognition System for Medieval <fixed-case>L</fixed-case>atin</title>
+      <author><first>Philipp</first><last>Koch</last></author>
+      <author><first>Gilary Vera</first><last>Nuñez</last></author>
+      <author><first>Esteban</first><last>Garces Arias</last></author>
+      <author><first>Christian</first><last>Heumann</last></author>
+      <author><first>Matthias</first><last>Schöffel</last></author>
+      <author><first>Alexander</first><last>Häberlin</last></author>
+      <author><first>Matthias</first><last>Assenmacher</last></author>
+      <pages>103–110</pages>
+      <abstract>The Bavarian Academy of Sciences and Humanities aims to digitize the Medieval Latin Dictionary. This dictionary entails record cards referring to lemmas in medieval Latin, a low-resource language. A crucial step of the digitization process is the handwritten text recognition (HTR) of the handwritten lemmas on the record cards. In our work, we introduce an end-to-end pipeline, tailored for the medieval Latin dictionary, for locating, extracting, and transcribing the lemmas. We employ two state-of-the-art image segmentation models to prepare the initial data set for the HTR task. Further, we experiment with different transformer-based models and conduct a set of experiments to explore the capabilities of different combinations of vision encoders with a GPT-2 decoder. Additionally, we also apply extensive data augmentation resulting in a highly competitive model. The best-performing setup achieved a character error rate of 0.015, which is even superior to the commercial Google Cloud Vision model, and shows more stable performance.</abstract>
+      <url hash="a20e2ac1">2023.alp-1.12</url>
+      <bibkey>koch-etal-2023-tailored</bibkey>
+    </paper>
+    <paper id="13">
+      <title>Evaluating Existing Lemmatisers on Unedited Byzantine <fixed-case>G</fixed-case>reek Poetry</title>
+      <author><first>Colin</first><last>Swaelens</last></author>
+      <author><first>Ilse</first><last>De Vos</last></author>
+      <author><first>Els</first><last>Lefever</last></author>
+      <pages>111–116</pages>
+      <abstract>This paper reports on the results of a comparative evaluation in view of the development of a new lemmatizer for unedited, Byzantine Greek texts. For the experiment, the performance of four existing lemmatizers, all pre-trained on Ancient Greek texts, was evaluated on how well they could handle texts stemming from the Middle Ages and displaying quite some peculiarities. The aim of this study is to get insights into the pitfalls of existing lemmatistion approaches as well as the specific challenges of our Byzantine Greek corpus, in order to develop a lemmatizer that can cope with its peculiarities. The results of the experiment show an accuracy drop of 20pp. on our corpus, which is further investigated in a qualitative error analysis.</abstract>
+      <url hash="17ce20e1">2023.alp-1.13</url>
+      <bibkey>swaelens-etal-2023-evaluating</bibkey>
+    </paper>
+    <paper id="14">
+      <title>Vector Based Stylistic Analysis on <fixed-case>A</fixed-case>ncient <fixed-case>C</fixed-case>hinese Books: Take the Three Commentaries on the Spring and Autumn Annals as an Example</title>
+      <author><first>Yue</first><last>Qi</last></author>
+      <author><first>Liu</first><last>Liu</last></author>
+      <author><first>Bin</first><last>Li</last></author>
+      <author><first>Dongbo</first><last>Wang</last></author>
+      <pages>117–121</pages>
+      <abstract>Commentary of Gongyang, Commentary of Guliang, and Commentary of Zuo are collectively called the Three Commentaries on the Spring and Autumn Annals, which are the supplement and interpretation of the content of Spring and Autumn Annals with value in historical and literary research. In traditional research paradigms, scholars often explored the differences between the Three Commentaries within the details in contexts. Starting from the view of computational humanities, this paper examines the differences in the language style of the Three Commentaries through the representation of language, which takes the methods of deep learning. Specifically, this study vectorizes the context at word and sentence levels. It maps them into the same plane to find the differences between the use of words and sentences in the Three Commentaries. The results show that the Commentary of Gongyang and the Commentary of Guliang are relatively similar, while the Commentary of Zuo is significantly different. This paper verifies the feasibility of deep learning methods in stylistics study under computational humanities. It provides a valuable perspective for studying the Three Commentaries on the Spring and Autumn Annals.</abstract>
+      <url hash="4b819958">2023.alp-1.14</url>
+      <bibkey>qi-etal-2023-vector</bibkey>
+    </paper>
+    <paper id="15">
+      <title>A Joint Model of Automatic Word Segmentation and Part-Of-Speech Tagging for Ancient Classical Texts Based on Radicals</title>
+      <author><first>Bolin</first><last>Chang</last></author>
+      <author><first>Yiguo</first><last>Yuan</last></author>
+      <author><first>Bin</first><last>Li</last></author>
+      <author><first>Zhixing</first><last>Xu</last></author>
+      <author><first>Minxuan</first><last>Feng</last></author>
+      <author><first>Dongbo</first><last>Wang</last></author>
+      <pages>122–132</pages>
+      <abstract>The digitization of ancient books necessitates the implementation of automatic word segmentation and part-of-speech tagging. However, the existing research on this topic encounters pressing issues, including suboptimal efficiency and precision, which require immediate resolution. This study employs a methodology that combines word segmentation and part-of-speech tagging. It establishes a correlation between fonts and radicals, trains the Radical2Vec radical vector representation model, and integrates it with the SikuRoBERTa word vector representation model. Finally, it connects the BiLSTM-CRF neural network.The study investigates the combination of word segmentation and part-of-speech tagging through an experimental approach using a specific data set. In the evaluation dataset, the F1 score for word segmentation is 95.75%, indicating a high level of accuracy. Similarly, the F1 score for part-of-speech tagging is 91.65%, suggesting a satisfactory performance in this task. This model enhances the efficiency and precision of the processing of ancient books, thereby facilitating the advancement of digitization efforts for ancient books and ensuring the preservation and advancement of ancient book heritage.</abstract>
+      <url hash="db0df686">2023.alp-1.15</url>
+      <bibkey>chang-etal-2023-joint</bibkey>
+    </paper>
+    <paper id="16">
+      <title>Introducing an Open Source Library for <fixed-case>S</fixed-case>umerian Text Analysis</title>
+      <author><first>Hansel</first><last>Guzman-Soto</last></author>
+      <author><first>Yudong</first><last>Liu</last></author>
+      <pages>133–137</pages>
+      <abstract>The study of Sumerian texts often requires domain experts to examine a vast number of tables. However, the absence of user-friendly tools for this process poses challenges and consumes significant time. In addressing this issue, we introduce an open-source library that empowers domain experts with minimal technical expertise to automate manual and repetitive tasks using a no-code dashboard. Our library includes an information extraction module that enables the automatic extraction of names and relations based on the user-defined lists of name tags and relation types. By utilizing the tool to facilitate the creation of knowledge graphs which is a data representation method offering insights into the relationships among entities in the data, we demonstrate its practical application in the analysis of Sumerian texts.</abstract>
+      <url hash="2abe08fb">2023.alp-1.16</url>
+      <bibkey>guzman-soto-liu-2023-introducing</bibkey>
+    </paper>
+    <paper id="17">
+      <title>Coding Design of Oracle Bone Inscriptions Input Method Based on “<fixed-case>Z</fixed-case>hong<fixed-case>H</fixed-case>ua<fixed-case>Z</fixed-case>i<fixed-case>K</fixed-case>u” Database</title>
+      <author><first>Dongxin</first><last>Hu</last></author>
+      <pages>138–147</pages>
+      <abstract>Abstract : Based on the oracle bone glyph data in the “ZhongHuaZiKu”database, this paper designs a new input method coding scheme which is easy to search in the database, and provides a feasible scheme for the design of oracle bone glyph input method software in the future. The coding scheme in this paper is based on the experience of the past oracle bone inscriptions input method design. In view of the particularity of oracle bone inscriptions, the difference factors such as component combination, sound code and shape code ( letter ) are added, and the coding format is designed as follows : The single component characters in the identified characters are arranged according to the format of " structural code + pronunciation full spelling code + tone code " ; the multi-component characters in the identified characters are arranged according to the format of " structure code + split component pronunciation full spelling code + overall glyph pronunciation full spelling code”; unidentified characters are arranged according to the format of " y + identified component pronunciation full spelling + unidentified component shape code ( letter ) ".Among them, the identified component code and the unidentified component shape code are input in turn according to the specific glyph from left to right, from top to bottom, and from outside to inside. Encoding through these coding formats, the heavy code rate is low, and the input habits of most people are also taken into account. Keywords : oracle bone inscriptions ; input method ; coding</abstract>
+      <url hash="59a90655">2023.alp-1.17</url>
+      <bibkey>hu-2023-coding</bibkey>
+    </paper>
+    <paper id="18">
+      <title>Word Sense Disambiguation for <fixed-case>A</fixed-case>ncient <fixed-case>G</fixed-case>reek: Sourcing a training corpus through translation alignment</title>
+      <author><first>Alek</first><last>Keersmaekers</last></author>
+      <author><first>Wouter</first><last>Mercelis</last></author>
+      <author><first>Toon</first><last>Van Hal</last></author>
+      <pages>148–159</pages>
+      <abstract>This paper seeks to leverage translations of Ancient Greek texts to enhance the performance of automatic word sense disambiguation (WSD). Satisfactory WSD in Ancient Greek is achievable, provided that the system can rely on annotated data. This study, acknowledging the challenges of manually assigning meanings to every Greek lemma, explores the strategies to derive WSD data from parallel texts using sentence and word alignment. Our results suggest that, assuming the condition of high word frequency is met, this technique permits us to automatically produce a significant volume of annotated data, although there are still significant obstacles when trying to automate this process.</abstract>
+      <url hash="c3b0d93b">2023.alp-1.18</url>
+      <bibkey>keersmaekers-etal-2023-word</bibkey>
+    </paper>
+    <paper id="19">
+      <title>Enhancing State-of-the-Art <fixed-case>NLP</fixed-case> Models for Classical <fixed-case>A</fixed-case>rabic</title>
+      <author><first>Tariq</first><last>Yousef</last></author>
+      <author><first>Lisa</first><last>Mischer</last></author>
+      <author><first>Hamid Reza</first><last>Hakimi</last></author>
+      <author><first>Maxim</first><last>Romanov</last></author>
+      <pages>160–169</pages>
+      <abstract>Classical Arabic, like all other historical languages, lacks adequate training datasets and accurate “off-the-shelf” models that can be directly employed in the processing pipelines. In this paper, we present our in-progress work in developing and training deep learning models tailored for handling diverse tasks relevant to classical Arabic texts. Specifically, we focus on Named Entities Recognition, person relationships classification, toponym sub-classification, onomastic section boundaries detection, onomastic entities classification, as well as date recognition and classification. Our work aims to address the challenges associated with these tasks and provide effective solutions for analyzing classical Arabic texts. Although this work is still in progress, the preliminary results reported in the paper indicate excellent to satisfactory performance of the fine-tuned models, effectively meeting the intended goal for which they were trained.</abstract>
+      <url hash="cc9f5102">2023.alp-1.19</url>
+      <bibkey>yousef-etal-2023-enhancing</bibkey>
+    </paper>
+    <paper id="20">
+      <title>Logion: Machine-Learning Based Detection and Correction of Textual Errors in <fixed-case>G</fixed-case>reek Philology</title>
+      <author><first>Charlie</first><last>Cowen-Breen</last></author>
+      <author><first>Creston</first><last>Brooks</last></author>
+      <author><first>Barbara</first><last>Graziosi</last></author>
+      <author><first>Johannes</first><last>Haubold</last></author>
+      <pages>170–178</pages>
+      <abstract>We present statistical and machine-learning based techniques for detecting and correcting errors in text and apply them to the challenge of textual corruption in Greek philology. Most ancient Greek texts reach us through a long process of copying, in relay, from earlier manuscripts (now lost). In this process of textual transmission, copying errors tend to accrue. After training a BERT model on the largest premodern Greek dataset used for this purpose to date, we identify and correct previously undetected errors made by scribes in the process of textual transmission, in what is, to our knowledge, the first successful identification of such errors via machine learning. The premodern Greek BERT model we train is available for use at https://huggingface.co/cabrooks/LOGION-base.</abstract>
+      <url hash="8c5c40c5">2023.alp-1.20</url>
+      <bibkey>cowen-breen-etal-2023-logion</bibkey>
+    </paper>
+    <paper id="21">
+      <title>Classical Philology in the Time of <fixed-case>AI</fixed-case>: Exploring the Potential of Parallel Corpora in Ancient Language</title>
+      <author><first>Tariq</first><last>Yousef</last></author>
+      <author><first>Chiara</first><last>Palladino</last></author>
+      <author><first>Farnoosh</first><last>Shamsian</last></author>
+      <pages>179–192</pages>
+      <abstract>This paper provides an overview of diverse applications of parallel corpora in ancient languages, particularly Ancient Greek. In the first part, we provide the fundamental principles of parallel corpora and a short overview of their applications in the study of ancient texts. In the second part, we illustrate how to leverage on parallel corpora to perform various NLP tasks, including automatic translation alignment, dynamic lexica induction, and Named Entity Recognition. In the conclusions, we emphasize current limitations and future work.</abstract>
+      <url hash="8630da71">2023.alp-1.21</url>
+      <bibkey>yousef-etal-2023-classical</bibkey>
+    </paper>
+    <paper id="22">
+      <title>Using Word Embeddings for Identifying Emotions Relating to the Body in a <fixed-case>N</fixed-case>eo-<fixed-case>A</fixed-case>ssyrian Corpus</title>
+      <author><first>Ellie</first><last>Bennett</last></author>
+      <author><first>Aleksi</first><last>Sahala</last></author>
+      <pages>193–202</pages>
+      <abstract>Research into emotions is a developing field within Assyriology, and NLP tools for Akkadian texts offers a new perspective on the data. In this submission, we use PMI-based word embeddings to explore the relationship between parts of the body and emotions. Using data downloaded from Oracc, we ask which parts of the body were semantically linked to emotions. We do this through examining which of the top 10 results for a body part could be used to express emotions. After identifying two words for the body that have the most emotion words in their results list (<i>libbu</i> and <i>kabattu</i>), we then examine whether the emotion words in their results lists were indeed used in this manner in the Neo-Assyrian textual corpus. The results indicate that of the two body parts, <i>kabattu</i> was semantically linked to happiness and joy, and had a secondary emotional field of anger.</abstract>
+      <url hash="28333d83">2023.alp-1.22</url>
+      <bibkey>bennett-sahala-2023-using</bibkey>
+    </paper>
+    <paper id="23">
+      <title>A Neural Pipeline for <fixed-case>POS</fixed-case>-tagging and Lemmatizing Cuneiform Languages</title>
+      <author><first>Aleksi</first><last>Sahala</last></author>
+      <author><first>Krister</first><last>Lindén</last></author>
+      <pages>203–212</pages>
+      <abstract>We presented a pipeline for POS-tagging and lemmatizing cuneiform languages and evaluated its performance on Sumerian, first millennium Babylonian, Neo-Assyrian and Urartian texts extracted from Oracc. The system achieves a POS-tagging accuracy between 95-98% and a lemmatization accuracy of 94-96% depending on the language or dialect. For OOV words only, the current version can predict correct POS-tags for 83-91%, and lemmata for 68-84% of the input words. Compared with the earlier version, the current one has about 10% higher accuracy in OOV lemmatization and POS-tagging due to better neural network performance. We also tested the system for lemmatizing and POS-tagging the PROIEL Ancient Greek and Latin treebanks, achieving results similar to those with the cuneiform languages.</abstract>
+      <url hash="6aca81f5">2023.alp-1.23</url>
+      <bibkey>sahala-linden-2023-neural</bibkey>
+    </paper>
+    <paper id="24">
+      <title><fixed-case>T</fixed-case>ibetan Dependency Parsing with Graph Convolutional Neural Networks</title>
+      <author><first>Bo</first><last>An</last></author>
+      <pages>213–221</pages>
+      <abstract>Dependency parsing is a syntactic analysis method to analyze the dependency relationships between words in a sentence. The interconnection between words through dependency relationships is typical graph data. Traditional Tibetan dependency parsing methods typically model dependency analysis as a transition-based or sequence-labeling task, ignoring the graph information between words. To address this issue, this paper proposes a graph neural network (GNN)-based Tibetan dependency parsing method. This method treats Tibetan words as nodes and the dependency relationships between words as edges, thereby constructing the graph data of Tibetan sentences. Specifically, we use BiLSTM to learn the word representations of Tibetan, utilize GNN to model the relationships between words and employ MLP to predict the types of relationships between words. We conduct experiments on a Tibetan dependency database, and the results show that the proposed method can achieve high-quality Tibetan dependency parsing results.</abstract>
+      <url hash="b2e24c5b">2023.alp-1.24</url>
+      <bibkey>an-2023-tibetan</bibkey>
+    </paper>
+    <paper id="25">
+      <title>On the Development of Interlinearized Ancient Literature of Ethnic Minorities: A Case Study of the Interlinearization of Ancient Written <fixed-case>T</fixed-case>ibetan Literature</title>
+      <author><first>Congjun</first><last>Long</last></author>
+      <author><first>Bo</first><last>An</last></author>
+      <pages>222–231</pages>
+      <abstract>Ancient ethnic documents are essential to China’s ancient literature and an indispensable civilizational achievement of Chinese culture. However, few research teams are involved due to language and script literacy limitations. To address these issues, this paper proposes an interlinearized annotation strategy for ancient ethnic literature. This strategy aims to alleviate text literacy difficulties, encourage interdisciplinary researchers to participate in studying ancient ethnic literature, and improve the efficiency of ancient ethnic literature development. Concretely, the interlinearized annotation consists of original, word segmentation, Latin, annotated, and translation lines. In this paper, we take ancient Tibetan literature as an example to explore the interlinearized annotation strategy. However, manually building large-scale corpus is challenging. To build a large-scale interlinearized dataset, we propose a multi-task learning-based interlinearized annotation method, which can generate interlinearized annotation lines based on the original line. Experimental results show that after training on about 10,000 sentences (lines) of data, our model achieves 70.9% and 63.2% F1 values on the segmentation lines and annotated lines, respectively, and 18.7% BLEU on the translation lines. It dramatically enhances the efficiency of data annotation, effectively speeds up interlinearized annotation, and reduces the workload of manual annotation.</abstract>
+      <url hash="30a4f98a">2023.alp-1.25</url>
+      <bibkey>long-an-2023-development</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2023.case.xml b/data/xml/2023.case.xml
new file mode 100644
index 0000000000..d0157bece7
--- /dev/null
+++ b/data/xml/2023.case.xml
@@ -0,0 +1,262 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2023.case">
+  <volume id="1" ingest-date="2023-10-31" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the 6th Workshop on Challenges and Applications of Automated Extraction of Socio-political Events from Text</booktitle>
+      <editor><first>Ali</first><last>Hürriyetoğlu</last></editor>
+      <editor><first>Hristo</first><last>Tanev</last></editor>
+      <editor><first>Vanni</first><last>Zavarella</last></editor>
+      <editor><first>Reyyan</first><last>Yeniterzi</last></editor>
+      <editor><first>Erdem</first><last>Yörük</last></editor>
+      <editor><first>Milena</first><last>Slavcheva</last></editor>
+      <publisher>INCOMA Ltd., Shoumen, Bulgaria</publisher>
+      <address>Varna, Bulgaria</address>
+      <month>sEPTEMBER</month>
+      <year>2023</year>
+      <url hash="8b04dd21">2023.case-1</url>
+      <venue>case</venue>
+      <venue>ws</venue>
+    </meta>
+    <frontmatter>
+      <url hash="21f77465">2023.case-1.0</url>
+      <bibkey>case-2023-challenges</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>Classifying Organized Criminal Violence in <fixed-case>M</fixed-case>exico using <fixed-case>ML</fixed-case> and <fixed-case>LLM</fixed-case>s</title>
+      <author><first>Javier</first><last>Osorio</last></author>
+      <author><first>Juan</first><last>Vasquez</last></author>
+      <pages>1–10</pages>
+      <abstract>Natural Language Processing (NLP) tools have been rapidly adopted in political science for the study of conflict and violence. In this paper, we present an application to analyze various lethal and non-lethal events conducted by organized criminal groups and state forces in Mexico. Based on a large corpus of news articles in Spanish and a set of high-quality annotations, the application evaluates different Machine Learning (ML) algorithms and Large Language Models (LLMs) to classify documents and individual sentences, and to identify specific behaviors related to organized criminal violence and law enforcement efforts. Our experiments support the growing evidence that BERT-like models achieve outstanding classification performance for the study of organized crime. This application amplifies the capacity of conflict scholars to provide valuable information related to important security challenges in the developing world.</abstract>
+      <url hash="69d26bed">2023.case-1.1</url>
+      <bibkey>osorio-vasquez-2023-classifying</bibkey>
+    </paper>
+    <paper id="2">
+      <title>Where “where” Matters : Event Location Disambiguation with a <fixed-case>BERT</fixed-case> Language Model</title>
+      <author><first>Hristo</first><last>Tanev</last></author>
+      <author><first>Bertrand</first><last>De Longueville</last></author>
+      <pages>11–17</pages>
+      <abstract>The method method presented in this paper uses a BERT model for classifying location mentions in event reporting news texts into two classes: a place of an event, called main location, or another location mention, called here secondary location. Our evaluation on articles, reporting protests, shows promising results and demonstrates the feasibility of our approach and the event geolocation task in general. We evaluate our method against a simple baseline and state of the art ML models and we achieve a significant improvement in all cases by using the BERT model. In contrast to other location classification approaches, we completelly avoid lingusitic pre processing and feature engineering, which is a pre-requisite for all multi-domain and multilingual applications.</abstract>
+      <url hash="f5822794">2023.case-1.2</url>
+      <bibkey>tanev-de-longueville-2023-matters</bibkey>
+    </paper>
+    <paper id="3">
+      <title>A Multi-instance Learning Approach to Civil Unrest Event Detection on <fixed-case>T</fixed-case>witter</title>
+      <author><first>Alexandra</first><last>DeLucia</last></author>
+      <author><first>Mark</first><last>Dredze</last></author>
+      <author><first>Anna L.</first><last>Buczak</last></author>
+      <pages>18–33</pages>
+      <abstract>Social media has become an established platform for people to organize and take offline actions, often in the form of civil unrest. Understanding these events can help support pro-democratic movements. The primary method to detect these events on Twitter relies on aggregating many tweets, but this includes many that are not relevant to the task. We propose a multi-instance learning (MIL) approach, which jointly identifies relevant tweets and detects civil unrest events. We demonstrate that MIL improves civil unrest detection over methods based on simple aggregation. Our best model achieves a 0.73 F1 on the Global Civil Unrest on Twitter (G-CUT) dataset.</abstract>
+      <url hash="4e749496">2023.case-1.3</url>
+      <bibkey>delucia-etal-2023-multi</bibkey>
+    </paper>
+    <paper id="4">
+      <title><fixed-case>MLM</fixed-case>odeler5 @ Causal News Corpus 2023: Using <fixed-case>R</fixed-case>o<fixed-case>BERT</fixed-case>a for Casual Event Classification</title>
+      <author><first>Amrita</first><last>Bhatia</last></author>
+      <author><first>Ananya</first><last>Thomas</last></author>
+      <author><first>Nitansh</first><last>Jain</last></author>
+      <author><first>Jatin</first><last>Bedi</last></author>
+      <pages>34–37</pages>
+      <abstract>Identifying cause-effect relations plays an integral role in the understanding and interpretation of natural languages. Furthermore, automated mining of causal relations from news and text about socio-political events is a stepping stone in gaining critical insights, including analyzing the scale, frequency and trends across timelines of events, as well as anticipating future ones. The Shared Task 3, part of the 6th Workshop on Challenges and Applications of Automated Extraction of Socio-political Events from Text (CASE @ RANLP 2023), involved the task of Event Causality Identification with Causal News Corpus. We describe our approach to Subtask 1, dealing with causal event classification, a supervised binary classification problem to annotate given event sentences with whether they contained any cause-effect relations. To help achieve this task, a BERT based architecture - RoBERTa was implemented. The results of this model are validated on the dataset provided by the organizers of this task.</abstract>
+      <url hash="f82c2288">2023.case-1.4</url>
+      <bibkey>bhatia-etal-2023-mlmodeler5</bibkey>
+    </paper>
+    <paper id="5">
+      <title><fixed-case>B</fixed-case>osch<fixed-case>AI</fixed-case> @ Causal News Corpus 2023: Robust Cause-Effect Span Extraction using Multi-Layer Sequence Tagging and Data Augmentation</title>
+      <author><first>Timo Pierre</first><last>Schrader</last></author>
+      <author><first>Simon</first><last>Razniewski</last></author>
+      <author><first>Lukas</first><last>Lange</last></author>
+      <author><first>Annemarie</first><last>Friedrich</last></author>
+      <pages>38–43</pages>
+      <abstract>Understanding causality is a core aspect of intelligence. The Event Causality Identification with Causal News Corpus Shared Task addresses two aspects of this challenge: Subtask 1 aims at detecting causal relationships in texts, and Subtask 2 requires identifying signal words and the spans that refer to the cause or effect, respectively. Our system, which is based on pre-trained transformers, stacked sequence tagging, and synthetic data augmentation, ranks third in Subtask 1 and wins Subtask 2 with an F1 score of 72.8, corresponding to a margin of 13 pp. to the second-best system.</abstract>
+      <url hash="65efa3cd">2023.case-1.5</url>
+      <bibkey>schrader-etal-2023-boschai</bibkey>
+    </paper>
+    <paper id="6">
+      <title>An Evaluation Framework for Mapping News Headlines to Event Classes in a Knowledge Graph</title>
+      <author><first>Steve Fonin</first><last>Mbouadeu</last></author>
+      <author><first>Martin</first><last>Lorenzo</last></author>
+      <author><first>Ken</first><last>Barker</last></author>
+      <author><first>Oktie</first><last>Hassanzadeh</last></author>
+      <pages>44–52</pages>
+      <abstract>Mapping ongoing news headlines to event-related classes in a rich knowledge base can be an important component in a knowledge-based event analysis and forecasting solution. In this paper, we present a methodology for creating a benchmark dataset of news headlines mapped to event classes in Wikidata, and resources for the evaluation of methods that perform the mapping. We use the dataset to study two classes of unsupervised methods for this task: 1) adaptations of classic entity linking methods, and 2) methods that treat the problem as a zero-shot text classification problem. For the first approach, we evaluate off-the-shelf entity linking systems. For the second approach, we explore a) pre-trained natural language inference (NLI) models, and b) pre-trained large generative language models. We present the results of our evaluation, lessons learned, and directions for future work. The dataset and scripts for evaluation are made publicly available.</abstract>
+      <url hash="6d3407ee">2023.case-1.6</url>
+      <bibkey>mbouadeu-etal-2023-evaluation</bibkey>
+    </paper>
+    <paper id="7">
+      <title>Ometeotl@Multimodal Hate Speech Event Detection 2023: Hate Speech and Text-Image Correlation Detection in Real Life Memes Using Pre-Trained <fixed-case>BERT</fixed-case> Models over Text</title>
+      <author><first>Jesus</first><last>Armenta-Segura</last></author>
+      <author><first>César Jesús</first><last>Núñez-Prado</last></author>
+      <author><first>Grigori Olegovich</first><last>Sidorov</last></author>
+      <author><first>Alexander</first><last>Gelbukh</last></author>
+      <author><first>Rodrigo Francisco</first><last>Román-Godínez</last></author>
+      <pages>53–59</pages>
+      <abstract>Hate speech detection during times of war has become crucial in recent years, as evident with the recent Russo-Ukrainian war. In this paper, we present our submissions for both subtasks from the Multimodal Hate Speech Event Detec- tion contest at CASE 2023, RANLP 2023. We used pre-trained BERT models in both submis- sion, achieving a F1 score of 0.809 in subtask A, and F1 score of 0.567 in subtask B. In the first subtask, our result was not far from the first place, which led us to realize the lower impact of images in real-life memes about feel- ings, when compared with the impact of text. However, we observed a higher importance of images when targeting hateful feelings towards a specific entity. The source code to reproduce our results can be found at the github repository https://github.com/JesusASmx/OmeteotlAtCASE2023</abstract>
+      <url hash="4c3d5e4e">2023.case-1.7</url>
+      <bibkey>armenta-segura-etal-2023-ometeotl</bibkey>
+    </paper>
+    <paper id="8">
+      <title><fixed-case>I</fixed-case>nteros<fixed-case>ML</fixed-case>@Causal News Corpus 2023: Understanding Causal Relationships: Supervised Contrastive Learning for Event Classification</title>
+      <author><first>Rajat</first><last>Patel</last></author>
+      <pages>60–65</pages>
+      <abstract>Causal events play a crucial role in explaining the intricate relationships between the causes and effects of events. However, comprehending causal events within discourse, text, or speech poses significant semantic challenges. We propose a contrastive learning-based method in this submission to the Causal News Corpus - Event Causality Shared Task 2023, with a specific focus on SubTask1 centered on causal event classification. In our approach we pre-train our base model using Supervised Contrastive (SuperCon) learning. Subsequently, we fine-tune the pre-trained model for the specific task of causal event classification. Our experimentation demonstrates the effectiveness of our method, achieving a competitive performance, and securing the 2nd position on the leaderboard with an F1-Score of 84.36.</abstract>
+      <url hash="984b09d3">2023.case-1.8</url>
+      <bibkey>patel-2023-interosml</bibkey>
+    </paper>
+    <paper id="9">
+      <title><fixed-case>SSN</fixed-case>-<fixed-case>NLP</fixed-case>-<fixed-case>ACE</fixed-case>@Multimodal Hate Speech Event Detection 2023: Detection of Hate Speech and Targets using Logistic Regression and <fixed-case>SVM</fixed-case></title>
+      <author><first>Avanthika</first><last>K</last></author>
+      <author><first>Mrithula</first><last>Kl</last></author>
+      <author><first>Thenmozhi</first><last>D</last></author>
+      <pages>66–70</pages>
+      <abstract>In this research paper, we propose a multimodal approach to hate speech detection, directed towards the identification of hate speech and its related targets. Our method uses logistic regression and support vector machines (SVMs) to analyse textual content extracted from social media platforms. We exploit natural language processing techniques to preprocess and extract relevant features from textual content, capturing linguistic patterns, sentiment, and contextual information.</abstract>
+      <url hash="60ce80cd">2023.case-1.9</url>
+      <bibkey>k-etal-2023-ssn</bibkey>
+    </paper>
+    <paper id="10">
+      <title><fixed-case>ARC</fixed-case>-<fixed-case>NLP</fixed-case> at Multimodal Hate Speech Event Detection 2023: Multimodal Methods Boosted by Ensemble Learning, Syntactical and Entity Features</title>
+      <author><first>Umitcan</first><last>Sahin</last></author>
+      <author><first>Izzet Emre</first><last>Kucukkaya</last></author>
+      <author><first>Oguzhan</first><last>Ozcelik</last></author>
+      <author><first>Cagri</first><last>Toraman</last></author>
+      <pages>71–78</pages>
+      <abstract>Text-embedded images can serve as a means of spreading hate speech, propaganda, and extremist beliefs. Throughout the Russia-Ukraine war, both opposing factions heavily relied on text-embedded images as a vehicle for spreading propaganda and hate speech. Ensuring the effective detection of hate speech and propaganda is of utmost importance to mitigate the negative effect of hate speech dissemination. In this paper, we outline our methodologies for two subtasks of Multimodal Hate Speech Event Detection 2023. For the first subtask, hate speech detection, we utilize multimodal deep learning models boosted by ensemble learning and syntactical text attributes. For the second subtask, target detection, we employ multimodal deep learning models boosted by named entity features. Through experimentation, we demonstrate the superior performance of our models compared to all textual, visual, and text-visual baselines employed in multimodal hate speech detection. Furthermore, our models achieve the first place in both subtasks on the final leaderboard of the shared task.</abstract>
+      <url hash="edd9eaee">2023.case-1.10</url>
+      <bibkey>sahin-etal-2023-arc</bibkey>
+    </paper>
+    <paper id="11">
+      <title><fixed-case>V</fixed-case>erba<fixed-case>V</fixed-case>isor@Multimodal Hate Speech Event Detection 2023: Hate Speech Detection using Transformer Model</title>
+      <author><first>Sarika</first><last>Esackimuthu</last></author>
+      <author><first>Prabavathy</first><last>Balasundaram</last></author>
+      <pages>79–83</pages>
+      <abstract>Hate speech detection has emerged as a critical research area in recent years due to the rise of online social platforms and the proliferation of harmful content targeting individuals or specific groups.This task highlights the importance of detecting hate speech in text-embedded images.By leveraging deep learning models,this research aims to uncover the connection between hate speech and the entities it targets.</abstract>
+      <url hash="4fac8593">2023.case-1.11</url>
+      <bibkey>esackimuthu-balasundaram-2023-verbavisor</bibkey>
+    </paper>
+    <paper id="12">
+      <title>Lexical Squad@Multimodal Hate Speech Event Detection 2023: Multimodal Hate Speech Detection using Fused Ensemble Approach</title>
+      <author><first>Mohammad</first><last>Kashif</last></author>
+      <author><first>Mohammad</first><last>Zohair</last></author>
+      <author><first>Saquib</first><last>Ali</last></author>
+      <pages>84–91</pages>
+      <abstract>With a surge in the usage of social media postings to express opinions, emotions, and ideologies, there has been a significant shift towards the calibration of social media as a rapid medium of conveying viewpoints and outlooks over the globe. Concurrently, the emergence of a multitude of conflicts between two entities has given rise to a stream of social media content containing propaganda, hate speech, and inconsiderate views. Thus, the issue of monitoring social media postings is rising swiftly, attracting major attention from those willing to solve such problems. One such problem is Hate Speech detection. To mitigate this problem, we present our novel ensemble learning approach for detecting hate speech, by classifying text-embedded images into two labels, namely “Hate Speech” and “No Hate Speech” . We have incorporated state-of-art models including InceptionV3, BERT, and XLNet. Our proposed ensemble model yielded promising results with 75.21 and 74.96 as accuracy and F-1 score (respectively). We also present an empirical evaluation of the text-embedded images to elaborate on how well the model was able to predict and classify.</abstract>
+      <url hash="a6ae8dd9">2023.case-1.12</url>
+      <bibkey>kashif-etal-2023-lexical</bibkey>
+    </paper>
+    <paper id="13">
+      <title>On the Road to a Protest Event Ontology for <fixed-case>B</fixed-case>ulgarian: Conceptual Structures and Representation Design</title>
+      <author><first>Milena</first><last>Slavcheva</last></author>
+      <author><first>Hristo</first><last>Tanev</last></author>
+      <author><first>Onur</first><last>Uca</last></author>
+      <pages>92–100</pages>
+      <abstract>The paper presents a semantic model of protest events, called Semantic Interpretations of Protest Events (SemInPE). The analytical framework used for building the semantic representations is inspired by the object-oriented paradigm in computer science and a cognitive approach to the linguistic analysis. The model is a practical application of the Unified Eventity Representation (UER) formalism, which is based on the Unified Modeling Language (UML). The multi-layered architecture of the model provides flexible means for building the semantic representations of the language objects along a scale of generality and specificity. Thus, it is a suitable environment for creating the elements of ontologies on various topics and for different languages.</abstract>
+      <url hash="8fe7c0ed">2023.case-1.13</url>
+      <bibkey>slavcheva-etal-2023-road</bibkey>
+    </paper>
+    <paper id="14">
+      <title><fixed-case>CSECU</fixed-case>-<fixed-case>DSG</fixed-case>@Multimodal Hate Speech Event Detection 2023: Transformer-based Multimodal Hierarchical Fusion Model For Multimodal Hate Speech Detection</title>
+      <author><first>Abdul</first><last>Aziz</last></author>
+      <author><first>MD. Akram</first><last>Hossain</last></author>
+      <author><first>Abu Nowshed</first><last>Chy</last></author>
+      <pages>101–107</pages>
+      <abstract>The emergence of social media and e-commerce platforms enabled the perpetrator to spread negativity and abuse individuals or organisations worldwide rapidly. It is critical to detect hate speech in both visual and textual content so that it may be moderated or excluded from online platforms to keep it sound and safe for users. However, multimodal hate speech detection is a complex and challenging task as people sarcastically present hate speech and different modalities i.e., image and text are involved in their content. This paper describes our participation in the CASE 2023 multimodal hate speech event detection task. In this task, the objective is to automatically detect hate speech and its target from the given text-embedded image. We proposed a transformer-based multimodal hierarchical fusion model to detect hate speech present in the visual content. We jointly fine-tune a language and a vision pre-trained transformer models to extract the visual-contextualized features representation of the text-embedded image. We concatenate these features and fed them to the multi-sample dropout strategy. Moreover, the contextual feature vector is fed into the BiLSTM module and the output of the BiLSTM module also passes into the multi-sample dropout. We employed arithmetic mean fusion to fuse all sample dropout outputs that predict the final label of our proposed method. Experimental results demonstrate that our model obtains competitive performance and ranked 5th among the participants</abstract>
+      <url hash="562b841f">2023.case-1.14</url>
+      <bibkey>aziz-etal-2023-csecu-dsg</bibkey>
+    </paper>
+    <paper id="15">
+      <title><fixed-case>CSECU</fixed-case>-<fixed-case>DSG</fixed-case> @ Causal News Corpus 2023: Leveraging <fixed-case>R</fixed-case>o<fixed-case>BERT</fixed-case>a and <fixed-case>D</fixed-case>e<fixed-case>BERT</fixed-case>a Transformer Model with Contrastive Learning for Causal Event Classification</title>
+      <author><first>MD. Akram</first><last>Hossain</last></author>
+      <author><first>Abdul</first><last>Aziz</last></author>
+      <author><first>Abu Nowshed</first><last>Chy</last></author>
+      <pages>108–112</pages>
+      <abstract>Cause-effect relationships play a crucial role in human cognition, and distilling cause-effect relations from text helps in ameliorating causal networks for predictive tasks. There are many NLP applications that can benefit from this task, including natural language-based financial forecasting, text summarization, and question-answering. However, due to the lack of syntactic clues, the ambivalent semantic meaning of words, complex sentence structure, and implicit meaning of numerical entities in the text make it one of the challenging tasks in NLP. To address these challenges, CASE-2023 introduced a shared task 3 task focusing on event causality identification with causal news corpus. In this paper, we demonstrate our participant systems for this task. We leverage two transformers models including DeBERTa and Twitter-RoBERTa along with the weighted average fusion technique to tackle the challenges of subtask 1 where we need to identify whether a text belongs to either causal or not. For subtask 2 where we need to identify the cause, effect, and signal tokens from the text, we proposed a unified neural network of DeBERTa and DistilRoBERTa transformer variants with contrastive learning techniques. The experimental results showed that our proposed method achieved competitive performance among the participants’ systems.</abstract>
+      <url hash="f6aeb115">2023.case-1.15</url>
+      <bibkey>hossain-etal-2023-csecu</bibkey>
+    </paper>
+    <paper id="16">
+      <title><fixed-case>NEXT</fixed-case>: An Event Schema Extension Approach for Closed-Domain Event Extraction Models</title>
+      <author><first>Elena</first><last>Tuparova</last></author>
+      <author><first>Petar</first><last>Ivanov</last></author>
+      <author><first>Andrey</first><last>Tagarev</last></author>
+      <author><first>Svetla</first><last>Boytcheva</last></author>
+      <author><first>Ivan</first><last>Koychev</last></author>
+      <pages>113–123</pages>
+      <abstract>Event extraction from textual data is a NLP research task relevant to a plethora of domains. Most approaches aim to recognize events from a predefined event schema, consisting of event types and their corresponding arguments. For domains, such as disinformation, where new event types emerge frequently, there is a need to adapt such fixed event schemas to accommodate for new event types. We present NEXT (New Event eXTraction) - a resource-sparse approach to extending a close-domain model to novel event types, that requires a very small number of annotated samples for fine-tuning performed on a single GPU. Furthermore, our results suggest that this approach is suitable not only for extraction of new event types, but also for recognition of existing event types, as the use of this approach on a new dataset leads to improved recall for all existing events while retaining precision.</abstract>
+      <url hash="88e2a08f">2023.case-1.16</url>
+      <bibkey>tuparova-etal-2023-next</bibkey>
+    </paper>
+    <paper id="17">
+      <title>Negative documents are positive: Improving event extraction performance using overlooked negative data</title>
+      <author><first>Osman</first><last>Mutlu</last></author>
+      <author><first>Ali</first><last>Hürriyetoğlu</last></author>
+      <pages>124–135</pages>
+      <abstract>The scarcity of data poses a significant challenge in closed-domain event extraction, as is common in complex NLP tasks. This limitation primarily arises from the intricate nature of the annotation process. To address this issue, we present a multi-task model structure and training approach that leverages the additional data, which is found as not having any event information at document and sentence levels, generated during the event annotation process. By incorporating this supplementary data, our proposed framework demonstrates enhanced robustness and, in some scenarios, improved performance. A particularly noteworthy observation is that including only negative documents in addition to the original data contributes to performance enhancement. Our findings offer promising insights into leveraging extra data to mitigate data scarcity challenges in closed-domain event extraction.</abstract>
+      <url hash="80416953">2023.case-1.17</url>
+      <bibkey>mutlu-hurriyetoglu-2023-negative</bibkey>
+    </paper>
+    <paper id="18">
+      <title><fixed-case>IIC</fixed-case>_<fixed-case>T</fixed-case>eam@Multimodal Hate Speech Event Detection 2023: Detection of Hate Speech and Targets using Xlm-Roberta-base</title>
+      <author><first>Karanpreet</first><last>Singh</last></author>
+      <author><first>Vajratiya</first><last>Vajrobol</last></author>
+      <author><first>Nitisha</first><last>Aggarwal</last></author>
+      <pages>136–143</pages>
+      <abstract>Hate speech has emerged as a pressing issue on social media platforms, fueled by the increasing availability of multimodal data and easy internet access. Addressing this problem requires collaborative efforts from researchers, policymakers, and online platforms. In this study, we investigate the detection of hate speech in multimodal data, comprising text-embedded images, by employing advanced deep learning models. The main objective is to identify effective strategies for hate speech detection and content moderation. We conducted experiments using four state-of-the-art classifiers: XLM-Roberta-base, BiLSTM, XLNet base cased, and ALBERT, on the CrisisHateMM[4] dataset, consisting of over 4700 text-embedded images related to the Russia-Ukraine conflict. The best findings reveal that XLM-Roberta-base exhibits superior performance, outperforming other classifiers across all evaluation metrics, including an impressive F1 score of 84.62 for sub-task 1 and 69.73 for sub-task 2. The future scope of this study lies in exploring multimodal approaches to enhance hate speech detection accuracy, integrating ethical considerations to address potential biases, promoting fairness, and safeguarding user rights. Additionally, leveraging larger and more diverse datasets will contribute to developing more robust and generalised hate speech detection solutions.</abstract>
+      <url hash="f20f4e72">2023.case-1.18</url>
+      <bibkey>singh-etal-2023-iic</bibkey>
+    </paper>
+    <paper id="19">
+      <title>Event Causality Identification - Shared Task 3, <fixed-case>CASE</fixed-case> 2023</title>
+      <author><first>Fiona Anting</first><last>Tan</last></author>
+      <author><first>Hansi</first><last>Hettiarachchi</last></author>
+      <author><first>Ali</first><last>Hürriyetoğlu</last></author>
+      <author><first>Nelleke</first><last>Oostdijk</last></author>
+      <author><first>Onur</first><last>Uca</last></author>
+      <author><first>Surendrabikram</first><last>Thapa</last></author>
+      <author><first>Farhana Ferdousi</first><last>Liza</last></author>
+      <pages>144–150</pages>
+      <abstract>The Event Causality Identification Shared Task of CASE 2023 is the second iteration of a shared task centered around the Causal News Corpus. Two subtasks were involved: In Subtask 1, participants were challenged to predict if a sentence contains a causal relation or not. In Subtask 2, participants were challenged to identify the Cause, Effect, and Signal spans given an input causal sentence. For both subtasks, participants uploaded their predictions for a held-out test set, and ranking was done based on binary F1 and macro F1 scores for Subtask 1 and 2, respectively. This paper includes an overview of the work of the ten teams that submitted their results to our competition and the six system description papers that were received. The highest F1 scores achieved for Subtask 1 and 2 were 84.66% and 72.79%, respectively.</abstract>
+      <url hash="644f1c65">2023.case-1.19</url>
+      <bibkey>tan-etal-2023-event-causality</bibkey>
+    </paper>
+    <paper id="20">
+      <title>Multimodal Hate Speech Event Detection - Shared Task 4, <fixed-case>CASE</fixed-case> 2023</title>
+      <author><first>Surendrabikram</first><last>Thapa</last></author>
+      <author><first>Farhan</first><last>Jafri</last></author>
+      <author><first>Ali</first><last>Hürriyetoğlu</last></author>
+      <author><first>Francielle</first><last>Vargas</last></author>
+      <author><first>Roy Ka-Wei</first><last>Lee</last></author>
+      <author><first>Usman</first><last>Naseem</last></author>
+      <pages>151–159</pages>
+      <abstract>Ensuring the moderation of hate speech and its targets emerges as a critical imperative within contemporary digital discourse. To facilitate this imperative, the shared task Multimodal Hate Speech Event Detection was organized in the sixth CASE workshop co-located at RANLP 2023. The shared task has two subtasks. The sub-task A required participants to pose hate speech detection as a binary problem i.e. they had to detect if the given text-embedded image had hate or not. Similarly, sub-task B required participants to identify the targets of the hate speech namely individual, community, and organization targets in text-embedded images. For both sub-tasks, the participants were ranked on the basis of the F1-score. The best F1-score in sub-task A and sub-task B were 85.65 and 76.34 respectively. This paper provides a comprehensive overview of the performance of 13 teams that submitted the results in Subtask A and 10 teams in Subtask B.</abstract>
+      <url hash="79ff19c3">2023.case-1.20</url>
+      <bibkey>thapa-etal-2023-multimodal</bibkey>
+    </paper>
+    <paper id="21">
+      <title>Detecting and Geocoding Battle Events from Social Media Messages on the Russo-<fixed-case>U</fixed-case>krainian War: Shared Task 2, <fixed-case>CASE</fixed-case> 2023</title>
+      <author><first>Hristo</first><last>Tanev</last></author>
+      <author><first>Nicolas</first><last>Stefanovitch</last></author>
+      <author><first>Andrew</first><last>Halterman</last></author>
+      <author><first>Onur</first><last>Uca</last></author>
+      <author><first>Vanni</first><last>Zavarella</last></author>
+      <author><first>Ali</first><last>Hurriyetoglu</last></author>
+      <author><first>Bertrand</first><last>De Longueville</last></author>
+      <author><first>Leonida</first><last>Della Rocca</last></author>
+      <pages>160–166</pages>
+      <abstract>The purpose of the shared task 2 at the Challenges and Applications of Automated Extraction of Socio-political Events from Text (CASE) 2023 workshop was to test the abilities of the participating models and systems to detect and geocode armed conflicts events in social media messages from Telegram channels reporting on the Russo Ukrainian war. The evaluation followed an approach which was introduced in CASE 2021 (Giorgi et al., 2021): For each system we consider the correlation of the spatio-temporal distribution of its detected events and the events identified for the same period in the ACLED (Armed Conflict Location and Event Data Project) database (Raleigh et al., 2010). We use ACLED for the ground truth, since it is a well established standard in the field of event extraction and political trend analysis, which relies on human annotators for the encoding of security events using a fine grained taxonomy. Two systems participated in this shared task, we report in this paper on both the shared task and the participating systems.</abstract>
+      <url hash="6d7e5b74">2023.case-1.21</url>
+      <bibkey>tanev-etal-2023-detecting</bibkey>
+    </paper>
+    <paper id="22">
+      <title>Challenges and Applications of Automated Extraction of Socio-political Events from Text (<fixed-case>CASE</fixed-case> 2023): Workshop and Shared Task Report</title>
+      <author><first>Ali</first><last>Hürriyetoğlu</last></author>
+      <author><first>Hristo</first><last>Tanev</last></author>
+      <author><first>Osman</first><last>Mutlu</last></author>
+      <author><first>Surendrabikram</first><last>Thapa</last></author>
+      <author><first>Fiona Anting</first><last>Tan</last></author>
+      <author><first>Erdem</first><last>Yörük</last></author>
+      <pages>167–175</pages>
+      <abstract>We provide a summary of the sixth edition of the CASE workshop that is held in the scope of RANLP 2023. The workshop consists of regular papers, three keynotes, working papers of shared task participants, and shared task overview papers. This workshop series has been bringing together all aspects of event information collection across technical and social science fields. In addition to contributing to the progress in text based event extraction, the workshop provides a space for the organization of a multimodal event information collection task.</abstract>
+      <url hash="4d8b2e53">2023.case-1.22</url>
+      <bibkey>hurriyetoglu-etal-2023-challenges</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2023.contents.xml b/data/xml/2023.contents.xml
new file mode 100644
index 0000000000..9ea03de7ec
--- /dev/null
+++ b/data/xml/2023.contents.xml
@@ -0,0 +1,94 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2023.contents">
+  <volume id="1" ingest-date="2023-10-31" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the Workshop on Computational Terminology in NLP and Translation Studies (ConTeNTS) Incorporating the 16th Workshop on Building and Using Comparable Corpora (BUCC)</booktitle>
+      <editor><first>Amal Haddad</first><last>Haddad</last></editor>
+      <editor><first>Ayla Rigouts</first><last>Terryn</last></editor>
+      <editor><first>Ruslan</first><last>Mitkov</last></editor>
+      <editor><first>Reinhard</first><last>Rapp</last></editor>
+      <editor><first>Pierre</first><last>Zweigenbaum</last></editor>
+      <editor><first>Serge</first><last>Sharoff</last></editor>
+      <publisher>INCOMA Ltd., Shoumen, Bulgaria</publisher>
+      <address>Varna, Bulgaria</address>
+      <month>September</month>
+      <year>2023</year>
+      <url hash="fee9db02">2023.contents-1</url>
+      <venue>contents</venue>
+      <venue>ws</venue>
+    </meta>
+    <frontmatter>
+      <url hash="543e5f35">2023.contents-1.0</url>
+      <bibkey>contents-2023-terminology</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>Bilingual Terminology Alignment Using Contextualized Embeddings</title>
+      <author><first>Imene</first><last>Setha</last></author>
+      <author><first>Hassina</first><last>Aliane</last></author>
+      <pages>1–8</pages>
+      <abstract>Terminology Alignment faces big challenges in NLP because of the dynamic nature of terms. Fortunately, over these last few years, Deep Learning models showed very good progress with several NLP tasks such as multilingual data resourcing, glossary building, terminology understanding. . . etc. In this work, we propose a new method for terminology alignment from a comparable corpus (Arabic/French languages) for the Algerian culture field. We aim to improve bilingual alignment based on contextual information of a term and to create a significant term bank i.e. a bilingual Arabic-French dictionary. We propose to create word embeddings for both Arabic and French languages using ELMO model focusing on contextual features of terms. Then, we mapp those embeddings using Seq2seq model. We use multilingual-BERT and All-MiniLM-L6 as baseline mod- els to compare terminology alignment results. Lastly we study the performance of these models by applying evaluation methods. Experimentation’s showed quite satisfying alignment results.</abstract>
+      <url hash="439e78d6">2023.contents-1.1</url>
+      <bibkey>setha-aliane-2023-bilingual</bibkey>
+    </paper>
+    <paper id="2">
+      <title>Termout: a tool for the semi-automatic creation of term databases</title>
+      <author><first>Rogelio</first><last>Nazar</last></author>
+      <author><first>Nicolas</first><last>Acosta</last></author>
+      <pages>9–18</pages>
+      <abstract>We propose a tool for the semi-automatic production of terminological databases, divided in the steps of corpus processing, terminology extraction, database population and management. With this tool it is possible to obtain a draft macrostructure (a lemma-list) and data for the microstructural level, such as grammatical (morphosyntactic patterns, gender, formation process) and semantic information (hypernyms, equivalence in another language, definitions and synonyms). In this paper we offer an overall description of the software and an evaluation of its performance, for which we used a linguistics corpus in English and Spanish.</abstract>
+      <url hash="044d51c7">2023.contents-1.2</url>
+      <bibkey>nazar-acosta-2023-termout</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Use of <fixed-case>NLP</fixed-case> Techniques in Translation by <fixed-case>C</fixed-case>hat<fixed-case>GPT</fixed-case>: Case Study</title>
+      <author><first>Feyza</first><last>Dalayli</last></author>
+      <pages>19–25</pages>
+      <abstract>Use of NLP Techniques in Translation by ChatGPT: Case Study Natural Language Processing (NLP) refers to a field of study within the domain of artificial intelligence (AI) and computational linguistics that focuses on the interaction between computers and human language. NLP seeks to develop computational models and algorithms capable of understanding, analyzing, and generating natural language text and speech (Brown et al., 1990). At its core, NLP aims to bridge the gap between human language and machine understanding by employing various techniques from linguistics, computer science, and statistics. It involves the application of linguistic and computational theories to process, interpret, and extract meaningful information from unstructured textual data (Bahdanau, Cho and Bengio, 2015). Researchers and practitioners in NLP employ diverse methodologies, including rule-based approaches, statistical models, machine learning techniques (such as neural networks), and more recently, deep learning architectures. These methodologies enable the development of robust algorithms that can learn from large-scale language data to improve the accuracy and effectiveness of language processing systems (Nilsson, 2010). NLP has numerous real-world applications across various domains, including information retrieval, virtual assistants, chatbots, social media analysis, sentiment monitoring, automated translation services, and healthcare, among others (kaynak). As the field continues to advance, NLP strives to overcome challenges such as understanding the nuances of human language, handling ambiguity, context sensitivity, and incorporating knowledge from diverse sources to enable machines to effectively communicate and interact with humans in a more natural and intuitive manner. Natural Language Processing (NLP) and translation are interconnected fields that share a symbiotic relationship, as NLP techniques and methodologies greatly contribute to the advancement and effectiveness of machine translation systems. NLP, a subfield of artificial intelligence (AI), focuses on the interaction between computers and human language. It encompasses a wide range of tasks, including text analysis, syntactic and semantic parsing, sentiment analysis, information extraction, and machine translation (Bahdanau, Cho and Bengio, 2014). NMT models employ deep learning architectures, such as recurrent neural networks (RNNs) and more specifically, long short-term memory (LSTM) networks, to learn the mapping between source and target language sentences. These models are trained on large-scale parallel corpora, consisting of aligned sentence pairs in different languages. The training process involves optimizing model parameters to minimize the discrepancy between predicted translations and human-generated translations (Wu et al., 2016) NLP techniques are crucial at various stages of machine translation. Preprocessing techniques, such as tokenization, sentence segmentation, and morphological analysis, help break down input text into meaningful linguistic units, making it easier for translation models to process and understand the content. Syntactic and semantic parsing techniques aid in capturing the structural and semantic relationships within sentences, improving the overall coherence and accuracy of translations. Furthermore, NLP-based methods are employed for handling specific translation challenges, such as handling idiomatic expressions, resolving lexical ambiguities, and addressing syntactic divergences between languages. For instance, statistical alignment models, based on NLP algorithms, enable the identification of correspondences between words or phrases in source and target languages, facilitating the generation of more accurate translations (kaynak). Several studies have demonstrated the effectiveness of NLP techniques in enhancing machine translation quality. For example, Bahdanau et al. (2015) introduced the attention mechanism, an NLP technique that enables NMT models to focus on relevant parts of the source sentence during translation. This attention mechanism significantly improved the translation quality of neural machine translation models. ChatGPT is a language model developed by OpenAI that utilizes the principles of Natural Language Processing (NLP) for various tasks, including translations. NLP is a field of artificial intelligence that focuses on the interaction between computers and human language. It encompasses a range of techniques and algorithms for processing, analyzing, and understanding natural language. When it comes to translation, NLP techniques can be applied to facilitate the conversion of text from one language to another. ChatGPT employs a sequence-to-sequence model, a type of neural network architecture commonly used in machine translation tasks. This model takes an input sequence in one language and generates a corresponding output sequence in the target language (OpenAI, 2023). The training process for ChatGPT involves exposing the model to large amounts of multilingual data, allowing it to learn patterns, syntax, and semantic relationships across different languages. This exposure enables the model to develop a general understanding of language structures and meanings, making it capable of performing translation tasks. To enhance translation quality, ChatGPT leverages the Transformer architecture, which has been highly successful in NLP tasks. Transformers utilize attention mechanisms, enabling the model to focus on different parts of the input sequence during the translation process. This attention mechanism allows the model to capture long-range dependencies and improve the overall coherence and accuracy of translations. Additionally, techniques such as subword tokenization, which divides words into smaller units, are commonly employed in NLP translation systems like ChatGPT. Subword tokenization helps handle out-of-vocabulary words and improves the model’s ability to handle rare or unknown words (GPT-4 Technical Report, 2023). As can be seen, there have been significant developments in artificial intelligence translations thanks to NLP. However, it is not possible to say that it has fully reached the quality of translation made by people. The only goal in artificial intelligence translations is to reach translations made by humans. In general, there are some fundamental differences between human and ChatGPT translations. Human-made translations and translations generated by ChatGPT (or similar language models) have several key differences (Kelly and Zetzsche, 2014; Koehn, 2010; Sutskever, Vinyals and Le, 2014; Costa-jussà and Fonollosa, 2018) Translation Quality: Human translators are capable of producing high-quality translations with a deep understanding of both the source and target languages. They can accurately capture the nuances, cultural references, idioms, and context of the original text. On the other hand, ChatGPT translations can sometimes be less accurate or may not fully grasp the intended meaning due to the limitations of the training data and the model’s inability to comprehend context in the same way a human can. While ChatGPT can provide reasonable translations, they may lack the finesse and precision of a human translator. Natural Language Processing: Human translators are skilled at processing and understanding natural language, taking into account the broader context, cultural implications, and the intended audience. They can adapt their translations to suit the target audience, tone, and purpose of the text. ChatGPT, although trained on a vast amount of text data, lacks the same level of natural language understanding. It often relies on pattern matching and statistical analysis to generate translations, which can result in less nuanced or contextually appropriate outputs. Subject Matter Expertise: Human translators often specialize in specific domains or subject areas, allowing them to have deep knowledge and understanding of technical or specialized terminology. They can accurately translate complex or industry-specific texts, ensuring the meaning is preserved. ChatGPT, while having access to a wide range of general knowledge, may struggle with domain-specific vocabulary or terminology, leading to inaccuracies or incorrect translations in specialized texts. Cultural Sensitivity: Human translators are well-versed in the cultural nuances of both the source and target languages. They can navigate potential pitfalls, adapt the translation to the cultural context, and avoid unintended offensive or inappropriate language choices. ChatGPT lacks this level of cultural sensitivity and may produce translations that are culturally tone-deaf or insensitive, as it lacks the ability to understand the subtleties and implications of language choices. Revision and Editing: Human translators go through an iterative process of revision and editing to refine their translations, ensuring accuracy, clarity, and quality. They can self-correct errors and refine their translations based on feedback or additional research. ChatGPT, while capable of generating translations, does not have the same ability to self-correct or improve based on feedback. It generates translations in a single pass, without the iterative refinement process that humans can employ. In summary, while ChatGPT can be a useful tool for generating translations, human-made translations generally outperform machine-generated translations in terms of quality, accuracy, contextuality, cultural sensitivity, and domain-specific expertise. In conclusion, NLP and machine translation are closely intertwined, with NLP providing essential tools, methodologies, and techniques that contribute to the development and improvement of machine translation systems. The integration of NLP methods has led to significant advancements in translation accuracy, fluency, and the ability to handle various linguistic complexities. As NLP continues to evolve, its impact on the field of machine translation is expected to grow, enabling the creation of more sophisticated and context-aware translation systems. On the basis of all this information, in this research, it is aimed to compare the translations from English to Turkish made by ChatGPT, one of the most advanced artificial intelligences, with the translations made by humans. In this context, an academic 1 page English text was chosen. The text was translated by both ChatGPT and a translator who is an academic in the field of translation and has 10 years of experience. Afterwards, two different translations were examined comparatively by 5 different translators who are experts in their fields. Semi-structured in-depth interviews were conducted with these translators. The aim of this study is to reveal the role of artificial intelligence tools in translation, which are increasing day by day and suggesting that there will be no need for language learning in the future. On the other hand, many translators argue that artificial intelligence and human translations can be understood. Therefore, if artificial intelligence is successful, there will be no profession called translator in the future. This research seems to be very useful in terms of shedding light on the future. The method of this research is semi-structured in-depth interview. References Bahdanau, D., Cho, K. and Bengio Y. (2015). Neural machine translation by jointly learning to align and translate. In International Conference on Learning Representations. Brown, P. F., Cocke, J., Pietra, S. A. D., Pietra, V. J. D., Jelinek, F., Lafferty, J. D., Mercer, R. L., and Roossin, P. S. A. (1990) statistical approach to machine translation. Computational linguistics 16, 2, 79–85. Costa-jussà, M. R., &amp; Fonollosa, J. A. R. (2018). “An Overview of Neural Machine Translation.” IEEE Transactions on Neural Networks and Learning Systems. GPT-4 Technical Report (2023). https://arxiv.org/abs/2303.08774. Kelly, N. and Zetzsche, J. (2014). Found in Translation: How Language Shapes Our Lives and Transforms the World. USA: Penguin Book. Koehn, P. (2010). “Statistical Machine Translation.” Cambridge University Press. Nilsson, N. J. (2010). The Quest For AI- A History Of Ideas And Achievements. http://ai.standford.edu/ nilsson/. OpenAI (2023). https://openai.com/blog/chatgpt/. Sutskever, I., Vinyals, O., &amp; Le, Q. V. (2014). “Sequence to Sequence Learning with Neural Networks.” Advances in Neural Information Processing Systems. Wu,Y. Schuster, M., Chen, Z., Le, Q. V. and Norouzi M. (2016). Google’s Neural Machine Translation System: Bridging the Gap between Human and Machine Translation. https://arxiv.org/pdf/1609.08144.pdf.</abstract>
+      <url hash="4349dd02">2023.contents-1.3</url>
+      <bibkey>dalayli-2023-use</bibkey>
+    </paper>
+    <paper id="4">
+      <title>On the Evaluation of Terminology Translation Errors in <fixed-case>NMT</fixed-case> and <fixed-case>PB</fixed-case>-<fixed-case>SMT</fixed-case> in the Legal Domain: a Study on the Translation of <fixed-case>A</fixed-case>rabic Legal Documents into <fixed-case>E</fixed-case>nglish and <fixed-case>F</fixed-case>rench</title>
+      <author><first>Khadija</first><last>Ait ElFqih</last></author>
+      <author><first>Johanna</first><last>Monti</last></author>
+      <pages>26–35</pages>
+      <abstract>In the translation process, terminological resources are used to solve translation problems, so information on terminological equivalence is crucial to make the most appropriate choices in terms of translation equivalence. In the context of Machine translation, indeed, neural models have improved the state-of-the-art in Machine Translation considerably in recent years. However, they still underperform in domain-specific fields and in under-resourced languages. This is particularly evident in translating legal terminology for Arabic, where current Machine Translation outputs do not adhere to the contextual, linguistic, cultural, and terminological constraints posed by translating legal terms in Arabic. In this paper, we conduct a comparative qualitative evaluation and comprehensive error analysis on legal terminology translation in Phrase-Based Statistical Machine Translation and Neural Machine Translation in two translation language pairs: Arabic-English and Arabic-French. We propose an error typology taking the legal terminology translation from Arabic into account. We demonstrate our findings, highlighting the strengths and weaknesses of both approaches in the area of legal terminology translation for Arabic. We also introduce a multilingual gold standard dataset that we developed using our Arabic legal corpus. This dataset serves as a reliable benchmark and/or reference during the evaluation process to decide the degree of adequacy and fluency of the Phrase-Based Statistical Machine Translation and Neural Machine Translation systems.</abstract>
+      <url hash="ff093ebe">2023.contents-1.4</url>
+      <bibkey>ait-elfqih-monti-2023-evaluation</bibkey>
+    </paper>
+    <paper id="5">
+      <title>Automatic Student Answer Assessment using <fixed-case>LSA</fixed-case></title>
+      <author><first>Teodora</first><last>Mihajlov</last></author>
+      <pages>36–44</pages>
+      <abstract>Implementing technology in a modern-day classroom is an ongoing challenge. In this paper, we created a system for an automatic assessment of student answers using Latent Semantic Analysis (LSA) – a method with an underlying assumption that words with similar meanings will appear in the same contexts. The system will be used within digital lexical flash-cards for L2 vocabulary acquisition in a CLIL classroom. Results presented in this paper indicate that while LSA does well in creating semantic spaces for longer texts, it somewhat struggles with detecting topics in short texts. After obtaining LSA semantic spaces, answer accuracy was assessed by calculating the cosine similarity between a student’s answer and the golden standard. The answers were classified by accuracy using KNN, for both binary and multinomial classification. The results of KNN classification are as follows: precision P = 0.73, recall R = 1.00, F1 = 0.85 for binary classification, and P = 0.50, R = 0.47, F1 = 0.46 score for the multinomial classifier. The results are to be taken with a grain of salt, due to a small test and training dataset.</abstract>
+      <url hash="e6f612d2">2023.contents-1.5</url>
+      <bibkey>mihajlov-2023-automatic</bibkey>
+    </paper>
+    <paper id="6">
+      <title>Semantic Specifics of <fixed-case>B</fixed-case>ulgarian Verbal Computer Terms</title>
+      <author><first>Maria</first><last>Todorova</last></author>
+      <pages>45–50</pages>
+      <abstract>This paper represents a description of Bulgarian verbal computer terms with a view to the specifics of their translation in English. The study employs a subset of 100 verbs extracted from the Bulgarian WordNet (BulNet) and from the internet. The analysis of their syntactic and semantic structure is a part of a study of the general lexis of Bulgarian. The aim of the paper is to (1) identify some problem areas of the description and translation of general lexis verbs, (2) offer an approach to the semantic description of metaphor-based terms from the perspective of Frame Semantics; (3) raise questions about the definition of general lexis with respect to Bulgarian and across languages.</abstract>
+      <url hash="aa353099">2023.contents-1.6</url>
+      <bibkey>todorova-2023-semantic</bibkey>
+    </paper>
+    <paper id="7">
+      <title><fixed-case>B</fixed-case>an<fixed-case>MANI</fixed-case>: A Dataset to Identify Manipulated Social Media News in <fixed-case>B</fixed-case>angla</title>
+      <author><first>Mahammed</first><last>Kamruzzaman</last></author>
+      <author><first>Md. Minul Islam</first><last>Shovon</last></author>
+      <author><first>Gene</first><last>Kim</last></author>
+      <pages>51–58</pages>
+      <abstract>Initial work has been done to address fake news detection and misrepresentation of news in the Bengali language. However, no work in Bengali yet addresses the identification of specific claims in social media news that falsely manipulate a related news article. At this point, this problem has been tackled in English and a few other languages, but not in the Bengali language. In this paper, we curate a dataset of social media content labeled with information manipulation relative to reference articles, called BanMANI. The dataset collection method we describe works around the limitations of the available NLP tools in Bangla. We expect these techniques will carry over to building similar datasets in other low-resource languages. BanMANI forms the basis both for evaluating the capabilities of existing NLP systems and for training or fine-tuning new models specifically on this task. In our analysis, we find that this task challenges current LLMs both under zero-shot and fine-tuned set- things</abstract>
+      <url hash="2361d482">2023.contents-1.7</url>
+      <bibkey>kamruzzaman-etal-2023-banmani</bibkey>
+    </paper>
+    <paper id="8">
+      <title>Supervised Feature-based Classification Approach to Bilingual Lexicon Induction from Specialised Comparable Corpora</title>
+      <author><first>Ayla</first><last>Rigouts Terryn</last></author>
+      <pages>59–68</pages>
+      <abstract>This study, submitted to the BUCC2023 shared task on bilingual term alignment in comparable specialised corpora, introduces a supervised, feature-based classification approach. The approach employs both static cross-lingual embeddings and contextual multilingual embeddings, combined with surface-level indicators such as Levenshtein distance and term length, as well as linguistic information. Results exhibit improved performance over previous methodologies, illustrating the merit of integrating diverse features. However, the error analysis also reveals remaining challenges.</abstract>
+      <url hash="5dd76666">2023.contents-1.8</url>
+      <bibkey>rigouts-terryn-2023-supervised</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2023.dravidianlangtech.xml b/data/xml/2023.dravidianlangtech.xml
new file mode 100644
index 0000000000..4b2e819480
--- /dev/null
+++ b/data/xml/2023.dravidianlangtech.xml
@@ -0,0 +1,536 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2023.dravidianlangtech">
+  <volume id="1" ingest-date="2023-10-31" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the Third Workshop on Speech and Language Technologies for Dravidian Languages</booktitle>
+      <editor><first>Bharathi R.</first><last>Chakravarthi</last></editor>
+      <editor><first>Ruba</first><last>Priyadharshini</last></editor>
+      <editor><first>Anand Kumar</first><last>M</last></editor>
+      <editor><first>Sajeetha</first><last>Thavareesan</last></editor>
+      <editor><first>Elizabeth</first><last>Sherly</last></editor>
+      <publisher>INCOMA Ltd., Shoumen, Bulgaria</publisher>
+      <address>Varna, Bulgaria</address>
+      <month>September</month>
+      <year>2023</year>
+      <url hash="dad16675">2023.dravidianlangtech-1</url>
+      <venue>dravidianlangtech</venue>
+      <venue>ws</venue>
+    </meta>
+    <frontmatter>
+      <url hash="036309a1">2023.dravidianlangtech-1.0</url>
+      <bibkey>dravidianlangtech-2023-speech</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>On the Errors in Code-Mixed <fixed-case>T</fixed-case>amil-<fixed-case>E</fixed-case>nglish Offensive Span Identification</title>
+      <author><first>Manikandan</first><last>Ravikiran</last></author>
+      <author><first>Bharathi Raja</first><last>Chakravarthi</last></author>
+      <pages>1–9</pages>
+      <abstract>In recent times, offensive span identification in code-mixed Tamil-English language has seen traction with the release of datasets, shared tasks, and the development of multiple methods. However, the details of various errors shown by these methods are currently unclear. This paper presents a detailed analysis of various errors in state-of-the-art Tamil-English offensive span identification methods. Our study reveals the strengths and weaknesses of the widely used sequence labeling and zero-shot models for offensive span identification. In the due process, we identify data-related errors, improve data annotation and release additional diagnostic data to evaluate models’ quality and stability. <i>Disclaimer: This paper contains examples that may be considered profane, vulgar, or offensive. The examples do not represent the views of the authors or their employers/graduate schools towards any person(s), group(s), practice(s), or entity/entities. Instead, they emphasize the complexity of various errors and linguistic research challenges.</i>
+      </abstract>
+      <url hash="0f0ece5c">2023.dravidianlangtech-1.1</url>
+      <bibkey>ravikiran-chakravarthi-2023-errors</bibkey>
+    </paper>
+    <paper id="2">
+      <title>Hate and Offensive Keyword Extraction from <fixed-case>C</fixed-case>ode<fixed-case>M</fixed-case>ix <fixed-case>M</fixed-case>alayalam Social Media Text Using Contextual Embedding</title>
+      <author><first>Mariya</first><last>Raphel</last></author>
+      <author><first>Premjith</first><last>B</last></author>
+      <author><first>Sreelakshmi</first><last>K</last></author>
+      <author><first>Bharathi Raja</first><last>Chakravarthi</last></author>
+      <pages>10–18</pages>
+      <abstract>This paper focuses on identifying hate and offensive keywords from codemix Malayalam social media text. As part of this work, a dataset for hate and offensive keyword extraction for codemix Malayalam language was created. Two different methods were experimented to extract Hate and Offensive language (HOL) keywords from social media text. In the first method, intrinsic evaluation was performed on the dataset to identify the hate and offensive keywords. Three different approaches namely – unigram approach, bigram approach and trigram approach were performed to extract the HOL keywords, sequence of HOL words and the sequence that contribute HOL meaning even in the absence of a HOL word. Five different transformer models were used in each of the pproaches for extracting the embeddings for the ngrams. Later, HOL keywords were extracted based on the similarity score obtained using the cosine similarity. Out of the five transformer models, the best results were obtained with multilingual BERT. In the second method, multilingual BERT transformer model was fine tuned with the dataset to develop a HOL keyword tagger model. This work is a new beginning for HOL keyword identification in Dravidian language – Malayalam.</abstract>
+      <url hash="08e1bcef">2023.dravidianlangtech-1.2</url>
+      <bibkey>raphel-etal-2023-hate</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Acoustic Analysis of the Fifth Liquid in <fixed-case>M</fixed-case>alayalam</title>
+      <author><first>Punnoose</first><last>A K</last></author>
+      <pages>19–24</pages>
+      <abstract>This paper investigates the claim of rhoticity of the fifth liquid in Malayalam using various acoustic characteristics. The Malayalam liquid phonemes are analyzed in terms of the smoothness of the pitch window, formants, formant bandwidth, the effect on surrounding vowels, duration, and classification patterns by an unrelated classifier. We report, for the fifth liquid, a slight similarity in terms of pitch smoothness with one of the laterals, similarity with the laterals in terms of F1 for males, and similarity with the laterals and one of the rhotics in terms of F1 for females. The similarity in terms of formant bandwidth between the fifth liquid and the other liquids is inconclusive. Similarly, the effect of the fifth liquid on the surrounding vowels is inconclusive. No similarity is observed between the fifth liquid and the other liquids in phoneme duration. Classification of the fifth liquid section implies higher order signal level similarity with both laterals and rhotics.</abstract>
+      <url hash="82069779">2023.dravidianlangtech-1.3</url>
+      <bibkey>a-k-2023-acoustic</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Transformer-based Context Aware Morphological Analyzer for <fixed-case>T</fixed-case>elugu</title>
+      <author><first>Priyanka</first><last>Dasari</last></author>
+      <author><first>Abhijith</first><last>Chelpuri</last></author>
+      <author><first>Nagaraju</first><last>Vuppala</last></author>
+      <author><first>Mounika</first><last>Marreddy</last></author>
+      <author><first>Parameshwari</first><last>Krishnamurthy</last></author>
+      <author><first>Radhika</first><last>Mamidi</last></author>
+      <pages>25–32</pages>
+      <abstract>This paper addresses the challenges faced by Indian languages in leveraging deep learning for natural language processing (NLP) due to limited resources, annotated datasets, and Transformer-based architectures. We specifically focus on Telugu and aim to construct a Telugu morph analyzer dataset comprising 10,000 sentences. Furthermore, we assess the performance of established multi-lingual Transformer models (m-Bert, XLM-R, IndicBERT) and mono-lingual Transformer models trained from scratch on an extensive Telugu corpus comprising 80,15,588 sentences (BERT-Te). Our findings demonstrate the efficacy of Transformer-based representations pretrained on Telugu data in improving the performance of the Telugu morph analyzer, surpassing existing multi-lingual approaches. This highlights the necessity of developing dedicated corpora, annotated datasets, and machine learning models in a mono-lingual setting. We present benchmark results for the Telugu morph analyzer achieved through simple fine-tuning on our dataset.</abstract>
+      <url hash="0f1ff4a8">2023.dravidianlangtech-1.4</url>
+      <bibkey>dasari-etal-2023-transformer</bibkey>
+    </paper>
+    <paper id="5">
+      <title>Improving Reinfocement Learning Agent Training using Text based Guidance: A study using Commands in <fixed-case>D</fixed-case>ravidian Languages</title>
+      <author><first>Nikhil Chowdary</first><last>Paleti</last></author>
+      <author><first>Sai Aravind</first><last>Vadlapudi</last></author>
+      <author><first>Sai Aashish</first><last>Menta</last></author>
+      <author><first>Sai Akshay</first><last>Menta</last></author>
+      <author><first>Vishnu Vardhan</first><last>Gorantla V N S L</last></author>
+      <author><first>Janakiram</first><last>Chandu</last></author>
+      <author><first>Soman</first><last>K P</last></author>
+      <author><first>Sachin Kumar</first><last>S</last></author>
+      <pages>33–42</pages>
+      <abstract>Reinforcement learning (RL) agents have achieved remarkable success in various domains, such as game-playing and protein structure prediction. However, most RL agents rely on exploration to find optimal solutions without explicit guidance. This paper proposes a methodology for training RL agents using text-based instructions in Dravidian Languages, including Telugu, Tamil, and Malayalam along with using the English language. The agents are trained in a modified Lunar Lander environment, where they must follow specific paths to successfully land the lander. The methodology involves collecting a dataset of human demonstrations and textual instructions, encoding the instructions into numerical representations using text-based embeddings, and training RL agents using state-of-the-art algorithms. The results demonstrate that the trained Soft Actor-Critic (SAC) agent can effectively understand and generalize instructions in different languages, outperforming other RL algorithms such as Proximal Policy Optimization (PPO) and Deep Deterministic Policy Gradient (DDPG).</abstract>
+      <url hash="233e3156">2023.dravidianlangtech-1.5</url>
+      <bibkey>paleti-etal-2023-improving</bibkey>
+    </paper>
+    <paper id="6">
+      <title>Social Media Data Analysis for <fixed-case>M</fixed-case>alayalam <fixed-case>Y</fixed-case>ou<fixed-case>T</fixed-case>ube Comments: Sentiment Analysis and Emotion Detection using <fixed-case>ML</fixed-case> and <fixed-case>DL</fixed-case> Models</title>
+      <author><first>Abeera</first><last>V P</last></author>
+      <author><first>Dr. Sachin</first><last>Kumar</last></author>
+      <author><first>Dr. Soman</first><last>K P</last></author>
+      <pages>43–51</pages>
+      <abstract>In this paper, we present a study on social media data analysis of Malayalam YouTube comments, specifically focusing on sentiment analysis and emotion detection. Our research aims to investigate the effectiveness of various machine learning (ML) and deep learning (DL) models in addressing these two tasks. For sentiment analysis, we collected a dataset consisting of 3064 comments, while for two-class emotion detection, we used a dataset of 817 comments. In the sentiment analysis phase, we explored multiple ML and DL models, including traditional algorithms such as Support Vector Machines (SVM), Naïve Bayes, K-Nearest Neighbors (KNN), MLP Classifier, Decision Tree, and Random Forests. Additionally, we utilized DL models such as Recurrent Neural Networks (RNN), LSTM, and GRU. To enhance the performance of these models, we preprocessed the Malayalam YouTube comments by tokenizing and removing stop words. Experimental results revealed that DL models achieved higher accuracy compared to ML models, indicating their ability to capture the complex patterns and nuances in the Malayalam language. Furthermore, we extended our analysis to emotion detection, which involved dealing with limited annotated data. This task is closely related to social media data analysis. For emotion detection, we employed the same ML models used in the sentiment analysis phase. Our dataset of 817 comments was annotated with two emotions: Happy and Sad. We trained the models to classify the comments into these emotion classes and analyzed the accuracy of the different models.</abstract>
+      <url hash="f0196a47">2023.dravidianlangtech-1.6</url>
+      <bibkey>v-p-etal-2023-social</bibkey>
+    </paper>
+    <paper id="7">
+      <title>Findings of the Second Shared Task on Offensive Span Identification from Code-Mixed <fixed-case>T</fixed-case>amil-<fixed-case>E</fixed-case>nglish Comments</title>
+      <author><first>Manikandan</first><last>Ravikiran</last></author>
+      <author><first>Ananth</first><last>Ganesh</last></author>
+      <author><first>Anand Kumar</first><last>M</last></author>
+      <author><first>R</first><last>Rajalakshmi</last></author>
+      <author><first>Bharathi Raja</first><last>Chakravarthi</last></author>
+      <pages>52–58</pages>
+      <abstract>Maintaining effective control over offensive content is essential on social media platforms to foster constructive online discussions. Yet, when it comes to code-mixed Dravidian languages, the current prevalence of offensive content moderation is restricted to categorizing entire comments, failing to identify specific portions that contribute to the offensiveness. Such limitation is primarily due to the lack of annotated data and open source systems for offensive spans. To alleviate this issue, in this shared task, we offer a collection of Tamil-English code-mixed social comments that include offensive comments. This paper provides an overview of the released dataset, the algorithms employed, and the outcomes achieved by the systems submitted for this task.</abstract>
+      <url hash="6298df8e">2023.dravidianlangtech-1.7</url>
+      <bibkey>ravikiran-etal-2023-findings</bibkey>
+    </paper>
+    <paper id="8">
+      <title>Overview of the shared task on Fake News Detection from Social Media Text</title>
+      <author><first>Malliga</first><last>S</last></author>
+      <author><first>Bharathi Raja</first><last>Chakravarthi</last></author>
+      <author><first>Kogilavani</first><last>S V</last></author>
+      <author><first>Santhiya</first><last>Pandiyan</last></author>
+      <author><first>Prasanna Kumar</first><last>Kumaresan</last></author>
+      <author><first>Balasubramanian</first><last>Palani</last></author>
+      <author><first>Muskaan</first><last>Singh</last></author>
+      <pages>59–63</pages>
+      <abstract>This document contains the instructions for preparing a manuscript for the proceedings of RANLP 2023. The document itself conforms to its own specifications and is therefore an example of what your manuscript should look like. These instructions should be used for both papers submitted for review and for final versions of accepted papers. Authors are asked to conform to all the directions reported in this document.</abstract>
+      <url hash="e150939f">2023.dravidianlangtech-1.8</url>
+      <bibkey>s-etal-2023-overview</bibkey>
+    </paper>
+    <paper id="9">
+      <title>Findings of the Shared Task on Sentiment Analysis in <fixed-case>T</fixed-case>amil and <fixed-case>T</fixed-case>ulu Code-Mixed Text</title>
+      <author><first>Asha</first><last>Hegde</last></author>
+      <author><first>Bharathi Raja</first><last>Chakravarthi</last></author>
+      <author><first>Hosahalli Lakshmaiah</first><last>Shashirekha</last></author>
+      <author><first>Rahul</first><last>Ponnusamy</last></author>
+      <author><first>Subalalitha</first><last>Cn</last></author>
+      <author><first>Lavanya</first><last>S K</last></author>
+      <author><first>Thenmozhi</first><last>D.</last></author>
+      <author><first>Martha</first><last>Karunakar</last></author>
+      <author><first>Shreya</first><last>Shreeram</last></author>
+      <author><first>Sarah</first><last>Aymen</last></author>
+      <pages>64–71</pages>
+      <abstract>In recent years, there has been a growing focus on Sentiment Analysis (SA) of code-mixed Dravidian languages. However, the majority of social media text in these languages is code-mixed, presenting a unique challenge. Despite this, there is currently lack of research on SA specifically tailored for code-mixed Dravidian languages, highlighting the need for further exploration and development in this domain. In this view, “Sentiment Analysis in Tamil and Tulu- DravidianLangTech” shared task at Recent Advances in Natural Language Processing (RANLP)- 2023 is organized. This shred consists two language tracks: code-mixed Tamil and Tulu and Tulu text is first ever explored in public domain for SA. We describe the task, its organization, and the submitted systems followed by the results. 57 research teams registered for the shared task and We received 27 systems each for code-mixed Tamil and Tulu texts. The performance of the systems (developed by participants) has been evaluated in terms of macro average F1 score. The top system for code-mixed Tamil and Tulu texts scored macro average F1 score of 0.32, and 0.542 respectively. The high quality and substantial quantity of submissions demonstrate a significant interest and attention in the analysis of code-mixed Dravidian languages. However, the current state of the art in this domain indicates the need for further advancements and improvements to effectively address the challenges posed by code-mixed Dravidian language SA.</abstract>
+      <url hash="db77ec40">2023.dravidianlangtech-1.9</url>
+      <bibkey>hegde-etal-2023-findings</bibkey>
+    </paper>
+    <paper id="10">
+      <title>Findings of the Shared Task on Multimodal Abusive Language Detection and Sentiment Analysis in <fixed-case>T</fixed-case>amil and <fixed-case>M</fixed-case>alayalam</title>
+      <author><first>Premjith</first><last>B</last></author>
+      <author><first>Jyothish Lal</first><last>G</last></author>
+      <author><first>Sowmya</first><last>V</last></author>
+      <author><first>Bharathi Raja</first><last>Chakravarthi</last></author>
+      <author><first>Rajeswari</first><last>Natarajan</last></author>
+      <author><first>Nandhini</first><last>K</last></author>
+      <author><first>Abirami</first><last>Murugappan</last></author>
+      <author><first>Bharathi</first><last>B</last></author>
+      <author><first>Kaushik</first><last>M</last></author>
+      <author><first>Prasanth</first><last>Sn</last></author>
+      <author><first>Aswin Raj</first><last>R</last></author>
+      <author><first>Vijai Simmon</first><last>S</last></author>
+      <pages>72–79</pages>
+      <abstract>This paper summarizes the shared task on multimodal abusive language detection and sentiment analysis in Dravidian languages as part of the third Workshop on Speech and Language Technologies for Dravidian Languages at RANLP 2023. This shared task provides a platform for researchers worldwide to submit their models on two crucial social media data analysis problems in Dravidian languages - abusive language detection and sentiment analysis. Abusive language detection identifies social media content with abusive information, whereas sentiment analysis refers to the problem of determining the sentiments expressed in a text. This task aims to build models for detecting abusive content and analyzing fine-grained sentiment from multimodal data in Tamil and Malayalam. The multimodal data consists of three modalities - video, audio and text. The datasets for both tasks were prepared by collecting videos from YouTube. Sixty teams participated in both tasks. However, only two teams submitted their results. The submissions were evaluated using macro F1-score.</abstract>
+      <url hash="0169e3a7">2023.dravidianlangtech-1.10</url>
+      <bibkey>b-etal-2023-findings</bibkey>
+    </paper>
+    <paper id="11">
+      <title>Overview of Shared-task on Abusive Comment Detection in <fixed-case>T</fixed-case>amil and <fixed-case>T</fixed-case>elugu</title>
+      <author><first>Ruba</first><last>Priyadharshini</last></author>
+      <author><first>Bharathi Raja</first><last>Chakravarthi</last></author>
+      <author><first>Malliga</first><last>S</last></author>
+      <author><first>Subalalitha</first><last>Cn</last></author>
+      <author><first>Kogilavani</first><last>S V</last></author>
+      <author><first>Premjith</first><last>B</last></author>
+      <author><first>Abirami</first><last>Murugappan</last></author>
+      <author><first>Prasanna Kumar</first><last>Kumaresan</last></author>
+      <pages>80–87</pages>
+      <abstract>This paper discusses the submissions to the shared task on abusive comment detection in Tamil and Telugu codemixed social media text conducted as part of the third Workshop on Speech and Language Technologies for Dravidian Languages at RANLP 20239. The task encourages researchers to develop models to detect the contents containing abusive information in Tamil and Telugu codemixed social media text. The task has three subtasks - abusive comment detection in Tamil, Tamil-English and Telugu-English. The dataset for all the tasks was developed by collecting comments from YouTube. The submitted models were evaluated using macro F1-score, and prepared the rank list accordingly.</abstract>
+      <url hash="fff9cd2a">2023.dravidianlangtech-1.11</url>
+      <bibkey>priyadharshini-etal-2023-overview</bibkey>
+    </paper>
+    <paper id="12">
+      <title><fixed-case>C</fixed-case>o<fixed-case>P</fixed-case>ara: The First <fixed-case>D</fixed-case>ravidian Paragraph-level n-way Aligned Corpus</title>
+      <author><first>Nikhil</first><last>E</last></author>
+      <author><first>Mukund</first><last>Choudhary</last></author>
+      <author><first>Radhika</first><last>Mamidi</last></author>
+      <pages>88–96</pages>
+      <abstract>We present CoPara, the first publicly available paragraph-level (n-way aligned) multilingual parallel corpora for Dravidian languages. The collection contains 2856 paragraph/passage pairs between English and four Dravidian languages. We source the parallel paragraphs from the New India Samachar magazine and align them with English as a pivot language. We do human and artificial evaluations to validate the high-quality alignment and richness of the parallel paragraphs of a range of lengths. To show one of the many ways this dataset can be wielded, we finetuned IndicBART, a seq2seq NMT model on all XX-En pairs of languages in CoPara which perform better than existing sentence-level models on standard benchmarks (like BLEU) on sentence level translations and longer text too. We show how this dataset can enrich a model trained for a task like this, with more contextual cues and beyond sentence understanding even in low-resource settings like that of Dravidian languages. Finally, the dataset and models are made available publicly at CoPara to help advance research in Dravidian NLP, parallel multilingual, and beyond sentence-level tasks like NMT, etc.</abstract>
+      <url hash="39590f5f">2023.dravidianlangtech-1.12</url>
+      <bibkey>e-etal-2023-copara</bibkey>
+    </paper>
+    <paper id="13">
+      <title><fixed-case>C</fixed-case>hat<fixed-case>GPT</fixed-case>_<fixed-case>P</fixed-case>owered_<fixed-case>T</fixed-case>ourist_<fixed-case>A</fixed-case>id_<fixed-case>A</fixed-case>pplications__<fixed-case>P</fixed-case>roficient_in_<fixed-case>H</fixed-case>indi__<fixed-case>Y</fixed-case>et_<fixed-case>T</fixed-case>o_<fixed-case>M</fixed-case>aster_<fixed-case>T</fixed-case>elugu_and_<fixed-case>K</fixed-case>annada</title>
+      <author><first>Sanjana</first><last>Kolar</last></author>
+      <author><first>Rohit</first><last>Kumar</last></author>
+      <pages>97–107</pages>
+      <abstract>This research investigates the effectiveness of Chat- GPT, an AI language model by OpenAI, in translating English into Hindi, Telugu, and Kannada languages, aimed at assisting tourists in India’s linguistically diverse environment. To measure the translation quality, a test set of 50 questions from diverse fields such as general knowledge, food, and travel was used. These were assessed by five volunteers for accuracy and fluency, and the scores were subsequently converted into a BLEU score. The BLEU score evaluates the closeness of a machine-generated translation to a human translation, with a higher score indicating better translation quality. The Hindi translations outperformed others, showcasing superior accuracy and fluency, whereas Telugu translations lagged behind. Human evaluators rated both the accuracy and fluency of translations, offering a comprehensive perspective on the language model’s performance.</abstract>
+      <url hash="dd2f7fc5">2023.dravidianlangtech-1.13</url>
+      <bibkey>kolar-kumar-2023-chatgpt</bibkey>
+    </paper>
+    <paper id="14">
+      <title>Enhancing <fixed-case>T</fixed-case>elugu News Understanding: Comparative Study of <fixed-case>ML</fixed-case> Algorithms for Category Prediction</title>
+      <author><first>Manish Rama Gopal</first><last>Nadella</last></author>
+      <author><first>Venkata Krishna Rayalu</first><last>Garapati</last></author>
+      <author><first>Eswar Sudhan</first><last>S.k.</last></author>
+      <author><first>Gouthami</first><last>Jangala</last></author>
+      <author><first>Soman</first><last>K.p.</last></author>
+      <author><first>Sachin</first><last>Kumar</last></author>
+      <pages>108–115</pages>
+      <abstract>As one of the most extensively used languages in India, Telugu has a sizable audience and a huge library of news articles. Predicting the categories of Telugu news items not only helps with efficient organization but also makes it possible to do trend research, advertise in a certain demographic, and provide individualized recommendations. In order to identify the most effective method for accurate Telugu news category prediction, this study compares and contrasts various machine learning (ML) techniques, including support vector machines (SVM), random forests, and naive Bayes. Accuracy, precision, recall, and F1-score will be utilized as performance indicators to gauge how well these algorithms perform. The outcomes of this comparative analysis will address the particular difficulties and complexities of the Telugu language and add to the body of knowledge on news category prediction. For Telugu-speaking consumers, the study intends to improve news organization and recommendation systems, giving them more relevant and customized news consumption experiences. Our result emphasize that, although other models can be taken into account for further research and comparison, W2Vec-skip gram with polynomial SVM is the best performing combination.</abstract>
+      <url hash="6fc0d7fb">2023.dravidianlangtech-1.14</url>
+      <bibkey>nadella-etal-2023-enhancing</bibkey>
+    </paper>
+    <paper id="15">
+      <title>Revisiting Automatic Speech Recognition for <fixed-case>T</fixed-case>amil and <fixed-case>H</fixed-case>indi Connected Number Recognition</title>
+      <author><first>Rahul</first><last>Mishra</last></author>
+      <author><first>Senthil Raja</first><last>Gunaseela Boopathy</last></author>
+      <author><first>Manikandan</first><last>Ravikiran</last></author>
+      <author><first>Shreyas</first><last>Kulkarni</last></author>
+      <author><first>Mayurakshi</first><last>Mukherjee</last></author>
+      <author><first>Ananth</first><last>Ganesh</last></author>
+      <author><first>Kingshuk</first><last>Banerjee</last></author>
+      <pages>116–123</pages>
+      <abstract>Automatic Speech Recognition and its applications are rising in popularity across applications with reasonable inference results. Recent state-of-the-art approaches, often employ significantly large-scale models to show high accuracy for ASR as a whole but often do not consider detailed analysis of performance across low-resource languages applications. In this preliminary work, we propose to revisit ASR in the context of Connected Number Recognition (CNR). More specifically, we (i) present a new dataset HCNR collected to understand various errors of ASR models for CNR, (ii) establish preliminary benchmark and baseline model for CNR, (iii) explore error mitigation strategies and their after-effects on CNR. In the due process, we also compare with end-to-end large scale ASR models for reference, to show its effectiveness.</abstract>
+      <url hash="4ee24ff6">2023.dravidianlangtech-1.15</url>
+      <bibkey>mishra-etal-2023-revisiting</bibkey>
+    </paper>
+    <paper id="16">
+      <title>Poorvi@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech: Sentiment Analysis on Code-Mixed <fixed-case>T</fixed-case>ulu and <fixed-case>T</fixed-case>amil Corpus</title>
+      <author><first>Poorvi</first><last>Shetty</last></author>
+      <pages>124–132</pages>
+      <abstract>Sentiment analysis in code-mixed languages poses significant challenges, particularly for highly under-resourced languages such as Tulu and Tamil. Existing corpora, primarily sourced from YouTube comments, suffer from class imbalance across sentiment categories. Moreover, the limited number of samples in these corpus hampers effective sentiment classification. This study introduces a new corpus tailored for sentiment analysis in Tulu code-mixed texts. The research applies standard pre-processing techniques to ensure data quality and consistency and handle class imbalance. Subsequently, multiple classifiers are employed to analyze the sentiment of the code-mixed texts, yielding promising results. By leveraging the new corpus, the study contributes to advancing sentiment analysis techniques in under-resourced code-mixed languages. This work serves as a stepping stone towards better understanding and addressing the challenges posed by sentiment analysis in highly under-resourced languages.</abstract>
+      <url hash="80705fc6">2023.dravidianlangtech-1.16</url>
+      <bibkey>shetty-2023-poorvi</bibkey>
+    </paper>
+    <paper id="17">
+      <title><fixed-case>NLP</fixed-case>_<fixed-case>SSN</fixed-case>_<fixed-case>CSE</fixed-case>@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech: Fake News Detection in <fixed-case>D</fixed-case>ravidian Languages using Transformer Models</title>
+      <author><first>Varsha</first><last>Balaji</last></author>
+      <author><first>Shahul Hameed</first><last>T</last></author>
+      <author><first>Bharathi</first><last>B</last></author>
+      <pages>133–139</pages>
+      <abstract>The proposed system procures a systematic workflow in fake news identification utilizing machine learning classification in order to recognize and distinguish between real and made-up news. Using the Natural Language Toolkit (NLTK), the procedure starts with data preprocessing, which includes operations like text cleaning, tokenization, and stemming. This guarantees that the data is translated into an analytically-ready format. The preprocessed data is subsequently supplied into transformer models like M-BERT, Albert, XLNET, and BERT. By utilizing their extensive training on substantial datasets to identify complex patterns and significant traits that discriminate between authentic and false news pieces, these transformer models excel at capturing contextual information. The most successful model among those used is M-BERT, which boasts an astounding F1 score of 0.74. This supports M-BERT’s supremacy over its competitors in the field of fake news identification, outperforming them in terms of performance. The program can draw more precise conclusions and more effectively counteract the spread of false information because of its comprehension of contextual nuance. Organizations and platforms can strengthen their fake news detection systems and their attempts to stop the spread of false information by utilizing M-BERT’s capabilities.</abstract>
+      <url hash="be19454d">2023.dravidianlangtech-1.17</url>
+      <bibkey>balaji-etal-2023-nlp</bibkey>
+    </paper>
+    <paper id="18">
+      <title><fixed-case>A</fixed-case>bhi<fixed-case>P</fixed-case>aw@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech: Multimodal Abusive Language Detection and Sentiment Analysis</title>
+      <author><first>Abhinaba</first><last>Bala</last></author>
+      <author><first>Parameswari</first><last>Krishnamurthy</last></author>
+      <pages>140–146</pages>
+      <abstract>Detecting abusive language in multimodal videos has become a pressing need in ensuring a safe and inclusive online environment. This paper focuses on addressing this challenge through the development of a novel approach for multimodal abusive language detection in Tamil videos and sentiment analysis for Tamil/Malayalam videos. By leveraging state-of-the-art models such as Multiscale Vision Transformers (MViT) for video analysis, OpenL3 for audio analysis, and the bert-base-multilingual-cased model for textual analysis, our proposed framework integrates visual, auditory, and textual features. Through extensive experiments and evaluations, we demonstrate the effectiveness of our model in accurately detecting abusive content and predicting sentiment categories. The limited availability of effective tools for performing these tasks in Dravidian Languages has prompted a new avenue of research in these domains.</abstract>
+      <url hash="973d6247">2023.dravidianlangtech-1.18</url>
+      <bibkey>bala-krishnamurthy-2023-abhipaw</bibkey>
+    </paper>
+    <paper id="19">
+      <title>Athena@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech: Abusive Comment Detection in Code-Mixed Languages using Machine Learning Techniques</title>
+      <author><first>Hema</first><last>M</last></author>
+      <author><first>Anza</first><last>Prem</last></author>
+      <author><first>Rajalakshmi</first><last>Sivanaiah</last></author>
+      <author><first>Angel Deborah</first><last>S</last></author>
+      <pages>147–151</pages>
+      <abstract>The amount of digital material that is disseminated through various social media platforms has significantly increased in recent years. Online networks have gained popularity in recent years and have established themselves as goto resources for news, information, and entertainment. Nevertheless, despite the many advantages of using online networks, mounting evidence indicates that an increasing number of malicious actors are taking advantage of these networks to spread poison and hurt other people. This work aims to detect abusive content in youtube comments written in the languages like Tamil, Tamil-English (codemixed), Telugu-English (code-mixed). This work was undertaken as part of the “DravidianLangTech@ RANLP 2023” shared task. The Macro F1 values for the Tamil, Tamil-English, and Telugu-English datasets were 0.28, 0.37, and 0.6137 and secured 5th, 7th, 8th rank respectively.</abstract>
+      <url hash="df0d60e5">2023.dravidianlangtech-1.19</url>
+      <bibkey>m-etal-2023-athena</bibkey>
+    </paper>
+    <paper id="20">
+      <title><fixed-case>A</fixed-case>lpha<fixed-case>B</fixed-case>rains@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech: Sentiment Analysis of Code-Mixed <fixed-case>T</fixed-case>amil and <fixed-case>T</fixed-case>ulu by Training Contextualized <fixed-case>ELM</fixed-case>o Word Representations</title>
+      <author><first>Toqeer</first><last>Ehsan</last></author>
+      <author><first>Amina</first><last>Tehseen</last></author>
+      <author><first>Kengatharaiyer</first><last>Sarveswaran</last></author>
+      <author><first>Amjad</first><last>Ali</last></author>
+      <pages>152–159</pages>
+      <abstract>Sentiment analysis in natural language processing (NLP), endeavors to computationally identify and extract subjective information from textual data. In code-mixed text, sentiment analysis presents a unique challenge due to the mixing of languages within a single textual context. For low-resourced languages such as Tamil and Tulu, predicting sentiment becomes a challenging task due to the presence of text comprising various scripts. In this research, we present the sentiment analysis of code-mixed Tamil and Tulu Youtube comments. We have developed a Bidirectional Long-Short Term Memory (BiLSTM) networks based models for both languages which further uses contextualized word embeddings at input layers of the models. For that purpose, ELMo embeddings have been trained on larger unannotated code-mixed text like corpora. Our models performed with macro average F1-scores of 0.2877 and 0.5133 on Tamil and Tulu code-mixed datasets respectively.</abstract>
+      <url hash="be3ad670">2023.dravidianlangtech-1.20</url>
+      <bibkey>ehsan-etal-2023-alphabrains</bibkey>
+    </paper>
+    <paper id="21">
+      <title><fixed-case>HARMONY</fixed-case>@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech: Transformer-based Ensemble Learning for Abusive Comment Detection</title>
+      <author><first>Amrish</first><last>Raaj P</last></author>
+      <author><first>Abirami</first><last>Murugappan</last></author>
+      <author><first>Lysa</first><last>Packiam R S</last></author>
+      <author><first>Deivamani</first><last>M</last></author>
+      <pages>160–165</pages>
+      <abstract>Millions of posts and comments are created every minute as a result of the widespread use of social media and easy access to the internet.It is essential to create an inclusive environment and forbid the use of abusive language against any individual or group of individuals.This paper describes the approach of team HARMONY for the “Abusive Comment Detection” shared task at the Third Workshop on Speech and Language Technologies for Dravidian Languages.A Transformer-based ensemble learning approach is proposed for detecting abusive comments in code-mixed (Tamil-English) language and Tamil language. The proposed architecture achieved rank 2 in Tamil text classification sub task and rank 3 in code mixed text classification sub task with macro-F1 score of 0.41 for Tamil and 0.50 for code-mixed data.</abstract>
+      <url hash="48e23655">2023.dravidianlangtech-1.21</url>
+      <bibkey>raaj-p-etal-2023-harmony</bibkey>
+    </paper>
+    <paper id="22">
+      <title>Avalanche at <fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech: Abusive Comment Detection in Code Mixed Data Using Machine Learning Techniques with Under Sampling</title>
+      <author><first>Rajalakshmi</first><last>Sivanaiah</last></author>
+      <author><first>Rajasekar</first><last>S</last></author>
+      <author><first>Srilakshmisai</first><last>K</last></author>
+      <author><first>Angel Deborah</first><last>S</last></author>
+      <author><first>Mirnalinee</first><last>ThankaNadar</last></author>
+      <pages>166–170</pages>
+      <abstract>In recent years, the growth of online platforms and social media has given rise to a concerning increase in the presence of abusive content. This poses significant challenges for maintaining a safe and inclusive digital environment. In order to resolve this issue, this paper experiments an approach for detecting abusive comments. We are using a combination of pipelining and vectorization techniques, along with algorithms such as the stochastic gradient descent (SGD) classifier and support vector machine (SVM) classifier. We conducted experiments on an Tamil-English code mixed dataset to evaluate the performance of this approach. Using the stochastic gradient descent classifier algorithm, we achieved a weighted F1 score of 0.76 and a macro score of 0.45 for development dataset. Furthermore, by using the support vector machine classifier algorithm, we obtained a weighted F1 score of 0.78 and a macro score of 0.42 for development dataset. With the test dataset, SGD approach secured 5th rank with 0.44 macro F1 score, while SVM scored 8th rank with 0.35 macro F1 score in the shared task. The top rank team secured 0.55 macro F1 score.</abstract>
+      <url hash="04cd5dea">2023.dravidianlangtech-1.22</url>
+      <bibkey>sivanaiah-etal-2023-avalanche</bibkey>
+    </paper>
+    <paper id="23">
+      <title><fixed-case>D</fixed-case>eep<fixed-case>B</fixed-case>lue<fixed-case>AI</fixed-case>@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech-<fixed-case>RANLP</fixed-case> 2023</title>
+      <author><first>Zhipeng</first><last>Luo</last></author>
+      <author><first>Jiahui</first><last>Wang</last></author>
+      <pages>171–175</pages>
+      <abstract>This paper presents a study on the language understanding of the Dravidian languages. Three specific tasks related to text classification are focused on in this study, including abusive comment detection, sentiment analysis and fake news detection. The paper provides a detailed description of the tasks, including dataset information and task definitions, as well as the model architectures and training details used to tackle them. Finally, the competition results are presented, demonstrating the effectiveness of the proposed approach for handling these challenging NLP tasks in the context of the Dravidian languages.</abstract>
+      <url hash="8fc4ceb1">2023.dravidianlangtech-1.23</url>
+      <bibkey>luo-wang-2023-deepblueai</bibkey>
+    </paper>
+    <paper id="24">
+      <title>Selam@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech:Sentiment Analysis of Code-Mixed <fixed-case>D</fixed-case>ravidian Texts using <fixed-case>SVM</fixed-case> Classification</title>
+      <author><first>Selam</first><last>Kanta</last></author>
+      <author><first>Grigori</first><last>Sidorov</last></author>
+      <pages>176–179</pages>
+      <abstract>Sentiment analysis in code-mixed text written in Dravidian languages. Specifically, Tamil- English and Tulu-English. This paper describes the system paper of the RANLP-2023 shared task. The goal of this shared task is to develop systems that accurately classify the sentiment polarity of code-mixed comments and posts. be provided with development, training, and test data sets containing code-mixed text in Tamil- English and Tulu-English. The task involves message-level polarity classification, to classify YouTube comments into positive, negative, neutral, or mixed emotions. This Code- Mix was compiled by RANLP-2023 organizers from posts on social media. We use classification techniques SVM and achieve an F1 score of 0.147 for Tamil-English and 0.518 for Tulu- English.</abstract>
+      <url hash="2fa7d641">2023.dravidianlangtech-1.24</url>
+      <bibkey>kanta-sidorov-2023-selam</bibkey>
+    </paper>
+    <paper id="25">
+      <title><fixed-case>LIDOMA</fixed-case>@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech: Convolutional Neural Networks for Studying Correlation Between Lexical Features and Sentiment Polarity in <fixed-case>T</fixed-case>amil and <fixed-case>T</fixed-case>ulu Languages</title>
+      <author><first>Moein</first><last>Tash</last></author>
+      <author><first>Jesus</first><last>Armenta-Segura</last></author>
+      <author><first>Zahra</first><last>Ahani</last></author>
+      <author><first>Olga</first><last>Kolesnikova</last></author>
+      <author><first>Grigori</first><last>Sidorov</last></author>
+      <author><first>Alexander</first><last>Gelbukh</last></author>
+      <pages>180–185</pages>
+      <abstract>With the prevalence of code-mixing among speakers of Dravidian languages, DravidianLangTech proposed the shared task on Sentiment Analysis in Tamil and Tulu at RANLP 2023. This paper presents the submission of LIDOMA, which proposes a methodology that combines lexical features and Convolutional Neural Networks (CNNs) to address the challenge. A fine-tuned 6-layered CNN model is employed, achieving macro F1 scores of 0.542 and 0.199 for Tulu and Tamil, respectively</abstract>
+      <url hash="7642bba3">2023.dravidianlangtech-1.25</url>
+      <bibkey>tash-etal-2023-lidoma</bibkey>
+    </paper>
+    <paper id="26">
+      <title>nlpt malayalm@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech : Fake News Detection in <fixed-case>M</fixed-case>alayalam using Optimized <fixed-case>XLM</fixed-case>-<fixed-case>R</fixed-case>o<fixed-case>BERT</fixed-case>a Model</title>
+      <author><first>Eduri</first><last>Raja</last></author>
+      <author><first>Badal</first><last>Soni</last></author>
+      <author><first>Sami Kumar</first><last>Borgohain</last></author>
+      <pages>186–191</pages>
+      <abstract>The paper demonstrates the submission of the team nlpt_malayalm to the Fake News Detection in Dravidian Languages-DravidianLangTech@LT-EDI-2023. The rapid dissemination of fake news and misinformation in today’s digital age poses significant societal challenges. This research paper addresses the issue of fake news detection in the Malayalam language by proposing a novel approach based on the XLM-RoBERTa base model. The objective is to develop an effective classification model that accurately differentiates between genuine and fake news articles in Malayalam. The XLM-RoBERTa base model, known for its multilingual capabilities, is fine-tuned using the prepared dataset to adapt it specifically to the nuances of the Malayalam language. A thorough analysis is also performed to identify any biases or limitations in the model’s performance. The results demonstrate that the proposed model achieves a remarkable macro-averaged F-Score of 87% in the Malayalam fake news dataset, ranking 2nd on the respective task. This indicates its high accuracy and reliability in distinguishing between real and fake news in Malayalam.</abstract>
+      <url hash="027144fa">2023.dravidianlangtech-1.26</url>
+      <bibkey>raja-etal-2023-nlpt</bibkey>
+    </paper>
+    <paper id="27">
+      <title><fixed-case>ML</fixed-case>&amp;<fixed-case>AI</fixed-case>_<fixed-case>IIITR</fixed-case>anchi@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech: Fine-Tuning <fixed-case>I</fixed-case>ndic<fixed-case>BERT</fixed-case> for Exploring Language-specific Features for Sentiment Classification in Code-Mixed <fixed-case>D</fixed-case>ravidian Languages</title>
+      <author><first>Kirti</first><last>Kumari</last></author>
+      <author><first>Shirish Shekhar</first><last>Jha</last></author>
+      <author><first>Zarikunte Kunal</first><last>Dayanand</last></author>
+      <author><first>Praneesh</first><last>Sharma</last></author>
+      <pages>192–197</pages>
+      <abstract>Code-mixing presents challenges to sentiment analysis due to limited availability of annotated data found on low-resource languages such as Tulu. To address this issue, comprehensive work was done in creating a gold-standard labeled corpus that incorporates both languages while facilitating accurate analyses of sentiments involved. Encapsulated within this research was the employed use of varied techniques including data collection, cleaning processes as well as preprocessing leading up to effective annotation along with finding results using fine tuning indic bert and performing experiments over tf-idf plus bag of words. The outcome is an invaluable resource for developing custom-tailored models meant solely for analyzing sentiments involved with code mixed texts across Tamil and Tulu domain limits; allowing a focused insight into what makes up such expressions. Remarkably, the adoption of hybrid models yielded promising outcomes, culminating in a 10th rank achievement for Tulu, and a 14thrank achievement for Tamil, supported by an macro F1 score of 0.471 and 0.124 respectively.</abstract>
+      <url hash="f19ac0c5">2023.dravidianlangtech-1.27</url>
+      <bibkey>kumari-etal-2023-ml</bibkey>
+    </paper>
+    <paper id="28">
+      <title><fixed-case>ML</fixed-case>&amp;<fixed-case>AI</fixed-case>_<fixed-case>IIITR</fixed-case>anchi@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech:Leveraging Transfer Learning for the discernment of Fake News within the Linguistic Domain of <fixed-case>D</fixed-case>ravidian Language</title>
+      <author><first>Kirti</first><last>Kumari</last></author>
+      <author><first>Shirish Shekhar</first><last>Jha</last></author>
+      <author><first>Zarikunte Kunal</first><last>Dayanand</last></author>
+      <author><first>Praneesh</first><last>Sharma</last></author>
+      <pages>198–206</pages>
+      <abstract>The primary focus of this research endeavor lies in detecting and mitigating misinformation within the intricate framework of the Dravidian language. A notable feat was achieved by employing fine-tuning methodologies on the highly acclaimed Indic BERT model, securing a commendable fourth rank in a prestigious competition organized by DravidianLangTech 2023 while attaining a noteworthy macro F1-Score of 0.78. To facilitate this undertaking, a diverse and comprehensive dataset was meticulously gathered from prominent social media platforms, including but not limited to Facebook and Twitter. The overarching objective of this collaborative initiative was to proficiently discern and categorize news articles into either the realm of veracity or deceit through the astute application of advanced machine learning techniques, coupled with the astute exploitation of the distinctive linguistic idiosyncrasies inherent to the Dravidian language.</abstract>
+      <url hash="d88dd543">2023.dravidianlangtech-1.28</url>
+      <bibkey>kumari-etal-2023-ml-ai</bibkey>
+    </paper>
+    <paper id="29">
+      <title><fixed-case>NITK</fixed-case>-<fixed-case>IT</fixed-case>-<fixed-case>NLP</fixed-case>@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech: Impact of Focal Loss on <fixed-case>M</fixed-case>alayalam Fake News Detection using Transformers</title>
+      <author><first>Hariharan</first><last>R L</last></author>
+      <author><first>Anand Kumar</first><last>M</last></author>
+      <pages>207–210</pages>
+      <abstract>Fake News Detection in Dravidian Languages is a shared task that identifies youtube comments in the Malayalam language for fake news detection. In this work, we have proposed a transformer-based model with cross-entropy loss and focal loss, which classifies the comments into fake or authentic news. We have used different transformer-based models for the dataset with modifications in the experimental setup, out of which the fine-tuned model, which is based on MuRIL with focal loss, achieved the best overall macro F1-score of 0.87, and we got second position in the final leaderboard.</abstract>
+      <url hash="921822f7">2023.dravidianlangtech-1.29</url>
+      <bibkey>r-l-m-2023-nitk</bibkey>
+    </paper>
+    <paper id="30">
+      <title><fixed-case>VEL</fixed-case>@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech: Sentiment Analysis of <fixed-case>T</fixed-case>amil and <fixed-case>T</fixed-case>ulu</title>
+      <author><first>Kishore Kumar</first><last>Ponnusamy</last></author>
+      <author><first>Charmathi</first><last>Rajkumar</last></author>
+      <author><first>Prasanna Kumar</first><last>Kumaresan</last></author>
+      <author><first>Elizabeth</first><last>Sherly</last></author>
+      <author><first>Ruba</first><last>Priyadharshini</last></author>
+      <pages>211–216</pages>
+      <abstract>We participated in the Sentiment Analysis in Tamil and Tulu - DravidianLangTech 2023-RANLP 2023 task in the team name of VEL. This research focuses on addressing the challenge of detecting sentiment analysis in social media code-mixed comments written in Tamil and Tulu languages. Code-mixed text in social media often deviates from strict grammar rules and incorporates non-native scripts, making sentiment identification a complex task. To tackle this issue, we employ pre-processing techniques to remove unnecessary content and develop a model specifically designed for sentiment analysis detection. Additionally, we explore the effectiveness of traditional machine-learning models combined with feature extraction techniques. Our best model logistic regression configurations achieve impressive macro F1 scores of 0.43 on the Tamil test set and 0.51 on the Tulu test set, indicating promising results in accurately detecting instances of sentiment in code-mixed comments.</abstract>
+      <url hash="22ac1500">2023.dravidianlangtech-1.30</url>
+      <bibkey>ponnusamy-etal-2023-vel</bibkey>
+    </paper>
+    <paper id="31">
+      <title>hate-alert@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech: Multimodal Abusive Language Detection and Sentiment Analysis in <fixed-case>D</fixed-case>ravidian Languages</title>
+      <author><first>Shubhankar</first><last>Barman</last></author>
+      <author><first>Mithun</first><last>Das</last></author>
+      <pages>217–224</pages>
+      <abstract>The use of abusive language on social media platforms is a prevalent issue that requires effective detection. Researchers actively engage in abusive language detection and sentiment analysis on social media platforms. However, most of the studies are in English. Hence, there is a need to develop models for low-resource languages. Further, the multimodal content in social media platforms is expanding rapidly. Our research aims to address this gap by developing a multimodal abusive language detection and performing sentiment analysis for Tamil and Malayalam, two under-resourced languages, based on the shared task Multimodal Abusive Language Detection and Sentiment Analysis in Dravidian Languages: DravidianLangTech@RANLP 2023”. In our study, we conduct extensive experiments utilizing multiple deep-learning models to detect abusive language in Tamil and perform sentiment analysis in Tamil and Malayalam. For feature extraction, we use the mBERT transformer-based model for texts, the ViT model for images and MFCC for audio. In the abusive language detection task, we achieved a weighted average F1 score of 0.5786, securing the first rank in this task. For sentiment analysis, we achieved a weighted average F1 score of 0.357 for Tamil and 0.233 for Malayalam, ranking first in this task.</abstract>
+      <url hash="36cc6275">2023.dravidianlangtech-1.31</url>
+      <bibkey>barman-das-2023-hate</bibkey>
+    </paper>
+    <paper id="32">
+      <title>Supernova@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech 2023@Abusive Comment Detection in <fixed-case>T</fixed-case>amil and <fixed-case>T</fixed-case>elugu - (<fixed-case>T</fixed-case>amil, <fixed-case>T</fixed-case>amil-<fixed-case>E</fixed-case>nglish, <fixed-case>T</fixed-case>elugu-<fixed-case>E</fixed-case>nglish)</title>
+      <author><first>Ankitha</first><last>Reddy</last></author>
+      <author><first>Pranav</first><last>Moorthi</last></author>
+      <author><first>Ann Maria</first><last>Thomas</last></author>
+      <pages>225–230</pages>
+      <abstract>This paper focuses on using Support Vector Machines (SVM) classifiers with TF-IDF feature extraction to classify whether a comment is abusive or not.The paper tries to identify abusive content in regional languages.The dataset analysis presents the distribution of target variables in the Tamil-English, Telugu-English, and Tamil datasets.The methodology section describes the preprocessing steps, including consistency, removal of special characters and emojis, removal of stop words, and stemming of data. Overall, the study contributes to the field of abusive comment detection in Tamil and Telugu languages.</abstract>
+      <url hash="8ba5c7bc">2023.dravidianlangtech-1.32</url>
+      <bibkey>reddy-etal-2023-supernova</bibkey>
+    </paper>
+    <paper id="33">
+      <title><fixed-case>A</fixed-case>bhi<fixed-case>P</fixed-case>aw@ <fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech: Abusive Comment Detection in <fixed-case>T</fixed-case>amil and <fixed-case>T</fixed-case>elugu using Logistic Regression</title>
+      <author><first>Abhinaba</first><last>Bala</last></author>
+      <author><first>Parameswari</first><last>Krishnamurthy</last></author>
+      <pages>231–234</pages>
+      <abstract>Abusive comments in online platforms have become a significant concern, necessitating the development of effective detection systems. However, limited work has been done in low resource languages, including Dravidian languages. This paper addresses this gap by focusing on abusive comment detection in a dataset containing Tamil, Tamil-English and Telugu-English code-mixed comments. Our methodology involves logistic regression and explores suitable embeddings to enhance the performance of the detection model. Through rigorous experimentation, we identify the most effective combination of logistic regression and embeddings. The results demonstrate the performance of our proposed model, which contributes to the development of robust abusive comment detection systems in low resource language settings. Keywords: Abusive comment detection, Dravidian languages, logistic regression, embeddings, low resource languages, code-mixed dataset.</abstract>
+      <url hash="cdbca1ae">2023.dravidianlangtech-1.33</url>
+      <bibkey>bala-krishnamurthy-2023-abhipaw-dravidianlangtech</bibkey>
+    </paper>
+    <paper id="34">
+      <title><fixed-case>A</fixed-case>bhi<fixed-case>P</fixed-case>aw@ <fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech: Fake News Detection in <fixed-case>D</fixed-case>ravidian Languages using Multilingual <fixed-case>BERT</fixed-case></title>
+      <author><first>Abhinaba</first><last>Bala</last></author>
+      <author><first>Parameswari</first><last>Krishnamurthy</last></author>
+      <pages>235–238</pages>
+      <abstract>This study addresses the challenge of detecting fake news in Dravidian languages by leveraging Google’s MuRIL (Multilingual Representations for Indian Languages) model. Drawing upon previous research, we investigate the intricacies involved in identifying fake news and explore the potential of transformer-based models for linguistic analysis and contextual understanding. Through supervised learning, we fine-tune the “muril-base-cased” variant of MuRIL using a carefully curated dataset of labeled comments and posts in Dravidian languages, enabling the model to discern between original and fake news. During the inference phase, the fine-tuned MuRIL model analyzes new textual content, extracting contextual and semantic features to predict the content’s classification. We evaluate the model’s performance using standard metrics, highlighting the effectiveness of MuRIL in detecting fake news in Dravidian languages and contributing to the establishment of a safer digital ecosystem. Keywords: fake news detection, Dravidian languages, MuRIL, transformer-based models, linguistic analysis, contextual understanding.</abstract>
+      <url hash="fef7bd31">2023.dravidianlangtech-1.34</url>
+      <bibkey>bala-krishnamurthy-2023-abhipaw-dravidianlangtech-fake</bibkey>
+    </paper>
+    <paper id="35">
+      <title>Habesha@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech: Utilizing Deep and Transfer Learning Approaches for Sentiment Analysis.</title>
+      <author><first>Mesay Gemeda</first><last>Yigezu</last></author>
+      <author><first>Tadesse</first><last>Kebede</last></author>
+      <author><first>Olga</first><last>Kolesnikova</last></author>
+      <author><first>Grigori</first><last>Sidorov</last></author>
+      <author><first>Alexander</first><last>Gelbukh</last></author>
+      <pages>239–243</pages>
+      <abstract>This research paper focuses on sentiment analysis of Tamil and Tulu texts using a BERT model and an RNN model. The BERT model, which was pretrained, achieved satisfactory performance for the Tulu language, with a Macro F1 score of 0.352. On the other hand, the RNN model showed good performance for Tamil language sentiment analysis, obtaining a Macro F1 score of 0.208. As future work, the researchers aim to fine-tune the models to further improve their results after the training process.</abstract>
+      <url hash="092d2c47">2023.dravidianlangtech-1.35</url>
+      <bibkey>yigezu-etal-2023-habesha</bibkey>
+    </paper>
+    <paper id="36">
+      <title>Habesha@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech: Abusive Comment Detection using Deep Learning Approach</title>
+      <author><first>Mesay Gemeda</first><last>Yigezu</last></author>
+      <author><first>Selam</first><last>Kanta</last></author>
+      <author><first>Olga</first><last>Kolesnikova</last></author>
+      <author><first>Grigori</first><last>Sidorov</last></author>
+      <author><first>Alexander</first><last>Gelbukh</last></author>
+      <pages>244–249</pages>
+      <abstract>This research focuses on identifying abusive language in comments. The study utilizes deep learning models, including Long Short-Term Memory (LSTM) and Recurrent Neural Networks (RNNs), to analyze linguistic patterns. Specifically, the LSTM model, a type of RNN, is used to understand the context by capturing long-term dependencies and intricate patterns in the input sequences. The LSTM model achieves better accuracy and is enhanced through the addition of a dropout layer and early stopping. For detecting abusive language in Telugu and Tamil-English, an LSTM model is employed, while in Tamil abusive language detection, a word-level RNN is developed to identify abusive words. These models process text sequentially, considering overall content and capturing contextual dependencies.</abstract>
+      <url hash="1967ec5a">2023.dravidianlangtech-1.36</url>
+      <bibkey>yigezu-etal-2023-habesha-dravidianlangtech</bibkey>
+    </paper>
+    <paper id="37">
+      <title><fixed-case>SADT</fixed-case>ech@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech: Multimodal Sentiment Analysis of <fixed-case>T</fixed-case>amil and <fixed-case>M</fixed-case>alayalam</title>
+      <author><first>Abhinav</first><last>Patil</last></author>
+      <author><first>Sam</first><last>Briggs</last></author>
+      <author><first>Tara</first><last>Wueger</last></author>
+      <author><first>Daniel D.</first><last>O’Connell</last></author>
+      <pages>250–257</pages>
+      <abstract>We present several models for sentiment analysis of multimodal movie reviews in Tamil and Malayalam into 5 separate classes: highly negative, negative, neutral, positive, and highly positive, based on the shared task, “Multimodal Abusive Language Detection and Sentiment Analysis” at RANLP-2023. We use transformer language models to build text and audio embeddings and then compare the performance of multiple classifier models trained on these embeddings: a Multinomial Naive Bayes baseline, a Logistic Regression, a Random Forest, and an SVM. To account for class imbalance, we use both naive resampling and SMOTE. We found that without resampling, the baseline models have the same performance as a naive Majority Class Classifier. However, with resampling, logistic regression and random forest both demonstrate gains over the baseline.</abstract>
+      <url hash="8b223397">2023.dravidianlangtech-1.37</url>
+      <bibkey>patil-etal-2023-sadtech</bibkey>
+    </paper>
+    <paper id="38">
+      <title><fixed-case>MUCS</fixed-case>@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech2023: Sentiment Analysis in Code-mixed <fixed-case>T</fixed-case>amil and <fixed-case>T</fixed-case>ulu Texts using fast<fixed-case>T</fixed-case>ext</title>
+      <author><first>Rachana</first><last>K</last></author>
+      <author><first>Prajnashree</first><last>M</last></author>
+      <author><first>Asha</first><last>Hegde</last></author>
+      <author><first>H. L</first><last>Shashirekha</last></author>
+      <pages>258–265</pages>
+      <abstract>Sentiment Analysis (SA) is a field of computational study that focuses on analyzing and understanding people’s opinions, attitudes, and emotions towards an entity. An entity could be an individual, an event, a topic, a product etc., which is most likely to be covered by reviews and such reviews can be found in abundance on social media platforms. The increase in the number of social media users and the growing amount of user-generated code-mixed content such as reviews, comments, posts etc., on social media have resulted in a rising demand for efficient tools capable of effectively analyzing such content to detect the sentiments. However, SA of social media text is challenging due to the complex nature of the code-mixed text. To tackle this issue, in this paper, we team MUCS, describe learning models submitted to “Sentiment Analysis in Tamil and Tulu” -DravidianLangTech@Recent Advances In Natural Language Processing (RANLP) 2023. Using fastText embeddings to train the Machine Learning (ML) models to perform SA in code-mixed Tamil and Tulu texts, the proposed methodology exhibited F1 scores of 0.14 and 0.204 securing 13th and 15th rank for Tamil and Tulu texts respectively.</abstract>
+      <url hash="476d75fa">2023.dravidianlangtech-1.38</url>
+      <bibkey>k-etal-2023-mucs</bibkey>
+    </paper>
+    <paper id="39">
+      <title><fixed-case>MUCS</fixed-case>@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech2023: Leveraging Learning Models to Identify Abusive Comments in Code-mixed <fixed-case>D</fixed-case>ravidian Languages</title>
+      <author><first>Asha</first><last>Hegde</last></author>
+      <author><first>Kavya</first><last>G</last></author>
+      <author><first>Sharal</first><last>Coelho</last></author>
+      <author><first>Hosahalli Lakshmaiah</first><last>Shashirekha</last></author>
+      <pages>266–274</pages>
+      <abstract>Abusive language detection in user-generated online content has become a pressing concern due to its negative impact on users and challenges for policy makers. Online platforms are faced with the task of moderating abusive content to mitigate societal harm, adhere to legal requirements, and foster inclusivity. Despite numerous methods developed for automated detection of abusive language, the problem continues to persist. This ongoing challenge necessitates further research and development to enhance the effectiveness of abusive content detection systems and implement proactive measures to create safer and more respectful online spaces. To address the automatic detection of abusive languages in social media platforms, this paper describes the models submitted by our team - MUCS to the shared task “Abusive Comment Detection in Tamil and Telugu” at DravidianLangTech - in Recent Advances in Natural Language Processing (RANLP) 2023. This shared task addresses the abusive comment detection in code-mixed Tamil, Telugu, and romanized Tamil (Tamil-English) texts. Two distinct models: i) AbusiveML - a model implemented utilizing Linear Support Vector Classifier (LinearSVC) algorithm fed with n-grams of words and character sequences within word boundary (char_wb) features and ii) AbusiveTL - a Transfer Learning (TL ) model with three different Bidirectional Encoder Representations from Transformers (BERT) models along with random oversampling to deal with data imbalance, are submitted to the shared task for detecting abusive language in the given code-mixed texts. The AbusiveTL model fared well among these two models, with macro F1 scores of 0.46, 0.74, and 0.49 for code-mixed Tamil, Telugu, and Tamil-English texts respectively.</abstract>
+      <url hash="25ec0fce">2023.dravidianlangtech-1.39</url>
+      <bibkey>hegde-etal-2023-mucs</bibkey>
+    </paper>
+    <paper id="40">
+      <title><fixed-case>MUNLP</fixed-case>@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech2023: Learning Approaches for Sentiment Analysis in Code-mixed <fixed-case>T</fixed-case>amil and <fixed-case>T</fixed-case>ulu Text</title>
+      <author><first>Asha</first><last>Hegde</last></author>
+      <author><first>Kavya</first><last>G</last></author>
+      <author><first>Sharal</first><last>Coelho</last></author>
+      <author><first>Pooja</first><last>Lamani</last></author>
+      <author><first>Hosahalli Lakshmaiah</first><last>Shashirekha</last></author>
+      <pages>275–281</pages>
+      <abstract>Sentiment Analysis (SA) examines the subjective content of a statement, such as opinions, assessments, feelings, or attitudes towards a subject, person, or a thing. Though several models are developed for SA in high-resource languages like English, Spanish, German, etc., uder-resourced languages like Dravidian languages are less explored. To address the challenges of SA in low resource Dravidian languages, in this paper, we team MUNLP describe the models submitted to “Sentiment Analysis in Tamil and Tulu- DravidianLangTech” shared task at Recent Advances in Natural Language Processing (RANLP)-2023. n-gramsSA, EmbeddingsSA and BERTSA are the models proposed for SA shared task. Among all the models, BERTSA exhibited a maximum macro F1 score of 0.26 for code-mixed Tamil texts securing 2nd place in the shared task. EmbeddingsSA exhibited maximum macro F1 score of 0.53 securing 2nd place for Tulu code-mixed texts.</abstract>
+      <url hash="622b8334">2023.dravidianlangtech-1.40</url>
+      <bibkey>hegde-etal-2023-munlp</bibkey>
+    </paper>
+    <paper id="41">
+      <title><fixed-case>MUCSD</fixed-case>@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech2023: Predicting Sentiment in Social Media Text using Machine Learning Techniques</title>
+      <author><first>Sharal</first><last>Coelho</last></author>
+      <author><first>Asha</first><last>Hegde</last></author>
+      <author><first>Pooja</first><last>Lamani</last></author>
+      <author><first>Kavya</first><last>G</last></author>
+      <author><first>Hosahalli Lakshmaiah</first><last>Shashirekha</last></author>
+      <pages>282–287</pages>
+      <abstract>User-generated social media texts are a blend of resource-rich languages like English and low-resource Dravidian languages like Tamil, Kannada, Tulu, etc. These texts referred to as code-mixing texts are enriching social media since they are written in two or more languages using either a common language script or various language scripts. However, due to the complex nature of the code-mixed text, in this paper, we - team MUCSD, describe a Machine learning (ML) models submitted to “Sentiment Analysis in Tamil and Tulu” shared task at DravidianLangTech@RANLP 2023. The proposed methodology makes use of ML models such as Linear Support Vector Classifier (LinearSVC), LR, and ensemble model (LR, DT, and SVM) to perform SA in Tamil and Tulu languages. The proposed LinearSVC model’s predictions submitted to the shared tasks, obtained 8th and 9th rank for Tamil-English and Tulu-English respectively.</abstract>
+      <url hash="62e748df">2023.dravidianlangtech-1.41</url>
+      <bibkey>coelho-etal-2023-mucsd</bibkey>
+    </paper>
+    <paper id="42">
+      <title><fixed-case>MUCS</fixed-case>@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech2023: <fixed-case>M</fixed-case>alayalam Fake News Detection Using Machine Learning Approach</title>
+      <author><first>Sharal</first><last>Coelho</last></author>
+      <author><first>Asha</first><last>Hegde</last></author>
+      <author><first>Kavya</first><last>G</last></author>
+      <author><first>Hosahalli Lakshmaiah</first><last>Shashirekha</last></author>
+      <pages>288–292</pages>
+      <abstract>Social media is widely used to spread fake news, which affects a larger population. So it is considered as a very important task to detect fake news spread on social media platforms. To address the challenges in the identification of fake news in the Malayalam language, in this paper, we - team MUCS, describe the Machine Learning (ML) models submitted to “Fake News Detection in Dravidian Languages” at DravidianLangTech@RANLP 2023 shared task. Three different models, namely, Multinomial Naive Bayes (MNB), Logistic Regression (LR), and Ensemble model (MNB, LR, and SVM) are trained using Term Frequency - Inverse Document Frequency (TF-IDF) of word unigrams. Among the three models ensemble model performed better with a macro F1-score of 0.83 and placed 3rd rank in the shared task.</abstract>
+      <url hash="77918c75">2023.dravidianlangtech-1.42</url>
+      <bibkey>coelho-etal-2023-mucs</bibkey>
+    </paper>
+    <paper id="43">
+      <title><fixed-case>KEC</fixed-case>_<fixed-case>AI</fixed-case>_<fixed-case>NLP</fixed-case>@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech: Abusive Comment Detection in <fixed-case>T</fixed-case>amil Language</title>
+      <author><first>Kogilavani</first><last>Shanmugavadivel</last></author>
+      <author><first>Malliga</first><last>Subramanian</last></author>
+      <author><first>Shri Durga</first><last>R</last></author>
+      <author><first>Srigha</first><last>S</last></author>
+      <author><first>Sree Harene</first><last>J S</last></author>
+      <author><first>Yasvanth Bala</first><last>P</last></author>
+      <pages>293–299</pages>
+      <abstract>Our work aims to identify the negative comments that is associated with Counter-speech,Xenophobia, Homophobia,Transphobia, Misandry, Misogyny, None-of-the-above categories, In order to identify these categories from the given dataset, we propose three different models such as traditional machine learning techniques, deep learning model and transfer Learning model called BERT is also used to analyze the texts. In the Tamil dataset, we are training the models with Train dataset and test the models with Validation data. Our Team Participated in the shared task organised by DravidianLangTech and secured 4th rank in the task of abusive comment detection in Tamil with a macro- f1 score of 0.35. Also, our run was submitted for abusive comment detection in code-mixed languages (Tamil-English) and secured 6th rank with a macro-f1 score of 0.42.</abstract>
+      <url hash="1f95d255">2023.dravidianlangtech-1.43</url>
+      <bibkey>shanmugavadivel-etal-2023-kec</bibkey>
+    </paper>
+    <paper id="44">
+      <title><fixed-case>KEC</fixed-case>_<fixed-case>AI</fixed-case>_<fixed-case>NLP</fixed-case>@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech: Sentiment Analysis in Code Mixture Language</title>
+      <author><first>Kogilavani</first><last>Shanmugavadivel</last></author>
+      <author><first>Malliga</first><last>Subaramanian</last></author>
+      <author><first>VetriVendhan</first><last>S</last></author>
+      <author><first>Pramoth Kumar</first><last>M</last></author>
+      <author><first>Karthickeyan</first><last>S</last></author>
+      <author><first>Kavin Vishnu</first><last>N</last></author>
+      <pages>300–305</pages>
+      <abstract>Sentiment Analysis is a process that involves analyzing digital text to determine the emo- tional tone, such as positive, negative, neu- tral, or unknown. Sentiment Analysis of code- mixed languages presents challenges in natural language processing due to the complexity of code-mixed data, which combines vocabulary and grammar from multiple languages and cre- ates unique structures. The scarcity of anno- tated data and the unstructured nature of code- mixed data are major challenges. To address these challenges, we explored various tech- niques, including Machine Learning models such as Decision Trees, Random Forests, Lo- gistic Regression, and Gaussian Na ̈ıve Bayes, Deep Learning model, such as Long Short- Term Memory (LSTM), and Transfer Learning model like BERT, were also utilized. In this work, we obtained the dataset from the Dravid- ianLangTech shared task by participating in a competition and accessing train, development and test data for Tamil Language. The results demonstrated promising performance in senti- ment analysis of code-mixed text. Among all the models, deep learning model LSTM pro- vides best accuracy of 0.61 for Tamil language.</abstract>
+      <url hash="7c904d27">2023.dravidianlangtech-1.44</url>
+      <bibkey>shanmugavadivel-etal-2023-kec-ai</bibkey>
+    </paper>
+    <paper id="45">
+      <title><fixed-case>CSSCUTN</fixed-case>@<fixed-case>D</fixed-case>ravidian<fixed-case>L</fixed-case>ang<fixed-case>T</fixed-case>ech:Abusive comments Detection in <fixed-case>T</fixed-case>amil and <fixed-case>T</fixed-case>elugu</title>
+      <author><first>Kathiravan</first><last>Pannerselvam</last></author>
+      <author><first>Saranya</first><last>Rajiakodi</last></author>
+      <author><first>Rahul</first><last>Ponnusamy</last></author>
+      <author><first>Sajeetha</first><last>Thavareesan</last></author>
+      <pages>306–312</pages>
+      <abstract>Code-mixing is a word or phrase-level act of interchanging two or more languages during a conversation or in written text within a sentence. This phenomenon is widespread on social media platforms, and understanding the underlying abusive comments in a code-mixed sentence is a complex challenge. We present our system in our submission for the DravidianLangTech Shared Task on Abusive Comment Detection in Tamil and Telugu. Our approach involves building a multiclass abusive detection model that recognizes 8 different labels. The provided samples are code-mixed Tamil-English text, where Tamil is represented in romanised form. We focused on the Multiclass classification subtask, and we leveraged Support Vector Machine (SVM), Random Forest (RF), and Logistic Regression (LR). Our method exhibited its effectiveness in the shared task by earning the ninth rank out of all competing systems for the classification of abusive comments in the code-mixed text. Our proposed classifier achieves an impressive accuracy of 0.99 and an F1-score of 0.99 for a balanced dataset using TF-IDF with SVM. It can be used effectively to detect abusive comments in Tamil, English code-mixed text</abstract>
+      <url hash="59e74dd2">2023.dravidianlangtech-1.45</url>
+      <bibkey>pannerselvam-etal-2023-csscutn</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2023.humeval.xml b/data/xml/2023.humeval.xml
new file mode 100644
index 0000000000..9bc117f75c
--- /dev/null
+++ b/data/xml/2023.humeval.xml
@@ -0,0 +1,188 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2023.humeval">
+  <volume id="1" ingest-date="2023-10-31" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the 3rd Workshop on Human Evaluation of NLP Systems</booktitle>
+      <editor><first>Anya</first><last>Belz</last></editor>
+      <editor><first>Maja</first><last>Popović</last></editor>
+      <editor><first>Ehud</first><last>Reiter</last></editor>
+      <editor><first>Craig</first><last>Thomson</last></editor>
+      <editor><first>João</first><last>Sedoc</last></editor>
+      <publisher>INCOMA Ltd., Shoumen, Bulgaria</publisher>
+      <address>Varna, Bulgaria</address>
+      <month>September</month>
+      <year>2023</year>
+      <url hash="e1ff764e">2023.humeval-1</url>
+      <venue>humeval</venue>
+      <venue>ws</venue>
+    </meta>
+    <frontmatter>
+      <url hash="2825123e">2023.humeval-1.0</url>
+      <bibkey>humeval-2023-human</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>A Manual Evaluation Method of Neural <fixed-case>MT</fixed-case> for Indigenous Languages</title>
+      <author><first>Linda</first><last>Wiechetek</last></author>
+      <author><first>Flammie</first><last>Pirinen</last></author>
+      <author><first>Per</first><last>Kummervold</last></author>
+      <pages>1–10</pages>
+      <abstract>Indigenous language expertise is not encoded in written text in the same way as it is for languages that have a long literal tradition. In many cases it is, on the contrary, mostly conserved orally. Therefore the evaluation of neural MT systems solely based on an algorithm learning from written texts is not adequate to measure the quality of a system that is used by the language community. If extensively using tools based on a big amount of non-native language this can even contribute to language change in a way that is not desired by the language community. It can also pollute the internet with automatically created texts that outweigh native texts. We propose a manual evaluation method focusing on flow and content separately, and additionally we use existing rule-based NLP to evaluate other factors such as spelling, grammar and grammatical richness. Our main conclusion is that language expertise of a native speaker is necessary to properly evaluate a given system. We test the method by manually evaluating two neural MT tools for an indigenous low resource language. We present an experiment on two different neural translations to and from North Sámi, an indigenous language of North Europe.</abstract>
+      <url hash="e2b27a34">2023.humeval-1.1</url>
+      <bibkey>wiechetek-etal-2023-manual</bibkey>
+    </paper>
+    <paper id="2">
+      <title>Hierarchical Evaluation Framework: Best Practices for Human Evaluation</title>
+      <author><first>Iva</first><last>Bojic</last></author>
+      <author><first>Jessica</first><last>Chen</last></author>
+      <author><first>Si Yuan</first><last>Chang</last></author>
+      <author><first>Qi Chwen</first><last>Ong</last></author>
+      <author><first>Shafiq</first><last>Joty</last></author>
+      <author><first>Josip</first><last>Car</last></author>
+      <pages>11–22</pages>
+      <abstract>Human evaluation plays a crucial role in Natural Language Processing (NLP) as it assesses the quality and relevance of developed systems, thereby facilitating their enhancement. However, the absence of widely accepted human evaluation metrics in NLP hampers fair comparisons among different systems and the establishment of universal assessment standards. Through an extensive analysis of existing literature on human evaluation metrics, we identified several gaps in NLP evaluation methodologies. These gaps served as motivation for developing our own hierarchical evaluation framework. The proposed framework offers notable advantages, particularly in providing a more comprehensive representation of the NLP system’s performance. We applied this framework to evaluate the developed Machine Reading Comprehension system, which was utilized within a human-AI symbiosis model. The results highlighted the associations between the quality of inputs and outputs, underscoring the necessity to evaluate both components rather than solely focusing on outputs. In future work, we will investigate the potential time-saving benefits of our proposed framework for evaluators assessing NLP systems.</abstract>
+      <url hash="892d48b2">2023.humeval-1.2</url>
+      <bibkey>bojic-etal-2023-hierarchical</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Designing a Metalanguage of Differences Between Translations: A Case Study for <fixed-case>E</fixed-case>nglish-to-<fixed-case>J</fixed-case>apanese Translation</title>
+      <author><first>Tomono</first><last>Honda</last></author>
+      <author><first>Atsushi</first><last>Fujita</last></author>
+      <author><first>Mayuka</first><last>Yamamoto</last></author>
+      <author><first>Kyo</first><last>Kageura</last></author>
+      <pages>23–34</pages>
+      <abstract>In both the translation industry and translation education, analytic and systematic assessment of translations plays a vital role. However, due to lack of a scheme for describing differences between translations, such assessment has been realized only in an ad-hoc manner. There is prior work on a scheme for describing differences between translations, but it has coverage and objectivity issues. To alleviate these issues and realize more fine-grained analyses, we developed an improved scheme by referring to diverse types of translations and adopting hierarchical linguistic units for analysis, taking English-to-Japanese translation as an example.</abstract>
+      <url hash="961641ed">2023.humeval-1.3</url>
+      <bibkey>honda-etal-2023-designing</bibkey>
+    </paper>
+    <paper id="4">
+      <title>The 2023 <fixed-case>R</fixed-case>epro<fixed-case>NLP</fixed-case> Shared Task on Reproducibility of Evaluations in <fixed-case>NLP</fixed-case>: Overview and Results</title>
+      <author><first>Anya</first><last>Belz</last></author>
+      <author><first>Craig</first><last>Thomson</last></author>
+      <pages>35–48</pages>
+      <abstract>This paper presents an overview of, and the results from, the 2023 Shared Task on Reproducibility of Evaluations in NLP (ReproNLP’23), following on from two previous shared tasks on reproducibility of evaluations in NLG, ReproGen’21 and ReproGen’22. This shared task series forms part of an ongoing research programme designed to develop theory and practice of reproducibility assessment in NLP and machine learning, all against a background of an interest in reproducibility that con- tinues to grow in the two fields. This paper describes the ReproNLP’23 shared task, summarises results from the reproduction studies submitted, and provides comparative analysis of the results.</abstract>
+      <url hash="44477e85">2023.humeval-1.4</url>
+      <bibkey>belz-thomson-2023-2023</bibkey>
+    </paper>
+    <paper id="5">
+      <title>Some lessons learned reproducing human evaluation of a data-to-text system</title>
+      <author><first>Javier</first><last>González Corbelle</last></author>
+      <author><first>Jose</first><last>Alonso</last></author>
+      <author><first>Alberto</first><last>Bugarín-Diz</last></author>
+      <pages>49–68</pages>
+      <abstract>This paper presents a human evaluation reproduction study regarding the data-to-text generation task. The evaluation focuses in counting the supported and contradicting facts generated by a neural data-to-text model with a macro planning stage. The model is tested generating sport summaries for the ROTOWIRE dataset. We first describe the approach to reproduction that is agreed in the context of the ReproHum project. Then, we detail the entire configuration of the original human evaluation and the adaptations that had to be made to reproduce such an evaluation. Finally, we compare the reproduction results with those reported in the paper that was taken as reference.</abstract>
+      <url hash="8a39e2c5">2023.humeval-1.5</url>
+      <bibkey>gonzalez-corbelle-etal-2023-lessons</bibkey>
+    </paper>
+    <paper id="6">
+      <title>Unveiling <fixed-case>NLG</fixed-case> Human-Evaluation Reproducibility: Lessons Learned and Key Insights from Participating in the <fixed-case>R</fixed-case>epro<fixed-case>NLP</fixed-case> Challenge</title>
+      <author><first>Lewis</first><last>Watson</last></author>
+      <author><first>Dimitra</first><last>Gkatzia</last></author>
+      <pages>69–74</pages>
+      <abstract>Human evaluation is crucial for NLG systems as it provides a reliable assessment of the quality, effectiveness, and utility of generated language outputs. However, concerns about the reproducibility of such evaluations have emerged, casting doubt on the reliability and generalisability of reported results. In this paper, we present the findings of a reproducibility study on a data-to-text system, conducted under two conditions: (1) replicating the original setup as closely as possible with evaluators from AMT, and (2) replicating the original human evaluation but this time, utilising evaluators with a background in academia. Our experiments show that there is a loss of statistical significance between the original and reproduction studies, i.e. the human evaluation results are not reproducible. In addition, we found that employing local participants led to more robust results. We finally discuss lessons learned, addressing the challenges and best practices for ensuring reproducibility in NLG human evaluations.</abstract>
+      <url hash="3555271b">2023.humeval-1.6</url>
+      <bibkey>watson-gkatzia-2023-unveiling</bibkey>
+    </paper>
+    <paper id="7">
+      <title>How reproducible is best-worst scaling for human evaluation? A reproduction of ‘Data-to-text Generation with Macro Planning’</title>
+      <author><first>Emiel</first><last>van Miltenburg</last></author>
+      <author><first>Anouck</first><last>Braggaar</last></author>
+      <author><first>Nadine</first><last>Braun</last></author>
+      <author><first>Debby</first><last>Damen</last></author>
+      <author><first>Martijn</first><last>Goudbeek</last></author>
+      <author><first>Chris</first><last>van der Lee</last></author>
+      <author><first>Frédéric</first><last>Tomas</last></author>
+      <author><first>Emiel</first><last>Krahmer</last></author>
+      <pages>75–88</pages>
+      <abstract>This paper is part of the larger ReproHum project, where different teams of researchers aim to reproduce published experiments from the NLP literature. Specifically, ReproHum focuses on the reproducibility of human evaluation studies, where participants indicate the quality of different outputs of Natural Language Generation (NLG) systems. This is necessary because without reproduction studies, we do not know how reliable earlier results are. This paper aims to reproduce the second human evaluation study of Puduppully &amp; Lapata (2021), while another lab is attempting to do the same. This experiment uses best-worst scaling to determine the relative performance of different NLG systems. We found that the worst performing system in the original study is now in fact the best performing system across the board. This means that we cannot fully reproduce the original results. We also carry out alternative analyses of the data, and discuss how our results may be combined with the other reproduction study that is carried out in parallel with this paper.</abstract>
+      <url hash="fc9261ac">2023.humeval-1.7</url>
+      <bibkey>van-miltenburg-etal-2023-reproducible</bibkey>
+    </paper>
+    <paper id="8">
+      <title>Human Evaluation Reproduction Report for Data-to-text Generation with Macro Planning</title>
+      <author><first>Mohammad</first><last>Arvan</last></author>
+      <author><first>Natalie</first><last>Parde</last></author>
+      <pages>89–96</pages>
+      <abstract>This paper presents a partial reproduction study of Data-to-text Generation with Macro Planning by Puduppully et al. (2021). This work was conducted as part of the ReproHum project, a multi-lab effort to reproduce the results of NLP papers incorporating human evaluations. We follow the same instructions provided by the authors and the ReproHum team to the best of our abilities. We collect preference ratings for the following evaluation criteria in order: conciseness, coherence, and grammaticality. Our results are highly correlated with the original experiment. Nonetheless, we believe the presented results are insufficent to conclude that the Macro system proposed and developed by the original paper is superior compared to other systems. We suspect combining our results with the three other reproductions of this paper through the ReproHum project will paint a clearer picture. Overall, we hope that our work is a step towards a more transparent and reproducible research landscape.</abstract>
+      <url hash="092e215c">2023.humeval-1.8</url>
+      <bibkey>arvan-parde-2023-human</bibkey>
+    </paper>
+    <paper id="9">
+      <title>Challenges in Reproducing Human Evaluation Results for Role-Oriented Dialogue Summarization</title>
+      <author><first>Takumi</first><last>Ito</last></author>
+      <author><first>Qixiang</first><last>Fang</last></author>
+      <author><first>Pablo</first><last>Mosteiro</last></author>
+      <author><first>Albert</first><last>Gatt</last></author>
+      <author><first>Kees</first><last>van Deemter</last></author>
+      <pages>97–123</pages>
+      <abstract>There is a growing concern regarding the reproducibility of human evaluation studies in NLP. As part of the ReproHum campaign, we conducted a study to assess the reproducibility of a recent human evaluation study in NLP. Specifically, we attempted to reproduce a human evaluation of a novel approach to enhance Role-Oriented Dialogue Summarization by considering the influence of role interactions. Despite our best efforts to adhere to the reported setup, we were unable to reproduce the statistical results as presented in the original paper. While no contradictory evidence was found, our study raises questions about the validity of the reported statistical significance results, and/or the comprehensiveness with which the original study was reported. In this paper, we provide a comprehensive account of our reproduction study, detailing the methodologies employed, data collection, and analysis procedures. We discuss the implications of our findings for the broader issue of reproducibility in NLP research. Our findings serve as a cautionary reminder of the challenges in conducting reproducible human evaluations and prompt further discussions within the NLP community.</abstract>
+      <url hash="0b3ac317">2023.humeval-1.9</url>
+      <bibkey>ito-etal-2023-challenges</bibkey>
+    </paper>
+    <paper id="10">
+      <title>A Reproduction Study of the Human Evaluation of Role-Oriented Dialogue Summarization Models</title>
+      <author><first>Mingqi</first><last>Gao</last></author>
+      <author><first>Jie</first><last>Ruan</last></author>
+      <author><first>Xiaojun</first><last>Wan</last></author>
+      <pages>124–129</pages>
+      <abstract>This paper reports a reproduction study of the human evaluation of role-oriented dialogue summarization models, as part of the ReproNLP Shared Task 2023 on Reproducibility of Evaluations in NLP. We outline the disparities between the original study’s experimental design and our reproduction study, along with the outcomes obtained. The inter-annotator agreement within the reproduction study is observed to be lower, measuring 0.40 as compared to the original study’s 0.48. Among the six conclusions drawn in the original study, four are validated in our reproduction study. We confirm the effectiveness of the proposed approach on the overall metric, albeit with slightly poorer relative performance compared to the original study. Furthermore, we raise an open-ended inquiry: how can subjective practices in the original study be identified and addressed when conducting reproduction studies?</abstract>
+      <url hash="e5b7f2a9">2023.humeval-1.10</url>
+      <bibkey>gao-etal-2023-reproduction</bibkey>
+    </paper>
+    <paper id="11">
+      <title>h_da@<fixed-case>R</fixed-case>epro<fixed-case>H</fixed-case>umn – Reproduction of Human Evaluation and Technical Pipeline</title>
+      <author><first>Margot</first><last>Mieskes</last></author>
+      <author><first>Jacob Georg</first><last>Benz</last></author>
+      <pages>130–135</pages>
+      <abstract>How reliable are human evaluation results? Is it possible to replicate human evaluation? This work takes a closer look at the evaluation of the output of a Text-to-Speech (TTS) system. Unfortunately, our results indicate that human evaluation is not as straightforward to replicate as expected. Additionally, we also present results on reproducing the technical background of the TTS system and discuss potential reasons for the reproduction failure.</abstract>
+      <url hash="b1fcbece">2023.humeval-1.11</url>
+      <bibkey>mieskes-benz-2023-h</bibkey>
+    </paper>
+    <paper id="12">
+      <title>Reproducing a Comparative Evaluation of <fixed-case>G</fixed-case>erman Text-to-Speech Systems</title>
+      <author><first>Manuela</first><last>Hürlimann</last></author>
+      <author><first>Mark</first><last>Cieliebak</last></author>
+      <pages>136–144</pages>
+      <abstract>This paper describes the reproduction of a human evaluation in Language-Agnostic Meta- Learning for Low-Resource Text-to-Speech with Articulatory Features reported in Lux and Vu (2022). It is a contribution to the ReproNLP 2023 Shared Task on Reproducibility of Evaluations in NLP. The original evaluation assessed the naturalness of audio generated by different Text-to-Speech (TTS) systems for German, and our goal was to repeat the experiment with a different set of evaluators. We reproduced the evaluation based on data and instructions provided by the original authors, with some uncertainty concerning the randomisation of question order. Evaluators were recruited via email to relevant mailing lists and we received 157 responses over the course of three weeks. Our initial results show low reproducibility, but when we assume that the systems of the original and repeat evaluation experiment have been transposed, the reproducibility assessment improves markedly. We do not know if and at what point such a transposition happened; however, an initial analysis of our audio and video files provides some evidence that the system assignment in our repeat experiment is correct.</abstract>
+      <url hash="9b3c17c7">2023.humeval-1.12</url>
+      <bibkey>hurlimann-cieliebak-2023-reproducing</bibkey>
+    </paper>
+    <paper id="13">
+      <title>With a Little Help from the Authors: Reproducing Human Evaluation of an <fixed-case>MT</fixed-case> Error Detector</title>
+      <author><first>Ondrej</first><last>Platek</last></author>
+      <author><first>Mateusz</first><last>Lango</last></author>
+      <author><first>Ondrej</first><last>Dusek</last></author>
+      <pages>145–152</pages>
+      <abstract>This work presents our efforts to reproduce the results of the human evaluation experiment presented in the paper of Vamvas and Sennrich (2022), which evaluated an automatic system detecting over- and undertranslations (translations containing more or less information than the original) in machine translation (MT) outputs. Despite the high quality of the documentation and code provided by the authors, we discuss some problems we found in reproducing the exact experimental setup and offer recommendations for improving reproducibility. Our replicated results generally confirm the conclusions of the original study, but in some cases statistically significant differences were observed, suggesting a high variability of human annotation.</abstract>
+      <url hash="7a620ffe">2023.humeval-1.13</url>
+      <bibkey>platek-etal-2023-little</bibkey>
+    </paper>
+    <paper id="14">
+      <title><fixed-case>H</fixed-case>um<fixed-case>E</fixed-case>val’23 Reproduction Report for Paper 0040: Human Evaluation of Automatically Detected Over- and Undertranslations</title>
+      <author><first>Filip</first><last>Klubička</last></author>
+      <author><first>John D.</first><last>Kelleher</last></author>
+      <pages>153–189</pages>
+      <abstract>This report describes a reproduction of a human evaluation study evaluating automatically detected over- and undertranslations obtained using neural machine translation approaches. While the scope of the original study is much broader, a human evaluation is included as part of its system evaluation. We attempt an exact reproduction of this human evaluation, pertaining to translations on the the English-German language pair. While encountering minor logistical challenges, with all the source material being publicly available and some additional instructions provided by the original authors, we were able to reproduce the original experiment with only minor differences in the results.</abstract>
+      <url hash="d9bea0ca">2023.humeval-1.14</url>
+      <bibkey>klubicka-kelleher-2023-humeval23</bibkey>
+    </paper>
+    <paper id="15">
+      <title>Same Trends, Different Answers: Insights from a Replication Study of Human Plausibility Judgments on Narrative Continuations</title>
+      <author><first>Yiru</first><last>Li</last></author>
+      <author><first>Huiyuan</first><last>Lai</last></author>
+      <author><first>Antonio</first><last>Toral</last></author>
+      <author><first>Malvina</first><last>Nissim</last></author>
+      <pages>190–203</pages>
+      <abstract>We reproduced the human-based evaluation of the continuation of narratives task presented by Chakrabarty et al. (2022). This experiment is performed as part of the ReproNLP Shared Task on Reproducibility of Evaluations in NLP (Track C). Our main goal is to reproduce the original study under conditions as similar as possible. Specifically, we follow the original experimental design and perform human evaluations of the data from the original study, while describing the differences between the two studies. We then present the results of these two studies together with an analysis of similarities between them. Inter-annotator agreement (Krippendorff’s alpha) in the reproduction study is lower than in the original study, while the human evaluation results of both studies have the same trends, that is, our results support the findings in the original study.</abstract>
+      <url hash="f854d7a7">2023.humeval-1.15</url>
+      <bibkey>li-etal-2023-trends</bibkey>
+    </paper>
+    <paper id="16">
+      <title>Reproduction of Human Evaluations in: “It’s not Rocket Science: Interpreting Figurative Language in Narratives”</title>
+      <author><first>Saad</first><last>Mahamood</last></author>
+      <pages>204–209</pages>
+      <abstract>We describe in this paper an attempt to reproduce some of the human of evaluation results from the paper “It’s not Rocket Science: Interpreting Figurative Language in Narratives”. In particular, we describe the methodology used to reproduce the chosen human evaluation, the challenges faced, and the results that were gathered. We will also make some recommendations on the learnings obtained from this reproduction attempt and what improvements are needed to enable more robust reproductions of future NLP human evaluations.</abstract>
+      <url hash="2efe3360">2023.humeval-1.16</url>
+      <bibkey>mahamood-2023-reproduction</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2023.isa.xml b/data/xml/2023.isa.xml
index a5cba5dee8..93cba75e2d 100644
--- a/data/xml/2023.isa.xml
+++ b/data/xml/2023.isa.xml
@@ -37,7 +37,7 @@
       <pages>11–17</pages>
       <abstract>The paper presents the work on the selection, semantic annotation and classification of a group of verbs from WordNet, characterized with the semantic primitive ‘verbs of contact’ that belong to the common Bulgarian lexis. The selection of the verb set using both different criteria: statistical information from corpora, WordNet Base concepts and AoA as a criterion, is described. The focus of the work is on the process of the verbs’ of contact semantic annotation using the combined information from two language resources - WordNet and FrameNet. The verbs of contact from WordNet are assigmed semantic frames from FrameNet and then grouped in semantic subclasses using both their place in the WordNet hierarchy, the semantic restrictions on their frame elements and the corresponding syntactic realization. At the end we offer some conclusions on the classification of ‘verbs of contact’ in semantic subtypes.</abstract>
       <url hash="9ad35ee0">2023.isa-1.2</url>
-      <bibkey>todorova-2023-semantic</bibkey>
+      <bibkey>todorova-2023-semantic-annotation</bibkey>
     </paper>
     <paper id="3">
       <title>Appraisal Theory and the Annotation of Speaker-Writer Engagement</title>
diff --git a/data/xml/2023.ltedi.xml b/data/xml/2023.ltedi.xml
new file mode 100644
index 0000000000..9fec8e64b4
--- /dev/null
+++ b/data/xml/2023.ltedi.xml
@@ -0,0 +1,533 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2023.ltedi">
+  <volume id="1" ingest-date="2023-10-31" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the Third Workshop on Language Technology for Equality, Diversity and Inclusion</booktitle>
+      <editor><first>Bharathi R.</first><last>Chakravarthi</last></editor>
+      <editor><first>B.</first><last>Bharathi</last></editor>
+      <editor><first>Joephine</first><last>Griffith</last></editor>
+      <editor><first>Kalika</first><last>Bali</last></editor>
+      <editor><first>Paul</first><last>Buitelaar</last></editor>
+      <publisher>INCOMA Ltd., Shoumen, Bulgaria</publisher>
+      <address>Varna, Bulgaria</address>
+      <month>September</month>
+      <year>2023</year>
+      <url hash="9d560161">2023.ltedi-1</url>
+      <venue>ltedi</venue>
+      <venue>ws</venue>
+    </meta>
+    <frontmatter>
+      <url hash="b30a8b8b">2023.ltedi-1.0</url>
+      <bibkey>ltedi-2023-language</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>An Exploration of Zero-Shot Natural Language Inference-Based Hate Speech Detection</title>
+      <author><first>Nerses</first><last>Yuzbashyan</last></author>
+      <author><first>Nikolay</first><last>Banar</last></author>
+      <author><first>Ilia</first><last>Markov</last></author>
+      <author><first>Walter</first><last>Daelemans</last></author>
+      <pages>1–9</pages>
+      <abstract>Conventional techniques for detecting online hate speech rely on the availability of a sufficient number of annotated instances, which can be costly and time consuming. For this reason, zero-shot or few-shot detection can offer an attractive alternative. In this paper, we explore a zero-shot detection approach based on natural language inference (NLI) models. Since the performance of the models in this approach depends heavily on the choice of a hypothesis, our goal is to determine which factors affect the quality of detection. We conducted a set of experiments with three NLI models and four hate speech datasets. We demonstrate that a zero-shot NLI-based approach is competitive with approaches that require supervised learning, yet they are highly sensitive to the choice of hypothesis. In addition, our experiments indicate that the results for a set of hypotheses on different model-data pairs are positively correlated, and that the correlation is higher for different datasets when using the same model than it is for different models when using the same dataset. These results suggest that if we find a hypothesis that works well for a specific model and domain or for a specific type of hate speech, we can use that hypothesis with the same model also within a different domain. While, another model might require different suitable hypotheses in order to demonstrate high performance.</abstract>
+      <url hash="b4f59943">2023.ltedi-1.1</url>
+      <bibkey>yuzbashyan-etal-2023-exploration</bibkey>
+    </paper>
+    <paper id="2">
+      <title><fixed-case>E</fixed-case>nglish2<fixed-case>BSL</fixed-case>: A Rule-Based System for Translating <fixed-case>E</fixed-case>nglish into <fixed-case>B</fixed-case>ritish <fixed-case>S</fixed-case>ign <fixed-case>L</fixed-case>anguage</title>
+      <author><first>Phoebe Alexandra</first><last>Pinney</last></author>
+      <author><first>Riza</first><last>Batista-Navarro</last></author>
+      <pages>10–16</pages>
+      <abstract>British Sign Language (BSL) is a complex language with its own vocabulary and grammatical structure, separate from English. Despite its long-standing and widespread use by Deaf communities within the UK, thus far, there have been no effective tools for translating written English into BSL. This overt lack of available resources made learning the language highly inaccessible for most people, exacerbating the communication barrier between hearing and Deaf individuals. This paper introduces a rule-based translation system, designed with the ambitious aim of creating the first web application that is not only able to translate sentences in written English into a BSL video output, but can also serve as a learning aid to empower the development of BSL proficiency.</abstract>
+      <url hash="e8757b3f">2023.ltedi-1.2</url>
+      <bibkey>pinney-batista-navarro-2023-english2bsl</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Multilingual Models for Sentiment and Abusive Language Detection for <fixed-case>D</fixed-case>ravidian Languages</title>
+      <author><first>Anand Kumar</first><last>M</last></author>
+      <pages>17–24</pages>
+      <abstract>This paper presents the TFIDF based LSTM and Hierarchical Attention Networks (HAN) for code-mixed abusive comment detection and sentiment analysis for Dravidian languages. The traditional TF-IDF-based techniques have out- performed the Hierarchical Attention models in both the sentiment analysis and abusive language detection tasks. The Tulu sentiment analysis system demonstrated better performance for the Positive and Neutral classes, whereas the Tamil sentiment analysis system exhibited lower performance overall. This highlights the need for more balanced datasets and additional research to enhance the accuracy of sentiment analysis in the Tamil language. In terms of abusive language detection, the TF-IDF-LSTM models generally outperformed the Hierarchical Attention models. However, the mixed models displayed better performance for specific classes such as “Homophobia” and “Xenophobia.” This implies that considering both code-mixed and original script data can offer a different perspective for research in social media analysis.</abstract>
+      <url hash="d91a3835">2023.ltedi-1.3</url>
+      <bibkey>m-2023-multilingual</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Overview of the shared task on Detecting Signs of Depression from Social Media Text</title>
+      <author><first>Kayalvizhi</first><last>S</last></author>
+      <author><first>Thenmozhi</first><last>D.</last></author>
+      <author><first>Bharathi Raja</first><last>Chakravarthi</last></author>
+      <author><first>Jerin Mahibha</first><last>C</last></author>
+      <author><first>Kogilavani</first><last>S V</last></author>
+      <author><first>Pratik Anil</first><last>Rahood</last></author>
+      <pages>25–30</pages>
+      <abstract>Social media has become a vital platform for personal communication. Its widespread use as a primary means of public communication offers an exciting opportunity for early detection and management of mental health issues. People often share their emotions on social media, but understanding the true depth of their feelings can be challenging. Depression, a prevalent problem among young people, is of particular concern due to its link with rising suicide rates. Identifying depression levels in social media texts is crucial for timely support and prevention of negative outcomes. However, it’s a complex task because human emotions are dynamic and can change significantly over time. The DepSign-LT-EDI@RANLP 2023 shared task aims to classify social media text into three depression levels: “Not Depressed,” “Moderately Depressed,” and “Severely Depressed.” This overview covers task details, dataset, methodologies used, and results analysis. Roberta-based models emerged as top performers, with the best result achieving an impressive macro F1-score of 0.584 among 31 participating teams.</abstract>
+      <url hash="dec47375">2023.ltedi-1.4</url>
+      <bibkey>s-etal-2023-overview-shared</bibkey>
+    </paper>
+    <paper id="5">
+      <title>Overview of the Second Shared Task on Speech Recognition for Vulnerable Individuals in <fixed-case>T</fixed-case>amil</title>
+      <author><first>Bharathi</first><last>B</last></author>
+      <author><first>Bharathi Raja</first><last>Chakravarthi</last></author>
+      <author><first>Subalalitha</first><last>Cn</last></author>
+      <author><first>Sripriya</first><last>Natarajan</last></author>
+      <author><first>Rajeswari</first><last>Natarajan</last></author>
+      <author><first>S</first><last>Suhasini</last></author>
+      <author><first>Swetha</first><last>Valli</last></author>
+      <pages>31–37</pages>
+      <abstract>This paper manifest the overview of the shared task on Speech Recognition for Vulnerable individuals in Tamil(LT-EDI-ACL2023). Task is provided with an Tamil dataset, which is collected from elderly people of three different genders, male, female and transgender. The audio samples were recorded from the public locations like hospitals, markets, vegetable shop, etc. The dataset is released in two phase, training and testing phase. The partcipants were asked to use different models and methods to handle audio signals and submit the result as transcription of the test samples given. The result submitted by the participants was evaluated using WER (Word Error Rate). The participants used the transformer-based model for automatic speech recognition. The results and different pre-trained transformer based models used by the participants is discussed in this overview paper.</abstract>
+      <url hash="519b07ec">2023.ltedi-1.5</url>
+      <bibkey>b-etal-2023-overview</bibkey>
+    </paper>
+    <paper id="6">
+      <title>Overview of Second Shared Task on Homophobia and Transphobia Detection in Social Media Comments</title>
+      <author><first>Bharathi Raja</first><last>Chakravarthi</last></author>
+      <author><first>Rahul</first><last>Ponnusamy</last></author>
+      <author><first>Malliga</first><last>S</last></author>
+      <author><first>Paul</first><last>Buitelaar</last></author>
+      <author><first>Miguel Angel</first><last>Garc ́ıa-Cumbreras</last></author>
+      <author><first>Salud Mar ́ıa</first><last>Jimenez-Zafra</last></author>
+      <author><first>Jose Antonio</first><last>Garcia-Diaz</last></author>
+      <author><first>Rafael</first><last>Valencia-Garcia</last></author>
+      <author><first>Nitesh</first><last>Jindal</last></author>
+      <pages>38–46</pages>
+      <abstract>We present an overview of the second shared task on homophobia/transphobia Detection in social media comments. Given a comment, a system must predict whether or not it contains any form of homophobia/transphobia. The shared task included five languages: English, Spanish, Tamil, Hindi, and Malayalam. The data was given for two tasks. Task A was given three labels, and Task B fine-grained seven labels. In total, 75 teams enrolled for the shared task in Codalab. For task A, 12 teams submitted systems for English, eight teams for Tamil, eight teams for Spanish, and seven teams for Hindi. For task B, nine teams submitted for English, 7 teams for Tamil, 6 teams for Malayalam. We present and analyze all submissions in this paper.</abstract>
+      <url hash="c59bfbdd">2023.ltedi-1.6</url>
+      <bibkey>chakravarthi-etal-2023-overview</bibkey>
+    </paper>
+    <paper id="7">
+      <title>Overview of the Shared Task on Hope Speech Detection for Equality, Diversity, and Inclusion</title>
+      <author><first>Prasanna Kumar</first><last>Kumaresan</last></author>
+      <author><first>Bharathi Raja</first><last>Chakravarthi</last></author>
+      <author><first>Subalalitha</first><last>Cn</last></author>
+      <author><first>Miguel Ángel</first><last>García-Cumbreras</last></author>
+      <author><first>Salud María</first><last>Jiménez Zafra</last></author>
+      <author><first>José Antonio</first><last>García-Díaz</last></author>
+      <author><first>Rafael</first><last>Valencia-García</last></author>
+      <author><first>Momchil</first><last>Hardalov</last></author>
+      <author><first>Ivan</first><last>Koychev</last></author>
+      <author><first>Preslav</first><last>Nakov</last></author>
+      <author><first>Daniel</first><last>García-Baena</last></author>
+      <author><first>Kishore Kumar</first><last>Ponnusamy</last></author>
+      <pages>47–53</pages>
+      <abstract>Hope serves as a powerful driving force that encourages individuals to persevere in the face of the unpredictable nature of human existence. It instills motivation within us to remain steadfast in our pursuit of important goals, regardless of the uncertainties that lie ahead. In today’s digital age, platforms such as Facebook, Twitter, Instagram, and YouTube have emerged as prominent social media outlets where people freely express their views and opinions. These platforms have also become crucial for marginalized individuals seeking online assistance and support[1][2][3]. The outbreak of the pandemic has exacerbated people’s fears around the world, as they grapple with the possibility of losing loved ones and the lack of access to essential services such as schools, hospitals, and mental health facilities.</abstract>
+      <url hash="f313005d">2023.ltedi-1.7</url>
+      <bibkey>kumaresan-etal-2023-overview</bibkey>
+    </paper>
+    <paper id="8">
+      <title>Computer, enhence: <fixed-case>POS</fixed-case>-tagging improvements for nonbinary pronoun use in <fixed-case>S</fixed-case>wedish</title>
+      <author><first>Henrik</first><last>Björklund</last></author>
+      <author><first>Hannah</first><last>Devinney</last></author>
+      <pages>54–61</pages>
+      <abstract>Part of Speech (POS) taggers for Swedish routinely fail for the third person gender-neutral pronoun “hen”, despite the fact that it has been a well-established part of the Swedish language since at least 2014. In addition to simply being a form of gender bias, this failure can have negative effects on other tasks relying on POS information. We demonstrate the usefulness of semi-synthetic augmented datasets in a case study, retraining a POS tagger to correctly recognize “hen” as a personal pronoun. We evaluate our retrained models for both tag accuracy and on a downstream task (dependency parsing) in a classicial NLP pipeline. Our results show that adding such data works to correct for the disparity in performance. The accuracy rate for identifying “hen” as a pronoun can be brought up to acceptable levels with only minor adjustments to the tagger’s vocabulary files. Performance parity to gendered pronouns can be reached after retraining with only a few hundred examples. This increase in POS tag accuracy also results in improvements for dependency parsing sentences containing hen.</abstract>
+      <url hash="0eea0848">2023.ltedi-1.8</url>
+      <bibkey>bjorklund-devinney-2023-computer</bibkey>
+    </paper>
+    <paper id="9">
+      <title>Evaluating the Impact of Stereotypes and Language Combinations on Gender Bias Occurrence in <fixed-case>NMT</fixed-case> Generic Systems</title>
+      <author><first>Bertille</first><last>Triboulet</last></author>
+      <author><first>Pierrette</first><last>Bouillon</last></author>
+      <pages>62–70</pages>
+      <abstract>Machine translation, and more specifically neural machine translation (NMT), have been proven to be subject to gender bias in recent years. Many studies have focused on evaluating and reducing this phenomenon, mainly through the analysis of occupational nouns’ translation for the same type of language combinations. In this paper, we reproduce a similar test set than in previous studies to investigate the influence of stereotypes and language combinations’ nature (formed with English, French and Italian) on gender bias occurrence in NMT. Similarly to previous studies, we confirm stereotypes as a major source of gender bias, especially in female contexts, while observing bias even in language combinations traditionally less examined.</abstract>
+      <url hash="d25d1cea">2023.ltedi-1.9</url>
+      <bibkey>triboulet-bouillon-2023-evaluating</bibkey>
+    </paper>
+    <paper id="10">
+      <title><fixed-case>K</fixed-case>austubh<fixed-case>S</fixed-case>hared<fixed-case>T</fixed-case>ask@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case> 2023: Homophobia-Transphobia Detection in Social Media Comments with <fixed-case>NLPAUG</fixed-case>-driven Data Augmentation</title>
+      <author><first>Kaustubh</first><last>Lande</last></author>
+      <author><first>Rahul</first><last>Ponnusamy</last></author>
+      <author><first>Prasanna Kumar</first><last>Kumaresan</last></author>
+      <author><first>Bharathi Raja</first><last>Chakravarthi</last></author>
+      <pages>71–77</pages>
+      <abstract>Our research in Natural Language Processing (NLP) aims to detect hate speech comments specifically targeted at the LGBTQ+ community within the YouTube platform shared task conducted by LTEDI workshop. The dataset provided by the organizers exhibited a high degree of class imbalance, and to mitigate this, we employed NLPAUG, a data augmentation library. We employed several classification methods and reported the results using recall, precision, and F1-score metrics. The classification models discussed in this paper include a Bidirectional Long Short-Term Memory (BiLSTM) model trained with Word2Vec embeddings, a BiLSTM model trained with Twitter GloVe embeddings, transformer models such as BERT, DistiBERT, RoBERTa, and XLM-RoBERTa, all of which were trained and fine-tuned. We achieved a weighted F1-score of 0.699 on the test data and secured fifth place in task B with 7 classes for the English language.</abstract>
+      <url hash="6c5e117e">2023.ltedi-1.10</url>
+      <bibkey>lande-etal-2023-kaustubhsharedtask</bibkey>
+    </paper>
+    <paper id="11">
+      <title><fixed-case>J</fixed-case>udith<fixed-case>J</fixed-case>eyafreeda@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case>-2023: Using <fixed-case>GPT</fixed-case> model for recognition of Homophobia/Transphobia detection from social media</title>
+      <author><first>Judith Jeyafreeda</first><last>Andrew</last></author>
+      <pages>78–82</pages>
+      <abstract>Homophobia and Transphobia is defined as hatred or discomfort towards Gay, Lesbian, Transgender or Bisexual people. With the increase in social media, communication has become free and easy. This also means that people can also express hatred and discomfort towards others. Studies have shown that these can cause mental health issues. Thus detection and masking/removal of these comments from the social media platforms can help with understanding and improving the mental health of LGBTQ+ people. In this paper, GPT2 is used to detect homophobic and/or transphobic comments in social media comments. The comments used in this paper are from five (English, Spanish, Tamil, Malayalam and Hindi) languages. The results show that detecting comments in English language is easier when compared to the other languages.</abstract>
+      <url hash="f0d9ebd0">2023.ltedi-1.11</url>
+      <bibkey>andrew-2023-judithjeyafreeda-lt</bibkey>
+    </paper>
+    <paper id="12">
+      <title>iicteam@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case>-2023: Leveraging pre-trained Transformers for Fine-Grained Depression Level Detection in Social Media</title>
+      <author><first>Vajratiya</first><last>Vajrobol</last></author>
+      <author><first>Nitisha</first><last>Aggarwal</last></author>
+      <author><first>Karanpreet</first><last>Singh</last></author>
+      <pages>83–88</pages>
+      <abstract>Depression is a prevalent mental illness characterized by feelings of sadness and a lack of interest in daily activities. Early detection of depression is crucial to prevent severe consequences, making it essential to observe and treat the condition at its onset. At ACL-2022, the DepSign-LT-EDI project aimed to identify signs of depression in individuals based on their social media posts, where people often share their emotions and feelings. Using social media postings in English, the system categorized depression signs into three labels: “not depressed,” “moderately depressed,” and “severely depressed.” To achieve this, our team has applied MentalRoBERTa, a model trained on big data of mental health. The test results indicated a macro F1-score of 0.439, ranking the fourth in the shared task.</abstract>
+      <url hash="9b06cc80">2023.ltedi-1.12</url>
+      <bibkey>vajrobol-etal-2023-iicteam</bibkey>
+    </paper>
+    <paper id="13">
+      <title><fixed-case>JA</fixed-case>-<fixed-case>NLP</fixed-case>@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case>-2023: Empowering Mental Health Assessment: A <fixed-case>R</fixed-case>o<fixed-case>BERT</fixed-case>a-Based Approach for Depression Detection</title>
+      <author><first>Jyoti</first><last>Kumari</last></author>
+      <author><first>Abhinav</first><last>Kumar</last></author>
+      <pages>89–96</pages>
+      <abstract>Depression, a widespread mental health disorder, affects a significant portion of the global population. Timely identification and intervention play a crucial role in ensuring effective treatment and support. Therefore, this research paper proposes a fine-tuned RoBERTa-based model for identifying depression in social media posts. In addition to the proposed model, Sentence-BERT is employed to encode social media posts into vector representations. These encoded vectors are then utilized in eight different popular classical machine learning models. The proposed fine-tuned RoBERTa model achieved a best macro F1-score of 0.55 for the development dataset and a comparable score of 0.41 for the testing dataset. Additionally, combining Sentence-BERT with Naive Bayes (S-BERT + NB) outperformed the fine-tuned RoBERTa model, achieving a slightly higher macro F1-score of 0.42. This demonstrates the effectiveness of the approach in detecting depression from social media posts.</abstract>
+      <url hash="634806de">2023.ltedi-1.13</url>
+      <bibkey>kumari-kumar-2023-ja</bibkey>
+    </paper>
+    <paper id="14">
+      <title>Team-<fixed-case>KEC</fixed-case>@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case>: Detecting Signs of Depression from Social Media Text</title>
+      <author><first>Malliga</first><last>S</last></author>
+      <author><first>Kogilavani</first><last>Shanmugavadivel</last></author>
+      <author><first>Arunaa</first><last>S</last></author>
+      <author><first>Gokulkrishna</first><last>R</last></author>
+      <author><first>Chandramukhii</first><last>A</last></author>
+      <pages>97–102</pages>
+      <abstract>The rise of social media has led to a drastic surge in the dissemination of hostile and toxic content, fostering an alarming proliferation of hate speech, inflammatory remarks, and abusive language. The exponential growth of social media has facilitated the widespread circulation of hostile and toxic content, giving rise to an unprecedented influx of hate speech, incendiary language, and abusive rhetoric. The study utilized different techniques to represent the text data in a numerical format. Word embedding techniques aim to capture the semantic and syntactic information of the text data, which is essential in text classification tasks. The study utilized various techniques such as CNN, BERT, and N-gram to classify social media posts into depression and non-depression categories. Text classification tasks often rely on deep learning techniques such as Convolutional Neural Networks (CNN), while the BERT model, which is pre-trained, has shown exceptional performance in a range of natural language processing tasks. To assess the effectiveness of the suggested approaches, the research employed multiple metrics, including accuracy, precision, recall, and F1-score. The outcomes of the investigation indicate that the suggested techniques can identify symptoms of depression with an average accuracy rate of 56%.</abstract>
+      <url hash="b76df6d7">2023.ltedi-1.14</url>
+      <bibkey>s-etal-2023-team</bibkey>
+    </paper>
+    <paper id="15">
+      <title>cantnlp@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case>-2023: Homophobia/Transphobia Detection in Social Media Comments using Spatio-Temporally Retrained Language Models</title>
+      <author><first>Sidney</first><last>Wong</last></author>
+      <author><first>Matthew</first><last>Durward</last></author>
+      <author><first>Benjamin</first><last>Adams</last></author>
+      <author><first>Jonathan</first><last>Dunn</last></author>
+      <pages>103–108</pages>
+      <abstract>This paper describes our multiclass classification system developed as part of the LT-EDI@RANLP-2023 shared task. We used a BERT-based language model to detect homophobic and transphobic content in social media comments across five language conditions: English, Spanish, Hindi, Malayalam, and Tamil. We retrained a transformer-based cross-language pretrained language model, XLM-RoBERTa, with spatially and temporally relevant social media language data. We found the inclusion of this spatio-temporal data improved the classification performance for all language and task conditions when compared with the baseline. We also retrained a subset of models with simulated script-mixed social media language data with varied performance. The results from the current study suggests that transformer-based language classification systems are sensitive to register-specific and language-specific retraining.</abstract>
+      <url hash="75705be4">2023.ltedi-1.15</url>
+      <bibkey>wong-etal-2023-cantnlp</bibkey>
+    </paper>
+    <paper id="16">
+      <title><fixed-case>NLP</fixed-case>_<fixed-case>CHRISTINE</fixed-case>@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case>-2023: <fixed-case>R</fixed-case>o<fixed-case>BERT</fixed-case>a &amp; <fixed-case>D</fixed-case>e<fixed-case>BERT</fixed-case>a Fine-tuning for Detecting Signs of Depression from Social Media Text</title>
+      <author><first>Christina</first><last>Christodoulou</last></author>
+      <pages>109–116</pages>
+      <abstract>The paper describes the system for the 4th Shared task on “Detecting Signs of Depression from Social Media Text” at LT-EDI@RANLP 2023, which aimed to identify signs of depression on English social media texts. The solution comprised data cleaning and pre-processing, the use of additional data, a method to deal with data imbalance as well as fine-tuning of two transformer-based pre-trained language models, RoBERTa-Large and DeBERTa-V3-Large. Four model architectures were developed by leveraging different word embedding pooling methods, namely a RoBERTa-Large bidirectional GRU model using GRU pooling and three DeBERTa models using CLS pooling, mean pooling and max pooling, respectively. Although ensemble learning of DeBERTa’s pooling methods through majority voting was employed for better performance, the RoBERTa bidirectional GRU model managed to receive the 8th place out of 31 submissions with 0.42 Macro-F1 score.</abstract>
+      <url hash="7ce1d42c">2023.ltedi-1.16</url>
+      <bibkey>christodoulou-2023-nlp-christine</bibkey>
+    </paper>
+    <paper id="17">
+      <title><fixed-case>IIITDWD</fixed-case>@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case>-2023 Unveiling Depression: Using pre-trained language models for Harnessing Domain-Specific Features and Context Information</title>
+      <author><first>Shankar</first><last>Biradar</last></author>
+      <author><first>Sunil</first><last>Saumya</last></author>
+      <author><first>Sanjana</first><last>Kavatagi</last></author>
+      <pages>117–123</pages>
+      <abstract>Depression has become a common health problem impacting millions of individuals globally. Workplace stress and an unhealthy lifestyle have increased in recent years, leading to an increase in the number of people experiencing depressive symptoms. The spread of the epidemic has further exacerbated the problem. Early detection and precise prediction of depression are critical for early intervention and support for individuals at risk. However, due to the social stigma associated with the illness, many people are afraid to consult healthcare specialists, making early detection practically impossible. As a result, alternative strategies for depression prediction are being investigated, one of which is analyzing users’ social media posting behaviour. The organizers of LT-EDI@RANLP carried out a shared Task to encourage research in this area. Our team participated in the shared task and secured 21st rank with a macro F1 score 0f 0.36. This article provides a summary of the model presented in the shared task.</abstract>
+      <url hash="b2a0c3bd">2023.ltedi-1.17</url>
+      <bibkey>biradar-etal-2023-iiitdwd</bibkey>
+    </paper>
+    <paper id="18">
+      <title><fixed-case>CIMAT</fixed-case>-<fixed-case>NLP</fixed-case>@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case>-2023: Finegrain Depression Detection by Multiple Binary Problems Approach</title>
+      <author><first>María de Jesús</first><last>García Santiago</last></author>
+      <author><first>Fernando</first><last>Sánchez Vega</last></author>
+      <author><first>Adrián Pastor</first><last>López Monroy</last></author>
+      <pages>124–130</pages>
+      <abstract>This work described the work of the team CIMAT-NLP on the Shared task of Detecting Signs of Depression from Social Media Text at LT-EDI@RANLP 2023, which consists of depression classification on three levels: “not depression”, “moderate” depression and “severe” depression on text from social media. In this work, we proposed two approaches: (1) a transformer model which can handle big text without truncation of its length, and (2) an ensemble of six binary Bag of Words. Our team placed fourth in the competition and found that models trained with our approaches could place second</abstract>
+      <url hash="070de630">2023.ltedi-1.18</url>
+      <bibkey>garcia-santiago-etal-2023-cimat</bibkey>
+    </paper>
+    <paper id="19">
+      <title><fixed-case>SIS</fixed-case>@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case>-2023: Detecting Signs of Depression from Social Media Text</title>
+      <author><first>Sulaksha</first><last>B K</last></author>
+      <author><first>Shruti Krishnaveni</first><last>S</last></author>
+      <author><first>Ivana</first><last>Steeve</last></author>
+      <author><first>Monica Jenefer</first><last>B</last></author>
+      <pages>131–137</pages>
+      <abstract>Various biological, genetic, psychological or social factors that feature a target oriented life with chronic stress and frequent traumatic experiences, lead to pessimism and apathy. The massive scale of depression should be dealt with as a disease rather than a ‘phase’ that is neglected by the majority. However, not a lot of people are aware of depression and its impact. Depression is a serious issue that should be treated in the right way. Many people dealing with depression do not realize that they have it due to the lack of awareness. This paper aims to address this issue with a tool built on the blocks of machine learning. This model analyzes the public social media texts and detects the signs of depression under three labels namely “not depressed”, “moderately depressed”, and “severely depressed” with high accuracy. The ensembled model uses three learners namely Multi-Layered Perceptron, Support Vector Machine and Multinomial Naive Bayes Classifier. The distinctive feature in this model is that it uses Artificial Neural Networks, Classifiers, Regression and Voting Classifiers to compute the final result or output.</abstract>
+      <url hash="9ae89022">2023.ltedi-1.19</url>
+      <bibkey>b-k-etal-2023-sis</bibkey>
+    </paper>
+    <paper id="20">
+      <title><fixed-case>TEAM</fixed-case> <fixed-case>BIAS</fixed-case> <fixed-case>BUSTERS</fixed-case>@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case>-2023: Detecting Signs of Depression with Generative Pretrained Transformers</title>
+      <author><first>Andrew</first><last>Nedilko</last></author>
+      <pages>138–143</pages>
+      <abstract>This paper describes our methodology adopted to participate in the multi-class classification task under the auspices of the Third Workshop on Language Technology for Equality, Diversity, Inclusion (LT-EDI) in the Recent Advances in Natural Language Processing (RANLP) 2023 conference. The overall objective was to employ ML algorithms to detect signs of depression in English social media content, classifying each post into one of three categories: no depression, moderate depression, and severe depression. To accomplish this we utilized generative pretrained transformers (GPTs), leveraging the full-scale OpenAI API. Our strategy incorporated prompt engineering for zero-shot and few-shot learning scenarios with ChatGPT and fine-tuning a GPT-3 model. The latter approach yielded the best results which allowed us to outperform our benchmark XGBoost classifier based on character-level features on the dev set and score a macro F1 score of 0.419 on the final blind test set.</abstract>
+      <url hash="f0205a04">2023.ltedi-1.20</url>
+      <bibkey>nedilko-2023-team</bibkey>
+    </paper>
+    <paper id="21">
+      <title><fixed-case>RANGANAYAKI</fixed-case>@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case>: Hope Speech Detection using Capsule Networks</title>
+      <author><first>Ranganayaki</first><last>Em</last></author>
+      <author><first>Abirami</first><last>Murugappan</last></author>
+      <author><first>Lysa</first><last>Packiam R S</last></author>
+      <author><first>Deivamani</first><last>M</last></author>
+      <pages>144–148</pages>
+      <abstract>HOPE speeches convey uplifting and motivating messages that help enhance mental health and general well-being. Hope speech detection has gained popularity in the field of natural language processing as it gives people the motivation they need to face challenges in life. The momentum behind this technology has been fueled by the demand for encouraging reinforcement online. In this paper, a deep learning approach is proposed in which four different word embedding techniques are used in combination with capsule networks, and a comparative analysis is performed to obtain results. Oversampling is used to address class imbalance problem. The dataset used in this paper is a part of the LT-EDI RANLP 2023 Hope Speech Detection shared task. The approach proposed in this paper achieved a Macro Average F1 score of 0.49 and 0.62 in English and Hindi-English code mix test data, which secured 2nd and 3rd rank respectively in the above mentioned share task.</abstract>
+      <url hash="c3f85c95">2023.ltedi-1.21</url>
+      <bibkey>em-etal-2023-ranganayaki</bibkey>
+    </paper>
+    <paper id="22">
+      <title><fixed-case>T</fixed-case>ech<fixed-case>SSN</fixed-case>1 at <fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case>-2023: Depression Detection and Classification using <fixed-case>BERT</fixed-case> Model for Social Media Texts</title>
+      <author><first>Venkatasai Ojus</first><last>Yenumulapalli</last></author>
+      <author><first>Vijai Aravindh</first><last>R</last></author>
+      <author><first>Rajalakshmi</first><last>Sivanaiah</last></author>
+      <author><first>Angel Deborah</first><last>S</last></author>
+      <pages>149–154</pages>
+      <abstract>Depression is a severe mental health disorder characterized by persistent feelings of sadness and anxiety, a decline in cognitive functioning resulting in drastic changes in a human’s psychological and physical well-being. However, depression is curable completely when treated at a suitable time and treatment resulting in the rejuvenation of an individual. The objective of this paper is to devise a technique for detecting signs of depression from English social media comments as well as classifying them based on their intensity into severe, moderate, and not depressed categories. The paper illustrates three approaches that are developed when working toward the problem. Of these approaches, the BERT model proved to be the most suitable model with an F1 macro score of 0.407, which gave us the 11th rank overall.</abstract>
+      <url hash="f801e7c9">2023.ltedi-1.22</url>
+      <bibkey>yenumulapalli-etal-2023-techssn1</bibkey>
+    </paper>
+    <paper id="23">
+      <title><fixed-case>SANBAR</fixed-case>@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case>-2023:Automatic Speech Recognition: vulnerable old-aged and transgender people in <fixed-case>T</fixed-case>amil</title>
+      <author><first>Saranya</first><last>S</last></author>
+      <author><first>Bharathi</first><last>B</last></author>
+      <pages>155–160</pages>
+      <abstract>An Automatic Speech Recognition systems for Tamil are designed to convert spoken lan- guage or speech signals into written Tamil text. Seniors go to banks, clinics and authoritative workplaces to address their regular necessities. A lot of older people are not aware of the use of the facilities available in public places or office. They need a person to help them. Like- wise, transgender people are deprived of pri- mary education because of social stigma, so speaking is the only way to help them meet their needs. In order to build speech enabled systems, spontaneous speech data is collected from seniors and transgender people who are deprived of using these facilities for their own benefit. The proposed system is developed with pretraind models are IIT Madras transformer ASR model and akashsivanandan/wav2vec2- large-xls-r-300m-tamil model. Both pretrained models are used to evaluate the test speech ut- terances, and obtainted the WER as 37.7144% and 40.55% respectively.</abstract>
+      <url hash="37d79edd">2023.ltedi-1.23</url>
+      <bibkey>s-b-2023-sanbar</bibkey>
+    </paper>
+    <paper id="24">
+      <title><fixed-case>ASR</fixed-case>_<fixed-case>SSN</fixed-case>_<fixed-case>CSE</fixed-case>@<fixed-case>LTEDI</fixed-case>- 2023: Pretrained Transformer based Automatic Speech Recognition system for Elderly People</title>
+      <author><first>Suhasini</first><last>S</last></author>
+      <author><first>Bharathi</first><last>B</last></author>
+      <pages>161–165</pages>
+      <abstract>Submission of the paper for the result submitted in Shared Task on Speech Recognition for Vulnerable Individuals in Tamil- LT-EDI-2023. The task is to develop an automatic speech recognition system for Tamil language. The dataset provided in the task is collected from the elderly people who converse in Tamil language. The proposed ASR system is designed with pre-trained model. The pre-trained model used in our system is fine-tuned with Tamil common voice dataset. The test data released from the task is given to the proposed system, now the transcriptions are generated for the test samples and the generated transcriptions is submitted to the task. The result submitted is evaluated by task, the evaluation metric used is Word Error Rate (WER). Our Proposed system attained a WER of 39.8091%.</abstract>
+      <url hash="540cdc16">2023.ltedi-1.24</url>
+      <bibkey>s-b-2023-asr</bibkey>
+    </paper>
+    <paper id="25">
+      <title><fixed-case>SSNT</fixed-case>ech2@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case>-2023: Homophobia/Transphobia Detection in Social Media Comments Using Linear Classification Techniques</title>
+      <author><first>Vaidhegi</first><last>D</last></author>
+      <author><first>Priya</first><last>M</last></author>
+      <author><first>Rajalakshmi</first><last>Sivanaiah</last></author>
+      <author><first>Angel Deborah</first><last>S</last></author>
+      <author><first>Mirnalinee</first><last>ThankaNadar</last></author>
+      <pages>166–171</pages>
+      <abstract>The abusive content on social media networks is causing destructive effects on the mental well-being of online users. Homophobia refers to the fear, negative attitudes and feeling towards homosexuality. Transphobia refer to negative attitudes, hatred and prejudice towards transsexual people. Even though, some parts of the society have started to accept homosexuality and transsexuality, there are still a large set of the population opposing it. Hate speech targeting LGBTQ+ individuals, known as homophobia/transphobia speech, has become a growing concern. This has led to a toxic and unwelcoming environment for LGBTQ+ people on online platforms. This poses a significant societal issue, hindering the progress of equality, diversity, and inclusion. The identification of homophobic and transphobic comments on social media platforms plays a crucial role in creating a safer environment for all social media users. In order to accomplish this, we built a machine learning model using SGD and SVM classifier. Our approach yielded promising results, with a weighted F1-score of 0.95 on the English dataset and we secured 4th rank in this task.</abstract>
+      <url hash="f750027c">2023.ltedi-1.25</url>
+      <bibkey>d-etal-2023-ssntech2</bibkey>
+    </paper>
+    <paper id="26">
+      <title><fixed-case>IJS</fixed-case>@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case> : Ensemble Approaches to Detect Signs of Depression from Social Media Text</title>
+      <author><first>Jaya</first><last>Caporusso</last></author>
+      <author><first>Thi Hong Hanh</first><last>Tran</last></author>
+      <author><first>Senja</first><last>Pollak</last></author>
+      <pages>172–178</pages>
+      <abstract>This paper presents our ensembling solutions for detecting signs of depression in social media text, as part of the Shared Task at LT-EDI@RANLP 2023. By leveraging social media posts in English, the task involves the development of a system to accurately classify them as presenting signs of depression of one of three levels: “severe”, “moderate”, and “not depressed”. We verify the hypothesis that combining contextual information from a language model with local domain-specific features can improve the classifier’s performance. We do so by evaluating: (1) two global classifiers (support vector machine and logistic regression); (2) contextual information from language models; and (3) the ensembling results.</abstract>
+      <url hash="29237d80">2023.ltedi-1.26</url>
+      <bibkey>caporusso-etal-2023-ijs</bibkey>
+    </paper>
+    <paper id="27">
+      <title><fixed-case>VEL</fixed-case>@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case>-2023: Automatic Detection of Hope Speech in <fixed-case>B</fixed-case>ulgarian Language using Embedding Techniques</title>
+      <author><first>Rahul</first><last>Ponnusamy</last></author>
+      <author><first>Malliga</first><last>S</last></author>
+      <author><first>Sajeetha</first><last>Thavareesan</last></author>
+      <author><first>Ruba</first><last>Priyadharshini</last></author>
+      <author><first>Bharathi Raja</first><last>Chakravarthi</last></author>
+      <pages>179–184</pages>
+      <abstract>Many people may find motivation in their lives by spreading content on social media that is encouraging or hopeful. Creating an effective model that helps in accurately predicting the target class is a challenging task. The problem of Hope speech identification is dealt with in this work using machine learning and deep learning methods. This paper presents the description of the system submitted by our team(VEL) to the Hope Speech Detection for Equality, Diversity, and Inclusion(HSD-EDI) LT-EDI-RANLP 2023 shared task for the Bulgarian language. The main goal of this shared task is to identify the given text into the Hope speech or Non-Hope speech category. The proposed method used the H2O deep learning model with MPNet embeddings and achieved the second rank for the Bulgarian language with the Macro F1 score of 0.69.</abstract>
+      <url hash="3bd08b0e">2023.ltedi-1.27</url>
+      <bibkey>ponnusamy-etal-2023-vel-lt</bibkey>
+    </paper>
+    <paper id="28">
+      <title>Cordyceps@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case>: Patching Language-Specific Homophobia/Transphobia Classifiers with a Multilingual Understanding</title>
+      <author><first>Dean</first><last>Ninalga</last></author>
+      <pages>185–191</pages>
+      <abstract>Detecting transphobia, homophobia, and various other forms of hate speech is difficult. Signals can vary depending on factors such as language, culture, geographical region, and the particular online platform. Here, we present a joint multilingual (M-L) and language-specific (L-S) approach to homophobia and transphobic hate speech detection (HSD). M-L models are needed to catch words, phrases, and concepts that are less common or missing in a particular language and subsequently overlooked by L-S models. Nonetheless, L-S models are better situated to understand the cultural and linguistic context of the users who typically write in a particular language. Here we construct a simple and successful way to merge the M-L and L-S approaches through simple weight interpolation in such a way that is interpretable and data-driven. We demonstrate our system on task A of the “Shared Task on Homophobia/Transphobia Detection in social media comments” dataset for homophobia and transphobic HSD. Our system achieves the best results in three of five languages and achieves a 0.997 macro average F1-score on Malayalam texts.</abstract>
+      <url hash="63f5e956">2023.ltedi-1.28</url>
+      <bibkey>ninalga-2023-cordyceps</bibkey>
+    </paper>
+    <paper id="29">
+      <title>Cordyceps@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case> : Depression Detection with <fixed-case>R</fixed-case>eddit and Self-training</title>
+      <author><first>Dean</first><last>Ninalga</last></author>
+      <pages>192–197</pages>
+      <abstract>Depression is debilitating, and not uncommon. Indeed, studies of excessive social media users show correlations with depression, ADHD, and other mental health concerns. Given that there is a large number of people with excessive social media usage, then there is a significant population of potentially undiagnosed users and posts that they create. In this paper, we propose a depression detection system using a semi-supervised learning technique. Namely, we use a trained model to classify a large number of unlabelled social media posts from Reddit, then use these generated labels to train a more powerful classifier. We demonstrate our framework on Detecting Signs of Depression from Social Media Text - LT-EDI@RANLP 2023 shared task, where our framework ranks 3rd overall.</abstract>
+      <url hash="7477ba21">2023.ltedi-1.29</url>
+      <bibkey>ninalga-2023-cordyceps-lt</bibkey>
+    </paper>
+    <paper id="30">
+      <title><fixed-case>T</fixed-case>ech<fixed-case>W</fixed-case>hiz@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case>-2023: Transformer Models to Detect Levels of Depression from Social Media Text</title>
+      <author><first>Madhumitha</first><last>M</last></author>
+      <author><first>Jerin Mahibha</first><last>C</last></author>
+      <author><first>Thenmozhi</first><last>D.</last></author>
+      <pages>198–203</pages>
+      <abstract>Depression is a mental fitness disorder from persistent reactions of unhappiness, void, and a deficit of interest in activities. It can influence differing facets of one’s life, containing their hopes, sympathy, and nature. Depression can stem from a sort of determinant, in the way that ancestral willingness, life occurrences, and social circumstances. In current years, the influence of social media on mental fitness has become an increasing concern. Excessive use of social media and the negative facets that guide it, can exacerbate or cause impressions of distress. The nonstop exposure to cautiously curated lives, social comparison, cyberbullying, and the pressure to meet unreal standards can impact an individual’s pride, social connections, and overall well-being. We participated in the shared task at DepSignLT-EDI@RANLP 2023 and have proposed a model that identifies the levels of depression from social media text using the data set shared for the task. Different transformer models like ALBERT and RoBERTa are used by the proposed model for implementing the task. The macro F1 score obtained by ALBERT model and RoBERTa model are 0.258 and 0.143 respectively.</abstract>
+      <url hash="2725dced">2023.ltedi-1.30</url>
+      <bibkey>m-etal-2023-techwhiz</bibkey>
+    </paper>
+    <paper id="31">
+      <title><fixed-case>CSE</fixed-case>_<fixed-case>SPEECH</fixed-case>@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case>-2023<fixed-case>A</fixed-case>utomatic Speech Recognition vulnerable old-aged and transgender people in <fixed-case>T</fixed-case>amil</title>
+      <author><first>Varsha</first><last>Balaji</last></author>
+      <author><first>Archana</first><last>Jp</last></author>
+      <author><first>Bharathi</first><last>B</last></author>
+      <pages>204–208</pages>
+      <abstract>This paper centers on utilizing Automatic Speech Recognition (ASR) for defenseless old-aged and transgender people in Tamil. The Amrrs/wav2vec2-large-xlsr-53-tamil show accomplishes a Word Error Rate (WER) of 40%. By leveraging this demonstration, ASR innovation upgrades availability and inclusivity, helping those with discourse impedances, hearing impedances, and cognitive inabilities. Assist refinements are vital to diminish error and move forward the client involvement. This inquiry emphasizes the significance of ASR, particularly the Amrrs/wav2vec2-large-xlsr-53-tamil show, in encouraging successful communication and availability for defenseless populaces in Tamil.</abstract>
+      <url hash="bda67786">2023.ltedi-1.31</url>
+      <bibkey>balaji-etal-2023-cse</bibkey>
+    </paper>
+    <paper id="32">
+      <title><fixed-case>VTUBGM</fixed-case>@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case>-2023: Hope Speech Identification using Layered Differential Training of <fixed-case>ULMF</fixed-case>it</title>
+      <author><first>Sanjana M.</first><last>Kavatagi</last></author>
+      <author><first>Rashmi R.</first><last>Rachh</last></author>
+      <author><first>Shankar S.</first><last>Biradar</last></author>
+      <pages>209–213</pages>
+      <abstract>Hope speech embodies optimistic and uplifting sentiments, aiming to inspire individuals to maintain faith in positive progress and actively contribute to a better future. In this article, we outline the model presented by our team, VTUBGM, for the shared task “Hope Speech Detection for Equality, Diversity, and Inclusion” at LT-EDI-RANLP 2023. This task entails classifying YouTube comments, which is a classification problem at the comment level. The task was conducted in four different languages: Bulgarian, English, Hindi, and Spanish. VTUBGM submitted a model developed through layered differential training of the ULMFit model. As a result, a macro F1 score of 0.48 was obtained and ranked 3rd in the competition.</abstract>
+      <url hash="2ffcae8b">2023.ltedi-1.32</url>
+      <bibkey>kavatagi-etal-2023-vtubgm</bibkey>
+    </paper>
+    <paper id="33">
+      <title><fixed-case>ML</fixed-case>&amp;<fixed-case>AI</fixed-case>_<fixed-case>IIITR</fixed-case>anchi@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case>-2023: Identification of Hope Speech of <fixed-case>Y</fixed-case>ou<fixed-case>T</fixed-case>ube comments in Mixed Languages</title>
+      <author><first>Kirti</first><last>Kumari</last></author>
+      <author><first>Shirish Shekhar</first><last>Jha</last></author>
+      <author><first>Zarikunte Kunal</first><last>Dayanand</last></author>
+      <author><first>Praneesh</first><last>Sharma</last></author>
+      <pages>214–222</pages>
+      <abstract>Hope speech analysis refers to the examination and evaluation of speeches or messages that aim to instill hope, inspire optimism, and motivate individuals or communities. It involves analyzing the content, language, rhetorical devices, and delivery techniques used in a speech to understand how it conveys hope and its potential impact on the audience. The objective of this study is to classify the given text comments as Hope Speech or Not Hope Speech. The provided dataset consists of YouTube comments in four languages: English, Hindi, Spanish, Bulgarian; with pre-defined classifications. Our approach involved pre-processing the dataset and using the TF-IDF (Term Frequency-Inverse Document Frequency) method.</abstract>
+      <url hash="912564f0">2023.ltedi-1.33</url>
+      <bibkey>kumari-etal-2023-ml-ai-iiitranchi</bibkey>
+    </paper>
+    <paper id="34">
+      <title><fixed-case>ML</fixed-case>&amp;<fixed-case>AI</fixed-case>_<fixed-case>IIITR</fixed-case>anchi@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case>-2023: Hybrid Model for Text Classification for Identification of Various Types of Depression</title>
+      <author><first>Kirti</first><last>Kumari</last></author>
+      <author><first>Shirish Shekhar</first><last>Jha</last></author>
+      <author><first>Zarikunte Kunal</first><last>Dayanand</last></author>
+      <author><first>Praneesh</first><last>Sharma</last></author>
+      <pages>223–232</pages>
+      <abstract>DepSign–LT–EDI@RANLP–2023 is a dedicated task that addresses the crucial issue of identifying indications of depression in individuals through their social media posts, which serve as a platform for expressing their emotions and sentiments. The primary objective revolves around accurately classifying the signs of depression into three distinct categories: “not depressed,” “moderately depressed,” and “severely depressed.” Our study entailed the utilization of machine learning algorithms, coupled with a diverse range of features such as sentence embeddings, TF-IDF, and Bag-of- Words. Remarkably, the adoption of hybrid models yielded promising outcomes, culminating in a <tex-math>10^{th}</tex-math> rank achievement, supported by macro F1-Score of 0.408. This research underscores the effectiveness and potential of employing advanced text classification methodologies to discern and identify signs of depression within social media data. The findings hold implications for the development of mental health monitoring systems and support mechanisms, contributing to the well-being of individuals in need.</abstract>
+      <url hash="907b12ca">2023.ltedi-1.34</url>
+      <bibkey>kumari-etal-2023-ml-ai-iiitranchi-lt</bibkey>
+    </paper>
+    <paper id="35">
+      <title><fixed-case>VEL</fixed-case>@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case>: Detecting Homophobia and Transphobia in Code-Mixed <fixed-case>S</fixed-case>panish Social Media Comments</title>
+      <author><first>Prasanna Kumar</first><last>Kumaresan</last></author>
+      <author><first>Kishore Kumar</first><last>Ponnusamy</last></author>
+      <author><first>Kogilavani</first><last>S V</last></author>
+      <author><first>Subalalitha</first><last>Cn</last></author>
+      <author><first>Ruba</first><last>Priyadharshini</last></author>
+      <author><first>Bharathi Raja</first><last>Chakravarthi</last></author>
+      <pages>233–238</pages>
+      <abstract>Our research aims to address the task of detecting homophobia and transphobia in social media code-mixed comments written in Spanish. Code-mixed text in social media often violates strict grammar rules and incorporates non-native scripts, posing challenges for identification. To tackle this problem, we perform pre-processing by removing unnecessary content and establishing a baseline for detecting homophobia and transphobia. Furthermore, we explore the effectiveness of various traditional machine-learning models with feature extraction and pre-trained transformer model techniques. Our best configurations achieve macro F1 scores of 0.84 on the test set and 0.82 on the development set for Spanish, demonstrating promising results in detecting instances of homophobia and transphobia in code-mixed comments.</abstract>
+      <url hash="fe31c2e8">2023.ltedi-1.35</url>
+      <bibkey>kumaresan-etal-2023-vel</bibkey>
+    </paper>
+    <paper id="36">
+      <title><fixed-case>T</fixed-case>ech<fixed-case>SSN</fixed-case>4@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case>-2023: Depression Sign Detection in Social Media Postings using <fixed-case>D</fixed-case>istil<fixed-case>BERT</fixed-case> Model</title>
+      <author><first>Krupa Elizabeth</first><last>Thannickal</last></author>
+      <author><first>Sanmati</first><last>P</last></author>
+      <author><first>Rajalakshmi</first><last>Sivanaiah</last></author>
+      <author><first>Angel Deborah</first><last>S</last></author>
+      <pages>239–243</pages>
+      <abstract>As world population increases, more people are living to the age when depression or Major Depressive Disorder (MDD) commonly occurs. Consequently, the number of those who suffer from such disorders is rising. There is a pressing need for faster and reliable diagnosis methods. This paper proposes the method to analyse text input from social media posts of subjects to determine the severity class of depression. We have used the DistilBERT transformer to process these texts and classify the individuals across three severity labels - ‘not depression’, ‘moderate’ and ‘severe’. The results showed the macro F1-score of 0.437 when the model was trained for 5 epochs with a comparative performance across the labels.The team acquired 6th rank while the top team scored macro F1-score as 0.470. We hope that this system will support further research into the early identification of depression in individuals to promote effective medical research and related treatments.</abstract>
+      <url hash="ab7c963b">2023.ltedi-1.36</url>
+      <bibkey>thannickal-etal-2023-techssn4</bibkey>
+    </paper>
+    <paper id="37">
+      <title>The Mavericks@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case>-2023: Detection of signs of Depression from social Media Texts using Navie Bayse approach</title>
+      <author><first>Sathvika</first><last>V S</last></author>
+      <author><first>Vaishnavi</first><last>Vaishnavi S</last></author>
+      <author><first>Angel Deborah</first><last>S</last></author>
+      <author><first>Rajalakshmi</first><last>Sivanaiah</last></author>
+      <author><first>Mirnalinee</first><last>ThankaNadar</last></author>
+      <pages>244–249</pages>
+      <abstract>Social media platforms have revolutionized the landscape of communication, providing individuals with an outlet to express their thoughts, emotions, and experiences openly. This paper focuses on the development of a model to determine whether individuals exhibit signs of depression based on their social media texts. With the aim of optimizing performance and accuracy, a Naive Bayes approach was chosen for the detection task.The Naive Bayes algorithm, a probabilistic classifier, was applied to extract features and classify the texts. The model leveraged linguistic patterns, sentiment analysis, and other relevant features to capture indicators of depression within the texts. Preprocessing techniques, including tokenization, stemming, and stop-word removal, were employed to enhance the quality of the input data.The performance of the Naive Bayes model was evaluated using standard metrics such as accuracy, precision, recall, and F1-score, it acheived a macro- avergaed F1 score of 0.263.</abstract>
+      <url hash="0124efc9">2023.ltedi-1.37</url>
+      <bibkey>v-s-etal-2023-mavericks</bibkey>
+    </paper>
+    <paper id="38">
+      <title>hate-alert@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case>-2023: Hope Speech Detection Using Transformer-Based Models</title>
+      <author><first>Mithun</first><last>Das</last></author>
+      <author><first>Shubhankar</first><last>Barman</last></author>
+      <author><first>Subhadeep</first><last>Chatterjee</last></author>
+      <pages>250–256</pages>
+      <abstract>Social media platforms have become integral to our daily lives, facilitating instant sharing of thoughts and ideas. While these platforms often host inspiring, motivational, and positive content, the research community has recognized the significance of such messages by labeling them as “hope speech”. In light of this, we delve into the detection of hope speech on social media platforms. Specifically, we explore various transformer-based model setups for the LT-EDI shared task at RANLP 2023. We observe that the performance of the models varies across languages. Overall, the finetuned m-BERT model showcases the best performance among all the models across languages. Our models secured the first position in Bulgarian and Hindi languages and achieved the third position for the Spanish language in the respective task.</abstract>
+      <url hash="e070a34a">2023.ltedi-1.38</url>
+      <bibkey>das-etal-2023-hate</bibkey>
+    </paper>
+    <paper id="39">
+      <title><fixed-case>TERCET</fixed-case>@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case>-2023: Hope Speech Detection for Equality, Diversity, and Inclusion</title>
+      <author><first>Priyadharshini</first><last>Thandavamurthi</last></author>
+      <author><first>Samyuktaa</first><last>Sivakumar</last></author>
+      <author><first>Shwetha</first><last>Sureshnathan</last></author>
+      <author><first>Thenmozhi</first><last>D.</last></author>
+      <author><first>Bharathi</first><last>B</last></author>
+      <author><first>Gayathri</first><last>Gl</last></author>
+      <pages>257–261</pages>
+      <abstract>Hope is a cheerful and optimistic state of mind which has its basis in the expectation of positive outcomes. Hope speech reflects the same as they are positive words that can motivate and encourage a person to do better. Non-hope speech reflects the exact opposite. They are meant to ridicule or put down someone and affect the person negatively. The shared Task on Hope Speech Detection for Equality, Diversity, and Inclusion at LT-EDI - RANLP 2023 was created with data sets in English, Spanish, Bulgarian and Hindi. The purpose of this task is to classify human-generated comments on the platform, YouTube, as Hope speech or non-Hope speech. We employed multiple traditional models such as SVM (support vector machine), Random Forest classifier, Naive Bayes and Logistic Regression. Support Vector Machine gave the highest macro average F1 score of 0.49 for the training data set and a macro average F1 score of 0.50 for the test data set.</abstract>
+      <url hash="38f2e509">2023.ltedi-1.39</url>
+      <bibkey>thandavamurthi-etal-2023-tercet</bibkey>
+    </paper>
+    <paper id="40">
+      <title>Interns@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case> : Detecting Signs of Depression from Social Media Text</title>
+      <author><first>Koushik</first><last>L</last></author>
+      <author><first>Hariharan R.</first><last>L</last></author>
+      <author><first>Anand Kumar</first><last>M</last></author>
+      <pages>262–265</pages>
+      <abstract>This submission presents our approach for depression detection in social media text. The methodology includes data collection, preprocessing - SMOTE, feature extraction/selection - TF-IDF and Glove, model development- SVM, CNN and Bi-LSTM, training, evaluation, optimisation, and validation. The proposed methodology aims to contribute to the accurate detection of depression.</abstract>
+      <url hash="436e6a4d">2023.ltedi-1.40</url>
+      <bibkey>l-etal-2023-interns</bibkey>
+    </paper>
+    <paper id="41">
+      <title>Tercet@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case>-2023: Homophobia/Transphobia Detection in social media comment</title>
+      <author><first>Shwetha</first><last>Sureshnathan</last></author>
+      <author><first>Samyuktaa</first><last>Sivakumar</last></author>
+      <author><first>Priyadharshini</first><last>Thandavamurthi</last></author>
+      <author><first>Thenmozhi</first><last>D.</last></author>
+      <author><first>Bharathi</first><last>B</last></author>
+      <author><first>Kiruthika</first><last>Chandrasekaran</last></author>
+      <pages>266–271</pages>
+      <abstract>The advent of social media platforms has revo- lutionized the way we interact, share, learn , ex- press and build our views and ideas. One major challenge of social media is hate speech. Homo- phobia and transphobia encompasses a range of negative attitudes and feelings towards people based on their sexual orientation or gender iden- tity. Homophobia refers to the fear, hatred, or prejudice against homosexuality, while trans- phobia involves discrimination against trans- gender individuals. Natural Language Process- ing can be used to identify homophobic and transphobic texts and help make social media a safer place. In this paper, we explore us- ing Support Vector Machine , Random Forest Classifier and Bert Model for homophobia and transphobia detection. The best model was a combination of LaBSE and SVM that achieved a weighted F1 score of 0.95.</abstract>
+      <url hash="5002b844">2023.ltedi-1.41</url>
+      <bibkey>sureshnathan-etal-2023-tercet</bibkey>
+    </paper>
+    <paper id="42">
+      <title><fixed-case>D</fixed-case>eep<fixed-case>L</fixed-case>earning<fixed-case>B</fixed-case>rasil@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case>-2023: Exploring Deep Learning Techniques for Detecting Depression in Social Media Text</title>
+      <author><first>Eduardo</first><last>Garcia</last></author>
+      <author><first>Juliana</first><last>Gomes</last></author>
+      <author><first>Adalberto Ferreira</first><last>Barbosa Junior</last></author>
+      <author><first>Cardeque Henrique Bittes de Alvarenga</first><last>Borges</last></author>
+      <author><first>Nadia Félix Felipe</first><last>da Silva</last></author>
+      <pages>272–278</pages>
+      <abstract>In this paper, we delineate the strategy employed by our team, DeepLearningBrasil, which secured us the first place in the shared task DepSign-LT-EDI@RANLP-2023 with the advantage of 2.4%. The task was to classify social media texts into three distinct levels of depression - “not depressed,” “moderately depressed,” and “severely depressed.” Leveraging the power of the RoBERTa and DeBERTa models, we further pre-trained them on a collected Reddit dataset, specifically curated from mental health-related Reddit’s communities (Subreddits), leading to an enhanced understanding of nuanced mental health discourse. To address lengthy textual data, we introduced truncation techniques that retained the essence of the content by focusing on its beginnings and endings. Our model was robust against unbalanced data by incorporating sample weights into the loss. Cross-validation and ensemble techniques were then employed to combine our k-fold trained models, delivering an optimal solution. The accompanying code is made available for transparency and further development.</abstract>
+      <url hash="896018ec">2023.ltedi-1.42</url>
+      <bibkey>garcia-etal-2023-deeplearningbrasil</bibkey>
+    </paper>
+    <paper id="43">
+      <title><fixed-case>MUCS</fixed-case>@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case>2023: Learning Approaches for Hope Speech Detection in Social Media Text</title>
+      <author><first>Asha</first><last>Hegde</last></author>
+      <author><first>Kavya</first><last>G</last></author>
+      <author><first>Sharal</first><last>Coelho</last></author>
+      <author><first>Hosahalli Lakshmaiah</first><last>Shashirekha</last></author>
+      <pages>279–286</pages>
+      <abstract>Hope plays a significant role in shaping human thoughts and actions and hope content has received limited attention in the realm of social media data analysis. The exploration of hope content helps to uncover the valuable insights into users’ aspirations, expectations, and emotional states. By delving into the analysis of hope content on social media platforms, researchers and analysts can gain a deeper understanding of how hope influences individuals’ behaviors, decisions, and overall well-being in the digital age. However, this area is rarely explored even for resource-high languages. To address the identification of hope text in social media platforms, this paper describes the models submitted by the team MUCS to “Hope Speech Detection for Equality, Diversity, and Inclusion (LT-EDI)” shared task organized at Recent Advances in Natural Language Processing (RANLP) - 2023. This shared task aims to classify a comment/post in English and code-mixed texts in three languages, namely, Bulgarian, Spanish, and Hindi into one of the two predefined categories, namely, “Hope speech” and “Non Hope speech”. Two models, namely: i) Hope_BERT - Linear Support Vector Classifier (LinearSVC) model trained by combining Bidirectional Encoder Representations from Transformers (BERT) embeddings and Term Frequency-Inverse Document Frequency (TF-IDF) of character n-grams with word boundary (char_wb) for English and ii) Hope_mBERT - LinearSVC model trained by combining Multilingual BERT (mBERT) embeddings and TF-IDF of char_wb for Bulgarian, Spanish, and Hindi code-mixed texts are proposed for the shared task to classify the given text into Hope or Non-Hope categories. The proposed models obtained 1st, 1st, 2nd, and 5th ranks for Spanish, Bulgarian, Hindi, and English texts respectively.</abstract>
+      <url hash="a4d8ce61">2023.ltedi-1.43</url>
+      <bibkey>hegde-etal-2023-mucs-lt</bibkey>
+    </paper>
+    <paper id="44">
+      <title><fixed-case>MUCS</fixed-case>@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case>2023: Homophobic/Transphobic Content Detection in Social Media Text using m<fixed-case>BERT</fixed-case></title>
+      <author><first>Asha</first><last>Hegde</last></author>
+      <author><first>Kavya</first><last>G</last></author>
+      <author><first>Sharal</first><last>Coelho</last></author>
+      <author><first>Hosahalli Lakshmaiah</first><last>Shashirekha</last></author>
+      <pages>287–294</pages>
+      <abstract>Homophobic/Transphobic (H/T) content includes hate speech, discrimination text, and abusive comments against Gay, Lesbian, Bisexual, Transgender, Queer, and Intersex (LGBTQ) individuals. With the increase in user generated text in social media, there has been an increase in code-mixed H/T content, which poses challenges for efficient analysis and detection of H/T content on social media. The complex nature of code-mixed text necessitates the development of advanced tools and techniques to effectively tackle this issue in social media platforms. To tackle this issue, in this paper, we - team MUCS, describe the transformer based models submitted to “Homophobia/Transphobia Detection in social media comments” shared task in Language Technology for Equality, Diversity and Inclusion (LT-EDI) at Recent Advances in Natural Language Processing (RANLP)-2023. The proposed methodology makes use of resampling the training data to handle the data imbalance and this resampled data is used to fine-tune the Multilingual Bidirectional Encoder Representations from Transformers (mBERT) models. These models obtained 11th, 5th, 3rd, 3rd, and 7th ranks for English, Tamil, Malayalam, Spanish, and Hindi respectively in Task A and 8th, 2nd, and 2nd ranks for English, Tamil, and Malayalam respectively in Task B.</abstract>
+      <url hash="8fe343d0">2023.ltedi-1.44</url>
+      <bibkey>hegde-etal-2023-mucs-lt-edi2023</bibkey>
+    </paper>
+    <paper id="45">
+      <title><fixed-case>MUCS</fixed-case>@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case>2023: Detecting Signs of Depression in Social Media Text</title>
+      <author><first>Sharal</first><last>Coelho</last></author>
+      <author><first>Asha</first><last>Hegde</last></author>
+      <author><first>Kavya</first><last>G</last></author>
+      <author><first>Hosahalli Lakshmaiah</first><last>Shashirekha</last></author>
+      <pages>295–299</pages>
+      <abstract>Depression can lead to significant changes in individuals’ posts on social media which is a important task to identify. Automated techniques must be created for the identification task as manually analyzing the growing volume of social media data is time-consuming. To address the signs of depression posts on social media, in this paper, we - team MUCS, describe a Transfer Learning (TL) model and Machine Learning (ML) models submitted to “Detecting Signs of Depression from Social Media Text” shared task organised by DepSign-LT-EDI@RANLP-2023. The TL model is trained using raw text Bidirectional Encoder Representations from Transformers (BERT) and the ML model is trained using Term Frequency-Inverse Document Frequency (TF-IDF) features separately. Among these three models, the TL model performed better with a macro averaged F1-score of 0.361 and placed 20th rank in the shared task.</abstract>
+      <url hash="d1697ff3">2023.ltedi-1.45</url>
+      <bibkey>coelho-etal-2023-mucs-lt</bibkey>
+    </paper>
+    <paper id="46">
+      <title><fixed-case>KEC</fixed-case>_<fixed-case>AI</fixed-case>_<fixed-case>NLP</fixed-case>_<fixed-case>DEP</fixed-case> @ <fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case> : Detecting Signs of Depression From Social Media Texts</title>
+      <author><first>Kogilavani</first><last>Shanmugavadivel</last></author>
+      <author><first>Malliga</first><last>Subramanian</last></author>
+      <author><first>Vasantharan</first><last>K</last></author>
+      <author><first>Prethish</first><last>Ga</last></author>
+      <author><first>Sankar</first><last>S</last></author>
+      <author><first>Sabari</first><last>S</last></author>
+      <pages>300–306</pages>
+      <abstract>The goal of this study is to use machine learning approaches to detect depression indications in social media articles. Data gathering, pre-processing, feature extraction, model training, and performance evaluation are all aspects of the research. The collection consists of social media messages classified into three categories: not depressed, somewhat depressed, and severely depressed. The study contributes to the growing field of social media data-driven mental health analysis by stressing the use of feature extraction algorithms for obtaining relevant information from text data. The use of social media communications to detect depression has the potential to increase early intervention and help for people at risk. Several feature extraction approaches, such as TF-IDF, Count Vectorizer, and Hashing Vectorizer, are used to quantitatively represent textual data. These features are used to train and evaluate a wide range of machine learning models, including Logistic Regression, Random Forest, Decision Tree, Gaussian Naive Bayes, and Multinomial Naive Bayes. To assess the performance of the models, metrics such as accuracy, precision, recall, F1 score, and the confusion matrix are utilized. The Random Forest model with Count Vectorizer had the greatest accuracy on the development dataset, coming in at 92.99 percent. And with a macro F1-score of 0.362, we came in 19th position in the shared task. The findings show that machine learning is effective in detecting depression markers in social media articles.</abstract>
+      <url hash="16257373">2023.ltedi-1.46</url>
+      <bibkey>shanmugavadivel-etal-2023-kec-ai-nlp</bibkey>
+    </paper>
+    <paper id="47">
+      <title>Flamingos_python@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case>-2023: An Ensemble Model to Detect Severity of Depression</title>
+      <author><first>Abirami</first><last>P S</last></author>
+      <author><first>Amritha</first><last>S</last></author>
+      <author><first>Pavithra</first><last>Meganathan</last></author>
+      <author><first>Jerin Mahibha</first><last>C</last></author>
+      <pages>307–311</pages>
+      <abstract>The prevalence of depression is increasing globally, and there is a need for effective screening and detection tools. Social media platforms offer a rich source of data for mental health research. The paper aims to detect the signs of depression of a person from their social media postings wherein people share their feelings and emotions. The task is to create a system that, given social media posts in English, should classify the level of depression as ‘not depressed’, ‘moderately depressed’ or ‘severely depressed’. The paper presents the solution for the Shared Task on Detecting Signs of Depression from Social Media Text at LT-EDI@RANLP 2023. The proposed system aims to develop a machine learning model using machine learning algorithms like SVM, Random forest and Naive Bayes to detect signs of depression from social media text. The model is trained on a dataset of social media posts to detect the level of depression of the individuals as ‘not depressed’, ‘moderately depressed’ or ‘severely depressed’. The dataset is pre-processed to remove duplicates and irrelevant features, and then, feature engineering techniques is used to extract meaningful features from the text data. The model is trained on these features to classify the text into the three categories. The performance of the model is evaluated using metrics such as accuracy, precision, recall, and F1-score. The ensemble model is used to combine these algorithms which gives accuracy of 90.2% and the F1 score is 0.90. The results of the proposed approach could potentially aid in the early detection and prevention of depression for individuals who may be at risk.</abstract>
+      <url hash="25af989c">2023.ltedi-1.47</url>
+      <bibkey>p-s-etal-2023-flamingos</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2023.nlp4tia.xml b/data/xml/2023.nlp4tia.xml
new file mode 100644
index 0000000000..9cfdc5c20a
--- /dev/null
+++ b/data/xml/2023.nlp4tia.xml
@@ -0,0 +1,106 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2023.nlp4tia">
+  <volume id="1" ingest-date="2023-10-31" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the First Workshop on NLP Tools and Resources for Translation and Interpreting Applications</booktitle>
+      <editor><first>Raquel Lázaro</first><last>Gutiérrez</last></editor>
+      <editor><first>Antonio</first><last>Pareja</last></editor>
+      <editor><first>Ruslan</first><last>Mitkov</last></editor>
+      <publisher>INCOMA Ltd., Shoumen, Bulgaria</publisher>
+      <address>Varna, Bulgaria</address>
+      <month>September</month>
+      <year>2023</year>
+      <url hash="a7d70b72">2023.nlp4tia-1</url>
+      <venue>nlp4tia</venue>
+      <venue>ws</venue>
+    </meta>
+    <frontmatter>
+      <url hash="ddac674c">2023.nlp4tia-1.0</url>
+      <bibkey>nlp4tia-2023-nlp</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>Natural Language Processing tools and resources for translation and interpreting applications. Introduction</title>
+      <author><first>Raquel</first><last>Lazaro Gutierrez</last></author>
+      <pages>1–3</pages>
+      <abstract/>
+      <url hash="cb4a6c4f">2023.nlp4tia-1.1</url>
+      <bibkey>lazaro-gutierrez-2023-natural</bibkey>
+    </paper>
+    <paper id="2">
+      <title>Machine translation, translation errors, and adequacy: <fixed-case>S</fixed-case>panish-<fixed-case>E</fixed-case>nglish vs. <fixed-case>S</fixed-case>panish-<fixed-case>R</fixed-case>omanian</title>
+      <author><first>Laura</first><last>Monguilod</last></author>
+      <author><first>Bianca</first><last>Vitalaru</last></author>
+      <pages>4–12</pages>
+      <abstract>This paper has two objectives: 1. To analyse the adequacy of using neural machine translation (NMT) for the translation of health information (from Spanish into English and Romanian) used in Spanish public health campaigns; and 2. To compare results considering these two linguistic combinations. Results show that post-editing is essential to improve the quality of the translations for both language combinations since they cannot be used as a primary resource for informing foreign users without post-editing. Moreover, Romanian translations require more post-editing. However, using NMT for informative texts combined with human post-editing can be used as a strategy to benefit from the potential of MT while at the same time ensuring the quality of the public service translations depending on the language combination and on the amount of time allotted for the task.</abstract>
+      <url hash="fd6ae23e">2023.nlp4tia-1.2</url>
+      <bibkey>monguilod-vitalaru-2023-machine</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Cross-Lingual Idiom Sense Clustering in <fixed-case>G</fixed-case>erman and <fixed-case>E</fixed-case>nglish</title>
+      <author><first>Mohammed</first><last>Absar</last></author>
+      <pages>13–19</pages>
+      <abstract>Idioms are expressions with non-literal and non-compositional meanings. For this reason, they pose a unique challenge for various NLP tasks including Machine Translation and Sentiment Analysis. In this paper, we propose an approach to clustering idioms in different languages by their sense. We leverage pre-trained cross-lingual transformer models and fine-tune them to produce cross-lingual vector representations of idioms according to their sense.</abstract>
+      <url hash="b14244b4">2023.nlp4tia-1.3</url>
+      <bibkey>absar-2023-cross</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Performance Evaluation on Human-Machine Teaming Augmented Machine Translation Enabled by <fixed-case>GPT</fixed-case>-4</title>
+      <author><first>Ming</first><last>Qian</last></author>
+      <pages>20–31</pages>
+      <abstract>Translation has been modeled as a multiple-phase process where pre-editing analyses guide meaning transfer and interlingual restructure. Present-day machine translation (MT) tools provide no means for source text analyses. Generative AI with Large language modeling (LLM), equipped with prompt engineering and fine-tuning capabilities, can enable augmented MT solutions by explicitly including AI or human generated analyses/instruction, and/or human-generated reference translation as pre-editing or interactive inputs. Using an English-to-Chinese translation piece that had been carefully studied during a translator slam event, Four types of translation outputs on 20 text segments were evaluated: human-generated translation, Google Translate MT, instruction-augmented MT using GPT4-LLM, and Human-Machine-Teaming (HMT)-augmented translation based on both human reference translation and instruction using GPT4-LLM. While human translation had the best performance, both augmented MT approaches performed better than un-augmented MT. The HMT-augmented MT performed better than instruction-augmented MT because it combined the guidance and knowledge provided by both human reference translation and style instruction. However, since it is unrealistic to generate sentence-by-sentence human translation as MT input, better approaches to HMT-augmented MT need to be invented. The evaluation showed that generative AI with LLM can enable new MT workflow facilitating pre-editing analyses and interactive restructuring and achieving better performance.</abstract>
+      <url hash="07fc4c00">2023.nlp4tia-1.4</url>
+      <bibkey>qian-2023-performance</bibkey>
+    </paper>
+    <paper id="5">
+      <title>The Interpretation System of <fixed-case>A</fixed-case>frican Languages in the Senegalese Parliament Debates</title>
+      <author><first>Jean Christophe</first><last>Faye</last></author>
+      <pages>32–38</pages>
+      <abstract>The present work deals with the interpretation system of local languages in the Senegalese parliament. In other words, it is devoted to the implementation of the simultaneous interpretation system in the Senegalese Parliament debates. The Senegalese parliament, in cooperation with the European Parliament and the European Union, implemented, some years ago, a system of interpretation devoted to translating (into) six local languages. But what does the interpretation system consist in? What motivates the choice of six local languages and not more or less than six? Why does the Senegalese parliament implement such system in a country whose official language is French? What are the linguistic consequences of this interpretation system on the local and foreign languages spoken in the Senegalese parliament? How is the recruitment of interpreters done? To answer these questions, we have explored the documents and writings related to the implementation of the simultaneous interpretation system in the Senegalese parliament, in particular, and of the interpretation system, in general. Field surveys as well as interviews of some deputies, some interpreters and other people from the administration have also been organized and analyzed in this study. This research has helped us have a lot of information and collect data for the corpus. After the data collection, we have moved on to data analysis and we have ended up with results that we have presented in the body of the text.</abstract>
+      <url hash="2bc58621">2023.nlp4tia-1.5</url>
+      <bibkey>faye-2023-interpretation</bibkey>
+    </paper>
+    <paper id="6">
+      <title><fixed-case>N</fixed-case>gambay-<fixed-case>F</fixed-case>rench Neural Machine Translation (sba-Fr)</title>
+      <author><first>Toadoum Sari</first><last>Sakayo</last></author>
+      <author><first>Angela</first><last>Fan</last></author>
+      <author><first>Lema Logamou</first><last>Seknewna</last></author>
+      <pages>39–47</pages>
+      <abstract>In Africa, and the world at large, there is an increasing focus on developing Neural Machine Translation (NMT) systems to overcome language barriers. NMT for Low-resource language is particularly compelling as it involves learning with limited labelled data. However, obtaining a well-aligned parallel corpus for low-resource languages can be challenging. The disparity between the technological advancement of a few global languages and the lack of research on NMT for local languages in Chad is striking. End-to-end NMT trials on low-resource Chad languages have not been attempted. Additionally, there is a dearth of online and well-structured data gathering for research in Natural Language Processing, unlike some African languages. However, a guided approach for data gathering can produce bitext data for many Chadian language translation pairs with well-known languages that have ample data. In this project, we created the first sba-Fr Dataset, which is a corpus of Ngambay-to-French translations, and fine-tuned three pre-trained models using this dataset. Our experiments show that the M2M100 model outperforms other models with high BLEU scores on both original and original+synthetic data. The publicly available bitext dataset can be used for research purposes.</abstract>
+      <url hash="858df4a9">2023.nlp4tia-1.6</url>
+      <bibkey>sakayo-etal-2023-ngambay</bibkey>
+    </paper>
+    <paper id="7">
+      <title>Machine Translation of literary texts: genres, times and systems</title>
+      <author><first>Ana Isabel</first><last>Cespedosa Vázquez</last></author>
+      <author><first>Ruslan</first><last>Mitkov</last></author>
+      <pages>48–53</pages>
+      <abstract>Machine Translation (MT) has taken off dramatically in recent years due to the advent of Deep Learning methods and Neural Machine Translation (NMT) has enhanced the quality of automatic translation significantly. While most work has covered the automatic translation of technical, legal and medical texts, the application of MT to literary texts and the human role in this process have been underexplored. In an effort to bridge the gap of this under-researched area, this paper presents the results of a study which seeks to evaluate the performance of three MT systems applied to two different literary genres, two novels (1984 by George Orwell and Pride and Prejudice by Jane Austen) and two poems (I Felt a Funeral in my Brain by Emily Dickinson and Siren Song by Margaret Atwood) representing different literary periods and timelines. The evaluation was conducted by way of the automatic evaluation metric BLEU to objectively assess the performance that the MT system shows on each genre. The limitations of this study are also outlined.</abstract>
+      <url hash="6b2a6d98">2023.nlp4tia-1.7</url>
+      <bibkey>cespedosa-vazquez-mitkov-2023-machine</bibkey>
+    </paper>
+    <paper id="8">
+      <title>s<fixed-case>TMS</fixed-case> Cloud – A Boutique Translation Project Management System</title>
+      <author><first>Nenad</first><last>Angelov</last></author>
+      <pages>54–56</pages>
+      <abstract>Demonstration of a Cloud-based Translation Project Management System, called sTMS, de- veloped with the financial support of Opera- tional Programme “Innovation and Competi- tiveness” 2014 2020 (OPIC) focusing to en- hance the operational activities of LSPs and MLPs. The idea behind was to concentrate mainly on the management processes, and not to integrate CAT or MT tools, because we be- lieve that the more functional such systems be- come, the harder to technically support and easy to operate they become. The key features sTMS provides are developed as a result of the broad experience of Project Managers, the increased requirements of our customers, the digital capabilities of our vendors and as last to meet the constantly changing environment of the translation industry.</abstract>
+      <url hash="a6ac73ee">2023.nlp4tia-1.8</url>
+      <bibkey>angelov-2023-stms</bibkey>
+    </paper>
+    <paper id="9">
+      <title>Leveraging Large Language Models to Extract Terminology</title>
+      <author><first>Julie</first><last>Giguere</last></author>
+      <pages>57–60</pages>
+      <abstract>Large Language Models (LLMs) have brought us efficient tools for various natural language processing (NLP) tasks. This paper explores the application of LLMs for extracting domain-specific terms from textual data. We will present the advantages and limitations of using LLMs for this task and will highlight the significant improvements they offer over traditional terminology extraction methods such as rule-based and statistical approaches.</abstract>
+      <url hash="5060e8e0">2023.nlp4tia-1.9</url>
+      <bibkey>giguere-2023-leveraging</bibkey>
+    </paper>
+    <paper id="10">
+      <title><fixed-case>C</fixed-case>hat<fixed-case>GPT</fixed-case> for translators: a survey</title>
+      <author><first>Constantin</first><last>Orăsan</last></author>
+      <pages>61–63</pages>
+      <abstract>This article surveys the most important ways in which translators can use ChatGPT. The focus is on scenarios where ChatGPT supports the work of translators, rather than tries to replace them. A discussion of issues that translators need to consider when using large language models, and ChatGPT in particular, is also provided.</abstract>
+      <url hash="6d1f3c65">2023.nlp4tia-1.10</url>
+      <bibkey>orasan-2023-chatgpt</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2023.ranlp.xml b/data/xml/2023.ranlp.xml
new file mode 100644
index 0000000000..2b91af74c8
--- /dev/null
+++ b/data/xml/2023.ranlp.xml
@@ -0,0 +1,1560 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2023.ranlp">
+  <volume id="1" ingest-date="2023-10-31" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the 14th International Conference on Recent Advances in Natural Language Processing</booktitle>
+      <editor><first>Ruslan</first><last>Mitkov</last></editor>
+      <editor><first>Galia</first><last>Angelova</last></editor>
+      <publisher>INCOMA Ltd., Shoumen, Bulgaria</publisher>
+      <address>Varna, Bulgaria</address>
+      <month>September</month>
+      <year>2023</year>
+      <url hash="81592834">2023.ranlp-1</url>
+      <venue>ranlp</venue>
+    </meta>
+    <frontmatter>
+      <url hash="7a7921b8">2023.ranlp-1.0</url>
+      <bibkey>ranlp-2023-international</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>Bipol: Multi-Axes Evaluation of Bias with Explainability in Benchmark Datasets</title>
+      <author><first>Tosin</first><last>Adewumi</last></author>
+      <author><first>Isabella</first><last>Södergren</last></author>
+      <author><first>Lama</first><last>Alkhaled</last></author>
+      <author><first>Sana</first><last>Al-azzawi</last></author>
+      <author><first>Foteini</first><last>Simistira Liwicki</last></author>
+      <author><first>Marcus</first><last>Liwicki</last></author>
+      <pages>1–10</pages>
+      <abstract>We investigate five English NLP benchmark datasets (on the superGLUE leaderboard) and two Swedish datasets for bias, along multiple axes. The datasets are the following: Boolean Question (Boolq), CommitmentBank (CB), Winograd Schema Challenge (WSC), Winogender diagnostic (AXg), Recognising Textual Entailment (RTE), Swedish CB, and SWEDN. Bias can be harmful and it is known to be common in data, which ML models learn from. In order to mitigate bias in data, it is crucial to be able to estimate it objectively. We use bipol, a novel multi-axes bias metric with explainability, to estimate and explain how much bias exists in these datasets. Multilingual, multi-axes bias evaluation is not very common. Hence, we also contribute a new, large Swedish bias-labelled dataset (of 2 million samples), translated from the English version and train the SotA mT5 model on it. In addition, we contribute new multi-axes lexica for bias detection in Swedish. We make the codes, model, and new dataset publicly available.</abstract>
+      <url hash="cd89695c">2023.ranlp-1.1</url>
+      <bibkey>adewumi-etal-2023-bipol</bibkey>
+    </paper>
+    <paper id="2">
+      <title>Automatically Generating <fixed-case>H</fixed-case>indi <fixed-case>W</fixed-case>ikipedia Pages Using <fixed-case>W</fixed-case>ikidata as a Knowledge Graph: A Domain-Specific Template Sentences Approach</title>
+      <author><first>Aditya</first><last>Agarwal</last></author>
+      <author><first>Radhika</first><last>Mamidi</last></author>
+      <pages>11–21</pages>
+      <abstract>This paper presents a method for automatically generating Wikipedia articles in the Hindi language, using Wikidata as a knowledge base. Our method extracts structured information from Wikidata, such as the names of entities, their properties, and their relationships, and then uses this information to generate natural language text that conforms to a set of templates designed for the domain of interest. We evaluate our method by generating articles about scientists, and we compare the resulting articles to machine-translated articles. Our results show that more than 70% of the generated articles using our method are better in terms of coherence, structure, and readability. Our approach has the potential to significantly reduce the time and effort required to create Wikipedia articles in Hindi and could be extended to other languages and domains as well.</abstract>
+      <url hash="aa2d0ead">2023.ranlp-1.2</url>
+      <bibkey>agarwal-mamidi-2023-automatically</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Cross-lingual Classification of Crisis-related Tweets Using Machine Translation</title>
+      <author><first>Shareefa</first><last>Al Amer</last></author>
+      <author><first>Mark</first><last>Lee</last></author>
+      <author><first>Phillip</first><last>Smith</last></author>
+      <pages>22–31</pages>
+      <abstract>Utilisation of multilingual language models such as mBERT and XLM-RoBERTa has increasingly gained attention in recent work by exploiting the multilingualism of such models in different downstream tasks across different languages. However, performance degradation is expected in transfer learning across languages compared to monolingual performance although it is an acceptable trade-off considering the sparsity of resources and lack of available training data in low-resource languages. In this work, we study the effect of machine translation on the cross-lingual transfer learning in a crisis event classification task. Our experiments include measuring the effect of machine-translating the target data into the source language and vice versa. We evaluated and compared the performance in terms of accuracy and F1-Score. The results show that translating the source data into the target language improves the prediction accuracy by 14.8% and the Weighted Average F1-Score by 19.2% when compared to zero-shot transfer to an unseen language.</abstract>
+      <url hash="000c9d33">2023.ranlp-1.3</url>
+      <bibkey>al-amer-etal-2023-cross</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Lexicon-Driven Automatic Sentence Generation for the Skills Section in a Job Posting</title>
+      <author><first>Vera</first><last>Aleksic</last></author>
+      <author><first>Mona</first><last>Brems</last></author>
+      <author><first>Anna</first><last>Mathes</last></author>
+      <author><first>Theresa</first><last>Bertele</last></author>
+      <pages>32–40</pages>
+      <abstract>This paper presents a sentence generation pipeline as implemented on the online job board Stepstone. The goal is to automatically create a set of sentences for the candidate profile and the task description sections in a job ad, related to a given input skill. They must cover two different “tone of voice” variants in German (Du, Sie), three experience levels (junior, mid, senior), and two optionality values (skill is mandatory or optional/nice to have). The generation process considers the difference between soft skills, natural language competencies and hard skills, as well as more specific sub-categories such as IT skills, programming languages and similar. To create grammatically consistent text, morphosyntactic features from the proprietary skill ontology and lexicon are consulted. The approach is a lexicon-driven generation process that compares all lexical features of the new input skills with the ones already added to the sentence database and creates new sentences according to the corresponding templates.</abstract>
+      <url hash="8d1d9e4c">2023.ranlp-1.4</url>
+      <bibkey>aleksic-etal-2023-lexicon</bibkey>
+    </paper>
+    <paper id="5">
+      <title>Multilingual Racial Hate Speech Detection Using Transfer Learning</title>
+      <author><first>Abinew Ali</first><last>Ayele</last></author>
+      <author><first>Skadi</first><last>Dinter</last></author>
+      <author><first>Seid Muhie</first><last>Yimam</last></author>
+      <author><first>Chris</first><last>Biemann</last></author>
+      <pages>41–48</pages>
+      <abstract>The rise of social media eases the spread of hateful content, especially racist content with severe consequences. In this paper, we analyze the tweets targeting the death of George Floyd in May 2020 as the event accelerated debates on racism globally. We focus on the tweets published in French for a period of one month since the death of Floyd. Using the Yandex Toloka platform, we annotate the tweets into categories as hate, offensive or normal. Tweets that are offensive or hateful are further annotated as racial or non-racial. We build French hate speech detection models based on the multilingual BERT and CamemBERT and apply transfer learning by fine-tuning the HateXplain model. We compare different approaches to resolve annotation ties and find that the detection model based on CamemBERT yields the best results in our experiments.</abstract>
+      <url hash="563c2466">2023.ranlp-1.5</url>
+      <bibkey>ayele-etal-2023-multilingual</bibkey>
+    </paper>
+    <paper id="6">
+      <title>Exploring <fixed-case>A</fixed-case>mharic Hate Speech Data Collection and Classification Approaches</title>
+      <author><first>Abinew Ali</first><last>Ayele</last></author>
+      <author><first>Seid Muhie</first><last>Yimam</last></author>
+      <author><first>Tadesse Destaw</first><last>Belay</last></author>
+      <author><first>Tesfa</first><last>Asfaw</last></author>
+      <author><first>Chris</first><last>Biemann</last></author>
+      <pages>49–59</pages>
+      <abstract>In this paper, we present a study of efficient data selection and annotation strategies for Amharic hate speech. We also build various classification models and investigate the challenges of hate speech data selection, annotation, and classification for the Amharic language. From a total of over 18 million tweets in our Twitter corpus, 15.1k tweets are annotated by two independent native speakers, and a Cohen’s kappa score of 0.48 is achieved. A third annotator, a curator, is also employed to decide on the final gold labels. We employ both classical machine learning and deep learning approaches, which include fine-tuning AmFLAIR and AmRoBERTa contextual embedding models. Among all the models, AmFLAIR achieves the best performance with an F1-score of 72%. We publicly release the annotation guidelines, keywords/lexicon entries, datasets, models, and associated scripts with a permissive license.</abstract>
+      <url hash="af030c59">2023.ranlp-1.6</url>
+      <bibkey>ayele-etal-2023-exploring</bibkey>
+    </paper>
+    <paper id="7">
+      <title><fixed-case>B</fixed-case>hojpuri <fixed-case>W</fixed-case>ord<fixed-case>N</fixed-case>et: Problems in Translating <fixed-case>H</fixed-case>indi Synsets into <fixed-case>B</fixed-case>hojpuri</title>
+      <author><first>Imran</first><last>Ali</last></author>
+      <author><first>Praveen</first><last>Gatla</last></author>
+      <pages>60–68</pages>
+      <abstract>Today, artificial intelligence systems are incredibly intelligent, however they lack the human like capacity for understanding. In this context, sense-based lexical resources become a requirement for artificially intelligent machines. Lexical resources like Wordnets have received scholarly attention because they are considered as the crucial sense-based resources in the field of natural language understanding. They can help in knowing the intended meaning of the communicated texts, as they are focused on the concept rather than the words. Wordnets are available only for 18 Indian languages. Keeping this in mind, we have initiated the development of a comprehensive wordnet for Bhojpuri. The present paper describes the creation of the synsets of Bhojpuri and discusses the problems that we faced while translating Hindi synsets into Bhojpuri. They are lexical anomalies, lexical mismatch words, synthesized forms, lack of technical words etc. Nearly 4000 Hindi synsets were mapped for their equivalent synsets in Bhojpuri following the expansion approach. We have also worked on the language-specific synsets, which are unique to Bhojpuri. This resource is useful in machine translation, sentiment analysis, word sense disambiguation, cross-lingual references among Indian languages, and Bhojpuri language teaching and learning.</abstract>
+      <url hash="4d7f9e1d">2023.ranlp-1.7</url>
+      <bibkey>ali-gatla-2023-bhojpuri</bibkey>
+    </paper>
+    <paper id="8">
+      <title>3<fixed-case>D</fixed-case>-<fixed-case>EX</fixed-case>: A Unified Dataset of Definitions and Dictionary Examples</title>
+      <author><first>Fatemah</first><last>Almeman</last></author>
+      <author><first>Hadi</first><last>Sheikhi</last></author>
+      <author><first>Luis</first><last>Espinosa Anke</last></author>
+      <pages>69–79</pages>
+      <abstract>Definitions are a fundamental building block in lexicography, linguistics and computational semantics. In NLP, they have been used for retrofitting word embeddings or augmenting contextual representations in language models. However, lexical resources containing definitions exhibit a wide range of properties, which has implications in the behaviour of models trained and evaluated on them. In this paper, we introduce 3D-EX, a dataset that aims to fill this gap by combining well-known English resources into one centralized knowledge repository in the form of &lt;term, definition, example&gt; triples. 3D-EX is a unified evaluation framework with carefully pre-computed train/validation/test splits to prevent memorization. We report experimental results that suggest that this dataset could be effectively leveraged in downstream NLP tasks. Code and data are available at https://github.com/F-Almeman/3D-EX.</abstract>
+      <url hash="456bb028">2023.ranlp-1.8</url>
+      <bibkey>almeman-etal-2023-3d</bibkey>
+    </paper>
+    <paper id="9">
+      <title>Are You Not moved? Incorporating Sensorimotor Knowledge to Improve Metaphor Detection</title>
+      <author><first>Ghadi</first><last>Alnafesah</last></author>
+      <author><first>Phillip</first><last>Smith</last></author>
+      <author><first>Mark</first><last>Lee</last></author>
+      <pages>80–89</pages>
+      <abstract>Metaphors use words from one domain of knowledge to describe another, which can make the meaning less clear and require human interpretation to understand. This makes it difficult for automated models to detect metaphorical usage. The objective of the experiments in the paper is to enhance the ability of deep learning models to detect metaphors automatically. This is achieved by using two elements of semantic richness, sensory experience, and body-object interaction, as the main lexical features, combined with the contextual information present in the metaphorical sentences. The tests were conducted using classification and sequence labeling models for metaphor detection on the three metaphorical corpora VUAMC, MOH-X, and TroFi. The sensory experience led to significant improvements in the classification and sequence labelling models across all datasets. The highest gains were seen on the VUAMC dataset: recall increased by 20.9%, F1 by 7.5% for the classification model, and Recall increased by 11.66% and F1 by 3.69% for the sequence labelling model. Body-object interaction also showed positive impact on the three datasets.</abstract>
+      <url hash="4b40e8cc">2023.ranlp-1.9</url>
+      <bibkey>alnafesah-etal-2023-moved</bibkey>
+    </paper>
+    <paper id="10">
+      <title><fixed-case>HAQA</fixed-case> and <fixed-case>QUQA</fixed-case>: Constructing Two <fixed-case>A</fixed-case>rabic Question-Answering Corpora for the <fixed-case>Q</fixed-case>uran and <fixed-case>H</fixed-case>adith</title>
+      <author><first>Sarah</first><last>Alnefaie</last></author>
+      <author><first>Eric</first><last>Atwell</last></author>
+      <author><first>Mohammad Ammar</first><last>Alsalka</last></author>
+      <pages>90–97</pages>
+      <abstract>It is neither possible nor fair to compare the performance of question-answering systems for the Holy Quran and Hadith Sharif in Arabic due to both the absence of a golden test dataset on the Hadith Sharif and the small size and easy questions of the newly created golden test dataset on the Holy Quran. This article presents two question–answer datasets: Hadith Question–Answer pairs (HAQA) and Quran Question–Answer pairs (QUQA). HAQA is the first Arabic Hadith question–answer dataset available to the research community, while the QUQA dataset is regarded as the more challenging and the most extensive collection of Arabic question–answer pairs on the Quran. HAQA was designed and its data collected from several expert sources, while QUQA went through several steps in the construction phase; that is, it was designed and then integrated with existing datasets in different formats, after which the datasets were enlarged with the addition of new data from books by experts. The HAQA corpus consists of 1598 question–answer pairs, and that of QUQA contains 3382. They may be useful as gold–standard datasets for the evaluation process, as training datasets for language models with question-answering tasks and for other uses in artificial intelligence.</abstract>
+      <url hash="489dfa0f">2023.ranlp-1.10</url>
+      <bibkey>alnefaie-etal-2023-haqa</bibkey>
+    </paper>
+    <paper id="11">
+      <title><fixed-case>C</fixed-case>onfli<fixed-case>BERT</fixed-case>-<fixed-case>A</fixed-case>rabic: A Pre-trained <fixed-case>A</fixed-case>rabic Language Model for Politics, Conflicts and Violence</title>
+      <author><first>Sultan</first><last>Alsarra</last></author>
+      <author><first>Luay</first><last>Abdeljaber</last></author>
+      <author><first>Wooseong</first><last>Yang</last></author>
+      <author><first>Niamat</first><last>Zawad</last></author>
+      <author><first>Latifur</first><last>Khan</last></author>
+      <author><first>Patrick</first><last>Brandt</last></author>
+      <author><first>Javier</first><last>Osorio</last></author>
+      <author><first>Vito</first><last>D’Orazio</last></author>
+      <pages>98–108</pages>
+      <abstract>This study investigates the use of Natural Language Processing (NLP) methods to analyze politics, conflicts and violence in the Middle East using domain-specific pre-trained language models. We introduce Arabic text and present ConfliBERT-Arabic, a pre-trained language models that can efficiently analyze political, conflict and violence-related texts. Our technique hones a pre-trained model using a corpus of Arabic texts about regional politics and conflicts. Performance of our models is compared to baseline BERT models. Our findings show that the performance of NLP models for Middle Eastern politics and conflict analysis are enhanced by the use of domain-specific pre-trained local language models. This study offers political and conflict analysts, including policymakers, scholars, and practitioners new approaches and tools for deciphering the intricate dynamics of local politics and conflicts directly in Arabic.</abstract>
+      <url hash="4567df76">2023.ranlp-1.11</url>
+      <bibkey>alsarra-etal-2023-conflibert</bibkey>
+    </paper>
+    <paper id="12">
+      <title>A Review in Knowledge Extraction from Knowledge Bases</title>
+      <author><first>Fabio</first><last>Yanez</last></author>
+      <author><first>Andrés</first><last>Montoyo</last></author>
+      <author><first>Yoan</first><last>Gutierrez</last></author>
+      <author><first>Rafael</first><last>Muñoz</last></author>
+      <author><first>Armando</first><last>Suarez</last></author>
+      <pages>109–116</pages>
+      <abstract>Generative language models achieve the state of the art in many tasks within natural language processing (NLP). Although these models correctly capture syntactic information, they fail to interpret knowledge (semantics). Moreover, the lack of interpretability of these models promotes the use of other technologies as a replacement or complement to generative language models. This is the case with research focused on incorporating knowledge by resorting to knowledge bases mainly in the form of graphs. The generation of large knowledge graphs is carried out with unsupervised or semi-supervised techniques, which promotes the validation of this knowledge with the same type of techniques due to the size of the generated databases. In this review, we will explain the different techniques used to test and infer knowledge from graph structures with machine learning algorithms. The motivation of validating and inferring knowledge is to use correct knowledge in subsequent tasks with improved embeddings.</abstract>
+      <url hash="6ad00a1d">2023.ranlp-1.12</url>
+      <bibkey>yanez-etal-2023-review</bibkey>
+    </paper>
+    <paper id="13">
+      <title>Evaluating of Large Language Models in Relationship Extraction from Unstructured Data: Empirical Study from Holocaust Testimonies</title>
+      <author><first>Isuri</first><last>Anuradha</last></author>
+      <author><first>Le An</first><last>Ha</last></author>
+      <author><first>Ruslan</first><last>Mitkov</last></author>
+      <author><first>Vinita</first><last>Nahar</last></author>
+      <pages>117–123</pages>
+      <abstract>Relationship extraction from unstructured data remains one of the most challenging tasks in the field of Natural Language Processing (NLP). The complexity of relationship extraction arises from the need to comprehend the underlying semantics, syntactic structures, and contextual dependencies within the text. Unstructured data poses challenges with diverse linguistic patterns, implicit relationships, contextual nuances, complicating accurate relationship identification and extraction. The emergence of Large Language Models (LLMs), such as GPT (Generative Pre-trained Transformer), has indeed marked a significant advancement in the field of NLP. In this work, we assess and evaluate the effectiveness of LLMs in relationship extraction in the Holocaust testimonies within the context of the Historical realm. By delving into this domain-specific context, we aim to gain deeper insights into the performance and capabilities of LLMs in accurately capturing and extracting relationships within the Holocaust domain by developing a novel knowledge graph to visualise the relationships of the Holocaust. To the best of our knowledge, there is no existing study which discusses relationship extraction in Holocaust testimonies. The majority of current approaches for Information Extraction (IE) in historic documents are either manual or OCR based. Moreover, in this study, we found that the Subject-Object-Verb extraction using GPT3-based relations produced more meaningful results compared to the Semantic Role labeling-based triple extraction.</abstract>
+      <url hash="098e3c4c">2023.ranlp-1.13</url>
+      <bibkey>anuradha-etal-2023-evaluating</bibkey>
+    </paper>
+    <paper id="14">
+      <title>Impact of Emojis on Automatic Analysis of Individual Emotion Categories</title>
+      <author><first>Ratchakrit</first><last>Arreerard</last></author>
+      <author><first>Scott</first><last>Piao</last></author>
+      <pages>124–131</pages>
+      <abstract>Automatic emotion analysis is a highly challenging task for Natural Language Processing, which has so far mainly relied on textual contents to determine the emotion of text. However, words are not the only media that carry emotional information. In social media, people also use emojis to convey their feelings. Recently, researchers have studied emotional aspects of emojis, and use emoji information to improve the emotion detection and classification, but many issues remain to be addressed. In this study, we examine the impact of emoji embedding on emotion classification and intensity prediction on four individual emotion categories, including anger, fear, joy, and sadness, in order to investigate how emojis affect the automatic analysis of individual emotion categories and intensity. We conducted a comparative study by testing five machine learning models with and without emoji embeddings involved. Our experiment demonstrates that emojis have varying impact on different emotion categories, and there is potential that emojis can be used to enhance emotion information processing.</abstract>
+      <url hash="c765fb44">2023.ranlp-1.14</url>
+      <bibkey>arreerard-piao-2023-impact</bibkey>
+    </paper>
+    <paper id="15">
+      <title>Was That a Question? Automatic Classification of Discourse Meaning in <fixed-case>S</fixed-case>panish</title>
+      <author><first>Santiago</first><last>Arróniz</last></author>
+      <author><first>Sandra</first><last>Kübler</last></author>
+      <pages>132–142</pages>
+      <abstract>This paper examines the effectiveness of different feature representations of audio data in accurately classifying discourse meaning in Spanish. The task involves determining whether an utterance is a declarative sentence, an interrogative, an imperative, etc. We explore how pitch contour can be represented for a discourse-meaning classification task, employing three different audio features: MFCCs, Mel-scale spectrograms, and chromagrams. We also determine if utilizing means is more effective in representing the speech signal, given the large number of coefficients produced during the feature extraction process. Finally, we evaluate whether these feature representation techniques are sensitive to speaker information. Our results show that a recurrent neural network architecture in conjunction with all three feature sets yields the best results for the task.</abstract>
+      <url hash="6c5c3918">2023.ranlp-1.15</url>
+      <bibkey>arroniz-kubler-2023-question</bibkey>
+    </paper>
+    <paper id="16">
+      <title>Designing the <fixed-case>LECOR</fixed-case> Learner Corpus for <fixed-case>R</fixed-case>omanian</title>
+      <author><first>Ana Maria</first><last>Barbu</last></author>
+      <author><first>Elena</first><last>Irimia</last></author>
+      <author><first>Carmen Mîrzea</first><last>Vasile</last></author>
+      <author><first>Vasile</first><last>Păiș</last></author>
+      <pages>143–152</pages>
+      <abstract>This article presents a work-in-progress project, which aims to build and utilize a corpus of Romanian texts written or spoken by non-native students of different nationalities, who learn Romanian as a foreign language in the one-year, intensive academic program organized by the University of Bucharest. This corpus, called LECOR – Learner Corpus for Romanian – is made up of pairs of texts: a version of the student and a corrected one of the teacher. Each version is automatically annotated with lemma and POS-tag, and the two versions are then compared, and the differences are marked as errors at this stage. The corpus also contains metadata file sets about students and their samples. In this article, the conceptual framework for building and utilization of the corpus is presented, including the acquisition and organization phases of the primary material, the annotation process, and the first attempts to adapt the NoSketch Engine query interface to the project’s objectives. The article concludes by outlining the next steps in the development of the corpus aimed at quantitative accumulation and the development of the error correction process and the complex error annotation.</abstract>
+      <url hash="1db6fd4d">2023.ranlp-1.16</url>
+      <bibkey>barbu-etal-2023-designing</bibkey>
+    </paper>
+    <paper id="17">
+      <title>Non-Parametric Memory Guidance for Multi-Document Summarization</title>
+      <author><first>Florian</first><last>Baud</last></author>
+      <author><first>Alex</first><last>Aussem</last></author>
+      <pages>153–158</pages>
+      <abstract>Multi-document summarization (MDS) is a difficult task in Natural Language Processing, aiming to summarize information from several documents. However, the source documents are often insufficient to obtain a qualitative summary. We propose a retriever-guided model combined with non-parametric memory for summary generation. This model retrieves relevant candidates from a database and then generates the summary considering the candidates with a copy mechanism and the source documents. The retriever is implemented with Approximate Nearest Neighbor Search (ANN) to search large databases. Our method is evaluated on the MultiXScience dataset which includes scientific articles. Finally, we discuss our results and possible directions for future work.</abstract>
+      <url hash="c95618c8">2023.ranlp-1.17</url>
+      <bibkey>baud-aussem-2023-non</bibkey>
+    </paper>
+    <paper id="18">
+      <title>Beyond Information: Is <fixed-case>C</fixed-case>hat<fixed-case>GPT</fixed-case> Empathetic Enough?</title>
+      <author><first>Ahmed</first><last>Belkhir</last></author>
+      <author><first>Fatiha</first><last>Sadat</last></author>
+      <pages>159–169</pages>
+      <abstract>This paper aims to explore and enhance ChatGPT’s abilities to generate more human-like conversations by taking into account the emotional state of the user. To achieve this goal, a prompt-driven Emotional Intelligence is used through the empathetic dialogue dataset in order to propose a more empathetic conversational language model. We propose two altered versions of ChatGPT as follows: (1) an emotion-infused version which takes the user’s emotion as input before generating responses using an emotion classifier based on ELECTRA ; and (2) the emotion adapting version that tries to accommodate for how the user feels without any external component. By analyzing responses of the two proposed altered versions and comparing them to the standard version of ChatGPT, we find that using the external emotion classifier leads to more frequent and pronounced use of positive emotions compared to the standard version. On the other hand, using simple prompt engineering to take the user emotion into consideration, does the opposite. Finally, comparisons with state-of-the-art models highlight the potential of prompt engineering to enhance the emotional abilities of chatbots based on large language models.</abstract>
+      <url hash="6c2f7c6c">2023.ranlp-1.18</url>
+      <bibkey>belkhir-sadat-2023-beyond</bibkey>
+    </paper>
+    <paper id="19">
+      <title>Using <fixed-case>W</fixed-case>ikidata for Enhancing Compositionality in Pretrained Language Models</title>
+      <author><first>Meriem</first><last>Beloucif</last></author>
+      <author><first>Mihir</first><last>Bansal</last></author>
+      <author><first>Chris</first><last>Biemann</last></author>
+      <pages>170–178</pages>
+      <abstract>One of the many advantages of pre-trained language models (PLMs) such as BERT and RoBERTa is their flexibility and contextual nature. These features give PLMs strong capabilities for representing lexical semantics. However, PLMs seem incapable of capturing high-level semantics in terms of compositionally. We show that when augmented with the relevant semantic knowledge, PMLs learn to capture a higher degree of lexical compositionality. We annotate a large dataset from Wikidata highlighting a type of semantic inference that is easy for humans to understand but difficult for PLMs, like the correlation between age and date of birth. We use this resource for finetuning DistilBERT, BERT large and RoBERTa. Our results show that the performance of PLMs against the test data continuously improves when augmented with such a rich resource. Our results are corroborated by a consistent improvement over most GLUE benchmark natural language understanding tasks.</abstract>
+      <url hash="9d1989be">2023.ranlp-1.19</url>
+      <bibkey>beloucif-etal-2023-using</bibkey>
+    </paper>
+    <paper id="20">
+      <title>Multimodal Learning for Accurate Visual Question Answering: An Attention-Based Approach</title>
+      <author><first>Jishnu</first><last>Bhardwaj</last></author>
+      <author><first>Anurag</first><last>Balakrishnan</last></author>
+      <author><first>Satyam</first><last>Pathak</last></author>
+      <author><first>Ishan</first><last>Unnarkar</last></author>
+      <author><first>Aniruddha</first><last>Gawande</last></author>
+      <author><first>Benyamin</first><last>Ahmadnia</last></author>
+      <pages>179–186</pages>
+      <abstract>This paper proposes an open-ended task for Visual Question Answering (VQA) that leverages the InceptionV3 Object Detection model and an attention-based Long Short-Term Memory (LSTM) network for question answering. Our proposed model provides accurate natural language answers to questions about an image, including those that require understanding contextual information and background details. Our findings demonstrate that the proposed approach can achieve high accuracy, even with complex and varied visual information. The proposed method can contribute to developing more advanced vision systems that can process and interpret visual information like humans.</abstract>
+      <url hash="1373485b">2023.ranlp-1.20</url>
+      <bibkey>bhardwaj-etal-2023-multimodal</bibkey>
+    </paper>
+    <paper id="21">
+      <title>Generative Models For <fixed-case>I</fixed-case>ndic Languages: Evaluating Content Generation Capabilities</title>
+      <author><first>Savita</first><last>Bhat</last></author>
+      <author><first>Vasudeva</first><last>Varma</last></author>
+      <author><first>Niranjan</first><last>Pedanekar</last></author>
+      <pages>187–195</pages>
+      <abstract>Large language models (LLMs) and generative AI have emerged as the most important areas in the field of natural language processing (NLP). LLMs are considered to be a key component in several NLP tasks, such as summarization, question-answering, sentiment classification, and translation. Newer LLMs, such as ChatGPT, BLOOMZ, and several such variants, are known to train on multilingual training data and hence are expected to process and generate text in multiple languages. Considering the widespread use of LLMs, evaluating their efficacy in multilingual settings is imperative. In this work, we evaluate the newest generative models (ChatGPT, mT0, and BLOOMZ) in the context of Indic languages. Specifically, we consider natural language generation (NLG) applications such as summarization and question-answering in monolingual and cross-lingual settings. We observe that current generative models have limited capability for generating text in Indic languages in a zero-shot setting. In contrast, generative models perform consistently better on manual quality-based evaluation in both Indic languages and English language generation. Considering limited generation performance, we argue that these LLMs are not intended to use in zero-shot fashion in downstream applications.</abstract>
+      <url hash="313203c6">2023.ranlp-1.21</url>
+      <bibkey>bhat-etal-2023-generative</bibkey>
+    </paper>
+    <paper id="22">
+      <title>Measuring Spurious Correlation in Classification: “Clever Hans” in Translationese</title>
+      <author><first>Angana</first><last>Borah</last></author>
+      <author><first>Daria</first><last>Pylypenko</last></author>
+      <author><first>Cristina</first><last>España-Bonet</last></author>
+      <author><first>Josef</first><last>van Genabith</last></author>
+      <pages>196–206</pages>
+      <abstract>Recent work has shown evidence of “Clever Hans” behavior in high-performance neural translationese classifiers, where BERT-based classifiers capitalize on spurious correlations, in particular topic information, between data and target classification labels, rather than genuine translationese signals. Translationese signals are subtle (especially for professional translation) and compete with many other signals in the data such as genre, style, author, and, in particular, topic. This raises the general question of how much of the performance of a classifier is really due to spurious correlations in the data versus the signals actually targeted for by the classifier, especially for subtle target signals and in challenging (low resource) data settings. We focus on topic-based spurious correlation and approach the question from two directions: (i) where we have no knowledge about spurious topic information and its distribution in the data, (ii) where we have some indication about the nature of spurious topic correlations. For (i) we develop a measure from first principles capturing alignment of unsupervised topics with target classification labels as an indication of spurious topic information in the data. We show that our measure is the same as purity in clustering and propose a “topic floor” (as in a “noise floor”) for classification. For (ii) we investigate masking of known spurious topic carriers in classification. Both (i) and (ii) contribute to quantifying and (ii) to mitigating spurious correlations.</abstract>
+      <url hash="ec9bbcb2">2023.ranlp-1.22</url>
+      <bibkey>borah-etal-2023-measuring</bibkey>
+    </paper>
+    <paper id="23">
+      <title><fixed-case>WIKITIDE</fixed-case>: A <fixed-case>W</fixed-case>ikipedia-Based Timestamped Definition Pairs Dataset</title>
+      <author><first>Hsuvas</first><last>Borkakoty</last></author>
+      <author><first>Luis</first><last>Espinosa Anke</last></author>
+      <pages>207–216</pages>
+      <abstract>A fundamental challenge in the current NLP context, dominated by language models, comes from the inflexibility of current architectures to “learn” new information. While model-centric solutions like continual learning or parameter-efficient fine-tuning are available, the question still remains of how to reliably identify changes in language or in the world. In this paper, we propose WikiTiDe, a dataset derived from pairs of timestamped definitions extracted from Wikipedia. We argue that such resources can be helpful for accelerating diachronic NLP, specifically, for training models able to scan knowledge resources for core updates concerning a concept, an event, or a named entity. Our proposed end-to-end method is fully automatic and leverages a bootstrapping algorithm for gradually creating a high-quality dataset. Our results suggest that bootstrapping the seed version of WikiTiDe leads to better-fine-tuned models. We also leverage fine-tuned models in a number of downstream tasks, showing promising results with respect to competitive baselines.</abstract>
+      <url hash="36b365f5">2023.ranlp-1.23</url>
+      <bibkey>borkakoty-espinosa-anke-2023-wikitide</bibkey>
+    </paper>
+    <paper id="24">
+      <title><fixed-case>BERT</fixed-case>abaporu: Assessing a Genre-Specific Language Model for <fixed-case>P</fixed-case>ortuguese <fixed-case>NLP</fixed-case></title>
+      <author><first>Pablo Botton</first><last>Costa</last></author>
+      <author><first>Matheus Camasmie</first><last>Pavan</last></author>
+      <author><first>Wesley Ramos</first><last>Santos</last></author>
+      <author><first>Samuel Caetano</first><last>Silva</last></author>
+      <author><first>Ivandré</first><last>Paraboni</last></author>
+      <pages>217–223</pages>
+      <abstract>Transformer-based language models such as Bidirectional Encoder Representations from Transformers (BERT) are now mainstream in the NLP field, but extensions to languages other than English, to new domains and/or to more specific text genres are still in demand. In this paper we introduced BERTabaporu, a BERT language model that has been pre-trained on Twitter data in the Brazilian Portuguese language. The model is shown to outperform the best-known general-purpose model for this language in three Twitter-related NLP tasks, making a potentially useful resource for Portuguese NLP in general.</abstract>
+      <url hash="310894cd">2023.ranlp-1.24</url>
+      <bibkey>costa-etal-2023-bertabaporu</bibkey>
+    </paper>
+    <paper id="25">
+      <title>Comparison of Multilingual Entity Linking Approaches</title>
+      <author><first>Ivelina</first><last>Bozhinova</last></author>
+      <author><first>Andrey</first><last>Tagarev</last></author>
+      <pages>224–233</pages>
+      <abstract>Despite rapid developments in the field of Natural Language Processing (NLP) in the past few years, the task of Multilingual Entity Linking (MEL) and especially its end-to-end formulation remains challenging. In this paper we aim to evaluate solutions for general end-to-end multilingual entity linking by conducting experiments using both existing complete approaches and novel combinations of pipelines for solving the task. The results identify the best performing current solutions and suggest some directions for further research.</abstract>
+      <url hash="ae31c39e">2023.ranlp-1.25</url>
+      <bibkey>bozhinova-tagarev-2023-comparison</bibkey>
+    </paper>
+    <paper id="26">
+      <title>Automatic Extraction of the <fixed-case>R</fixed-case>omanian Academic Word List: Data and Methods</title>
+      <author><first>Ana-Maria</first><last>Bucur</last></author>
+      <author><first>Andreea</first><last>Dincă</last></author>
+      <author><first>Madalina</first><last>Chitez</last></author>
+      <author><first>Roxana</first><last>Rogobete</last></author>
+      <pages>234–241</pages>
+      <abstract>This paper presents the methodology and data used for the automatic extraction of the Romanian Academic Word List (Ro-AWL). Academic Word Lists are useful in both L2 and L1 teaching contexts. For the Romanian language, no such resource exists so far. Ro-AWL has been generated by combining methods from corpus and computational linguistics with L2 academic writing approaches. We use two types of data: (a) existing data, such as the Romanian Frequency List based on the ROMBAC corpus, and (b) self-compiled data, such as the expert academic writing corpus EXPRES. For constructing the academic word list, we follow the methodology for building the Academic Vocabulary List for the English language. The distribution of Ro-AWL features (general distribution, POS distribution) into four disciplinary datasets is in line with previous research. Ro-AWL is freely available and can be used for teaching, research and NLP applications.</abstract>
+      <url hash="0701d7b9">2023.ranlp-1.26</url>
+      <bibkey>bucur-etal-2023-automatic</bibkey>
+    </paper>
+    <paper id="27">
+      <title>Stance Prediction from Multimodal Social Media Data</title>
+      <author><first>Lais Carraro Leme</first><last>Cavalheiro</last></author>
+      <author><first>Matheus Camasmie</first><last>Pavan</last></author>
+      <author><first>Ivandré</first><last>Paraboni</last></author>
+      <pages>242–248</pages>
+      <abstract>Stance prediction - the computational task of inferring attitudes towards a given target topic of interest - relies heavily on text data provided by social media or similar sources, but it may also benefit from non-text information such as demographics (e.g., users’ gender, age, etc.), network structure (e.g., friends, followers, etc.), interactions (e.g., mentions, replies, etc.) and other non-text properties (e.g., time information, etc.). However, so-called hybrid (or in some cases multimodal) approaches to stance prediction have only been developed for a small set of target languages, and often making use of count-based text models (e.g., bag-of-words) and time-honoured classification methods (e.g., support vector machines). As a means to further research in the field, in this work we introduce a number of text- and non-text models for stance prediction in the Portuguese language, which make use of more recent methods based on BERT and an ensemble architecture, and ask whether a BERT stance classifier may be enhanced with different kinds of network-related information.</abstract>
+      <url hash="60916def">2023.ranlp-1.27</url>
+      <bibkey>cavalheiro-etal-2023-stance</bibkey>
+    </paper>
+    <paper id="28">
+      <title>From Stigma to Support: A Parallel Monolingual Corpus and <fixed-case>NLP</fixed-case> Approach for Neutralizing Mental Illness Bias</title>
+      <author><first>Mason</first><last>Choey</last></author>
+      <pages>249–254</pages>
+      <abstract>Negative attitudes and perceptions towards mental illness continue to be pervasive in our society. One of the factors contributing to and reinforcing this stigma is the usage of language that is biased against mental illness. Identifying biased language and replacing it with person-first, neutralized language is a first step towards eliminating harmful stereotypes and creating a supportive and inclusive environment for those living with mental illness. This paper presents a novel Natural Language Processing (NLP) system that aims to automatically identify biased text related to mental illness and suggest neutral language replacements without altering the original text’s meaning. Building on previous work in the field, this paper presents the Mental Illness Neutrality Corpus (MINC) comprising over 5500 mental illness-biased text and neutralized sentence pairs (in English), which is used to fine-tune a CONCURRENT model system developed by Pryzant et al. (2020). After evaluation, the model demonstrates high proficiency in neutralizing mental illness bias with an accuracy of 98.7%. This work contributes a valuable resource for reducing mental illness bias in text and has the potential for further research in tackling more complex nuances and multilingual biases.</abstract>
+      <url hash="66674dd0">2023.ranlp-1.28</url>
+      <bibkey>choey-2023-stigma</bibkey>
+    </paper>
+    <paper id="29">
+      <title><fixed-case>BB</fixed-case>25<fixed-case>HL</fixed-case>egal<fixed-case>S</fixed-case>um: Leveraging <fixed-case>BM</fixed-case>25 and <fixed-case>BERT</fixed-case>-Based Clustering for the Summarization of Legal Documents</title>
+      <author><first>Leonardo</first><last>de Andrade</last></author>
+      <author><first>Karin</first><last>Becker</last></author>
+      <pages>255–263</pages>
+      <abstract>Legal document summarization aims to provide a clear understanding of the main points and arguments in a legal document, contributing to the efficiency of the judicial system. In this paper, we propose BB25HLegalSum, a method that combines BERT clusters with the BM25 algorithm to summarize legal documents and present them to users with highlighted important information. The process involves selecting unique, relevant sentences from the original document, clustering them to find sentences about a similar subject, combining them to generate a summary according to three strategies, and highlighting them to the user in the original document. We outperformed baseline techniques using the BillSum dataset, a widely used benchmark in legal document summarization. Legal workers positively assessed the highlighted presentation.</abstract>
+      <url hash="4bc32ed3">2023.ranlp-1.29</url>
+      <bibkey>de-andrade-becker-2023-bb25hlegalsum</bibkey>
+    </paper>
+    <paper id="30">
+      <title><fixed-case>SSSD</fixed-case>: Leveraging Pre-trained Models and Semantic Search for Semi-supervised Stance Detection</title>
+      <author><first>André</first><last>de Sousa</last></author>
+      <author><first>Karin</first><last>Becker</last></author>
+      <pages>264–273</pages>
+      <abstract>Pre-trained models (PTMs) based on the Transformers architecture are trained on massive amounts of data and can capture nuances and complexities in linguistic expressions, making them a powerful tool for many natural language processing tasks. In this paper, we present SSSD (Semantic Similarity Stance Detection), a semi-supervised method for stance detection on Twitter that automatically labels a large, domain-related corpus for training a stance classification model. The method assumes as input a domain set of tweets about a given target and a labeled query set of tweets of representative arguments related to the stances. It scales the automatic labeling of a large number of tweets, and improves classification accuracy by leveraging the power of PTMs and semantic search to capture context and meaning. We largely outperformed all baselines in experiments using the Semeval benchmark.</abstract>
+      <url hash="e1e719a9">2023.ranlp-1.30</url>
+      <bibkey>de-sousa-becker-2023-sssd</bibkey>
+    </paper>
+    <paper id="31">
+      <title>Detecting Text Formality: A Study of Text Classification Approaches</title>
+      <author><first>Daryna</first><last>Dementieva</last></author>
+      <author><first>Nikolay</first><last>Babakov</last></author>
+      <author><first>Alexander</first><last>Panchenko</last></author>
+      <pages>274–284</pages>
+      <abstract>Formality is one of the important characteristics of text documents. The automatic detection of the formality level of a text is potentially beneficial for various natural language processing tasks. Before, two large-scale datasets were introduced for multiple languages featuring formality annotation—GYAFC and X-FORMAL. However, they were primarily used for the training of style transfer models. At the same time, the detection of text formality on its own may also be a useful application. This work proposes the first to our knowledge systematic study of formality detection methods based on statistical, neural-based, and Transformer-based machine learning methods and delivers the best-performing models for public usage. We conducted three types of experiments – monolingual, multilingual, and cross-lingual. The study shows the overcome of Char BiLSTM model over Transformer-based ones for the monolingual and multilingual formality classification task, while Transformer-based classifiers are more stable to cross-lingual knowledge transfer.</abstract>
+      <url hash="723ee157">2023.ranlp-1.31</url>
+      <bibkey>dementieva-etal-2023-detecting</bibkey>
+    </paper>
+    <paper id="32">
+      <title>Developing a Multilingual Corpus of <fixed-case>W</fixed-case>ikipedia Biographies</title>
+      <author><first>Hannah</first><last>Devinney</last></author>
+      <author><first>Anton</first><last>Eklund</last></author>
+      <author><first>Igor</first><last>Ryazanov</last></author>
+      <author><first>Jingwen</first><last>Cai</last></author>
+      <pages>285–294</pages>
+      <abstract>For many languages, Wikipedia is the most accessible source of biographical information. Studying how Wikipedia describes the lives of people can provide insights into societal biases, as well as cultural differences more generally. We present a method for extracting datasets of Wikipedia biographies. The accompanying codebase is adapted to English, Swedish, Russian, Chinese, and Farsi, and is extendable to other languages. We present an exploratory analysis of biographical topics and gendered patterns in four languages using topic modelling and embedding clustering. We find similarities across languages in the types of categories present, with the distribution of biographies concentrated in the language’s core regions. Masculine terms are over-represented and spread out over a wide variety of topics. Feminine terms are less frequent and linked to more constrained topics. Non-binary terms are nearly non-represented.</abstract>
+      <url hash="396a9ba3">2023.ranlp-1.32</url>
+      <bibkey>devinney-etal-2023-developing</bibkey>
+    </paper>
+    <paper id="33">
+      <title>A Computational Analysis of the Voices of Shakespeare’s Characters</title>
+      <author><first>Liviu P.</first><last>Dinu</last></author>
+      <author><first>Ana Sabina</first><last>Uban</last></author>
+      <pages>295–300</pages>
+      <abstract>In this paper we propose a study of a relatively novel problem in authorship attribution research: that of classifying the stylome of characters in a literary work. We choose as a case study the plays of William Shakespeare, presumably the most renowned and respected dramatist in the history of literature. Previous research in the field of authorship attribution has shown that the writing style of an author can be characterized and distinguished from that of other authors automatically. The question we propose to answer is a related but different one: can the styles of different characters be distinguished? We aim to verify in this way if an author managed to create believable characters with individual styles, and focus on Shakespeare’s iconic characters. We present our experiments using various features and models, including an SVM and a neural network, show that characters in Shakespeare’s plays can be classified with up to 50% accuracy.</abstract>
+      <url hash="2bc64502">2023.ranlp-1.33</url>
+      <bibkey>dinu-uban-2023-computational</bibkey>
+    </paper>
+    <paper id="34">
+      <title>Source Code Plagiarism Detection with Pre-Trained Model Embeddings and Automated Machine Learning</title>
+      <author><first>Fahad</first><last>Ebrahim</last></author>
+      <author><first>Mike</first><last>Joy</last></author>
+      <pages>301–309</pages>
+      <abstract>Source code plagiarism is a critical ethical issue in computer science education where students use someone else’s work as their own. It can be treated as a binary classification problem where the output can be either: yes (plagiarism found) or no (plagiarism not found). In this research, we have taken the open-source dataset ‘SOCO’, which contains two programming languages (PLs), namely Java and C/C++ (although our method could be applied to any PL). Source codes should be converted to vector representations that capture both the syntax and semantics of the text, known as contextual embeddings. These embeddings would be generated using source code pre-trained models (CodePTMs). The cosine similarity scores of three different CodePTMs were selected as features. The classifier selection and parameter tuning were conducted with the assistance of Automated Machine Learning (AutoML). The selected classifiers were tested, initially on Java, and the proposed approach produced average to high results compared to other published research, and surpassed the baseline (the JPlag plagiarism detection tool). For C/C++, the approach outperformed other research work and produced the highest ranking score.</abstract>
+      <url hash="cf818e1b">2023.ranlp-1.34</url>
+      <bibkey>ebrahim-joy-2023-source</bibkey>
+    </paper>
+    <paper id="35">
+      <title>Identifying Semantic Argument Types in Predication and Copredication Contexts: A Zero-Shot Cross-Lingual Approach</title>
+      <author><first>Deniz Ekin</first><last>Yavas</last></author>
+      <author><first>Laura</first><last>Kallmeyer</last></author>
+      <author><first>Rainer</first><last>Osswald</last></author>
+      <author><first>Elisabetta</first><last>Jezek</last></author>
+      <author><first>Marta</first><last>Ricchiardi</last></author>
+      <author><first>Long</first><last>Chen</last></author>
+      <pages>310–320</pages>
+      <abstract>Identifying semantic argument types in predication contexts is not a straightforward task for several reasons, such as inherent polysemy, coercion, and copredication phenomena. In this paper, we train monolingual and multilingual classifiers with a zero-shot cross-lingual approach to identify semantic argument types in predications using pre-trained language models as feature extractors. We train classifiers for different semantic argument types and for both verbal and adjectival predications. Furthermore, we propose a method to detect copredication using these classifiers through identifying the argument semantic type targeted in different predications over the same noun in a sentence. We evaluate the performance of the method on copredication test data with Food•Event nouns for 5 languages.</abstract>
+      <url hash="ab3ed19a">2023.ranlp-1.35</url>
+      <bibkey>yavas-etal-2023-identifying</bibkey>
+    </paper>
+    <paper id="36">
+      <title>A Review of Research-Based Automatic Text Simplification Tools</title>
+      <author><first>Isabel</first><last>Espinosa-Zaragoza</last></author>
+      <author><first>José</first><last>Abreu-Salas</last></author>
+      <author><first>Elena</first><last>Lloret</last></author>
+      <author><first>Paloma</first><last>Moreda</last></author>
+      <author><first>Manuel</first><last>Palomar</last></author>
+      <pages>321–330</pages>
+      <abstract>In the age of knowledge, the democratisation of information facilitated through the Internet may not be as pervasive if written language poses challenges to particular sectors of the population. The objective of this paper is to present an overview of research-based automatic text simplification tools. Consequently, we describe aspects such as the language, language phenomena, language levels simplified, approaches, specific target populations these tools are created for (e.g. individuals with cognitive impairment, attention deficit, elderly people, children, language learners), and accessibility and availability considerations. The review of existing studies covering automatic text simplification tools is undergone by searching two databases: Web of Science and Scopus. The eligibility criteria involve text simplification tools with a scientific background in order to ascertain how they operate. This methodology yielded 27 text simplification tools that are further analysed. Some of the main conclusions reached with this review are the lack of resources accessible to the public, the need for customisation to foster the individual’s independence by allowing the user to select what s/he finds challenging to understand while not limiting the user’s capabilities and the need for more simplification tools in languages other than English, to mention a few.</abstract>
+      <url hash="f9c5f5c4">2023.ranlp-1.36</url>
+      <bibkey>espinosa-zaragoza-etal-2023-review</bibkey>
+    </paper>
+    <paper id="37">
+      <title>Vocab-Expander: A System for Creating Domain-Specific Vocabularies Based on Word Embeddings</title>
+      <author><first>Michael</first><last>Faerber</last></author>
+      <author><first>Nicholas</first><last>Popovic</last></author>
+      <pages>331–335</pages>
+      <abstract>In this paper, we propose Vocab-Expander at https://vocab-expander.com, an online tool that enables end-users (e.g., technology scouts) to create and expand a vocabulary of their domain of interest. It utilizes an ensemble of state-of-the-art word embedding techniques based on web text and ConceptNet, a common-sense knowledge base, to suggest related terms for already given terms. The system has an easy-to-use interface that allows users to quickly confirm or reject term suggestions. Vocab-Expander offers a variety of potential use cases, such as improving concept-based information retrieval in technology and innovation management, enhancing communication and collaboration within organizations or interdisciplinary projects, and creating vocabularies for specific courses in education.</abstract>
+      <url hash="b7faf149">2023.ranlp-1.37</url>
+      <bibkey>faerber-popovic-2023-vocab</bibkey>
+    </paper>
+    <paper id="38">
+      <title>On the Generalization of Projection-Based Gender Debiasing in Word Embedding</title>
+      <author><first>Elisabetta</first><last>Fersini</last></author>
+      <author><first>Antonio</first><last>Candelieri</last></author>
+      <author><first>Lorenzo</first><last>Pastore</last></author>
+      <pages>336–343</pages>
+      <abstract>Gender bias estimation and mitigation techniques in word embeddings lack an understanding of their generalization capabilities. In this work, we complement prior research by comparing in a systematic way four gender bias metrics (Word Embedding Association Tes, Relative Negative Sentiment Bias, Embedding Coherence Test and Bias Analogy Test), two types of projection-based gender mitigation strategies (hard- and soft-debiasing) on three well-known word embedding representations (Word2Vec, FastText and Glove). The experiments have shown that the considered word embeddings are consistent between them but the debiasing techniques are inconsistent across the different metrics, also highlighting the potential risk of unintended bias after the mitigation strategies.</abstract>
+      <url hash="ad022cf7">2023.ranlp-1.38</url>
+      <bibkey>fersini-etal-2023-generalization</bibkey>
+    </paper>
+    <paper id="39">
+      <title>Mapping Explicit and Implicit Discourse Relations between the <fixed-case>RST</fixed-case>-<fixed-case>DT</fixed-case> and the <fixed-case>PDTB</fixed-case> 3.0</title>
+      <author><first>Nelson Filipe</first><last>Costa</last></author>
+      <author><first>Nadia</first><last>Sheikh</last></author>
+      <author><first>Leila</first><last>Kosseim</last></author>
+      <pages>344–352</pages>
+      <abstract>In this paper we propose a first empirical mapping between the RST-DT and the PDTB 3.0. We provide an original algorithm which allowed the mapping of 6,510 (80.0%) explicit and implicit discourse relations between the overlapping articles of the RST-DT and PDTB 3.0 discourse annotated corpora. Results of the mapping show that while it is easier to align segments of implicit discourse relations, the mapping obtained between the aligned explicit discourse relations is more unambiguous.</abstract>
+      <url hash="796a0f16">2023.ranlp-1.39</url>
+      <bibkey>costa-etal-2023-mapping</bibkey>
+    </paper>
+    <paper id="40">
+      <title>Bigfoot in Big Tech: Detecting Out of Domain Conspiracy Theories</title>
+      <author><first>Matthew</first><last>Fort</last></author>
+      <author><first>Zuoyu</first><last>Tian</last></author>
+      <author><first>Elizabeth</first><last>Gabel</last></author>
+      <author><first>Nina</first><last>Georgiades</last></author>
+      <author><first>Noah</first><last>Sauer</last></author>
+      <author><first>Daniel</first><last>Dakota</last></author>
+      <author><first>Sandra</first><last>Kübler</last></author>
+      <pages>353–363</pages>
+      <abstract>We investigate approaches to classifying texts into either conspiracy theory or mainstream using the Language Of Conspiracy (LOCO) corpus. Since conspiracy theories are not monolithic constructs, we need to identify approaches that robustly work in an out-of- domain setting (i.e., across conspiracy topics). We investigate whether optimal in-domain set- tings can be transferred to out-of-domain set- tings, and we investigate different methods for bleaching to steer classifiers away from words typical for an individual conspiracy theory. We find that BART works better than an SVM, that we can successfully classify out-of-domain, but there are no clear trends in how to choose the best source training domains. Addition- ally, bleaching only topic words works better than bleaching all content words or completely delexicalizing texts.</abstract>
+      <url hash="2aee0f90">2023.ranlp-1.40</url>
+      <bibkey>fort-etal-2023-bigfoot</bibkey>
+    </paper>
+    <paper id="41">
+      <title>Deep Learning Approaches to Detecting Safeguarding Concerns in Schoolchildren’s Online Conversations</title>
+      <author><first>Emma</first><last>Franklin</last></author>
+      <author><first>Tharindu</first><last>Ranasinghe</last></author>
+      <pages>364–372</pages>
+      <abstract>For school teachers and Designated Safeguarding Leads (DSLs), computers and other school-owned communication devices are both indispensable and deeply worrisome. For their education, children require access to the Internet, as well as a standard institutional ICT infrastructure, including e-mail and other forms of online communication technology. Given the sheer volume of data being generated and shared on a daily basis within schools, most teachers and DSLs can no longer monitor the safety and wellbeing of their students without the use of specialist safeguarding software. In this paper, we experiment with the use of state-of-the-art neural network models on the modelling of a dataset of almost 9,000 anonymised child-generated chat messages on the Microsoft Teams platform. The data was manually classified into eight fine-grained classes of safeguarding concerns (or false alarms) that a monitoring program would be interested in, and these were further split into two binary classes: true positives (real safeguarding concerns) and false positives (false alarms). For the fine grained classification, our models achieved a macro F1 score of 73.56, while for the binary classification, we achieved a macro F1 score of 87.32. This first experiment into the use of Deep Learning for detecting safeguarding concerns represents an important step towards achieving high-accuracy and reliable monitoring information for busy teachers and safeguarding leads.</abstract>
+      <url hash="2361b79b">2023.ranlp-1.41</url>
+      <bibkey>franklin-ranasinghe-2023-deep</bibkey>
+    </paper>
+    <paper id="42">
+      <title>On the Identification and Forecasting of Hate Speech in Inceldom</title>
+      <author><first>Paolo</first><last>Gajo</last></author>
+      <author><first>Arianna</first><last>Muti</last></author>
+      <author><first>Katerina</first><last>Korre</last></author>
+      <author><first>Silvia</first><last>Bernardini</last></author>
+      <author><first>Alberto</first><last>Barrón-Cedeño</last></author>
+      <pages>373–384</pages>
+      <abstract>Spotting hate speech in social media posts is crucial to increase the civility of the Web and has been thoroughly explored in the NLP community. For the first time, we introduce a multilingual corpus for the analysis and identification of hate speech in the domain of inceldom, built from incel Web forums in English and Italian, including expert annotation at the post level for two kinds of hate speech: misogyny and racism. This resource paves the way for the development of mono- and cross-lingual models for (a) the identification of hateful (misogynous and racist) posts and (b) the forecasting of the amount of hateful responses that a post is likely to trigger. Our experiments aim at improving the performance of Transformer-based models using masked language modeling pre-training and dataset merging. The results show that these strategies boost the models’ performance in all settings (binary classification, multi-label classification and forecasting), especially in the cross-lingual scenarios.</abstract>
+      <url hash="5946ad56">2023.ranlp-1.42</url>
+      <bibkey>gajo-etal-2023-identification</bibkey>
+    </paper>
+    <paper id="43">
+      <title><fixed-case>T</fixed-case>2<fixed-case>KG</fixed-case>: Transforming Multimodal Document to Knowledge Graph</title>
+      <author><first>Santiago</first><last>Galiano</last></author>
+      <author><first>Rafael</first><last>Muñoz</last></author>
+      <author><first>Yoan</first><last>Gutiérrez</last></author>
+      <author><first>Andrés</first><last>Montoyo</last></author>
+      <author><first>Jose Ignacio</first><last>Abreu</last></author>
+      <author><first>Luis Alfonso</first><last>Ureña</last></author>
+      <pages>385–391</pages>
+      <abstract>The large amount of information in digital format that exists today makes it unfeasible to use manual means to acquire the knowledge contained in these documents. Therefore, it is necessary to develop tools that allow us to incorporate this knowledge into a structure that is easy to use by both machines and humans. This paper presents a system that can incorporate the relevant information from a document in any format, structured or unstructured, into a semantic network that represents the existing knowledge in the document. The system independently processes from structured documents based on its annotation scheme to unstructured documents, written in natural language, for which it uses a set of sensors that identifies the relevant information and subsequently incorporates it to enrich the semantic network that is created by linking all the information based on the knowledge discovered.</abstract>
+      <url hash="87fde17c">2023.ranlp-1.43</url>
+      <bibkey>galiano-etal-2023-t2kg</bibkey>
+    </paper>
+    <paper id="44">
+      <title>!Translate: When You Cannot Cook Up a Translation, Explain</title>
+      <author><first>Federico</first><last>Garcea</last></author>
+      <author><first>Margherita</first><last>Martinelli</last></author>
+      <author><first>Maja</first><last>Milicević Petrović</last></author>
+      <author><first>Alberto</first><last>Barrón-Cedeño</last></author>
+      <pages>392–398</pages>
+      <abstract>In the domain of cuisine, both dishes and ingredients tend to be heavily rooted in the local context they belong to. As a result, the associated terms are often realia tied to specific cultures and languages. This causes difficulties for non-speakers of the local language and ma- chine translation (MT) systems alike, as it implies a lack of the concept and/or of a plausible translation. MT typically opts for one of two alternatives: keeping the source language terms untranslated or relying on a hyperonym/near-synonym in the target language, provided one exists. !Translate proposes a better alternative: explaining. Given a cuisine entry such as a restaurant menu item, we identify culture-specific terms and enrich the output of the MT system with automatically retrieved definitions of the non-translatable terms in the target language, making the translation more actionable for the final user.</abstract>
+      <url hash="f8e079a8">2023.ranlp-1.44</url>
+      <bibkey>garcea-etal-2023-translate</bibkey>
+    </paper>
+    <paper id="45">
+      <title>An Evaluation of Source Factors in Concatenation-Based Context-Aware Neural Machine Translation</title>
+      <author><first>Harritxu</first><last>Gete</last></author>
+      <author><first>Thierry</first><last>Etchegoyhen</last></author>
+      <pages>399–407</pages>
+      <abstract>We explore the use of source factors in context-aware neural machine translation, specifically concatenation-based models, to improve the translation quality of inter-sentential phenomena. Context sentences are typically concatenated to the sentence to be translated, with string-based markers to separate the latter from the former. Although previous studies have measured the impact of prefixes to identify and mark context information, the use of learnable factors has only been marginally explored. In this study, we evaluate the impact of single and multiple source context factors in English-German and Basque-Spanish contextual translation. We show that this type of factors can significantly enhance translation accuracy for phenomena such as gender and register coherence in Basque-Spanish, while also improving BLEU results in some scenarios. These results demonstrate the potential of factor-based context identification to improve context-aware machine translation in future research.</abstract>
+      <url hash="024b6980">2023.ranlp-1.45</url>
+      <bibkey>gete-etchegoyhen-2023-evaluation</bibkey>
+    </paper>
+    <paper id="46">
+      <title>Lessons Learnt from Linear Text Segmentation: a Fair Comparison of Architectural and Sentence Encoding Strategies for Successful Segmentation</title>
+      <author><first>Iacopo</first><last>Ghinassi</last></author>
+      <author><first>Lin</first><last>Wang</last></author>
+      <author><first>Chris</first><last>Newell</last></author>
+      <author><first>Matthew</first><last>Purver</last></author>
+      <pages>408–418</pages>
+      <abstract>Recent works on linear text segmentation have shown new state-of-the-art results nearly every year. Most times, however, these recent advances include a variety of different elements which makes it difficult to evaluate which individual components of the proposed methods bring about improvements for the task and, more generally, what actually works for linear text segmentation. Moreover, evaluating text segmentation is notoriously difficult and the use of a metric such as Pk, which is widely used in existing literature, presents specific problems that complicates a fair comparison between segmentation models. In this work, then, we draw from a number of existing works to assess which is the state-of-the-art in linear text segmentation, investigating what architectures and features work best for the task. For doing so, we present three models representative of a variety of approaches, we compare them to existing methods and we inspect elements composing them, so as to give a more complete picture of which technique is more successful and why that might be the case. At the same time, we highlight a specific feature of Pk which can bias the results and we report our results using different settings, so as to give future literature a more comprehensive set of baseline results for future developments. We then hope that this work can serve as a solid foundation to foster research in the area, overcoming task-specific difficulties such as evaluation setting and providing new state-of-the-art results.</abstract>
+      <url hash="34447304">2023.ranlp-1.46</url>
+      <bibkey>ghinassi-etal-2023-lessons</bibkey>
+    </paper>
+    <paper id="47">
+      <title>Student’s t-Distribution: On Measuring the Inter-Rater Reliability When the Observations are Scarce</title>
+      <author><first>Serge</first><last>Gladkoff</last></author>
+      <author><first>Lifeng</first><last>Han</last></author>
+      <author><first>Goran</first><last>Nenadic</last></author>
+      <pages>419–428</pages>
+      <abstract>In natural language processing (NLP) we always rely on human judgement as the golden quality evaluation method. However, there has been an ongoing debate on how to better evaluate inter-rater reliability (IRR) levels for certain evaluation tasks, such as translation quality evaluation (TQE), especially when the data samples (observations) are very scarce. In this work, we first introduce the study on how to estimate the confidence interval for the measurement value when only one data (evaluation) point is available. Then, this leads to our example with two human-generated observational scores, for which, we introduce “Student’s <i>t</i>-Distribution” method and explain how to use it to measure the IRR score using only these two data points, as well as the confidence intervals (CIs) of the quality evaluation. We give a quantitative analysis of how the evaluation confidence can be greatly improved by introducing more observations, even if only one extra observation. We encourage researchers to report their IRR scores in all possible means, e.g. using Student’s <i>t</i>-Distribution method whenever possible; thus making the NLP evaluation more meaningful, transparent, and trustworthy. This <i>t</i>-Distribution method can be also used outside of NLP fields to measure IRR level for trustworthy evaluation of experimental investigations, whenever the observational data is scarce.</abstract>
+      <url hash="1976ffd1">2023.ranlp-1.47</url>
+      <bibkey>gladkoff-etal-2023-students</bibkey>
+    </paper>
+    <paper id="48">
+      <title>Data Augmentation for Fake News Detection by Combining Seq2seq and <fixed-case>NLI</fixed-case></title>
+      <author><first>Anna</first><last>Glazkova</last></author>
+      <pages>429–439</pages>
+      <abstract>State-of-the-art data augmentation methods help improve the generalization of deep learning models. However, these methods often generate examples that contradict the preserving class labels. This is crucial for some natural language processing tasks, such as fake news detection. In this work, we combine sequence-to-sequence and natural language inference models for data augmentation in the fake news detection domain using short news texts, such as tweets and news titles. This approach allows us to generate new training examples that do not contradict facts from the original texts. We use the non-entailment probability for the pair of the original and generated texts as a loss function for a transformer-based sequence-to-sequence model. The proposed approach has demonstrated the effectiveness on three classification benchmarks in fake news detection in terms of the F1-score macro and ROC AUC. Moreover, we showed that our approach retains the class label of the original text more accurately than other transformer-based methods.</abstract>
+      <url hash="6ac899af">2023.ranlp-1.48</url>
+      <bibkey>glazkova-2023-data</bibkey>
+    </paper>
+    <paper id="49">
+      <title>Exploring Unsupervised Semantic Similarity Methods for Claim Verification in Health Care News Articles</title>
+      <author><first>Vishwani</first><last>Gupta</last></author>
+      <author><first>Astrid</first><last>Viciano</last></author>
+      <author><first>Holger</first><last>Wormer</last></author>
+      <author><first>Najmehsadat</first><last>Mousavinezhad</last></author>
+      <pages>440–447</pages>
+      <abstract>In the 21st century, the proliferation of fake information has emerged as a significant threat to society. Particularly, healthcare medical reporters face challenges when verifying claims related to treatment effects, side effects, and risks mentioned in news articles, relying on scientific publications for accuracy. The accurate communication of scientific information in news articles has long been a crucial concern in the scientific community, as the dissemination of misinformation can have dire consequences in the healthcare domain. Healthcare medical reporters would greatly benefit from efficient methods to retrieve evidence from scientific publications supporting specific claims. This paper delves into the application of unsupervised semantic similarity models to facilitate claim verification for medical reporters, thereby expediting the process. We explore unsupervised multilingual evidence retrieval techniques aimed at reducing the time required to obtain evidence from scientific studies. Instead of employing content classification, we propose an approach that retrieves relevant evidence from scientific publications for claim verification within the healthcare domain. Given a claim and a set of scientific publications, our system generates a list of the most similar paragraphs containing supporting evidence. Furthermore, we evaluate the performance of state-of-the-art unsupervised semantic similarity methods in this task. As the claim and evidence are present in a cross-lingual space, we find that the XML-RoBERTa model exhibits high accuracy in achieving our objective. Through this research, we contribute to enhancing the efficiency and reliability of claim verification for healthcare medical reporters, enabling them to accurately source evidence from scientific publications in a timely manner.</abstract>
+      <url hash="b823b513">2023.ranlp-1.49</url>
+      <bibkey>gupta-etal-2023-exploring</bibkey>
+    </paper>
+    <paper id="50">
+      <title><fixed-case>A</fixed-case>lpha<fixed-case>MWE</fixed-case>-<fixed-case>A</fixed-case>rabic: <fixed-case>A</fixed-case>rabic Edition of Multilingual Parallel Corpora with Multiword Expression Annotations</title>
+      <author><first>Najet</first><last>Hadj Mohamed</last></author>
+      <author><first>Malak</first><last>Rassem</last></author>
+      <author><first>Lifeng</first><last>Han</last></author>
+      <author><first>Goran</first><last>Nenadic</last></author>
+      <pages>448–457</pages>
+      <abstract>Multiword Expressions (MWEs) have been a bottleneck for Natural Language Understanding (NLU) and Natural Language Generation (NLG) tasks due to their idiomaticity, ambiguity, and non-compositionality. Bilingual parallel corpora introducing MWE annotations are very scarce which set another challenge for current Natural Language Processing (NLP) systems, especially in a multilingual setting. This work presents AlphaMWE-Arabic, an Arabic edition of the AlphaMWE parallel corpus with MWE annotations. We introduce how we created this corpus including machine translation (MT), post-editing, and annotations for both standard and dialectal varieties, i.e. Tunisian and Egyptian Arabic. We analyse the MT errors when they meet MWEs-related content, both quantitatively using the human-in-the-loop metric HOPE and qualitatively. We report the current state-of-the-art MT systems are far from reaching human parity performances. We expect our bilingual English-Arabic corpus will be an asset for multilingual research on MWEs such as translation and localisation, as well as for monolingual settings including the study of Arabic-specific lexicography and phrasal verbs on MWEs. Our corpus and experimental data are available at <url>https://github.com/aaronlifenghan/AlphaMWE</url>.</abstract>
+      <url hash="9d291827">2023.ranlp-1.50</url>
+      <bibkey>hadj-mohamed-etal-2023-alphamwe</bibkey>
+    </paper>
+    <paper id="51">
+      <title>Performance Analysis of <fixed-case>A</fixed-case>rabic Pre-trained Models on Named Entity Recognition Task</title>
+      <author><first>Abdelhalim Hafedh</first><last>Dahou</last></author>
+      <author><first>Mohamed Amine</first><last>Cheragui</last></author>
+      <author><first>Ahmed</first><last>Abdelali</last></author>
+      <pages>458–467</pages>
+      <abstract>Named Entity Recognition (NER) is a crucial task within natural language processing (NLP) that entails the identification and classification of entities, such as person, organization and location. This study delves into NER specifically in the Arabic language, focusing on the Algerian dialect. While previous research in NER has primarily concentrated on Modern Standard Arabic (MSA), the advent of social media has prompted a need to address the variations found in different Arabic dialects. Moreover, given the notable achievements of Large-scale pre-trained models (PTMs) based on the BERT architecture, this paper aims to evaluate Arabic pre-trained models using an Algerian dataset that covers different domains and writing styles. Additionally, an error analysis is conducted to identify PTMs’ limitations, and an investigation is carried out to assess the performance of trained MSA models on the Algerian dialect. The experimental results and subsequent analysis shed light on the complexities of NER in Arabic, offering valuable insights for future research endeavors.</abstract>
+      <url hash="46719bd0">2023.ranlp-1.51</url>
+      <bibkey>dahou-etal-2023-performance</bibkey>
+    </paper>
+    <paper id="52">
+      <title>Discourse Analysis of Argumentative Essays of <fixed-case>E</fixed-case>nglish Learners Based on <fixed-case>CEFR</fixed-case> Level</title>
+      <author><first>Blaise</first><last>Hanel</last></author>
+      <author><first>Leila</first><last>Kosseim</last></author>
+      <pages>468–474</pages>
+      <abstract>In this paper, we investigate the relationship between the use of discourse relations and the CEFR-level of argumentative English learner essays. Using both the Rhetorical Structure Theory (RST) and the Penn Discourse TreeBank (PDTB) frameworks, we analyze essays from The International Corpus Network of Asian Learners (ICNALE), and the Corpus and Repository of Writing (CROW). Results show that the use of the RST relations of Explanation and Background, as well as the first-level PDTB sense of Contingency, are influenced by the English proficiency level of the writer.</abstract>
+      <url hash="ec4b5fa1">2023.ranlp-1.52</url>
+      <bibkey>hanel-kosseim-2023-discourse</bibkey>
+    </paper>
+    <paper id="53">
+      <title>Improving Translation Quality for Low-Resource <fixed-case>I</fixed-case>nuktitut with Various Preprocessing Techniques</title>
+      <author><first>Mathias Hans Erik</first><last>Stenlund</last></author>
+      <author><first>Mathilde</first><last>Nanni</last></author>
+      <author><first>Micaella</first><last>Bruton</last></author>
+      <author><first>Meriem</first><last>Beloucif</last></author>
+      <pages>475–479</pages>
+      <abstract>Neural machine translation has been shown to outperform all other machine translation paradigms when trained in a high-resource setting. However, it still performs poorly when dealing with low-resource languages, for which parallel data for training is scarce. This is especially the case for morphologically complex languages such as Turkish, Tamil, Uyghur, etc. In this paper, we investigate various preprocessing methods for Inuktitut, a low-resource indigenous language from North America, without a morphological analyzer. On both the original and romanized scripts, we test various preprocessing techniques such as Byte-Pair Encoding, random stemming, and data augmentation using Hungarian for the Inuktitut-to-English translation task. We found that there are benefits to retaining the original script as it helps to achieve higher BLEU scores than the romanized models.</abstract>
+      <url hash="b25d97e5">2023.ranlp-1.53</url>
+      <bibkey>stenlund-etal-2023-improving</bibkey>
+    </paper>
+    <paper id="54">
+      <title>Enriched Pre-trained Transformers for Joint Slot Filling and Intent Detection</title>
+      <author><first>Momchil</first><last>Hardalov</last></author>
+      <author><first>Ivan</first><last>Koychev</last></author>
+      <author><first>Preslav</first><last>Nakov</last></author>
+      <pages>480–493</pages>
+      <abstract>Detecting the user’s intent and finding the corresponding slots among the utterance’s words are important tasks in natural language understanding. Their interconnected nature makes their joint modeling a standard part of training such models. Moreover, data scarceness and specialized vocabularies pose additional challenges. Recently, the advances in pre-trained language models, namely contextualized models such as ELMo and BERT have revolutionized the field by tapping the potential of training very large models with just a few steps of fine-tuning on a task-specific dataset. Here, we leverage such models, and we design a novel architecture on top of them. Moreover, we propose an intent pooling attention mechanism, and we reinforce the slot filling task by fusing intent distributions, word features, and token representations. The experimental results on standard datasets show that our model outperforms both the current non-BERT state of the art as well as stronger BERT-based baselines.</abstract>
+      <url hash="038219b4">2023.ranlp-1.54</url>
+      <bibkey>hardalov-etal-2023-enriched</bibkey>
+    </paper>
+    <paper id="55">
+      <title>Unimodal Intermediate Training for Multimodal Meme Sentiment Classification</title>
+      <author><first>Muzhaffar</first><last>Hazman</last></author>
+      <author><first>Susan</first><last>McKeever</last></author>
+      <author><first>Josephine</first><last>Griffith</last></author>
+      <pages>494–506</pages>
+      <abstract>Internet Memes remain a challenging form of user-generated content for automated sentiment classification. The availability of labelled memes is a barrier to developing sentiment classifiers of multimodal memes. To address the shortage of labelled memes, we propose to supplement the training of a multimodal meme classifier with unimodal (image-only and text-only) data. In this work, we present a novel variant of supervised intermediate training that uses relatively abundant sentiment-labelled unimodal data. Our results show a statistically significant performance improvement from the incorporation of unimodal text data. Furthermore, we show that the training set of labelled memes can be reduced by 40% without reducing the performance of the downstream model.</abstract>
+      <url hash="60e7774b">2023.ranlp-1.55</url>
+      <bibkey>hazman-etal-2023-unimodal</bibkey>
+    </paper>
+    <paper id="56">
+      <title>Explainable Event Detection with Event Trigger Identification as Rationale Extraction</title>
+      <author><first>Hansi</first><last>Hettiarachchi</last></author>
+      <author><first>Tharindu</first><last>Ranasinghe</last></author>
+      <pages>507–518</pages>
+      <abstract>Most event detection methods act at the sentence-level and focus on identifying sentences related to a particular event. However, identifying certain parts of a sentence that act as event triggers is also important and more challenging, especially when dealing with limited training data. Previous event detection attempts have considered these two tasks separately and have developed different methods. We hypothesise that similar to humans, successful sentence-level event detection models rely on event triggers to predict sentence-level labels. By exploring feature attribution methods that assign relevance scores to the inputs to explain model predictions, we study the behaviour of state-of-the-art sentence-level event detection models and show that explanations (i.e. rationales) extracted from these models can indeed be used to detect event triggers. We, therefore, (i) introduce a novel weakly-supervised method for event trigger detection; and (ii) propose to use event triggers as an explainable measure in sentence-level event detection. To the best of our knowledge, this is the first explainable machine learning approach to event trigger identification.</abstract>
+      <url hash="7e745822">2023.ranlp-1.56</url>
+      <bibkey>hettiarachchi-ranasinghe-2023-explainable</bibkey>
+    </paper>
+    <paper id="57">
+      <title>Clinical Text Classification to <fixed-case>SNOMED</fixed-case> <fixed-case>CT</fixed-case> Codes Using Transformers Trained on Linked Open Medical Ontologies</title>
+      <author><first>Anton</first><last>Hristov</last></author>
+      <author><first>Petar</first><last>Ivanov</last></author>
+      <author><first>Anna</first><last>Aksenova</last></author>
+      <author><first>Tsvetan</first><last>Asamov</last></author>
+      <author><first>Pavlin</first><last>Gyurov</last></author>
+      <author><first>Todor</first><last>Primov</last></author>
+      <author><first>Svetla</first><last>Boytcheva</last></author>
+      <pages>519–526</pages>
+      <abstract>We present an approach for medical text coding with SNOMED CT. Our approach uses publicly available linked open data from terminologies and ontologies as training data for the algorithms. We claim that even small training corpora made of short text snippets can be used to train models for the given task. We propose a method based on transformers enhanced with clustering and filtering of the candidates. Further, we adopt a classical machine learning approach - support vector classification (SVC) using transformer embeddings. The resulting approach proves to be more accurate than the predictions given by Large Language Models. We evaluate on a dataset generated from linked open data for SNOMED codes related to morphology and topography for four use cases. Our transformers-based approach achieves an F1-score of 0.82 for morphology and 0.99 for topography codes. Further, we validate the applicability of our approach in a clinical context using labelled real clinical data that are not used for model training.</abstract>
+      <url hash="dbd6ce86">2023.ranlp-1.57</url>
+      <bibkey>hristov-etal-2023-clinical</bibkey>
+    </paper>
+    <paper id="58">
+      <title>Towards a Consensus Taxonomy for Annotating Errors in Automatically Generated Text</title>
+      <author><first>Rudali</first><last>Huidrom</last></author>
+      <author><first>Anya</first><last>Belz</last></author>
+      <pages>527–540</pages>
+      <abstract>Error analysis aims to provide insights into system errors at different levels of granularity. NLP as a field has a long-standing tradition of analysing and reporting errors which is generally considered good practice. There are existing error taxonomies tailored for different types of NLP task. In this paper, we report our work reviewing existing research on meaning/content error types in generated text, attempt to identify emerging consensus among existing meaning/content error taxonomies, and propose a standardised error taxonomy on this basis. We find that there is virtually complete agreement at the highest taxonomic level where errors of meaning/content divide into (1) Content Omission, (2) Content Addition, and (3) Content Substitution. Consensus in the lower levels is less pronounced, but a compact standardised consensus taxonomy can nevertheless be derived that works across generation tasks and application domains.</abstract>
+      <url hash="b909a340">2023.ranlp-1.58</url>
+      <bibkey>huidrom-belz-2023-towards</bibkey>
+    </paper>
+    <paper id="59">
+      <title>Uncertainty Quantification of Text Classification in a Multi-Label Setting for Risk-Sensitive Systems</title>
+      <author><first>Jinha</first><last>Hwang</last></author>
+      <author><first>Carol</first><last>Gudumotu</last></author>
+      <author><first>Benyamin</first><last>Ahmadnia</last></author>
+      <pages>541–547</pages>
+      <abstract>This paper addresses the challenge of uncertainty quantification in text classification for medical purposes and provides a three-fold approach to support robust and trustworthy decision-making by medical practitioners. Also, we address the challenge of imbalanced datasets in the medical domain by utilizing the Mondrian Conformal Predictor with a Naïve Bayes classifier.</abstract>
+      <url hash="f950555d">2023.ranlp-1.59</url>
+      <bibkey>hwang-etal-2023-uncertainty</bibkey>
+    </paper>
+    <paper id="60">
+      <title>Pretraining Language- and Domain-Specific <fixed-case>BERT</fixed-case> on Automatically Translated Text</title>
+      <author><first>Tatsuya</first><last>Ishigaki</last></author>
+      <author><first>Yui</first><last>Uehara</last></author>
+      <author><first>Goran</first><last>Topić</last></author>
+      <author><first>Hiroya</first><last>Takamura</last></author>
+      <pages>548–555</pages>
+      <abstract>Domain-specific pretrained language models such as SciBERT are effective for various tasks involving text in specific domains. However, pretraining BERT requires a large-scale language resource, which is not necessarily available in fine-grained domains, especially in non-English languages. In this study, we focus on a setting with no available domain-specific text for pretraining. To this end, we propose a simple framework that trains a BERT on text in the target language automatically translated from a resource-rich language, e.g., English. In this paper, we particularly focus on the materials science domain in Japanese. Our experiments pertain to the task of entity and relation extraction for this domain and language. The experiments demonstrate that the various models pretrained on translated texts consistently perform better than the general BERT in terms of F1 scores although the domain-specific BERTs do not use any human-authored domain-specific text. These results imply that BERTs for various low-resource domains can be successfully trained on texts automatically translated from resource-rich languages.</abstract>
+      <url hash="6643375b">2023.ranlp-1.60</url>
+      <bibkey>ishigaki-etal-2023-pretraining</bibkey>
+    </paper>
+    <paper id="61">
+      <title>Categorising Fine-to-Coarse Grained Misinformation: An Empirical Study of the <fixed-case>COVID</fixed-case>-19 Infodemic</title>
+      <author><first>Ye</first><last>Jiang</last></author>
+      <author><first>Xingyi</first><last>Song</last></author>
+      <author><first>Carolina</first><last>Scarton</last></author>
+      <author><first>Iknoor</first><last>Singh</last></author>
+      <author><first>Ahmet</first><last>Aker</last></author>
+      <author><first>Kalina</first><last>Bontcheva</last></author>
+      <pages>556–567</pages>
+      <abstract>The spread of COVID-19 misinformation on social media became a major challenge for citizens, with negative real-life consequences. Prior research focused on detection and/or analysis of COVID-19 misinformation. However, fine-grained classification of misinformation claims has been largely overlooked. The novel contribution of this paper is in introducing a new dataset which makes fine-grained distinctions between statements that assert, comment or question on false COVID-19 claims. This new dataset not only enables social behaviour analysis but also enables us to address both evidence-based and non-evidence-based misinformation classification tasks. Lastly, through leave claim out cross-validation, we demonstrate that classifier performance on unseen COVID-19 misinformation claims is significantly different, as compared to performance on topics present in the training data.</abstract>
+      <url hash="0472d5a0">2023.ranlp-1.61</url>
+      <bibkey>jiang-etal-2023-categorising</bibkey>
+    </paper>
+    <paper id="62">
+      <title>Bridging the Gap between Subword and Character Segmentation in Pretrained Language Models</title>
+      <author><first>Shun</first><last>Kiyono</last></author>
+      <author><first>Sho</first><last>Takase</last></author>
+      <author><first>Shengzhe</first><last>Li</last></author>
+      <author><first>Toshinori</first><last>Sato</last></author>
+      <pages>568–577</pages>
+      <abstract>Pretrained language models require the use of consistent segmentation (e.g., subword- or character-level segmentation) in pretraining and finetuning. In NLP, many tasks are modeled by subword-level segmentation better than by character-level segmentation. However, because of their format, several tasks require the use of character-level segmentation. Thus, in order to tackle both types of NLP tasks, language models must be independently pretrained for both subword and character-level segmentation. However, this is an inefficient and costly procedure. Instead, this paper proposes a method for training a language model with unified segmentation. This means that the trained model can be finetuned on both subword- and character-level segmentation. The principle of the method is to apply the subword regularization technique to generate a mixture of subword- and character-level segmentation. Through experiment on BERT models, we demonstrate that our method can halve the computational cost of pretraining.</abstract>
+      <url hash="4310450d">2023.ranlp-1.62</url>
+      <bibkey>kiyono-etal-2023-bridging</bibkey>
+    </paper>
+    <paper id="63">
+      <title>Evaluating Data Augmentation for Medication Identification in Clinical Notes</title>
+      <author><first>Jordan</first><last>Koontz</last></author>
+      <author><first>Maite</first><last>Oronoz</last></author>
+      <author><first>Alicia</first><last>Pérez</last></author>
+      <pages>578–585</pages>
+      <abstract>We evaluate the effectiveness of using data augmentation to improve the generalizability of a Named Entity Recognition model for the task of medication identification in clinical notes. We compare disparate data augmentation methods, namely mention-replacement and a generative model, for creating synthetic training examples. Through experiments on the n2c2 2022 Track 1 Contextualized Medication Event Extraction data set, we show that data augmentation with supplemental examples created with GPT-3 can boost the performance of a transformer-based model for small training sets.</abstract>
+      <url hash="92433dd4">2023.ranlp-1.63</url>
+      <bibkey>koontz-etal-2023-evaluating</bibkey>
+    </paper>
+    <paper id="64">
+      <title>Advancing Topical Text Classification: A Novel Distance-Based Method with Contextual Embeddings</title>
+      <author><first>Andriy</first><last>Kosar</last></author>
+      <author><first>Guy</first><last>De Pauw</last></author>
+      <author><first>Walter</first><last>Daelemans</last></author>
+      <pages>586–597</pages>
+      <abstract>This study introduces a new method for distance-based unsupervised topical text classification using contextual embeddings. The method applies and tailors sentence embeddings for distance-based topical text classification. This is achieved by leveraging the semantic similarity between topic labels and text content, and reinforcing the relationship between them in a shared semantic space. The proposed method outperforms a wide range of existing sentence embeddings on average by 35%. Presenting an alternative to the commonly used transformer-based zero-shot general-purpose classifiers for multiclass text classification, the method demonstrates significant advantages in terms of computational efficiency and flexibility, while maintaining comparable or improved classification results.</abstract>
+      <url hash="b731f642">2023.ranlp-1.64</url>
+      <bibkey>kosar-etal-2023-advancing</bibkey>
+    </paper>
+    <paper id="65">
+      <title>Taxonomy-Based Automation of Prior Approval Using Clinical Guidelines</title>
+      <author><first>Saranya</first><last>Krishnamoorthy</last></author>
+      <author><first>Ayush</first><last>Singh</last></author>
+      <pages>598–607</pages>
+      <abstract>Performing prior authorization on patients in a medical facility is a time-consuming and challenging task for insurance companies. Automating the clinical decisions that lead to authorization can reduce the time that staff spend executing such procedures. To better facilitate such critical decision making, we present an automated approach to predict one of the challenging tasks in the process called primary clinical indicator prediction, which is the outcome of this procedure. The proposed solution is to create a taxonomy to capture the main categories in primary clinical indicators. Our approach involves an important step of selecting what is known as the “primary indicator” – one of the several heuristics based on clinical guidelines that are published and publicly available. A taxonomy based PI classification system was created to help in the recognition of PIs from free text in electronic health records (EHRs). This taxonomy includes comprehensive explanations of each PI, as well as examples of free text that could be used to detect each PI. The major contribution of this work is to introduce a taxonomy created by three professional nurses with many years of experience. We experiment with several state-of-the-art supervised and unsupervised techniques with a focus on prior approval for spinal imaging. The results indicate that the proposed taxonomy is capable of increasing the performance of unsupervised approaches by up to 10 F1 points. Further, in the supervised setting, we achieve an F1 score of 0.61 using a conventional technique based on term frequency–inverse document frequency that outperforms other deep-learning approaches.</abstract>
+      <url hash="78d8aec9">2023.ranlp-1.65</url>
+      <bibkey>krishnamoorthy-singh-2023-taxonomy</bibkey>
+    </paper>
+    <paper id="66">
+      <title>Simultaneous Interpreting as a Noisy Channel: How Much Information Gets Through</title>
+      <author><first>Maria</first><last>Kunilovskaya</last></author>
+      <author><first>Heike</first><last>Przybyl</last></author>
+      <author><first>Ekaterina</first><last>Lapshinova-Koltunski</last></author>
+      <author><first>Elke</first><last>Teich</last></author>
+      <pages>608–618</pages>
+      <abstract>We explore the relationship between information density/surprisal of source and target texts in translation and interpreting in the language pair English-German, looking at the specific properties of translation (“translationese”). Our data comes from two bidirectional English-German subcorpora representing written and spoken mediation modes collected from European Parliament proceedings. Within each language, we (a) compare original speeches to their translated or interpreted counterparts, and (b) explore the association between segment-aligned sources and targets in each translation direction. As additional variables, we consider source delivery mode (read-out, impromptu) and source speech rate in interpreting. We use language modelling to measure the information rendered by words in a segment and to characterise the cross-lingual transfer of information under various conditions. Our approach is based on statistical analyses of surprisal values, extracted from n-gram models of our dataset. The analysis reveals that while there is a considerable positive correlation between the average surprisal of source and target segments in both modes, information output in interpreting is lower than in translation, given the same amount of input. Significantly lower information density in spoken mediated production compared to non-mediated speech in the same language can indicate a possible simplification effect in interpreting.</abstract>
+      <url hash="f74d94d7">2023.ranlp-1.66</url>
+      <bibkey>kunilovskaya-etal-2023-simultaneous</bibkey>
+    </paper>
+    <paper id="67">
+      <title>Challenges of <fixed-case>GPT</fixed-case>-3-Based Conversational Agents for Healthcare</title>
+      <author><first>Fabian</first><last>Lechner</last></author>
+      <author><first>Allison</first><last>Lahnala</last></author>
+      <author><first>Charles</first><last>Welch</last></author>
+      <author><first>Lucie</first><last>Flek</last></author>
+      <pages>619–630</pages>
+      <abstract>The potential of medical domain dialogue agents lies in their ability to provide patients with faster information access while enabling medical specialists to concentrate on critical tasks. However, the integration of large-language models (LLMs) into these agents presents certain limitations that may result in serious consequences. This paper investigates the challenges and risks of using GPT-3-based models for medical question-answering (MedQA). We perform several evaluations contextualized in terms of standard medical principles. We provide a procedure for manually designing patient queries to stress-test high-risk limitations of LLMs in MedQA systems. Our analysis reveals that LLMs fail to respond adequately to these queries, generating erroneous medical information, unsafe recommendations, and content that may be considered offensive.</abstract>
+      <url hash="7cc051b5">2023.ranlp-1.67</url>
+      <bibkey>lechner-etal-2023-challenges</bibkey>
+    </paper>
+    <paper id="68">
+      <title>Noisy Self-Training with Data Augmentations for Offensive and Hate Speech Detection Tasks</title>
+      <author><first>João</first><last>Leite</last></author>
+      <author><first>Carolina</first><last>Scarton</last></author>
+      <author><first>Diego</first><last>Silva</last></author>
+      <pages>631–640</pages>
+      <abstract>Online social media is rife with offensive and hateful comments, prompting the need for their automatic detection given the sheer amount of posts created every second. Creating high-quality human-labelled datasets for this task is difficult and costly, especially because non-offensive posts are significantly more frequent than offensive ones. However, unlabelled data is abundant, easier, and cheaper to obtain. In this scenario, self-training methods, using weakly-labelled examples to increase the amount of training data, can be employed. Recent “noisy” self-training approaches incorporate data augmentation techniques to ensure prediction consistency and increase robustness against noisy data and adversarial attacks. In this paper, we experiment with default and noisy self-training using three different textual data augmentation techniques across five different pre-trained BERT architectures varying in size. We evaluate our experiments on two offensive/hate-speech datasets and demonstrate that (i) self-training consistently improves performance regardless of model size, resulting in up to +1.5% F1-macro on both datasets, and (ii) noisy self-training with textual data augmentations, despite being successfully applied in similar settings, decreases performance on offensive and hate-speech domains when compared to the default method, even with state-of-the-art augmentations such as backtranslation.</abstract>
+      <url hash="21294342">2023.ranlp-1.68</url>
+      <bibkey>leite-etal-2023-noisy</bibkey>
+    </paper>
+    <paper id="69">
+      <title>A Practical Survey on Zero-Shot Prompt Design for In-Context Learning</title>
+      <author><first>Yinheng</first><last>Li</last></author>
+      <pages>641–647</pages>
+      <abstract>The remarkable advancements in large language models (LLMs) have brought about significant improvements in Natural Language Processing(NLP) tasks. This paper presents a comprehensive review of in-context learning techniques, focusing on different types of prompts, including discrete, continuous, few-shot, and zero-shot, and their impact on LLM performance. We explore various approaches to prompt design, such as manual design, optimization algorithms, and evaluation methods, to optimize LLM performance across diverse tasks. Our review covers key research studies in prompt engineering, discussing their methodologies and contributions to the field. We also delve into the challenges faced in evaluating prompt performance, given the absence of a single “best” prompt and the importance of considering multiple metrics. In conclusion, the paper highlights the critical role of prompt design in harnessing the full potential of LLMs and provides insights into the combination of manual design, optimization techniques, and rigorous evaluation for more effective and efficient use of LLMs in various NLP tasks.</abstract>
+      <url hash="06a8f0e9">2023.ranlp-1.69</url>
+      <bibkey>li-2023-practical</bibkey>
+    </paper>
+    <paper id="70">
+      <title>Classifying <fixed-case>COVID</fixed-case>-19 Vaccine Narratives</title>
+      <author><first>Yue</first><last>Li</last></author>
+      <author><first>Carolina</first><last>Scarton</last></author>
+      <author><first>Xingyi</first><last>Song</last></author>
+      <author><first>Kalina</first><last>Bontcheva</last></author>
+      <pages>648–657</pages>
+      <abstract>Vaccine hesitancy is widespread, despite the government’s information campaigns and the efforts of the World Health Organisation (WHO). Categorising the topics within vaccine-related narratives is crucial to understand the concerns expressed in discussions and identify the specific issues that contribute to vaccine hesitancy. This paper addresses the need for monitoring and analysing vaccine narratives online by introducing a novel vaccine narrative classification task, which categorises COVID-19 vaccine claims into one of seven categories. Following a data augmentation approach, we first construct a novel dataset for this new classification task, focusing on the minority classes. We also make use of fact-checker annotated data. The paper also presents a neural vaccine narrative classifier that achieves an accuracy of 84% under cross-validation. The classifier is publicly available for researchers and journalists.</abstract>
+      <url hash="0b019be5">2023.ranlp-1.70</url>
+      <bibkey>li-etal-2023-classifying</bibkey>
+    </paper>
+    <paper id="71">
+      <title>Sign Language Recognition and Translation: A Multi-Modal Approach Using Computer Vision and Natural Language Processing</title>
+      <author><first>Jacky</first><last>Li</last></author>
+      <author><first>Jaren</first><last>Gerdes</last></author>
+      <author><first>James</first><last>Gojit</last></author>
+      <author><first>Austin</first><last>Tao</last></author>
+      <author><first>Samyak</first><last>Katke</last></author>
+      <author><first>Kate</first><last>Nguyen</last></author>
+      <author><first>Benyamin</first><last>Ahmadnia</last></author>
+      <pages>658–665</pages>
+      <abstract>Sign-to-Text (S2T) is a hand gesture recognition program in the American Sign Language (ASL) domain. The primary objective of S2T is to classify standard ASL alphabets and custom signs and convert the classifications into a stream of text using neural networks. This paper addresses the shortcomings of pure Computer Vision techniques and applies Natural Language Processing (NLP) as an additional layer of complexity to increase S2T’s robustness.</abstract>
+      <url hash="b053ef40">2023.ranlp-1.71</url>
+      <bibkey>li-etal-2023-sign</bibkey>
+    </paper>
+    <paper id="72">
+      <title>Classification-Aware Neural Topic Model Combined with Interpretable Analysis - for Conflict Classification</title>
+      <author><first>Tianyu</first><last>Liang</last></author>
+      <author><first>Yida</first><last>Mu</last></author>
+      <author><first>Soonho</first><last>Kim</last></author>
+      <author><first>Darline</first><last>Kuate</last></author>
+      <author><first>Julie</first><last>Lang</last></author>
+      <author><first>Rob</first><last>Vos</last></author>
+      <author><first>Xingyi</first><last>Song</last></author>
+      <pages>666–672</pages>
+      <abstract>A large number of conflict events are affecting the world all the time. In order to analyse such conflict events effectively, this paper presents a Classification-Aware Neural Topic Model (CANTM-IA) for Conflict Information Classification and Topic Discovery. The model provides a reliable interpretation of classification results and discovered topics by introducing interpretability analysis. At the same time, interpretation is introduced into the model architecture to improve the classification performance of the model and to allow interpretation to focus further on the details of the data. Finally, the model architecture is optimised to reduce the complexity of the model.</abstract>
+      <url hash="9cc345fc">2023.ranlp-1.72</url>
+      <bibkey>liang-etal-2023-classification</bibkey>
+    </paper>
+    <paper id="73">
+      <title>Data Augmentation for Fake Reviews Detection</title>
+      <author><first>Ming</first><last>Liu</last></author>
+      <author><first>Massimo</first><last>Poesio</last></author>
+      <pages>673–680</pages>
+      <abstract>In this research, we studied the relationship between data augmentation and model accuracy for the task of fake review detection. We used data generation methods to augment two different fake review datasets and compared the performance of models trained with the original data and with the augmented data. Our results show that the accuracy of our fake review detection model can be improved by 0.31 percentage points on DeRev Test and by 7.65 percentage points on Amazon Test by using the augmented datasets.</abstract>
+      <url hash="5ae02e66">2023.ranlp-1.73</url>
+      <bibkey>liu-poesio-2023-data</bibkey>
+    </paper>
+    <paper id="74">
+      <title>Coherent Story Generation with Structured Knowledge</title>
+      <author><first>Congda</first><last>Ma</last></author>
+      <author><first>Kotaro</first><last>Funakoshi</last></author>
+      <author><first>Kiyoaki</first><last>Shirai</last></author>
+      <author><first>Manabu</first><last>Okumura</last></author>
+      <pages>681–690</pages>
+      <abstract>The emergence of pre-trained language models has taken story generation, which is the task of automatically generating a comprehensible story from limited information, to a new stage. Although generated stories from the language models are fluent and grammatically correct, the lack of coherence affects their quality. We propose a knowledge-based multi-stage model that incorporates the schema, a kind of structured knowledge, to guide coherent story generation. Our framework includes a schema acquisition module, a plot generation module, and a surface realization module. In the schema acquisition module, high-relevant structured knowledge pieces are selected as a schema. In the plot generation module, a coherent plot plan is navigated by the schema. In the surface realization module, conditioned by the generated plot, a story is generated. Evaluations show that our methods can generate more comprehensible stories than strong baselines, especially with higher global coherence and less repetition.</abstract>
+      <url hash="0302b155">2023.ranlp-1.74</url>
+      <bibkey>ma-etal-2023-coherent</bibkey>
+    </paper>
+    <paper id="75">
+      <title>Studying Common Ground Instantiation Using Audio, Video and Brain Behaviours: The <fixed-case>B</fixed-case>rain<fixed-case>KT</fixed-case> Corpus</title>
+      <author><first>Eliot</first><last>Maës</last></author>
+      <author><first>Thierry</first><last>Legou</last></author>
+      <author><first>Leonor</first><last>Becerra</last></author>
+      <author><first>Philippe</first><last>Blache</last></author>
+      <pages>691–702</pages>
+      <abstract>An increasing amount of multimodal recordings has been paving the way for the development of a more automatic way to study language and conversational interactions. However this data largely comprises of audio and video recordings, leaving aside other modalities that might complement this external view of the conversation but might be more difficult to collect in naturalistic setups, such as participants brain activity. In this context, we present BrainKT, a natural conversational corpus with audio, video and neuro-physiological signals, collected with the aim of studying information exchanges and common ground instantiation in conversation in a new, more in-depth way. We recorded conversations from 28 dyads (56 participants) during 30 minutes experiments where subjects were first tasked to collaborate on a joint information game, then freely drifted to the topic of their choice. During each session, audio and video were captured, along with the participants’ neural signal (EEG with Biosemi 64) and their electro-physiological activity (with Empatica-E4). The paper situates this new type of resources in the literature, presents the experimental setup and describes the different kinds of annotations considered for the corpus.</abstract>
+      <url hash="5cbb72c3">2023.ranlp-1.75</url>
+      <bibkey>maes-etal-2023-studying</bibkey>
+    </paper>
+    <paper id="76">
+      <title>Reading between the Lines: Information Extraction from Industry Requirements</title>
+      <author><first>Ole Magnus</first><last>Holter</last></author>
+      <author><first>Basil</first><last>Ell</last></author>
+      <pages>703–711</pages>
+      <abstract>Industry requirements describe the qualities that a project or a service must provide. Most requirements are, however, only available in natural language format and are embedded in textual documents. To be machine-understandable, a requirement needs to be represented in a logical format. We consider that a requirement consists of a scope, which is the requirement’s subject matter, a condition, which is any condition that must be fulfilled for the requirement to be relevant, and a demand, which is what is required. We introduce a novel task, the identification of the semantic components scope, condition, and demand in a requirement sentence, and establish baselines using sequence labelling and few-shot learning. One major challenge with this task is the implicit nature of the scope, often not stated in the sentence. By including document context information, we improved the average performance for scope detection. Our study provides insights into the difficulty of machine understanding of industry requirements and suggests strategies for addressing this challenge.</abstract>
+      <url hash="021ae308">2023.ranlp-1.76</url>
+      <bibkey>holter-ell-2023-reading</bibkey>
+    </paper>
+    <paper id="77">
+      <title>Transformer-Based Language Models for <fixed-case>B</fixed-case>ulgarian</title>
+      <author><first>Iva</first><last>Marinova</last></author>
+      <author><first>Kiril</first><last>Simov</last></author>
+      <author><first>Petya</first><last>Osenova</last></author>
+      <pages>712–720</pages>
+      <abstract>This paper presents an approach for training lightweight and robust language models for Bulgarian that mitigate gender, political, racial, and other biases in the data. Our method involves scraping content from major Bulgarian online media providers using a specialized procedure for source filtering, topic selection, and lexicon-based removal of inappropriate language during the pre-training phase. We continuously improve the models by incorporating new data from various domains, including social media, books, scientific literature, and linguistically modified corpora. Our motivation is to provide a solution that is sufficient for all natural language processing tasks in Bulgarian, and to address the lack of existing procedures for guaranteeing the robustness of such models.</abstract>
+      <url hash="1ba59200">2023.ranlp-1.77</url>
+      <bibkey>marinova-etal-2023-transformer</bibkey>
+    </paper>
+    <paper id="78">
+      <title>Multi-task Ensemble Learning for Fake Reviews Detection and Helpfulness Prediction: A Novel Approach</title>
+      <author><first>Alimuddin</first><last>Melleng</last></author>
+      <author><first>Anna</first><last>Jurek-Loughrey</last></author>
+      <author><first>Deepak</first><last>P</last></author>
+      <pages>721–729</pages>
+      <abstract>Research on fake reviews detection and review helpfulness prediction is prevalent, yet most studies tend to focus solely on either fake reviews detection or review helpfulness prediction, considering them separate research tasks. In contrast to this prevailing pattern, we address both challenges concurrently by employing a multi-task learning approach. We posit that undertaking these tasks simultaneously can enhance the performance of each task through shared information among features. We utilize pre-trained RoBERTa embeddings with a document-level data representation. This is coupled with an array of deep learning and neural network models, including Bi-LSTM, LSTM, GRU, and CNN. Additionally, we em- ploy ensemble learning techniques to integrate these models, with the objective of enhancing overall prediction accuracy and mitigating the risk of overfitting. The findings of this study offer valuable insights to the fields of natural language processing and machine learning and present a novel perspective on leveraging multi-task learning for the twin challenges of fake reviews detection and review helpfulness prediction</abstract>
+      <url hash="3189bb5f">2023.ranlp-1.78</url>
+      <bibkey>melleng-etal-2023-multi</bibkey>
+    </paper>
+    <paper id="79">
+      <title>Data Fusion for Better Fake Reviews Detection</title>
+      <author><first>Alimuddin</first><last>Melleng</last></author>
+      <author><first>Anna</first><last>Jurek-Loughrey</last></author>
+      <author><first>Deepak</first><last>P</last></author>
+      <pages>730–738</pages>
+      <abstract>Online reviews have become critical in informing purchasing decisions, making the detection of fake reviews a crucial challenge to tackle. Many different Machine Learning based solutions have been proposed, using various data representations such as n-grams or document embeddings. In this paper, we first explore the effectiveness of different data representations, including emotion, document embedding, n-grams, and noun phrases in embedding for mat, for fake reviews detection. We evaluate these representations with various state-of-the-art deep learning models, such as BILSTM, LSTM, GRU, CNN, and MLP. Following this, we propose to incorporate different data repre- sentations and classification models using early and late data fusion techniques in order to im- prove the prediction performance. The experiments are conducted on four datasets: Hotel, Restaurant, Amazon, and Yelp. The results demonstrate that combination of different data representations significantly outperform any of the single data representations</abstract>
+      <url hash="5198f5fe">2023.ranlp-1.79</url>
+      <bibkey>melleng-etal-2023-data</bibkey>
+    </paper>
+    <paper id="80">
+      <title>Dimensions of Quality: Contrasting Stylistic vs. Semantic Features for Modelling Literary Quality in 9,000 Novels</title>
+      <author><first>Pascale</first><last>Moreira</last></author>
+      <author><first>Yuri</first><last>Bizzoni</last></author>
+      <pages>739–747</pages>
+      <abstract>In computational literary studies, the challenging task of predicting quality or reader-appreciation of narrative texts is confounded by volatile definitions of quality and the vast feature space that may be considered in modeling. In this paper, we explore two different types of feature sets: stylistic features on one hand, and semantic features on the other. We conduct experiments on a corpus of 9,089 English language literary novels published in the 19th and 20th century, using GoodReads’ ratings as a proxy for reader-appreciation. Examining the potential of both approaches, we find that some types of books are more predictable in one model than in the other, which may indicate that texts have different prominent characteristics (stylistic complexity, a certain narrative progression at the sentiment-level).</abstract>
+      <url hash="18363835">2023.ranlp-1.80</url>
+      <bibkey>moreira-bizzoni-2023-dimensions</bibkey>
+    </paper>
+    <paper id="81">
+      <title><fixed-case>B</fixed-case>angla<fixed-case>B</fixed-case>ait: Semi-Supervised Adversarial Approach for Clickbait Detection on <fixed-case>B</fixed-case>angla Clickbait Dataset</title>
+      <author><first>Md. Motahar</first><last>Mahtab</last></author>
+      <author><first>Monirul</first><last>Haque</last></author>
+      <author><first>Mehedi</first><last>Hasan</last></author>
+      <author><first>Farig</first><last>Sadeque</last></author>
+      <pages>748–758</pages>
+      <abstract>Intentionally luring readers to click on a particular content by exploiting their curiosity defines a title as clickbait. Although several studies focused on detecting clickbait titles in English articles, low-resource language like Bangla has not been given adequate attention. To tackle clickbait titles in Bangla, we have constructed the first Bangla clickbait detection dataset containing 15,056 labeled news articles and 65,406 unlabelled news articles extracted from clickbait-dense news sites. Each article has been labeled by three expert linguists and includes an article’s title, body, and other metadata. By incorporating labeled and unlabelled data, we finetune a pre-trained Bangla transformer model in an adversarial fashion using Semi-Supervised Generative Adversarial Networks (SS-GANs). The proposed model acts as a good baseline for this dataset, outperforming traditional neural network models (LSTM, GRU, CNN) and linguistic feature-based models. We expect that this dataset and the detailed analysis and comparison of these clickbait detection models will provide a fundamental basis for future research into detecting clickbait titles in Bengali articles.</abstract>
+      <url hash="eeae1388">2023.ranlp-1.81</url>
+      <bibkey>mahtab-etal-2023-banglabait</bibkey>
+    </paper>
+    <paper id="82">
+      <title><fixed-case>T</fixed-case>ree<fixed-case>S</fixed-case>wap: Data Augmentation for Machine Translation via Dependency Subtree Swapping</title>
+      <author><first>Attila</first><last>Nagy</last></author>
+      <author><first>Dorina</first><last>Lakatos</last></author>
+      <author><first>Botond</first><last>Barta</last></author>
+      <author><first>Judit</first><last>Ács</last></author>
+      <pages>759–768</pages>
+      <abstract>Data augmentation methods for neural machine translation are particularly useful when limited amount of training data is available, which is often the case when dealing with low-resource languages. We introduce a novel augmentation method, which generates new sentences by swapping objects and subjects across bisentences. This is performed simultaneously based on the dependency parse trees of the source and target sentences. We name this method TreeSwap. Our results show that TreeSwap achieves consistent improvements over baseline models in 4 language pairs in both directions on resource-constrained datasets. We also explore domain-specific corpora, but find that our method does not make significant improvements on law, medical and IT data. We report the scores of similar augmentation methods and find that TreeSwap performs comparably. We also analyze the generated sentences qualitatively and find that the augmentation produces a correct translation in most cases. Our code is available on Github.</abstract>
+      <url hash="9e4636a6">2023.ranlp-1.82</url>
+      <bibkey>nagy-etal-2023-treeswap</bibkey>
+    </paper>
+    <paper id="83">
+      <title>Automatic Assessment Of Spoken <fixed-case>E</fixed-case>nglish Proficiency Based on Multimodal and Multitask Transformers</title>
+      <author><first>Kamel</first><last>Nebhi</last></author>
+      <author><first>György</first><last>Szaszák</last></author>
+      <pages>769–776</pages>
+      <abstract>This paper describes technology developed to automatically grade students on their English spontaneous spoken language proficiency with common european framework of reference for languages (CEFR) level. Our automated assessment system contains two tasks: elicited imitation and spontaneous speech assessment. Spontaneous speech assessment is a challenging task that requires evaluating various aspects of speech quality, content, and coherence. In this paper, we propose a multimodal and multitask transformer model that leverages both audio and text features to perform three tasks: scoring, coherence modeling, and prompt relevancy scoring. Our model uses a fusion of multiple features and multiple modality attention to capture the interactions between audio and text modalities and learn from different sources of information.</abstract>
+      <url hash="cc854ba7">2023.ranlp-1.83</url>
+      <bibkey>nebhi-szaszak-2023-automatic</bibkey>
+    </paper>
+    <paper id="84">
+      <title>Medical Concept Mention Identification in Social Media Posts Using a Small Number of Sample References</title>
+      <author><first>Vasudevan</first><last>Nedumpozhimana</last></author>
+      <author><first>Sneha</first><last>Rautmare</last></author>
+      <author><first>Meegan</first><last>Gower</last></author>
+      <author><first>Nishtha</first><last>Jain</last></author>
+      <author><first>Maja</first><last>Popović</last></author>
+      <author><first>Patricia</first><last>Buffini</last></author>
+      <author><first>John</first><last>Kelleher</last></author>
+      <pages>777–784</pages>
+      <abstract>Identification of mentions of medical concepts in social media text can provide useful information for caseload prediction of diseases like Covid-19 and Measles. We propose a simple model for the automatic identification of the medical concept mentions in the social media text. We validate the effectiveness of the proposed model on Twitter, Reddit, and News/Media datasets.</abstract>
+      <url hash="afbc9dcf">2023.ranlp-1.84</url>
+      <bibkey>nedumpozhimana-etal-2023-medical</bibkey>
+    </paper>
+    <paper id="85">
+      <title>Context-Aware Module Selection in Modular Dialog Systems</title>
+      <author><first>Jan</first><last>Nehring</last></author>
+      <author><first>René Marcel</first><last>Berk</last></author>
+      <author><first>Stefan</first><last>Hillmann</last></author>
+      <pages>785–791</pages>
+      <abstract>In modular dialog systems, a dialog system consists of multiple conversational agents. The task “module selection” selects the appropriate sub-dialog system for an incoming user utterance. Current models for module selection use features derived from the current user turn only, such as the utterances text or confidence values of the natural language understanding systems of the individual conversational agents, or they perform text classification on the user utterance. However, dialogs often span multiple turns, and turns are embedded into a context. Therefore, looking at the current user turn only is a source of error in certain situations. This work proposes four models for module selection that include the dialog history and the current user turn into module selection. We show that these models surpass the current state of the art in module selection.</abstract>
+      <url hash="156d6351">2023.ranlp-1.85</url>
+      <bibkey>nehring-etal-2023-context</bibkey>
+    </paper>
+    <paper id="86">
+      <title>Human Value Detection from Bilingual Sensory Product Reviews</title>
+      <author><first>Boyu</first><last>Niu</last></author>
+      <author><first>Céline</first><last>Manetta</last></author>
+      <author><first>Frédérique</first><last>Segond</last></author>
+      <pages>792–802</pages>
+      <abstract>We applied text classification methods on a corpus of product reviews we created with the help of a questionnaire. We found that for certain values, “traditional” deep neural networks like CNN can give promising results compared to the baseline. We propose some ideas to improve the results in the future. The bilingual corpus we created which contains more than 16 000 consumer reviews associated to the human value profile of the authors can be used for different marketing purposes.</abstract>
+      <url hash="9f9f60f8">2023.ranlp-1.86</url>
+      <bibkey>niu-etal-2023-human</bibkey>
+    </paper>
+    <paper id="87">
+      <title>Word Sense Disambiguation for Automatic Translation of Medical Dialogues into Pictographs</title>
+      <author><first>Magali</first><last>Norré</last></author>
+      <author><first>Rémi</first><last>Cardon</last></author>
+      <author><first>Vincent</first><last>Vandeghinste</last></author>
+      <author><first>Thomas</first><last>François</last></author>
+      <pages>803–812</pages>
+      <abstract>Word sense disambiguation is an NLP task embedded in different applications. We propose to evaluate its contribution to the automatic translation of French texts into pictographs, in the context of communication between doctors and patients with an intellectual disability. Different general and/or medical language models (Word2Vec, fastText, CamemBERT, FlauBERT, DrBERT, and CamemBERT-bio) are tested in order to choose semantically correct pictographs leveraging the synsets in the French WordNets (WOLF and WoNeF). The results of our automatic evaluations show that our method based on Word2Vec and fastText significantly improves the precision of medical translations into pictographs. We also present an evaluation corpus adapted to this task.</abstract>
+      <url hash="dbf9b125">2023.ranlp-1.87</url>
+      <bibkey>norre-etal-2023-word</bibkey>
+    </paper>
+    <paper id="88">
+      <title>A Research-Based Guide for the Creation and Deployment of a Low-Resource Machine Translation System</title>
+      <author><first>John E.</first><last>Ortega</last></author>
+      <author><first>Kenneth</first><last>Church</last></author>
+      <pages>813–823</pages>
+      <abstract>The machine translation (MT) field seems to focus heavily on English and other high-resource languages. Though, low-resource MT (LRMT) is receiving more attention than in the past. Successful LRMT systems (LRMTS) should make a compelling business case in terms of demand, cost and quality in order to be viable for end users. When used by communities where low-resource languages are spoken, LRMT quality should not only be determined by the use of traditional metrics like BLEU, but it should also take into account other factors in order to be inclusive and not risk overall rejection by the community. MT systems based on neural methods tend to perform better with high volumes of training data, but they may be unrealistic and even harmful for LRMT. It is obvious that for research purposes, the development and creation of LRMTS is necessary. However, in this article, we argue that two main workarounds could be considered by companies that are considering deployment of LRMTS in the wild: human-in-the-loop and sub-domains.</abstract>
+      <url hash="e42e9b6e">2023.ranlp-1.88</url>
+      <bibkey>ortega-church-2023-research</bibkey>
+    </paper>
+    <paper id="89">
+      <title><fixed-case>MQDD</fixed-case>: Pre-training of Multimodal Question Duplicity Detection for Software Engineering Domain</title>
+      <author><first>Jan</first><last>Pasek</last></author>
+      <author><first>Jakub</first><last>Sido</last></author>
+      <author><first>Miloslav</first><last>Konopik</last></author>
+      <author><first>Ondrej</first><last>Prazak</last></author>
+      <pages>824–835</pages>
+      <abstract>This work proposes a new pipeline for leveraging data collected on the Stack Overflow website for pre-training a multimodal model for searching duplicates on question answering websites. Our multimodal model is trained on question descriptions and source codes in multiple programming languages. We design two new learning objectives to improve duplicate detection capabilities. The result of this work is a mature, fine-tuned Multimodal Question Duplicity Detection (MQDD) model, ready to be integrated into a Stack Overflow search system, where it can help users find answers for already answered questions. Alongside the MQDD model, we release two datasets related to the software engineering domain. The first Stack Overflow Dataset (SOD) represents a massive corpus of paired questions and answers. The second Stack Overflow Duplicity Dataset (SODD) contains data for training duplicate detection models.</abstract>
+      <url hash="17a9f6eb">2023.ranlp-1.89</url>
+      <bibkey>pasek-etal-2023-mqdd</bibkey>
+    </paper>
+    <paper id="90">
+      <title>Forming Trees with Treeformers</title>
+      <author><first>Nilay</first><last>Patel</last></author>
+      <author><first>Jeffrey</first><last>Flanigan</last></author>
+      <pages>836–845</pages>
+      <abstract>Human language is known to exhibit a nested, hierarchical structure, allowing us to form complex sentences out of smaller pieces. However, many state-of-the-art neural networks models such as Transformers have no explicit hierarchical structure in their architecture—that is, they don’t have an inductive bias toward hierarchical structure. Additionally, Transformers are known to perform poorly on compositional generalization tasks which require such structures. In this paper, we introduce Treeformer, a general-purpose encoder module inspired by the CKY algorithm which learns a composition operator and pooling function to construct hierarchical encodings for phrases and sentences. Our extensive experiments demonstrate the benefits of incorporating hierarchical structure into the Transformer and show significant improvements in compositional generalization as well as in downstream tasks such as machine translation, abstractive summarization, and various natural language understanding tasks.</abstract>
+      <url hash="b3dd67f1">2023.ranlp-1.90</url>
+      <bibkey>patel-flanigan-2023-forming</bibkey>
+    </paper>
+    <paper id="91">
+      <title>Evaluating Unsupervised Hierarchical Topic Models Using a Labeled Dataset</title>
+      <author><first>Judicael</first><last>Poumay</last></author>
+      <author><first>Ashwin</first><last>Ittoo</last></author>
+      <pages>846–853</pages>
+      <abstract>Topic modeling is a commonly used method for identifying and extracting topics from a corpus of documents. While several evaluation techniques, such as perplexity and topic coherence, have been developed to assess the quality of extracted topics, they fail to determine whether all topics have been identified and to what extent they have been represented. Additionally, hierarchical topic models have been proposed, but the quality of the hierarchy produced has not been adequately evaluated. This study proposes a novel approach to evaluating topic models that supplements existing methods. Using a labeled dataset, we trained hierarchical topic models in an unsupervised manner and used the known labels to evaluate the accuracy of the results. Our findings indicate that labels encompassing a substantial number of documents achieve high accuracy of over 70%. Although there are 90 labels in the dataset, labels that cover only 1% of the data still achieve an average accuracy of 37.9%, demonstrating the effectiveness of hierarchical topic models even on smaller subsets. Furthermore, we demonstrate that these labels can be used to assess the quality of the topic tree and confirm that hierarchical topic models produce coherent taxonomies for the labels.</abstract>
+      <url hash="b280dc13">2023.ranlp-1.91</url>
+      <bibkey>poumay-ittoo-2023-evaluating</bibkey>
+    </paper>
+    <paper id="92">
+      <title><fixed-case>HTMOT</fixed-case>: Hierarchical Topic Modelling over Time</title>
+      <author><first>Judicael</first><last>Poumay</last></author>
+      <author><first>Ashwin</first><last>Ittoo</last></author>
+      <pages>854–863</pages>
+      <abstract>Topic models provide an efficient way of extracting insights from text and supporting decision-making. Recently, novel methods have been proposed to model topic hierarchy or temporality. Modeling temporality provides more precise topics by separating topics that are characterized by similar words but located over distinct time periods. Conversely, modeling hierarchy provides a more detailed view of the content of a corpus by providing topics and sub-topics. However, no models have been proposed to incorporate both hierarchy and temporality which could be beneficial for applications such as environment scanning. Therefore, we propose a novel method to perform Hierarchical Topic Modelling Over Time (HTMOT). We evaluate the performance of our approach on a corpus of news articles using the Word Intrusion task. Results demonstrate that our model produces topics that elegantly combine a hierarchical structure and a temporal aspect. Furthermore, our proposed Gibbs sampling implementation shows competitive performance compared to previous state-of-the-art methods.</abstract>
+      <url hash="6ee5f492">2023.ranlp-1.92</url>
+      <bibkey>poumay-ittoo-2023-htmot</bibkey>
+    </paper>
+    <paper id="93">
+      <title>Multilingual Continual Learning Approaches for Text Classification</title>
+      <author><first>Karan</first><last>Praharaj</last></author>
+      <author><first>Irina</first><last>Matveeva</last></author>
+      <pages>864–870</pages>
+      <abstract>Multilingual continual learning is important for models that are designed to be deployed over long periods of time and are required to be updated when new data becomes available. Such models are continually applied to new unseen data that can be in any of the supported languages. One challenge in this scenario is to ensure consistent performance of the model throughout the deployment lifecycle, beginning from the moment of first deployment. We empirically assess the strengths and shortcomings of some continual learning methods in a multilingual setting across two tasks.</abstract>
+      <url hash="16226663">2023.ranlp-1.93</url>
+      <bibkey>praharaj-matveeva-2023-multilingual</bibkey>
+    </paper>
+    <paper id="94">
+      <title>Can Model Fusing Help Transformers in Long Document Classification? An Empirical Study</title>
+      <author><first>Damith</first><last>Premasiri</last></author>
+      <author><first>Tharindu</first><last>Ranasinghe</last></author>
+      <author><first>Ruslan</first><last>Mitkov</last></author>
+      <pages>871–878</pages>
+      <abstract>Text classification is an area of research which has been studied over the years in Natural Language Processing (NLP). Adapting NLP to multiple domains has introduced many new challenges for text classification and one of them is long document classification. While state-of-the-art transformer models provide excellent results in text classification, most of them have limitations in the maximum sequence length of the input sequence. The majority of the transformer models are limited to 512 tokens, and therefore, they struggle with long document classification problems. In this research, we explore on employing Model Fusing for long document classification while comparing the results with well-known BERT and Longformer architectures.</abstract>
+      <url hash="7230872c">2023.ranlp-1.94</url>
+      <bibkey>premasiri-etal-2023-model</bibkey>
+    </paper>
+    <paper id="95">
+      <title>Deep Learning Methods for Identification of Multiword Flower and Plant Names</title>
+      <author><first>Damith</first><last>Premasiri</last></author>
+      <author><first>Amal</first><last>Haddad Haddad</last></author>
+      <author><first>Tharindu</first><last>Ranasinghe</last></author>
+      <author><first>Ruslan</first><last>Mitkov</last></author>
+      <pages>879–887</pages>
+      <abstract>Multiword Terms (MWTs) are domain-specific Multiword Expressions (MWE) where two or more lexemes converge to form a new unit of meaning. The task of processing MWTs is crucial in many Natural Language Processing (NLP) applications, including Machine Translation (MT) and terminology extraction. However, the automatic detection of those terms is a difficult task and more research is still required to give more insightful and useful results in this field. In this study, we seek to fill this gap using state-of-the-art transformer models. We evaluate both BERT like discriminative transformer models and generative pre-trained transformer (GPT) models on this task, and we show that discriminative models perform better than current GPT models in multi-word terms identification task in flower and plant names in English and Spanish languages. Best discriminate models perform 94.3127%, 82.1733% F1 scores in English and Spanish data, respectively while ChatGPT could only perform 63.3183% and 47.7925% respectively.</abstract>
+      <url hash="c873a837">2023.ranlp-1.95</url>
+      <bibkey>premasiri-etal-2023-deep</bibkey>
+    </paper>
+    <paper id="96">
+      <title>Improving Aspect-Based Sentiment with End-to-End Semantic Role Labeling Model</title>
+      <author><first>Pavel</first><last>Přibáň</last></author>
+      <author><first>Ondrej</first><last>Prazak</last></author>
+      <pages>888–897</pages>
+      <abstract>This paper presents a series of approaches aimed at enhancing the performance of Aspect-Based Sentiment Analysis (ABSA) by utilizing extracted semantic information from a Semantic Role Labeling (SRL) model. We propose a novel end-to-end Semantic Role Labeling model that effectively captures most of the structured semantic information within the Transformer hidden state. We believe that this end-to-end model is well-suited for our newly proposed models that incorporate semantic information. We evaluate the proposed models in two languages, English and Czech, employing ELECTRA-small models. Our combined models improve ABSA performance in both languages. Moreover, we achieved new state-of-the-art results on the Czech ABSA.</abstract>
+      <url hash="561ff23f">2023.ranlp-1.96</url>
+      <bibkey>priban-prazak-2023-improving</bibkey>
+    </paper>
+    <paper id="97">
+      <title>hu<fixed-case>PWKP</fixed-case>: A <fixed-case>H</fixed-case>ungarian Text Simplification Corpus</title>
+      <author><first>Noémi</first><last>Prótár</last></author>
+      <author><first>Dávid Márk</first><last>Nemeskey</last></author>
+      <pages>898–907</pages>
+      <abstract>In this article we introduce huPWKP, the first parallel corpus consisting of Hungarian standard language-simplified sentence pairs. As Hungarian is a quite low-resource language in regards to text simplification, we opted for translating an already existing corpus, PWKP (Zhu et al., 2010), on which we performed some cleaning in order to improve its quality. We evaluated the corpus both with the help of human evaluators and by training a seq2seq model on both the Hungarian corpus and the original (cleaned) English corpus. The Hungarian model performed slightly worse in terms of automatic metrics; however, the English model attains a SARI score close to the state of the art on the official PWKP set. According to the human evaluation, the corpus performs at around 3 on a scale ranging from 1 to 5 in terms of information retention and increase in simplification and around 3.7 in terms of grammaticality.</abstract>
+      <url hash="ba23e7b9">2023.ranlp-1.97</url>
+      <bibkey>protar-nemeskey-2023-hupwkp</bibkey>
+    </paper>
+    <paper id="98">
+      <title>Topic Modeling Using Community Detection on a Word Association Graph</title>
+      <author><first>Mahfuzur Rahman</first><last>Chowdhury</last></author>
+      <author><first>Intesur</first><last>Ahmed</last></author>
+      <author><first>Farig</first><last>Sadeque</last></author>
+      <author><first>Muhammad</first><last>Yanhaona</last></author>
+      <pages>908–917</pages>
+      <abstract>Topic modeling of a text corpus is one of the most well-studied areas of information retrieval and knowledge discovery. Despite several decades of research in the area that begets an array of modeling tools, some common problems still obstruct automated topic modeling from matching users’ expectations. In particular, existing topic modeling solutions suffer when the distribution of words among the underlying topics is uneven or the topics are overlapped. Furthermore, many solutions ask the user to provide a topic count estimate as input, which limits their usefulness in modeling a corpus where such information is unavailable. We propose a new topic modeling approach that overcomes these shortcomings by formulating the topic modeling problem as a community detection problem in a word association graph/network that we generate from the text corpus. Experimental evaluation using multiple data sets of three different types of text corpora shows that our approach is superior to prominent topic modeling alternatives in most cases. This paper describes our approach and discusses the experimental findings.</abstract>
+      <url hash="61e51c90">2023.ranlp-1.98</url>
+      <bibkey>chowdhury-etal-2023-topic</bibkey>
+    </paper>
+    <paper id="99">
+      <title>Exploring Techniques to Detect and Mitigate Non-Inclusive Language Bias in Marketing Communications Using a Dictionary-Based Approach</title>
+      <author><first>Bharathi Raja</first><last>Chakravarthi</last></author>
+      <author><first>Prasanna Kumar</first><last>Kumaresan</last></author>
+      <author><first>Rahul</first><last>Ponnusamy</last></author>
+      <author><first>John P.</first><last>McCrae</last></author>
+      <author><first>Michaela</first><last>Comerford</last></author>
+      <author><first>Jay</first><last>Megaro</last></author>
+      <author><first>Deniz</first><last>Keles</last></author>
+      <author><first>Last</first><last>Feremenga</last></author>
+      <pages>918–925</pages>
+      <abstract>We propose a new dataset for detecting non-inclusive language in sentences in English. These sentences were gathered from public sites, explaining what is inclusive and what is non-inclusive. We also extracted potentially non-inclusive keywords/phrases from the guidelines from business websites. A phrase dictionary was created by using an automatic extension with a word embedding trained on a massive corpus of general English text. In the end, a phrase dictionary was constructed by hand-editing the previous one to exclude inappropriate expansions and add the keywords from the guidelines. In a business context, the words individuals use can significantly impact the culture of inclusion and the quality of interactions with clients and prospects. Knowing the right words to avoid helps customers of different backgrounds and historically excluded groups feel included. They can make it easier to have productive, engaging, and positive communications. You can find the dictionaries, the code, and the method for making requests for the corpus at (we will release the link for data and code once the paper is accepted).</abstract>
+      <url hash="7e12f04b">2023.ranlp-1.99</url>
+      <bibkey>chakravarthi-etal-2023-exploring</bibkey>
+    </paper>
+    <paper id="100">
+      <title>Does the “Most Sinfully Decadent Cake Ever” Taste Good? Answering Yes/No Questions from Figurative Contexts</title>
+      <author><first>Geetanjali</first><last>Rakshit</last></author>
+      <author><first>Jeffrey</first><last>Flanigan</last></author>
+      <pages>926–936</pages>
+      <abstract>Figurative language is commonplace in natural language, and while making communication memorable and creative, can be difficult to understand. In this work, we investigate the robustness of Question Answering (QA) models on figurative text. Yes/no questions, in particular, are a useful probe of figurative language understanding capabilities of large language models. We propose FigurativeQA, a set of 1000 yes/no questions with figurative and non-figurative contexts, extracted from the domains of restaurant and product reviews. We show that state-of-the-art BERT-based QA models exhibit an average performance drop of up to 15% points when answering questions from figurative contexts, as compared to non-figurative ones. While models like GPT-3 and ChatGPT are better at handling figurative texts, we show that further performance gains can be achieved by automatically simplifying the figurative contexts into their non-figurative (literal) counterparts. We find that the best overall model is ChatGPT with chain-of-thought prompting to generate non-figurative contexts. Our work provides a promising direction for building more robust QA models with figurative language understanding capabilities.</abstract>
+      <url hash="7218e57a">2023.ranlp-1.100</url>
+      <bibkey>rakshit-flanigan-2023-sinfully</bibkey>
+    </paper>
+    <paper id="101">
+      <title>Modeling Easiness for Training Transformers with Curriculum Learning</title>
+      <author><first>Leonardo</first><last>Ranaldi</last></author>
+      <author><first>Giulia</first><last>Pucci</last></author>
+      <author><first>Fabio Massimo</first><last>Zanzotto</last></author>
+      <pages>937–948</pages>
+      <abstract>Directly learning from complex examples is generally problematic for humans and machines. Indeed, a better strategy is exposing learners to examples in a reasonable, pedagogically-motivated order. Curriculum Learning (CL) has been proposed to import this strategy when training machine learning models. In this paper, building on Curriculum Learning, we propose a novel, linguistically motivated measure to determine example complexity for organizing examples during learning. Our complexity measure - LRC- is based on length, rarity, and comprehensibility. Our resulting learning model is CL-LRC, that is, CL with LRC. Experiments on downstream tasks show that CL-LRC outperforms existing CL and non-CL methods for training BERT and RoBERTa from scratch. Furthermore, we analyzed different measures, including perplexity, loss, and learning curve of different models pre-trained from scratch, showing that CL-LRC performs better than the state-of-the-art.</abstract>
+      <url hash="4db9d34d">2023.ranlp-1.101</url>
+      <bibkey>ranaldi-etal-2023-modeling</bibkey>
+    </paper>
+    <paper id="102">
+      <title>The Dark Side of the Language: Pre-trained Transformers in the <fixed-case>D</fixed-case>ark<fixed-case>N</fixed-case>et</title>
+      <author><first>Leonardo</first><last>Ranaldi</last></author>
+      <author><first>Aria</first><last>Nourbakhsh</last></author>
+      <author><first>Elena Sofia</first><last>Ruzzetti</last></author>
+      <author><first>Arianna</first><last>Patrizi</last></author>
+      <author><first>Dario</first><last>Onorati</last></author>
+      <author><first>Michele</first><last>Mastromattei</last></author>
+      <author><first>Francesca</first><last>Fallucchi</last></author>
+      <author><first>Fabio Massimo</first><last>Zanzotto</last></author>
+      <pages>949–960</pages>
+      <abstract>Pre-trained Transformers are challenging human performances in many Natural Language Processing tasks. The massive datasets used for pre-training seem to be the key to their success on existing tasks. In this paper, we explore how a range of pre-trained natural language understanding models performs on definitely unseen sentences provided by classification tasks over a DarkNet corpus. Surprisingly, results show that syntactic and lexical neural networks perform on par with pre-trained Transformers even after fine-tuning. Only after what we call extreme domain adaptation, that is, retraining with the masked language model task on all the novel corpus, pre-trained Transformers reach their standard high results. This suggests that huge pre-training corpora may give Transformers unexpected help since they are exposed to many of the possible sentences.</abstract>
+      <url hash="ddbd036a">2023.ranlp-1.102</url>
+      <bibkey>ranaldi-etal-2023-dark</bibkey>
+    </paper>
+    <paper id="103">
+      <title><fixed-case>P</fixed-case>re<fixed-case>C</fixed-case>og: Exploring the Relation between Memorization and Performance in Pre-trained Language Models</title>
+      <author><first>Leonardo</first><last>Ranaldi</last></author>
+      <author><first>Elena Sofia</first><last>Ruzzetti</last></author>
+      <author><first>Fabio Massimo</first><last>Zanzotto</last></author>
+      <pages>961–967</pages>
+      <abstract>Large Language Models (LLMs) are impressive machines with the ability to memorize, possibly generalized learning examples. We present here a small, focused contribution to the analysis of the interplay between memorization and performance of BERT in downstream tasks. We propose PreCog, a measure for evaluating memorization from pre-training, and we analyze its correlation with the BERT’s performance. Our experiments show that highly memorized examples are better classified, suggesting memorization is an essential key to success for BERT.</abstract>
+      <url hash="7ee1b9f6">2023.ranlp-1.103</url>
+      <bibkey>ranaldi-etal-2023-precog</bibkey>
+    </paper>
+    <paper id="104">
+      <title>Publish or Hold? Automatic Comment Moderation in <fixed-case>L</fixed-case>uxembourgish News Articles</title>
+      <author><first>Tharindu</first><last>Ranasinghe</last></author>
+      <author><first>Alistair</first><last>Plum</last></author>
+      <author><first>Christoph</first><last>Purschke</last></author>
+      <author><first>Marcos</first><last>Zampieri</last></author>
+      <pages>968–978</pages>
+      <abstract>Recently, the internet has emerged as the primary platform for accessing news. In the majority of these news platforms, the users now have the ability to post comments on news articles and engage in discussions on various social media. While these features promote healthy conversations among users, they also serve as a breeding ground for spreading fake news, toxic discussions and hate speech. Moderating or removing such content is paramount to avoid unwanted consequences for the readers. How- ever, apart from a few notable exceptions, most research on automatic moderation of news article comments has dealt with English and other high resource languages. This leaves under-represented or low-resource languages at a loss. Addressing this gap, we perform the first large-scale qualitative analysis of more than one million Luxembourgish comments posted over the course of 14 years. We evaluate the performance of state-of-the-art transformer models in Luxembourgish news article comment moderation. Furthermore, we analyse how the language of Luxembourgish news article comments has changed over time. We observe that machine learning models trained on old comments do not perform well on recent data. The findings in this work will be beneficial in building news comment moderation systems for many low-resource languages</abstract>
+      <url hash="9c38e87a">2023.ranlp-1.104</url>
+      <bibkey>ranasinghe-etal-2023-publish</bibkey>
+    </paper>
+    <paper id="105">
+      <title>Cross-Lingual Speaker Identification for <fixed-case>I</fixed-case>ndian Languages</title>
+      <author><first>Amaan</first><last>Rizvi</last></author>
+      <author><first>Anupam</first><last>Jamatia</last></author>
+      <author><first>Dwijen</first><last>Rudrapal</last></author>
+      <author><first>Kunal</first><last>Chakma</last></author>
+      <author><first>Björn</first><last>Gambäck</last></author>
+      <pages>979–987</pages>
+      <abstract>The paper introduces a cross-lingual speaker identification system for Indian languages, utilising a Long Short-Term Memory dense neural network (LSTM-DNN). The system was trained on audio recordings in English and evaluated on data from Hindi, Kannada, Malayalam, Tamil, and Telugu, with a view to how factors such as phonetic similarity and native accent affect performance. The model was fed with MFCC (mel-frequency cepstral coefficient) features extracted from the audio file. For comparison, the corresponding mel-spectrogram images were also used as input to a ResNet-50 model, while the raw audio was used to train a Siamese network. The LSTM-DNN model outperformed the other two models as well as two more traditional baseline speaker identification models, showing that deep learning models are superior to probabilistic models for capturing low-level speech features and learning speaker characteristics.</abstract>
+      <url hash="42c89e0f">2023.ranlp-1.105</url>
+      <bibkey>rizvi-etal-2023-cross</bibkey>
+    </paper>
+    <paper id="106">
+      <title>‘<fixed-case>C</fixed-case>hem<fixed-case>X</fixed-case>tract’ A System for Extraction of Chemical Events from Patent Documents</title>
+      <author><first>Pattabhi</first><last>RK Rao</last></author>
+      <author><first>Sobha</first><last>Lalitha Devi</last></author>
+      <pages>988–995</pages>
+      <abstract>ChemXtraxt main goal is to extract the chemical events from patent documents. Event extraction requires that we first identify the names of chemical compounds involved in the events. Thus, in this work two extractions are done and they are (a) names of chemical compounds and (b) event that identify the specific involvement of the chemical compounds in a chemical reaction. Extraction of essential elements of a chemical reaction, generally known as Named Entity Recognition (NER), extracts the compounds, condition and yields, their specific role in reaction and assigns a label according to the role it plays within a chemical reaction. Whereas event extraction identifies the chemical event relations between the chemical compounds identified. Here in this work we have used Neural Conditional Random Fields (NCRF), which combines the power of artificial neural network (ANN) and CRFs. Different levels of features that include linguistic, orthographical and lexical clues are used. The results obtained are encouraging.</abstract>
+      <url hash="950e4bd2">2023.ranlp-1.106</url>
+      <bibkey>rk-rao-lalitha-devi-2023-chemxtract</bibkey>
+    </paper>
+    <paper id="107">
+      <title>Mind the User! Measures to More Accurately Evaluate the Practical Value of Active Learning Strategies</title>
+      <author><first>Julia</first><last>Romberg</last></author>
+      <pages>996–1006</pages>
+      <abstract>One solution to limited annotation budgets is active learning (AL), a collaborative process of human and machine to strategically select a small but informative set of examples. While current measures optimize AL from a pure machine learning perspective, we argue that for a successful transfer into practice, additional criteria must target the second pillar of AL, the human annotator. In text classification, e.g., where practitioners regularly encounter datasets with an increased number of imbalanced classes, measures like F1 fall short when finding all classes or identifying rare cases is required. We therefore introduce four measures that reflect class-related demands that users place on data acquisition. In a comprehensive comparison of uncertainty-based, diversity-based, and hybrid query strategies on six different datasets, we find that strong F1 performance is not necessarily associated with full class coverage. Uncertainty sampling outperforms diversity sampling in selecting minority classes and covering classes more efficiently, while diversity sampling excels in selecting less monotonous batches. Our empirical findings emphasize that a holistic view is essential when evaluating AL approaches to ensure their usefulness in practice - the actual, but often overlooked, goal of development. To this end, standard measures for assessing the performance of text classification need to be complemented by such that more appropriately reflect user needs.</abstract>
+      <url hash="9ca23899">2023.ranlp-1.107</url>
+      <bibkey>romberg-2023-mind</bibkey>
+    </paper>
+    <paper id="108">
+      <title>Event Annotation and Detection in <fixed-case>K</fixed-case>annada-<fixed-case>E</fixed-case>nglish Code-Mixed Social Media Data</title>
+      <author><first>Sumukh</first><last>S</last></author>
+      <author><first>Abhinav</first><last>Appidi</last></author>
+      <author><first>Manish</first><last>Shrivastava</last></author>
+      <pages>1007–1014</pages>
+      <abstract>Code-mixing (CM) is a frequently observed phenomenon on social media platforms in multilingual societies such as India. While the increase in code-mixed content on these platforms provides good amount of data for studying various aspects of code-mixing, the lack of automated text analysis tools makes such studies difficult. To overcome the same, tools such as language identifiers, Parts-of-Speech (POS) taggers and Named Entity Recognition (NER) for analysing code-mixed data have been developed. One such important tool is Event Detection, an important information retrieval task which can be used to identify critical facts occurring in the vast streams of unstructured text data available. While event detection from text is a hard problem on its own, social media data adds to it with its informal nature, and code-mixed (Kannada-English) data further complicates the problem due to its word-level mixing, lack of structure and incomplete information. In this work, we have tried to address this problem. We have proposed guidelines for the annotation of events in Kannada-English CM data and provided some baselines for the same with careful feature selection.</abstract>
+      <url hash="2726512b">2023.ranlp-1.108</url>
+      <bibkey>s-etal-2023-event</bibkey>
+    </paper>
+    <paper id="109">
+      <title>Three Approaches to Client Email Topic Classification</title>
+      <author><first>Branislava</first><last>Šandrih Todorović</last></author>
+      <author><first>Katarina</first><last>Josipović</last></author>
+      <author><first>Jurij</first><last>Kodre</last></author>
+      <pages>1015–1022</pages>
+      <abstract>This paper describes a use case that was implemented and is currently running in production at the Nova Ljubljanska Banka, that involves classifying incoming client emails in the Slovenian language according to their topics and priorities. Since the proposed approach relies only on the Named Entity Recogniser (NER) of personal names as a language-dependent resource (for the purpose of anonymisation), that is the only prerequisite for applying the approach to any other language.</abstract>
+      <url hash="7589463f">2023.ranlp-1.109</url>
+      <bibkey>sandrih-todorovic-etal-2023-three</bibkey>
+    </paper>
+    <paper id="110">
+      <title>Exploring Abstractive Text Summarisation for Podcasts: A Comparative Study of <fixed-case>BART</fixed-case> and T5 Models</title>
+      <author><first>Parth</first><last>Saxena</last></author>
+      <author><first>Mo</first><last>El-Haj</last></author>
+      <pages>1023–1033</pages>
+      <abstract>Podcasts have become increasingly popular in recent years, resulting in a massive amount of audio content being produced every day. Efficient summarisation of podcast episodes can enable better content management and discovery for users. In this paper, we explore the use of abstractive text summarisation methods to generate high-quality summaries of podcast episodes. We use pre-trained models, BART and T5, to fine-tune on a dataset of Spotify’s 100K podcast. We evaluate our models using automated metrics and human evaluation, and find that the BART model fine-tuned on the podcast dataset achieved a higher ROUGE-1 and ROUGE-L score compared to other models, while the T5 model performed better in terms of semantic meaning. The human evaluation indicates that both models produced high-quality summaries that were well received by participants. Our study demonstrates the effectiveness of abstractive summarisation methods for podcast episodes and offers insights for improving the summarisation of audio content.</abstract>
+      <url hash="925dd46d">2023.ranlp-1.110</url>
+      <bibkey>saxena-el-haj-2023-exploring</bibkey>
+    </paper>
+    <paper id="111">
+      <title>Exploring the Landscape of Natural Language Processing Research</title>
+      <author><first>Tim</first><last>Schopf</last></author>
+      <author><first>Karim</first><last>Arabi</last></author>
+      <author><first>Florian</first><last>Matthes</last></author>
+      <pages>1034–1045</pages>
+      <abstract>As an efficient approach to understand, generate, and process natural language texts, research in natural language processing (NLP) has exhibited a rapid spread and wide adoption in recent years. Given the increasing research work in this area, several NLP-related approaches have been surveyed in the research community. However, a comprehensive study that categorizes established topics, identifies trends, and outlines areas for future research remains absent. Contributing to closing this gap, we have systematically classified and analyzed research papers in the ACL Anthology. As a result, we present a structured overview of the research landscape, provide a taxonomy of fields of study in NLP, analyze recent developments in NLP, summarize our findings, and highlight directions for future work.</abstract>
+      <url hash="4ba21ca8">2023.ranlp-1.111</url>
+      <bibkey>schopf-etal-2023-exploring</bibkey>
+    </paper>
+    <paper id="112">
+      <title>Efficient Domain Adaptation of Sentence Embeddings Using Adapters</title>
+      <author><first>Tim</first><last>Schopf</last></author>
+      <author><first>Dennis N.</first><last>Schneider</last></author>
+      <author><first>Florian</first><last>Matthes</last></author>
+      <pages>1046–1053</pages>
+      <abstract>Sentence embeddings enable us to capture the semantic similarity of short texts. Most sentence embedding models are trained for general semantic textual similarity tasks. Therefore, to use sentence embeddings in a particular domain, the model must be adapted to it in order to achieve good results. Usually, this is done by fine-tuning the entire sentence embedding model for the domain of interest. While this approach yields state-of-the-art results, all of the model’s weights are updated during fine-tuning, making this method resource-intensive. Therefore, instead of fine-tuning entire sentence embedding models for each target domain individually, we propose to train lightweight adapters. These domain-specific adapters do not require fine-tuning all underlying sentence embedding model parameters. Instead, we only train a small number of additional parameters while keeping the weights of the underlying sentence embedding model fixed. Training domain-specific adapters allows always using the same base model and only exchanging the domain-specific adapters to adapt sentence embeddings to a specific domain. We show that using adapters for parameter-efficient domain adaptation of sentence embeddings yields competitive performance within 1% of a domain-adapted, entirely fine-tuned sentence embedding model while only training approximately 3.6% of the parameters.</abstract>
+      <url hash="ec7e585b">2023.ranlp-1.112</url>
+      <bibkey>schopf-etal-2023-efficient</bibkey>
+    </paper>
+    <paper id="113">
+      <title><fixed-case>A</fixed-case>spect<fixed-case>CSE</fixed-case>: Sentence Embeddings for Aspect-Based Semantic Textual Similarity Using Contrastive Learning and Structured Knowledge</title>
+      <author><first>Tim</first><last>Schopf</last></author>
+      <author><first>Emanuel</first><last>Gerber</last></author>
+      <author><first>Malte</first><last>Ostendorff</last></author>
+      <author><first>Florian</first><last>Matthes</last></author>
+      <pages>1054–1065</pages>
+      <abstract>Generic sentence embeddings provide coarse-grained approximation of semantic textual similarity, but ignore specific aspects that make texts similar. Conversely, aspect-based sentence embeddings provide similarities between texts based on certain predefined aspects. Thus, similarity predictions of texts are more targeted to specific requirements and more easily explainable. In this paper, we present AspectCSE, an approach for aspect-based contrastive learning of sentence embeddings. Results indicate that AspectCSE achieves an average improvement of 3.97% on information retrieval tasks across multiple aspects compared to the previous best results. We also propose the use of Wikidata knowledge graph properties to train models of multi-aspect sentence embeddings in which multiple specific aspects are simultaneously considered during similarity predictions. We demonstrate that multi-aspect embeddings outperform even single-aspect embeddings on aspect-specific information retrieval tasks. Finally, we examine the aspect-based sentence embedding space and demonstrate that embeddings of semantically similar aspect labels are often close, even without explicit similarity training between different aspect labels.</abstract>
+      <url hash="75c4157b">2023.ranlp-1.113</url>
+      <bibkey>schopf-etal-2023-aspectcse</bibkey>
+    </paper>
+    <paper id="114">
+      <title>Tackling the Myriads of Collusion Scams on <fixed-case>Y</fixed-case>ou<fixed-case>T</fixed-case>ube Comments of Cryptocurrency Videos</title>
+      <author><first>Sadat</first><last>Shahriar</last></author>
+      <author><first>Arjun</first><last>Mukherjee</last></author>
+      <pages>1066–1075</pages>
+      <abstract>Despite repeated measures, YouTube’s comment section has been a fertile ground for scammers. With the growth of the cryptocurrency market and obscurity around it, a new form of scam, namely “Collusion Scam” has emerged as a dominant force within YouTube’s comment space. Unlike typical scams and spams, collusion scams employ a cunning persuasion strategy, using the facade of genuine social interactions within comment threads to create an aura of trust and success to entrap innocent users. In this research, we collect 1,174 such collusion scam threads and perform a detailed analysis, which is tailored towards the successful detection of these scams. We find that utilization of the collusion dynamics can provide an accuracy of 96.67% and an F1-score of 93.04%. Furthermore, we demonstrate the robust predictive power of metadata associated with these threads and user channels, which act as compelling indicators of collusion scams. Finally, we show that modern LLM, like chatGPT, can effectively detect collusion scams without the need for any training.</abstract>
+      <url hash="ccdd0d67">2023.ranlp-1.114</url>
+      <bibkey>shahriar-mukherjee-2023-tackling</bibkey>
+    </paper>
+    <paper id="115">
+      <title>Exploring Deceptive Domain Transfer Strategies: Mitigating the Differences among Deceptive Domains</title>
+      <author><first>Sadat</first><last>Shahriar</last></author>
+      <author><first>Arjun</first><last>Mukherjee</last></author>
+      <author><first>Omprakash</first><last>Gnawali</last></author>
+      <pages>1076–1084</pages>
+      <abstract>Deceptive text poses a significant threat to users, resulting in widespread misinformation and disorder. While researchers have created numerous cutting-edge techniques for detecting deception in domain-specific settings, whether there is a generic deception pattern so that deception-related knowledge in one domain can be transferred to the other remains mostly unexplored. Moreover, the disparities in textual expression across these many mediums pose an additional obstacle for generalization. To this end, we present a Multi-Task Learning (MTL)-based deception generalization strategy to reduce the domain-specific noise and facilitate a better understanding of deception via a generalized training. As deceptive domains, we use News (fake news), Tweets (rumors), and Reviews (fake reviews) and employ LSTM and BERT model to incorporate domain transfer techniques. Our proposed architecture for the combined approach of domain-independent and domain-specific training improves the deception detection performance by up to 5.28% in F1-score.</abstract>
+      <url hash="c427d18a">2023.ranlp-1.115</url>
+      <bibkey>shahriar-etal-2023-exploring</bibkey>
+    </paper>
+    <paper id="116">
+      <title>Party Extraction from Legal Contract Using Contextualized Span Representations of Parties</title>
+      <author><first>Sanjeepan</first><last>Sivapiran</last></author>
+      <author><first>Charangan</first><last>Vasantharajan</last></author>
+      <author><first>Uthayasanker</first><last>Thayasivam</last></author>
+      <pages>1085–1094</pages>
+      <abstract>Extracting legal entities from legal documents, particularly legal parties in contract documents, poses a significant challenge for legal assistive software. Many existing party extraction systems tend to generate numerous false positives due to the complex structure of the legal text. In this study, we present a novel and accurate method for extracting parties from legal contract documents by leveraging contextual span representations. To facilitate our approach, we have curated a large-scale dataset comprising 1000 contract documents with party annotations. Our method incorporates several enhancements to the SQuAD 2.0 question-answering system, specifically tailored to handle the intricate nature of the legal text. These enhancements include modifications to the activation function, an increased number of encoder layers, and the addition of normalization and dropout layers stacked on top of the output encoder layer. Baseline experiments reveal that our model, fine-tuned on our dataset, outperforms the current state-of-the-art model. Furthermore, we explore various combinations of the aforementioned techniques to further enhance the accuracy of our method. By employing a hybrid approach that combines 24 encoder layers with normalization and dropout layers, we achieve the best results, exhibiting an exact match score of 0.942 (+6.2% improvement).</abstract>
+      <url hash="f03bebd5">2023.ranlp-1.116</url>
+      <bibkey>sivapiran-etal-2023-party</bibkey>
+    </paper>
+    <paper id="117">
+      <title>From Fake to Hyperpartisan News Detection Using Domain Adaptation</title>
+      <author><first>Răzvan-Alexandru</first><last>Smădu</last></author>
+      <author><first>Sebastian-Vasile</first><last>Echim</last></author>
+      <author><first>Dumitru-Clementin</first><last>Cercel</last></author>
+      <author><first>Iuliana</first><last>Marin</last></author>
+      <author><first>Florin</first><last>Pop</last></author>
+      <pages>1095–1109</pages>
+      <abstract>Unsupervised Domain Adaptation (UDA) is a popular technique that aims to reduce the domain shift between two data distributions. It was successfully applied in computer vision and natural language processing. In the current work, we explore the effects of various unsupervised domain adaptation techniques between two text classification tasks: fake and hyperpartisan news detection. We investigate the knowledge transfer from fake to hyperpartisan news detection without involving target labels during training. Thus, we evaluate UDA, cluster alignment with a teacher, and cross-domain contrastive learning. Extensive experiments show that these techniques improve performance, while including data augmentation further enhances the results. In addition, we combine clustering and topic modeling algorithms with UDA, resulting in improved performances compared to the initial UDA setup.</abstract>
+      <url hash="593f21d8">2023.ranlp-1.117</url>
+      <bibkey>smadu-etal-2023-fake</bibkey>
+    </paper>
+    <paper id="118">
+      <title>Prompt-Based Approach for <fixed-case>C</fixed-case>zech Sentiment Analysis</title>
+      <author><first>Jakub</first><last>Šmíd</last></author>
+      <author><first>Pavel</first><last>Přibáň</last></author>
+      <pages>1110–1120</pages>
+      <abstract>This paper introduces the first prompt-based methods for aspect-based sentiment analysis and sentiment classification in Czech. We employ the sequence-to-sequence models to solve the aspect-based tasks simultaneously and demonstrate the superiority of our prompt-based approach over traditional fine-tuning. In addition, we conduct zero-shot and few-shot learning experiments for sentiment classification and show that prompting yields significantly better results with limited training examples compared to traditional fine-tuning. We also demonstrate that pre-training on data from the target domain can lead to significant improvements in a zero-shot scenario.</abstract>
+      <url hash="14876e1c">2023.ranlp-1.118</url>
+      <bibkey>smid-priban-2023-prompt</bibkey>
+    </paper>
+    <paper id="119">
+      <title>Measuring Gender Bias in Natural Language Processing: Incorporating Gender-Neutral Linguistic Forms for Non-Binary Gender Identities in Abusive Speech Detection</title>
+      <author><first>Nasim</first><last>Sobhani</last></author>
+      <author><first>Kinshuk</first><last>Sengupta</last></author>
+      <author><first>Sarah Jane</first><last>Delany</last></author>
+      <pages>1121–1131</pages>
+      <abstract>Predictions from machine learning models can reflect bias in the data on which they are trained. Gender bias has been shown to be prevalent in natural language processing models. The research into identifying and mitigating gender bias in these models predominantly considers gender as binary, male and female, neglecting the fluidity and continuity of gender as a variable. In this paper, we present an approach to evaluate gender bias in a prediction task, which recognises the non-binary nature of gender. We gender-neutralise a random subset of existing real-world hate speech data. We extend the existing template approach for measuring gender bias to include test examples that are gender-neutral. Measuring the bias across a selection of hate speech datasets we show that the bias for the gender-neutral data is closer to that seen for test instances that identify as male than those that identify as female.</abstract>
+      <url hash="de6726d5">2023.ranlp-1.119</url>
+      <bibkey>sobhani-etal-2023-measuring</bibkey>
+    </paper>
+    <paper id="120">
+      <title><fixed-case>L</fixed-case>e<fixed-case>SS</fixed-case>: A Computationally-Light Lexical Simplifier for <fixed-case>S</fixed-case>panish</title>
+      <author><first>Sanja</first><last>Stajner</last></author>
+      <author><first>Daniel</first><last>Ibanez</last></author>
+      <author><first>Horacio</first><last>Saggion</last></author>
+      <pages>1132–1142</pages>
+      <abstract>Due to having knowledge of only basic vocabulary, many people cannot understand up-to-date written information and thus make informed decisions and fully participate in the society. We propose LeSS, a modular lexical simplification architecture that outperforms state-of-the-art lexical simplification systems for Spanish. In addition to its state-of-the-art performance, LeSS is computationally light, using much less disk space, CPU and GPU, and having faster loading and execution time than the transformer-based lexical simplification models which are predominant in the field.</abstract>
+      <url hash="1559b358">2023.ranlp-1.120</url>
+      <bibkey>stajner-etal-2023-less</bibkey>
+    </paper>
+    <paper id="121">
+      <title><fixed-case>H</fixed-case>indi to <fixed-case>D</fixed-case>ravidian Language Neural Machine Translation Systems</title>
+      <author><first>Vijay</first><last>Sundar Ram</last></author>
+      <author><first>Sobha</first><last>Lalitha Devi</last></author>
+      <pages>1143–1150</pages>
+      <abstract>Neural machine translation (NMT) has achieved state-of-art performance in high-resource language pairs, but the performance of NMT drops in low-resource conditions. Morphologically rich languages are yet another challenge in NMT. The common strategy to handle this issue is to apply sub-word segmentation. In this work, we compare the morphologically inspired segmentation methods against the Byte Pair Encoding (BPE) in processing the input for building NMT systems for Hindi to Malayalam and Hindi to Tamil, where Hindi is an Indo-Aryan language and Malayalam and Tamil are south Dravidian languages. These two languages are low resource, morphologically rich and agglutinative. Malayalam is more agglutinative than Tamil. We show that for both the language pairs, the morphological segmentation algorithm out-performs BPE. We also present an elaborate analysis on translation outputs from both the NMT systems.</abstract>
+      <url hash="3b945dde">2023.ranlp-1.121</url>
+      <bibkey>sundar-ram-lalitha-devi-2023-hindi</bibkey>
+    </paper>
+    <paper id="122">
+      <title>Looking for Traces of Textual Deepfakes in <fixed-case>B</fixed-case>ulgarian on Social Media</title>
+      <author><first>Irina</first><last>Temnikova</last></author>
+      <author><first>Iva</first><last>Marinova</last></author>
+      <author><first>Silvia</first><last>Gargova</last></author>
+      <author><first>Ruslana</first><last>Margova</last></author>
+      <author><first>Ivan</first><last>Koychev</last></author>
+      <pages>1151–1161</pages>
+      <abstract>Textual deepfakes can cause harm, especially on social media. At the moment, there are models trained to detect deepfake messages mainly for the English language, but no research or datasets currently exist for detecting them in most low-resource languages, such as Bulgarian. To address this gap, we explore three approaches. First, we machine translate an English-language social media dataset with bot messages into Bulgarian. However, the translation quality is unsatisfactory, leading us to create a new Bulgarian-language dataset with real social media messages and those generated by two language models (a new Bulgarian GPT-2 model – GPT-WEB-BG, and ChatGPT). We machine translate it into English and test existing English GPT-2 and ChatGPT detectors on it, achieving only 0.44-0.51 accuracy. Next, we train our own classifiers on the Bulgarian dataset, obtaining an accuracy of 0.97. Additionally, we apply the classifier with the highest results to a recently released Bulgarian social media dataset with manually fact-checked messages, which successfully identifies some of the messages as generated by Language Models (LM). Our results show that the use of machine translation is not suitable for textual deepfakes detection. We conclude that combining LM text detection with fact-checking is the most appropriate method for this task, and that identifying Bulgarian textual deepfakes is indeed possible.</abstract>
+      <url hash="aae80bc7">2023.ranlp-1.122</url>
+      <bibkey>temnikova-etal-2023-looking</bibkey>
+    </paper>
+    <paper id="123">
+      <title>Propaganda Detection in <fixed-case>R</fixed-case>ussian Telegram Posts in the Scope of the <fixed-case>R</fixed-case>ussian Invasion of <fixed-case>U</fixed-case>kraine</title>
+      <author><first>Natalia</first><last>Vanetik</last></author>
+      <author><first>Marina</first><last>Litvak</last></author>
+      <author><first>Egor</first><last>Reviakin</last></author>
+      <author><first>Margarita</first><last>Tiamanova</last></author>
+      <pages>1162–1170</pages>
+      <abstract>The emergence of social media has made it more difficult to recognize and analyze misinformation efforts. Popular messaging software Telegram has developed into a medium for disseminating political messages and misinformation, particularly in light of the conflict in Ukraine. In this paper, we introduce a sizable corpus of Telegram posts containing pro-Russian propaganda and benign political texts. We evaluate the corpus by applying natural language processing (NLP) techniques to the task of text classification in this corpus. Our findings indicate that, with an overall accuracy of over 96% for confirmed sources as propagandists and oppositions and 92% for unconfirmed sources, our method can successfully identify and categorize pro- Russian propaganda posts. We highlight the consequences of our research for comprehending political communications and propaganda on social media.</abstract>
+      <url hash="1ea3b2ae">2023.ranlp-1.123</url>
+      <bibkey>vanetik-etal-2023-propaganda</bibkey>
+    </paper>
+    <paper id="124">
+      <title>Auto-Encoding Questions with Retrieval Augmented Decoding for Unsupervised Passage Retrieval and Zero-Shot Question Generation</title>
+      <author><first>Stalin</first><last>Varanasi</last></author>
+      <author><first>Muhammad Umer Tariq</first><last>Butt</last></author>
+      <author><first>Guenter</first><last>Neumann</last></author>
+      <pages>1171–1179</pages>
+      <abstract>Dense passage retrieval models have become state-of-the-art for information retrieval on many Open-domain Question Answering (ODQA) datasets. However, most of these models rely on supervision obtained from the ODQA datasets, which hinders their performance in a low-resource setting. Recently, retrieval-augmented language models have been proposed to improve both zero-shot and supervised information retrieval. However, these models have pre-training tasks that are agnostic to the target task of passage retrieval. In this work, we propose Retrieval Augmented Auto-encoding of Questions for zero-shot dense information retrieval. Unlike other pre-training methods, our pre-training method is built for target information retrieval, thereby making the pre-training more efficient. Our method consists of a dense IR model for encoding questions and retrieving documents during training and a conditional language model that maximizes the question’s likelihood by marginalizing over retrieved documents. As a by-product, we can use this conditional language model for zero-shot question generation from documents. We show that the IR model obtained through our method improves the current state-of-the-art of zero-shot dense information retrieval, and we improve the results even further by training on a synthetic corpus created by zero-shot question generation.</abstract>
+      <url hash="bad5954d">2023.ranlp-1.124</url>
+      <bibkey>varanasi-etal-2023-auto</bibkey>
+    </paper>
+    <paper id="125">
+      <title><fixed-case>N</fixed-case>o<fixed-case>H</fixed-case>ate<fixed-case>B</fixed-case>razil: A <fixed-case>B</fixed-case>razilian <fixed-case>P</fixed-case>ortuguese Text Offensiveness Analysis System</title>
+      <author><first>Francielle</first><last>Vargas</last></author>
+      <author><first>Isabelle</first><last>Carvalho</last></author>
+      <author><first>Wolfgang</first><last>Schmeisser-Nieto</last></author>
+      <author><first>Fabrício</first><last>Benevenuto</last></author>
+      <author><first>Thiago</first><last>Pardo</last></author>
+      <pages>1180–1186</pages>
+      <abstract>Hate speech is a surely relevant problem in Brazil. Nevertheless, its regulation is not effective due to the difficulty to identify, quantify and classify offensive comments. Here, we introduce a novel system for offensive comment analysis in Brazilian Portuguese. The system titled “NoHateBrazil” recognizes explicit and implicit offensiveness in context at a fine-grained level. Specifically, we propose a framework for data collection, human annotation and machine learning models that were used to build the system. In addition, we assess the potential of our system to reflect stereotypical beliefs against marginalized groups by contrasting them with counter-stereotypes. As a result, a friendly web application was implemented, which besides presenting relevant performance, showed promising results towards mitigation of the risk of reinforcing social stereotypes. Lastly, new measures were proposed to improve the explainability of offensiveness classification and reliability of the model’s predictions.</abstract>
+      <url hash="65a30726">2023.ranlp-1.125</url>
+      <bibkey>vargas-etal-2023-nohatebrazil</bibkey>
+    </paper>
+    <paper id="126">
+      <title>Socially Responsible Hate Speech Detection: Can Classifiers Reflect Social Stereotypes?</title>
+      <author><first>Francielle</first><last>Vargas</last></author>
+      <author><first>Isabelle</first><last>Carvalho</last></author>
+      <author><first>Ali</first><last>Hürriyetoğlu</last></author>
+      <author><first>Thiago</first><last>Pardo</last></author>
+      <author><first>Fabrício</first><last>Benevenuto</last></author>
+      <pages>1187–1196</pages>
+      <abstract>Recent studies have shown that hate speech technologies may propagate social stereotypes against marginalized groups. Nevertheless, there has been a lack of realistic approaches to assess and mitigate biased technologies. In this paper, we introduce a new approach to analyze the potential of hate-speech classifiers to reflect social stereotypes through the investigation of stereotypical beliefs by contrasting them with counter-stereotypes. We empirically measure the distribution of stereotypical beliefs by analyzing the distinctive classification of tuples containing stereotypes versus counter-stereotypes in machine learning models and datasets. Experiment results show that hate speech classifiers attribute unreal or negligent offensiveness to social identity groups by reflecting and reinforcing stereotypical beliefs regarding minorities. Furthermore, we also found that models that embed expert and context information from offensiveness markers present promising results to mitigate social stereotype bias towards socially responsible hate speech detection.</abstract>
+      <url hash="7e2fcd32">2023.ranlp-1.126</url>
+      <bibkey>vargas-etal-2023-socially</bibkey>
+    </paper>
+    <paper id="127">
+      <title>Predicting Sentence-Level Factuality of News and Bias of Media Outlets</title>
+      <author><first>Francielle</first><last>Vargas</last></author>
+      <author><first>Kokil</first><last>Jaidka</last></author>
+      <author><first>Thiago</first><last>Pardo</last></author>
+      <author><first>Fabrício</first><last>Benevenuto</last></author>
+      <pages>1197–1206</pages>
+      <abstract>Automated news credibility and fact-checking at scale require accurate prediction of news factuality and media bias. This paper introduces a large sentence-level dataset, titled “FactNews”, composed of 6,191 sentences expertly annotated according to factuality and media bias definitions proposed by AllSides. We use FactNews to assess the overall reliability of news sources by formulating two text classification problems for predicting sentence-level factuality of news reporting and bias of media outlets. Our experiments demonstrate that biased sentences present a higher number of words compared to factual sentences, besides having a predominance of emotions. Hence, the fine-grained analysis of subjectivity and impartiality of news articles showed promising results for predicting the reliability of entire media outlets. Finally, due to the severity of fake news and political polarization in Brazil, and the lack of research for Portuguese, both dataset and baseline were proposed for Brazilian Portuguese.</abstract>
+      <url hash="0b2d8f82">2023.ranlp-1.127</url>
+      <bibkey>vargas-etal-2023-predicting</bibkey>
+    </paper>
+    <paper id="128">
+      <title>Classification of <fixed-case>US</fixed-case> <fixed-case>S</fixed-case>upreme <fixed-case>C</fixed-case>ourt Cases Using <fixed-case>BERT</fixed-case>-Based Techniques</title>
+      <author><first>Shubham</first><last>Vatsal</last></author>
+      <author><first>Adam</first><last>Meyers</last></author>
+      <author><first>John E.</first><last>Ortega</last></author>
+      <pages>1207–1215</pages>
+      <abstract>Models based on bidirectional encoder representations from transformers (BERT) produce state of the art (SOTA) results on many natural language processing (NLP) tasks such as named entity recognition (NER), part-of-speech (POS) tagging etc. An interesting phenomenon occurs when classifying long documents such as those from the US supreme court where BERT-based models can be considered difficult to use on a first-pass or out-of-the-box basis. In this paper, we experiment with several BERT-based classification techniques for US supreme court decisions or supreme court database (SCDB) and compare them with the previous SOTA results. We then compare our results specifically with SOTA models for long documents. We compare our results for two classification tasks: (1) a broad classification task with 15 categories and (2) a fine-grained classification task with 279 categories. Our best result produces an accuracy of 80% on the 15 broad categories and 60% on the fine-grained 279 categories which marks an improvement of 8% and 28% respectively from previously reported SOTA results.</abstract>
+      <url hash="c58834a3">2023.ranlp-1.128</url>
+      <bibkey>vatsal-etal-2023-classification</bibkey>
+    </paper>
+    <paper id="129">
+      <title>Kāraka-Based Answer Retrieval for Question Answering in <fixed-case>I</fixed-case>ndic Languages</title>
+      <author><first>Devika</first><last>Verma</last></author>
+      <author><first>Ramprasad S.</first><last>Joshi</last></author>
+      <author><first>Aiman A.</first><last>Shivani</last></author>
+      <author><first>Rohan D.</first><last>Gupta</last></author>
+      <pages>1216–1224</pages>
+      <abstract>Kārakas from ancient Paninian grammar form a concise set of semantic roles that capture crucial aspect of sentence meaning pivoted on the action verb. In this paper, we propose employing a kāraka-based approach for retrieving answers in Indic question-answering systems. To study and evaluate this novel approach, empirical experiments are conducted over large benchmark corpora in Hindi and Marathi. The results obtained demonstrate the effectiveness of the proposed method. Additionally, we explore the varying impact of two approaches for extracting kārakas. The literature surveyed and experiments conducted encourage hope that kāraka annotation can improve communication with machines using natural languages, particularly in low-resource languages.</abstract>
+      <url hash="4b0f3763">2023.ranlp-1.129</url>
+      <bibkey>verma-etal-2023-karaka</bibkey>
+    </paper>
+    <paper id="130">
+      <title>Comparative Analysis of Named Entity Recognition in the Dungeons and Dragons Domain</title>
+      <author><first>Gayashan</first><last>Weerasundara</last></author>
+      <author><first>Nisansa</first><last>de Silva</last></author>
+      <pages>1225–1233</pages>
+      <abstract>Some Natural Language Processing (NLP) tasks that are in the sufficiently solved state for general domain English still struggle to attain the same level of performance in specific domains. Named Entity Recognition (NER), which aims to find and categorize entities in text is such a task met with difficulties in adapting to domain specificity. This paper compares the performance of 10 NER models on 7 adventure books from the Dungeons and Dragons (D&amp;D) domain which is a subdomain of fantasy literature. Fantasy literature, being rich and diverse in vocabulary, poses considerable challenges for conventional NER. In this study, we use open-source Large Language Models (LLM) to annotate the named entities and character names in each number of official D&amp;D books and evaluate the precision and distribution of each model. The paper aims to identify the challenges and opportunities for improving NER in fantasy literature. Our results show that even in the off-the-shelf configuration, Flair, Trankit, and Spacy achieve better results for identifying named entities in the D&amp;D domain compared to their peers.</abstract>
+      <url hash="f32fdc22">2023.ranlp-1.130</url>
+      <bibkey>weerasundara-de-silva-2023-comparative</bibkey>
+    </paper>
+    <paper id="131">
+      <title>Comparative Analysis of Anomaly Detection Algorithms in Text Data</title>
+      <author><first>Yizhou</first><last>Xu</last></author>
+      <author><first>Kata</first><last>Gábor</last></author>
+      <author><first>Jérôme</first><last>Milleret</last></author>
+      <author><first>Frédérique</first><last>Segond</last></author>
+      <pages>1234–1245</pages>
+      <abstract>Text anomaly detection (TAD) is a crucial task that aims to identify texts that deviate significantly from the norm within a corpus. Despite its importance in various domains, TAD remains relatively underexplored in natural language processing. This article presents a systematic evaluation of 22 TAD algorithms on 17 corpora using multiple text representations, including monolingual and multilingual SBERT. The performance of the algorithms is compared based on three criteria: degree of supervision, theoretical basis, and architecture used. The results demonstrate that semi-supervised methods utilizing weak labels outperform both unsupervised methods and semi-supervised methods using only negative samples for training. Additionally, we explore the application of TAD techniques in hate speech detection. The results provide valuable insights for future TAD research and guide the selection of suitable algorithms for detecting text anomalies in different contexts.</abstract>
+      <url hash="9b4cf9cb">2023.ranlp-1.131</url>
+      <bibkey>xu-etal-2023-comparative</bibkey>
+    </paper>
+    <paper id="132">
+      <title>Poetry Generation Combining Poetry Theme Labels Representations</title>
+      <author><first>Yingyu</first><last>Yan</last></author>
+      <author><first>Dongzhen</first><last>Wen</last></author>
+      <author><first>Liang</first><last>Yang</last></author>
+      <author><first>Dongyu</first><last>Zhang</last></author>
+      <author><first>Hongfei</first><last>Lin</last></author>
+      <pages>1246–1255</pages>
+      <abstract>Ancient Chinese poetry is the earliest literary genre that took shape in Chinese literature and has a dissemination effect, showing China’s profound cultural heritage. At the same time, the generation of ancient poetry is an important task in the field of digital humanities, which is of great significance to the inheritance of national culture and the education of ancient poetry. The current work in the field of poetry generation is mainly aimed at improving the fluency and structural accuracy of words and sentences, ignoring the theme unity of poetry generation results. In order to solve this problem, this paper proposes a graph neural network poetry theme representation model based on label embedding. On the basis of the network representation of poetry, the topic feature representation of poetry is constructed and learned from the granularity of words. Then, the features of the poetry theme representation model are combined with the autoregressive language model to construct a theme-oriented ancient Chinese poetry generation model TLPG (Poetry Generation with Theme Label). Through machine evaluation and evaluation by experts in related fields, the model proposed in this paper has significantly improved the topic consistency of poetry generation compared with existing work on the premise of ensuring the fluency and format accuracy of poetry.</abstract>
+      <url hash="927fb050">2023.ranlp-1.132</url>
+      <bibkey>yan-etal-2023-poetry</bibkey>
+    </paper>
+    <paper id="133">
+      <title>Evaluating Generative Models for Graph-to-Text Generation</title>
+      <author><first>Shuzhou</first><last>Yuan</last></author>
+      <author><first>Michael</first><last>Faerber</last></author>
+      <pages>1256–1264</pages>
+      <abstract>Large language models (LLMs) have been widely employed for graph-to-text generation tasks. However, the process of finetuning LLMs requires significant training resources and annotation work. In this paper, we explore the capability of generative models to generate descriptive text from graph data in a zero-shot setting. Specifically, we evaluate GPT-3 and ChatGPT on two graph-to-text datasets and compare their performance with that of finetuned LLM models such as T5 and BART. Our results demonstrate that generative models are capable of generating fluent and coherent text, achieving BLEU scores of 10.57 and 11.08 for the AGENDA and WebNLG datasets, respectively. However, our error analysis reveals that generative models still struggle with understanding the semantic relations between entities, and they also tend to generate text with hallucinations or irrelevant information. As a part of error analysis, we utilize BERT to detect machine-generated text and achieve high macro-F1 scores. We have made the text generated by generative models publicly available.</abstract>
+      <url hash="831b7f30">2023.ranlp-1.133</url>
+      <bibkey>yuan-faerber-2023-evaluating</bibkey>
+    </paper>
+    <paper id="134">
+      <title>Microsyntactic Unit Detection Using Word Embedding Models: Experiments on <fixed-case>S</fixed-case>lavic Languages</title>
+      <author><first>Iuliia</first><last>Zaitova</last></author>
+      <author><first>Irina</first><last>Stenger</last></author>
+      <author><first>Tania</first><last>Avgustinova</last></author>
+      <pages>1265–1273</pages>
+      <abstract>Microsyntactic units have been defined as language-specific transitional entities between lexicon and grammar, whose idiomatic properties are closely tied to syntax. These units are typically described based on individual constructions, making it difficult to understand them comprehensively as a class. This study proposes a novel approach to detect microsyntactic units using Word Embedding Models (WEMs) trained on six Slavic languages, namely Belarusian, Bulgarian, Czech, Polish, Russian, and Ukrainian, and evaluates how well these models capture the nuances of syntactic non-compositionality. To evaluate the models, we develop a cross-lingual inventory of microsyntactic units using the lists of microsyntantic units available at the Russian National Corpus. Our results demonstrate the effectiveness of WEMs in capturing microsyntactic units across all six Slavic languages under analysis. Additionally, we find that WEMs tailored for syntax-based tasks consistently outperform other WEMs at the task. Our findings contribute to the theory of microsyntax by providing insights into the detection of microsyntactic units and their cross-linguistic properties.</abstract>
+      <url hash="73e00140">2023.ranlp-1.134</url>
+      <bibkey>zaitova-etal-2023-microsyntactic</bibkey>
+    </paper>
+    <paper id="135">
+      <title>Systematic <fixed-case>T</fixed-case>ext<fixed-case>R</fixed-case>ank Optimization in Extractive Summarization</title>
+      <author><first>Morris</first><last>Zieve</last></author>
+      <author><first>Anthony</first><last>Gregor</last></author>
+      <author><first>Frederik Juul</first><last>Stokbaek</last></author>
+      <author><first>Hunter</first><last>Lewis</last></author>
+      <author><first>Ellis Marie</first><last>Mendoza</last></author>
+      <author><first>Benyamin</first><last>Ahmadnia</last></author>
+      <pages>1274–1281</pages>
+      <abstract>With the ever-growing amount of textual data, extractive summarization has become increasingly crucial for efficiently processing information. The TextRank algorithm, a popular unsupervised method, offers excellent potential for this task. In this paper, we aim to optimize the performance of TextRank by systematically exploring and verifying the best preprocessing and fine-tuning techniques. We extensively evaluate text preprocessing methods, such as tokenization, stemming, and stopword removal, to identify the most effective combination with TextRank. Additionally, we examine fine-tuning strategies, including parameter optimization and incorporation of domain-specific knowledge, to achieve superior summarization quality.</abstract>
+      <url hash="94cf6aad">2023.ranlp-1.135</url>
+      <bibkey>zieve-etal-2023-systematic</bibkey>
+    </paper>
+  </volume>
+  <volume id="stud" ingest-date="2023-11-08" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the 8th Student Research Workshop associated with the International Conference Recent Advances in Natural Language Processing</booktitle>
+      <editor><first>Momchil</first><last>Hardalov</last></editor>
+      <editor><first>Zara</first><last>Kancheva</last></editor>
+      <editor><first>Boris</first><last>Velichkov</last></editor>
+      <editor><first>Ivelina</first><last>Nikolova-Koleva</last></editor>
+      <editor><first>Milena</first><last>Slavcheva</last></editor>
+      <publisher>INCOMA Ltd., Shoumen, Bulgaria</publisher>
+      <address>Varna, Bulgaria</address>
+      <month>September</month>
+      <year>2023</year>
+      <url hash="ef06c29f">2023.ranlp-stud</url>
+      <venue>ranlp</venue>
+    </meta>
+    <frontmatter>
+      <url hash="08309289">2023.ranlp-stud.0</url>
+      <bibkey>ranlp-2023-student</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>Detecting <fixed-case>C</fixed-case>hat<fixed-case>GPT</fixed-case>: A Survey of the State of Detecting <fixed-case>C</fixed-case>hat<fixed-case>GPT</fixed-case>-Generated Text</title>
+      <author><first>Mahdi</first><last>Dhaini</last></author>
+      <author><first>Wessel</first><last>Poelman</last></author>
+      <author><first>Ege</first><last>Erdogan</last></author>
+      <pages>1–12</pages>
+      <abstract>While recent advancements in the capabilities and widespread accessibility of generative language models, such as ChatGPT (OpenAI, 2022), have brought about various benefits by generating fluent human-like text, the task of distinguishing between human- and large language model (LLM) generated text has emerged as a crucial problem. These models can potentially deceive by generating artificial text that appears to be human-generated. This issue is particularly significant in domains such as law, education, and science, where ensuring the integrity of text is of the utmost importance. This survey provides an overview of the current approaches employed to differentiate between texts generated by humans and ChatGPT. We present an account of the different datasets constructed for detecting ChatGPT-generated text, the various methods utilized, what qualitative analyses into the characteristics of human versus ChatGPT-generated text have been performed, and finally, summarize our findings into general insights.</abstract>
+      <url hash="98e67dab">2023.ranlp-stud.1</url>
+      <bibkey>dhaini-etal-2023-detecting</bibkey>
+    </paper>
+    <paper id="2">
+      <title>Unsupervised Calibration through Prior Adaptation for Text Classification using Large Language Models</title>
+      <author><first>Lautaro</first><last>Estienne</last></author>
+      <pages>13–22</pages>
+      <abstract>A wide variety of natural language tasks are currently being addressed with large-scale language models (LLMs). These models are usually trained with a very large amount of unsupervised text data and adapted to perform a downstream natural language task using methods like fine-tuning, calibration or in-context learning. In this work, we propose an approach to adapt the prior class distribution to perform text classification tasks without the need for labelled samples and only a few in-domain sample queries. The proposed approach treats the LLM as a black box, adding a stage where the model posteriors are calibrated to the task. Results show that these methods outperform the un-adapted model for different number of training shots in the prompt and a previous approach where calibration is performed without using any adaptation data.</abstract>
+      <url hash="a948098c">2023.ranlp-stud.2</url>
+      <attachment type="OptionalSupplementaryMaterial" hash="b2944fcc">2023.ranlp-stud.2.OptionalSupplementaryMaterial.pdf</attachment>
+      <bibkey>estienne-2023-unsupervised</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Controllable Active-Passive Voice Generation using Prefix Tuning</title>
+      <author><first>Valentin</first><last>Knappich</last></author>
+      <author><first>Timo Pierre</first><last>Schrader</last></author>
+      <pages>23–32</pages>
+      <abstract>The prompting paradigm is an uprising trend in the field of Natural Language Processing (NLP) that aims to learn tasks by finding appropriate prompts rather than fine-tuning the model weights. Such prompts can express an intention, e.g., they can instruct a language model to generate a summary of a given event. In this paper, we study how to influence (”control”) the language generation process such that the outcome fulfills a requested linguistic property. More specifically, we look at controllable active-passive (AP) voice generation, i.e., we require the model to generate a sentence in the requested voice. We build upon the prefix tuning approach and introduce control tokens that are trained on controllable AP generation. We create an AP subset of the WebNLG dataset to fine-tune these control tokens. Among four different models, the one trained with a contrastive learning approach yields the best results in terms of AP accuracy ( 95%) but at the cost of decreased performance on the original WebNLG task.</abstract>
+      <url hash="499c9345">2023.ranlp-stud.3</url>
+      <bibkey>knappich-schrader-2023-controllable</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Age-Specific Linguistic Features of Depression via Social Media</title>
+      <author><first>Charlotte</first><last>Rosario</last></author>
+      <pages>33–43</pages>
+      <abstract>Social media data has become a crucial resource for understanding and detecting mental health challenges. However, there is a significant gap in our understanding of age-specific linguistic markers associated with classifying depression. This study bridges the gap by analyzing 25,241 text samples from 15,156 Reddit users with self-reported depression across two age groups: adolescents (13-20 year olds) and adults (21+). Through a quantitative exploratory analysis using LIWC, topic modeling, and data visualization, distinct patterns and topical differences emerged in the language of depression for adolescents and adults, including social concerns, temporal focuses, emotions, and cognition. These findings enhance our understanding of how depression is expressed on social media, bearing implications for accurate classification and tailored interventions across different age groups.</abstract>
+      <url hash="2d21091b">2023.ranlp-stud.4</url>
+      <bibkey>rosario-2023-age</bibkey>
+    </paper>
+    <paper id="5">
+      <title>Trigger Warnings: A Computational Approach to Understanding User-Tagged Trigger Warnings</title>
+      <author><first>Sarthak</first><last>Tyagi</last></author>
+      <author><first>Adwita</first><last>Arora</last></author>
+      <author><first>Krish</first><last>Chopra</last></author>
+      <author><first>Manan</first><last>Suri</last></author>
+      <pages>44–54</pages>
+      <abstract>Content and trigger warnings give information about the content of material prior to receiving it and are used by social media users to tag their content when discussing sensitive topics. Trigger warnings are known to yield benefits in terms of an increased individual agency to make an informed decision about engaging with content. At the same time, some studies contest the benefits of trigger warnings suggesting that they can induce anxiety and reinforce the traumatic experience of specific identities. Our study involves the analysis of the nature and implications of the usage of trigger warnings by social media users using empirical methods and machine learning. Further, we aim to study the community interactions associated with trigger warnings in online communities, precisely the diversity and content of responses and inter-user interactions. The domains of trigger warnings covered will include self-harm, drug abuse, suicide, and depression. The analysis of the above domains will assist in a better understanding of online behaviour associated with them and help in developing domain-specific datasets for further research</abstract>
+      <url hash="f3a1c63c">2023.ranlp-stud.5</url>
+      <bibkey>tyagi-etal-2023-trigger</bibkey>
+    </paper>
+    <paper id="6">
+      <title>Evaluating Hallucinations in Large Language Models for <fixed-case>B</fixed-case>ulgarian Language</title>
+      <author><first>Melania</first><last>Berbatova</last></author>
+      <author><first>Yoan</first><last>Salambashev</last></author>
+      <pages>55–63</pages>
+      <abstract>In this short paper, we introduce the task of evaluating the hallucination of large language models for the Bulgarian language. We first give definitions of what is a hallucination in large language models and what evaluation methods for measuring hallucinations exist. Next, we give an overview of the multilingual evaluation of the latest large language models, focusing on the evaluation of the performance in Bulgarian on tasks, related to hallucination. We then present a method to evaluate the level of hallucination in a given language with no reference data, and provide some initial experiments with this method in Bulgarian. Finally, we provide directions for future research on the topic.</abstract>
+      <url hash="e37dc754">2023.ranlp-stud.6</url>
+      <bibkey>berbatova-salambashev-2023-evaluating</bibkey>
+    </paper>
+    <paper id="7">
+      <title>Leveraging Probabilistic Graph Models in Nested Named Entity Recognition for <fixed-case>P</fixed-case>olish</title>
+      <author><first>Jędrzej</first><last>Jamnicki</last></author>
+      <pages>64–67</pages>
+      <abstract>This paper presents ongoing work on leveraging probabilistic graph models, specifically conditional random fields and hidden Markov models, in nested named entity recognition for the Polish language. NER is a crucial task in natural language processing that involves identifying and classifying named entities in text documents. Nested NER deals with recognizing hierarchical structures of entities that overlap with one another, presenting additional challenges. The paper discusses the methodologies and approaches used in nested NER, focusing on CRF and HMM. Related works and their contributions are reviewed, and experiments using the KPWr dataset are conducted, particularly with the BiLSTM-CRF model and Word2Vec and HerBERT embeddings. The results show promise in addressing nested NER for Polish, but further research is needed to develop robust and accurate models for this complex task.</abstract>
+      <url hash="545ceafb">2023.ranlp-stud.7</url>
+      <bibkey>jamnicki-2023-leveraging</bibkey>
+    </paper>
+    <paper id="8">
+      <title>Crowdsourcing Veridicality Annotations in <fixed-case>S</fixed-case>panish: Can Speakers Actually Agree?</title>
+      <author><first>Teresa</first><last>Martín Soeder</last></author>
+      <pages>68–77</pages>
+      <abstract>In veridicality studies, an area of research of Natural Language Inference (NLI), the factuality of different contexts is evaluated. This task, known to be a difficult one since often it is not clear what the interpretation should be Uma et al. (2021), is key for building any Natural Language Understanding (NLU) system that aims at making the right inferences. Here the results of a study that analyzes the veridicality of mood alternation and specificity in Spanish, and whose labels are based on those of Saurí and Pustejovsky (2009) are presented. It has an inter-annotator agreement of AC2 = 0.114, considerably lower than that of de Marneffe et al. (2012) (κ = 0.53), a main reference to this work; and a couple of mood-related significant effects. Due to this strong lack of agreement, an analysis of what factors cause disagreement is presented together with a discussion based on the work of de Marneffe et al. (2012) and Pavlick and Kwiatkowski (2019) about the quality of the annotations gathered and whether other types of analysis like entropy distribution could better represent this corpus. The annotations collected are available at https://github.com/narhim/veridicality_spanish.</abstract>
+      <url hash="af99977c">2023.ranlp-stud.8</url>
+      <bibkey>martin-soeder-2023-crowdsourcing</bibkey>
+    </paper>
+    <paper id="9">
+      <title>Weakly supervised learning for aspect based sentiment analysis of <fixed-case>U</fixed-case>rdu Tweets</title>
+      <author><first>Zoya</first><last>Maqsood</last></author>
+      <pages>78–86</pages>
+      <abstract>Aspect-based sentiment analysis (ABSA) is vital for text comprehension which benefits applications across various domains. This field involves the two main sub-tasks including aspect extraction and sentiment classification. Existing methods to tackle this problem normally address only one sub-task or utilize topic models that may result in overlapping concepts. Moreover, such algorithms often rely on extensive labeled data and external language resources, making their application costly and time-consuming in new domains and especially for resource-poor languages like Urdu. The lack of aspect mining studies in Urdu literature further exacerbates the inapplicability of existing methods for Urdu language. The primary challenge lies in the preprocessing of data to ensure its suitability for language comprehension by the model, as well as the availability of appropriate pre-trained models, domain embeddings, and tools. This paper implements an ABSA model (CITATION) for unlabeled Urdu tweets with minimal user guidance, utilizing a small set of seed words for each aspect and sentiment class. The model first learns sentiment and aspect joint topic embeddings in the word embedding space with regularization to encourage topic distinctiveness. Afterwards, it employs deep neural models for pre-training with embedding-based predictions and self-training on unlabeled data. Furthermore, we optimize the model for improved performance by substituting the CNN with the BiLSTM classifier for sentence-level sentiment and aspect classification. Our optimized model achieves significant improvements over baselines in aspect and sentiment classification for Urdu tweets with accuracy of 64.8% and 72.8% respectively, demonstrating its effectiveness in generating joint topics and addressing existing limitations in Urdu ABSA.</abstract>
+      <url hash="601f56e5">2023.ranlp-stud.9</url>
+      <bibkey>maqsood-2023-weakly</bibkey>
+    </paper>
+    <paper id="10">
+      <title>Exploring Low-resource Neural Machine Translation for <fixed-case>S</fixed-case>inhala-<fixed-case>T</fixed-case>amil Language Pair</title>
+      <author><first>Ashmari</first><last>Pramodya</last></author>
+      <pages>87–97</pages>
+      <abstract>At present, Neural Machine Translation is a promising approach for machine translation. Transformer-based deep learning architectures in particular show a substantial performance increase in translating between various language pairs. However, many low-resource language pairs still struggle to lend themselves to Neural Machine Translation due to their data-hungry nature. In this article, we investigate methods of expanding the parallel corpus to enhance translation quality within a model training pipeline, starting from the initial collection of parallel data to the training process of baseline models. Grounded on state-of-the-art Neural Machine Translation approaches such as hyper-parameter tuning, and data augmentation with forward and backward translation, we define a set of best practices for improving Tamil-to-Sinhala machine translation and empirically validate our methods using standard evaluation metrics. Our results demonstrate that the Neural Machine Translation models trained on larger amounts of back-translated data outperform other synthetic data generation approaches in Transformer base training settings. We further demonstrate that, even for language pairs with limited resources, Transformer models are able to tune to outperform existing state-of-the-art Statistical Machine Translation models by as much as 3.28 BLEU points in the Tamil to Sinhala translation scenarios.</abstract>
+      <url hash="0bc96722">2023.ranlp-stud.10</url>
+      <bibkey>pramodya-2023-exploring</bibkey>
+    </paper>
+    <paper id="11">
+      <title>Prompting <fixed-case>C</fixed-case>hat<fixed-case>GPT</fixed-case> to Draw Morphological Connections for New Word Comprehension</title>
+      <author><first>Bianca-Madalina</first><last>Zgreaban</last></author>
+      <author><first>Rishabh</first><last>Suresh</last></author>
+      <pages>98–107</pages>
+      <abstract>Though more powerful, Large Language Models need to be periodically retrained for updated information, consuming resources and energy. In this respect, prompt engineering can prove a possible solution to re-training. To explore this line of research, this paper uses a case study, namely, finding the best prompting strategy for asking ChatGPT to define new words based on morphological connections. To determine the best prompting strategy, each definition provided by the prompt was ranked in terms of plausibility and humanlikeness criteria. The findings of this paper show that adding contextual information, operationalised as the keywords ‘new’ and ‘morpheme’, significantly improve the performance of the model for any prompt. While no single prompt significantly outperformed all others, there were differences between performances on the two criteria for most prompts. ChatGPT also provided the most correct definitions with a persona-type prompt.</abstract>
+      <url hash="b952d44d">2023.ranlp-stud.11</url>
+      <attachment type="OptionalSupplementaryMaterial" hash="58b2657c">2023.ranlp-stud.11.OptionalSupplementaryMaterial.zip</attachment>
+      <bibkey>zgreaban-suresh-2023-prompting</bibkey>
+    </paper>
+  </volume>
+  <event id="ranlp-2023">
+    <colocated>
+      <volume-id>2023.alp-1</volume-id>
+      <volume-id>2023.contents-1</volume-id>
+      <volume-id>2023.humeval-1</volume-id>
+      <volume-id>2023.nlp4tia-1</volume-id>
+      <volume-id>2023.tsar-1</volume-id>
+      <volume-id>2023.case-1</volume-id>
+      <volume-id>2023.dravidianlangtech-1</volume-id>
+      <volume-id>2023.ltedi-1</volume-id>
+    </colocated>
+  </event>
+</collection>
diff --git a/data/xml/2023.tsar.xml b/data/xml/2023.tsar.xml
new file mode 100644
index 0000000000..0bf892026f
--- /dev/null
+++ b/data/xml/2023.tsar.xml
@@ -0,0 +1,161 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2023.tsar">
+  <volume id="1" ingest-date="2023-10-31" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the Second Workshop on Text Simplification, Accessibility and Readability</booktitle>
+      <editor><first>Sanja</first><last>Štajner</last></editor>
+      <editor><first>Horacio</first><last>Saggio</last></editor>
+      <editor><first>Matthew</first><last>Shardlow</last></editor>
+      <editor><first>Fernando</first><last>Alva-Manchego</last></editor>
+      <publisher>INCOMA Ltd., Shoumen, Bulgaria</publisher>
+      <address>Varna, Bulgaria</address>
+      <month>September</month>
+      <year>2023</year>
+      <url hash="6ca91de8">2023.tsar-1</url>
+      <venue>tsar</venue>
+      <venue>ws</venue>
+    </meta>
+    <frontmatter>
+      <url hash="a9c1d442">2023.tsar-1.0</url>
+      <bibkey>tsar-2023-text</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>Using <fixed-case>C</fixed-case>hat<fixed-case>GPT</fixed-case> as a <fixed-case>CAT</fixed-case> tool in Easy Language translation</title>
+      <author><first>Silvana</first><last>Deilen</last></author>
+      <author><first>Sergio</first><last>Hernández Garrido</last></author>
+      <author><first>Ekaterina</first><last>Lapshinova-Koltunski</last></author>
+      <author><first>Christiane</first><last>Maaß</last></author>
+      <pages>1–10</pages>
+      <abstract>This study sets out to investigate the feasibility of using ChatGPT to translate citizen-oriented administrative texts into German Easy Language, a simplified, rule-based language variety that is adapted to the needs of people with reading impairments. We use ChatGPT to translate selected texts from websites of German public authorities using two strategies, i.e. linguistic and holistic. We analyse the quality of the generated texts based on different criteria, such as correctness, readability, and syntactic complexity. The results indicated that the generated texts are easier than the standard texts, but that they still do not fully meet the established Easy Language standards. Additionally, the content is not always rendered correctly.</abstract>
+      <url hash="5b7baaea">2023.tsar-1.1</url>
+      <bibkey>deilen-etal-2023-using</bibkey>
+    </paper>
+    <paper id="2">
+      <title>Context-aware <fixed-case>S</fixed-case>wedish Lexical Simplification</title>
+      <author><first>Emil</first><last>Graichen</last></author>
+      <author><first>Arne</first><last>Jonsson</last></author>
+      <pages>11–20</pages>
+      <abstract>We present results from the development and evaluation of context-aware Lexical simplification (LS) systems for the Swedish language. Three versions of LS models, LäsBERT, LäsBERT-baseline, and LäsGPT, were created and evaluated on a newly constructed Swedish LS evaluation dataset. The LS systems demonstrated promising potential in aiding audiences with reading difficulties by providing context-aware word replacements. While there were areas for improvement, particularly in complex word identification, the systems showed agreement with human annotators on word replacements.</abstract>
+      <url hash="0c002912">2023.tsar-1.2</url>
+      <bibkey>graichen-jonsson-2023-context</bibkey>
+    </paper>
+    <paper id="3">
+      <title><fixed-case>T</fixed-case>ext<fixed-case>S</fixed-case>implifier: A Modular, Extensible, and Context Sensitive Simplification Framework for Improved Natural Language Understanding</title>
+      <author><first>Sandaru</first><last>Seneviratne</last></author>
+      <author><first>Eleni</first><last>Daskalaki</last></author>
+      <author><first>Hanna</first><last>Suominen</last></author>
+      <pages>21–32</pages>
+      <abstract>Natural language understanding is fundamental to knowledge acquisition in today’s information society. However, natural language is often ambiguous with frequent occurrences of complex terms, acronyms, and abbreviations that require substitution and disambiguation, for example, by “translation” from complex to simpler text for better understanding. These tasks are usually difficult for people with limited reading skills, second language learners, and non-native speakers. Hence, the development of text simplification systems that are capable of simplifying complex text is of paramount importance. Thus, we conducted a user study to identify which components are essential in a text simplification system. Based on our findings, we proposed an improved text simplification framework, covering a broader range of aspects related to lexical simplification — from complexity identification to lexical substitution and disambiguation — while supplementing the simplified outputs with additional information for better understandability. Based on the improved framework, we developed TextSimplifier, a modularised, context-sensitive, end-to-end simplification framework, and engineered its web implementation. This system targets lexical simplification that identifies complex terms and acronyms followed by their simplification through substitution and disambiguation for better understanding of complex language.</abstract>
+      <url hash="ca39a5d0">2023.tsar-1.3</url>
+      <bibkey>seneviratne-etal-2023-textsimplifier</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Cross-lingual Mediation: Readability Effects</title>
+      <author><first>Maria</first><last>Kunilovskaya</last></author>
+      <author><first>Ruslan</first><last>Mitkov</last></author>
+      <author><first>Eveline</first><last>Wandl-Vogt</last></author>
+      <pages>33–43</pages>
+      <abstract>This paper explores the readability of translated and interpreted texts compared to the original source texts and target language texts in the same domain. It was shown in the literature that translated and interpreted texts could exhibit lexical and syntactic properties that make them simpler, and hence, easier to process than their sources or comparable non-translations. In translation, this effect is attributed to the tendency to simplify and disambiguate the message. In interpreting, it can be enhanced by the temporal and cognitive constraints. We use readability annotations from the Newsela corpus to formulate a number of classification and regression tasks and fine-tune a multilingual pre-trained model on these tasks, obtaining models that can differentiate between complex and simple sentences. Then, the models are applied to predict the readability of sources, targets, and comparable target language originals in a zero-shot manner. Our test data – parallel and comparable – come from English-German bidirectional interpreting and translation subsets from the Europarl corpus. The results confirm the difference in readability between translated/interpreted targets against sentences in standard originally-authored source and target languages. Besides, we find consistent differences between the translation directions in the English-German language pair.</abstract>
+      <url hash="8b4bd7a6">2023.tsar-1.4</url>
+      <bibkey>kunilovskaya-etal-2023-cross</bibkey>
+    </paper>
+    <paper id="5">
+      <title>Simplification by Lexical Deletion</title>
+      <author><first>Matthew</first><last>Shardlow</last></author>
+      <author><first>Piotr</first><last>Przybyła</last></author>
+      <pages>44–50</pages>
+      <abstract>Lexical simplification traditionally focuses on the replacement of tokens with simpler alternatives. However, in some cases the goal of this task (simplifying the form while preserving the meaning) may be better served by removing a word rather than replacing it. In fact, we show that existing datasets rely heavily on the deletion operation. We propose supervised and unsupervised solutions for lexical deletion based on classification, end-to-end simplification systems and custom language models. We contribute a new silver-standard corpus of lexical deletions (called SimpleDelete), which we mine from simple English Wikipedia edit histories and use to evaluate approaches to detecting superfluous words. The results show that even unsupervised approaches (TerseBERT) can achieve good performance in this new task. Deletion is one part of the wider lexical simplification puzzle, which we show can be isolated and investigated.</abstract>
+      <url hash="e7ff122d">2023.tsar-1.5</url>
+      <bibkey>shardlow-przybyla-2023-simplification</bibkey>
+    </paper>
+    <paper id="6">
+      <title>Comparing Generic and Expert Models for Genre-Specific Text Simplification</title>
+      <author><first>Zihao</first><last>Li</last></author>
+      <author><first>Matthew</first><last>Shardlow</last></author>
+      <author><first>Fernando</first><last>Alva-Manchego</last></author>
+      <pages>51–67</pages>
+      <abstract>We investigate how text genre influences the performance of models for controlled text simplification. Regarding datasets from Wikipedia and PubMed as two different genres, we compare the performance of genre-specific models trained by transfer learning and prompt-only GPT-like large language models. Our experiments showed that: (1) the performance loss of genre-specific models on general tasks can be limited to 2%, (2) transfer learning can improve performance on genre-specific datasets up to 10% in SARI score from the base model without transfer learning, (3) simplifications generated by the smaller but more customized models show similar performance in simplicity and a better meaning reservation capability to the larger generic models in both automatic and human evaluations.</abstract>
+      <url hash="99dd969a">2023.tsar-1.6</url>
+      <bibkey>li-etal-2023-comparing-generic</bibkey>
+    </paper>
+    <paper id="7">
+      <title>Automatic Text Simplification for People with Cognitive Disabilities: Resource Creation within the <fixed-case>C</fixed-case>lear<fixed-case>T</fixed-case>ext Project</title>
+      <author><first>Isabel</first><last>Espinosa-Zaragoza</last></author>
+      <author><first>José</first><last>Abreu-Salas</last></author>
+      <author><first>Paloma</first><last>Moreda</last></author>
+      <author><first>Manuel</first><last>Palomar</last></author>
+      <pages>68–77</pages>
+      <abstract>This paper presents the ongoing work conducted within the ClearText project, specifically focusing on the resource creation for the simplification of Spanish for people with cognitive disabilities. These resources include the CLEARSIM corpus and the Simple.Text tool. On the one hand, a description of the corpus compilation process with the help of APSA is detailed along with information regarding whether these texts are bronze, silver or gold standard simplification versions from the original text. The goal to reach is 18,000 texts in total by the end of the project. On the other hand, we aim to explore Large Language Models (LLMs) in a sequence-to-sequence setup for text simplification at the document level. Therefore, the tool’s objectives, technical aspects, and the preliminary results derived from early experimentation are also presented. The initial results are subject to improvement, given that experimentation is in a very preliminary stage. Despite showcasing flaws inherent to generative models (e.g. hallucinations, repetitive text), we examine the resolutions (or lack thereof) of complex linguistic phenomena that can be learned from the corpus. These issues will be addressed throughout the remainder of this project. The expected positive results from this project that will impact society are three-fold in nature: scientific-technical, social, and economic.</abstract>
+      <url hash="82aa2541">2023.tsar-1.7</url>
+      <bibkey>espinosa-zaragoza-etal-2023-automatic</bibkey>
+    </paper>
+    <paper id="8">
+      <title>Towards Sentence-level Text Readability Assessment for <fixed-case>F</fixed-case>rench</title>
+      <author><first>Duy Van</first><last>Ngo</last></author>
+      <author><first>Yannick</first><last>Parmentier</last></author>
+      <pages>78–84</pages>
+      <abstract>In this paper, we report on some experiments aimed at exploring the relation between document-level and sentence-level readability assessment for French. These were run on an open-source tailored corpus, which was automatically created by aggregating various sources from children’s literature. On top of providing the research community with a freely available corpus, we report on sentence readability scores obtained when applying both classical approaches (aka readability formulas) and state-of-the-art deep learning techniques (e.g. fine-tuning of large language models). Results show a relatively strong correlation between document-level and sentence-level readability, suggesting ways to reduce the cost of building annotated sentence-level readability datasets.</abstract>
+      <url hash="df291631">2023.tsar-1.8</url>
+      <bibkey>ngo-parmentier-2023-towards</bibkey>
+    </paper>
+    <paper id="9">
+      <title>Document-level Text Simplification with Coherence Evaluation</title>
+      <author><first>Laura</first><last>Vásquez-Rodríguez</last></author>
+      <author><first>Matthew</first><last>Shardlow</last></author>
+      <author><first>Piotr</first><last>Przybyła</last></author>
+      <author><first>Sophia</first><last>Ananiadou</last></author>
+      <pages>85–101</pages>
+      <abstract>We present a coherence-aware evaluation of document-level Text Simplification (TS), an approach that has not been considered in TS so far. We improve current TS sentence-based models to support a multi-sentence setting and the implementation of a state-of-the-art neural coherence model for simplification quality assessment. We enhanced English sentence simplification neural models for document-level simplification using 136,113 paragraph-level samples from both the general and medical domains to generate multiple sentences. Additionally, we use document-level simplification, readability and coherence metrics for evaluation. Our contributions include the introduction of coherence assessment into simplification evaluation with the automatic evaluation of 34,052 simplifications, a fine-tuned state-of-the-art model for document-level simplification, a coherence-based analysis of our results and a human evaluation of 300 samples that demonstrates the challenges encountered when moving towards document-level simplification.</abstract>
+      <url hash="ae5b30d3">2023.tsar-1.9</url>
+      <bibkey>vasquez-rodriguez-etal-2023-document</bibkey>
+    </paper>
+    <paper id="10">
+      <title><fixed-case>LSL</fixed-case>lama: Fine-Tuned <fixed-case>LL</fixed-case>a<fixed-case>MA</fixed-case> for Lexical Simplification</title>
+      <author><first>Anthony</first><last>Baez</last></author>
+      <author><first>Horacio</first><last>Saggion</last></author>
+      <pages>102–108</pages>
+      <abstract>Generative Large Language Models (LLMs), such as GPT-3, have become increasingly effective and versatile in natural language processing (NLP) tasks. One such task is Lexical Simplification, where state-of-the-art methods involve complex, multi-step processes which can use both deep learning and non-deep learning processes. LLaMA, an LLM with full research access, holds unique potential for the adaption of the entire LS pipeline. This paper details the process of fine-tuning LLaMA to create LSLlama, which performs comparably to previous LS baseline models LSBert and UniHD.</abstract>
+      <url hash="c8fb8296">2023.tsar-1.10</url>
+      <bibkey>baez-saggion-2023-lsllama</bibkey>
+    </paper>
+    <paper id="11">
+      <title><fixed-case>LC</fixed-case>-Score: Reference-less estimation of Text Comprehension Difficulty</title>
+      <author><first>Paul</first><last>Tardy</last></author>
+      <author><first>Charlotte</first><last>Roze</last></author>
+      <author><first>Paul</first><last>Poupet</last></author>
+      <pages>109–115</pages>
+      <abstract>Being able to read and understand written text is critical in a digital era. However, studies shows that a large fraction of the population experiences comprehension issues. In this context, further initiatives in accessibility are required to improve the audience text comprehension. However, writers are hardly assisted nor encouraged to produce easy-to-understand content. Moreover, Automatic Text Simplification (ATS) model development suffers from the lack of metric to accurately estimate comprehension difficulty. We present LC-SCORE, a simple approach for training text comprehension metric for any text without reference i.e. predicting how easy to understand a given text is on a [0, 100] scale. Our objective with this scale is to quantitatively capture the extend to which a text suits to the Langage Clair (LC, Clear Language) guidelines, a French initiative closely related to English Plain Language. We explore two approaches: (i) using linguistically motivated indicators used to train statistical models, and (ii) neural learning directly from text leveraging pre-trained language models. We introduce a simple proxy task for comprehension difficulty training as a classification task. To evaluate our models, we run two distinct human annotation experiments, and find that both approaches (indicator based and neural) outperforms commonly used readability and comprehension metrics such as FKGL.</abstract>
+      <url hash="74bdbd32">2023.tsar-1.11</url>
+      <bibkey>tardy-etal-2023-lc</bibkey>
+    </paper>
+    <paper id="12">
+      <title>On Operations in Automatic Text Simplification</title>
+      <author><first>Rémi</first><last>Cardon</last></author>
+      <author><first>Adrien</first><last>Bibal</last></author>
+      <pages>116–130</pages>
+      <abstract>This paper explores the literature of automatic text simplification (ATS) centered on the notion of operations. Operations are the processed of applying certain modifications to a given text in order to transform it. In ATS, the intent of the transformation is to simplify the text. This paper overviews and structures the domain by showing how operations are defined and how they are exploited. We extensively discuss the most recent works on this notion and perform preliminary experiments to automatize operations recognition with large language models (LLMs). Through our overview of the literature and the preliminary experiment with LLMs, this paper provides insights on the topic that can help lead to new directions in ATS research.</abstract>
+      <url hash="32633dbd">2023.tsar-1.12</url>
+      <bibkey>cardon-bibal-2023-operations</bibkey>
+    </paper>
+    <paper id="13">
+      <title>An automated tool with human supervision to adapt difficult texts into Plain Language</title>
+      <author><first>Paul</first><last>Poupet</last></author>
+      <author><first>Morgane</first><last>Hauguel</last></author>
+      <author><first>Erwan</first><last>Boehm</last></author>
+      <author><first>Charlotte</first><last>Roze</last></author>
+      <author><first>Paul</first><last>Tardy</last></author>
+      <pages>131–133</pages>
+      <abstract>In this paper, we present an automated tool with human supervision to write in plain language or to adapt difficult texts into plain language. It can be used on a web version and as a plugin for Word/Outlook plugins. At the publication date, it is only available in the French language. This tool has been developed for 3 years and has been used by 400 users from private companies and from public administrations. Text simplification is automatically performed with the manual approval of the user, at the lexical, syntactic, and discursive levels. Screencast of the demo can be found at the following link: <url>https://www.youtube.com/watch?v=wXVtjfKO9FI</url>.</abstract>
+      <url hash="7a6cca1d">2023.tsar-1.13</url>
+      <bibkey>poupet-etal-2023-automated</bibkey>
+    </paper>
+    <paper id="14">
+      <title>Beyond Vocabulary: Capturing Readability from Children’s Difficulty</title>
+      <author><first>Arif</first><last>Ahmed</last></author>
+      <pages>134–141</pages>
+      <abstract>Readability formulae targeting children have been developed, but their appropriateness can still be improved, for example by taking into account suffixation. Literacy research has identified the suffixation phenomenon makes children’s reading difficult, so we analyze the effectiveness of suffixation within the context of readability. Our analysis finds that suffixation is potentially effective for readability assessment. Moreover, we find that existing readability formulae fail to discern lower grade levels for texts from different existing corpora.</abstract>
+      <url hash="45299265">2023.tsar-1.14</url>
+      <bibkey>ahmed-2023-beyond</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/yaml/venues/alp.yaml b/data/yaml/venues/alp.yaml
new file mode 100644
index 0000000000..fffd6a36dd
--- /dev/null
+++ b/data/yaml/venues/alp.yaml
@@ -0,0 +1,2 @@
+acronym: ALP
+name: Workshop on Ancient Language Processing
diff --git a/data/yaml/venues/contents.yaml b/data/yaml/venues/contents.yaml
new file mode 100644
index 0000000000..610203b74b
--- /dev/null
+++ b/data/yaml/venues/contents.yaml
@@ -0,0 +1,3 @@
+acronym: ConTeNTS
+name: Workshop on Computational Terminology in NLP and Translation Studies (ConTeNTS)
+  Incorporating the 16th Workshop on Building and Using Comparable Corpora (BUCC)
diff --git a/data/yaml/venues/nlp4tia.yaml b/data/yaml/venues/nlp4tia.yaml
new file mode 100644
index 0000000000..61c33e434d
--- /dev/null
+++ b/data/yaml/venues/nlp4tia.yaml
@@ -0,0 +1,2 @@
+acronym: NLP4TIA
+name: First Workshop on NLP Tools and Resources for Translation and Interpreting Applications

From 0a35fde24b34287afa0b01c57569ce3c15da4db3 Mon Sep 17 00:00:00 2001
From: anthology-assist <126604033+anthology-assist@users.noreply.github.com>
Date: Tue, 14 Nov 2023 20:56:19 -0600
Subject: [PATCH 04/12] October Corrections (#2838)

* Paper pdf correction for 2021.emnlp-main.235, closes #2821.
* Paper pdf correction for 2023.clasp-1.2, closes #2803.
* Author name meta correction for 2023.latechclfl-1.2, closes #2809.
* Author name meta correction for 2022.eval4nlp-1.2, closes #2801.
* Author name meta correction for 2023.dstc-1.14, closes #2837.
* Abstract meta data correction for 2023.acl-short.122, closes #2833.
* Author meta correction for 2023.icard-1.3, closes #2830.
* Author mata correction for 2023.sigdial-1.5, closes #2829.
* Paper pdf correction for 2023.eacl-demo.11, closes #2828.
* Editor order update for 2023.sigdial-1, closes #2825.
* Paper pdf correction for 2023.acl-short.59, closes #2839.
* Paper pdf correction for 2022.findings-emnlp.439, closes #2844.
* Author name correction for 2023.inlg-main.10, closes #2842.
* Author order meta correction for 2022.semeval-1.31, closes #2848.
* Paper pdf correction for 2023.acl-long.594, closes #2851.
* Author meta data correction for 2023.finnlp-1.9, closes #2856.
* Author meta data correction for 2023.finnlp-1.8, closes #2855.
* Author meta data correction for 2023.finnlp-1.7, closes #2854.
* Author meta data correction for 2023.finnlp-1.6, closes #2853.
* Author meta data correction for 2023.finnlp-1.6 again.

---------

Co-authored-by: Daniel Gildea <gildea>
---
 data/xml/2018.icon.xml       |  2 +-
 data/xml/2021.emnlp.xml      | 32 +++++++++------
 data/xml/2022.eval4nlp.xml   |  2 +-
 data/xml/2022.findings.xml   | 34 ++++++++++------
 data/xml/2022.semeval.xml    |  6 +--
 data/xml/2023.acl.xml        | 76 ++++++++++++++++++++++++------------
 data/xml/2023.ccl.xml        |  3 +-
 data/xml/2023.clasp.xml      |  4 +-
 data/xml/2023.dstc.xml       | 10 ++---
 data/xml/2023.eacl.xml       | 28 ++++++++-----
 data/xml/2023.finnlp.xml     | 37 +++++++++---------
 data/xml/2023.icard.xml      |  4 +-
 data/xml/2023.inlg.xml       |  2 +-
 data/xml/2023.latechclfl.xml |  2 +-
 data/xml/2023.sigdial.xml    |  4 +-
 15 files changed, 153 insertions(+), 93 deletions(-)

diff --git a/data/xml/2018.icon.xml b/data/xml/2018.icon.xml
index 5543f7dd37..a5d9d1f9ad 100644
--- a/data/xml/2018.icon.xml
+++ b/data/xml/2018.icon.xml
@@ -190,7 +190,7 @@
     <paper id="20">
       <title>Improving Computer Generated Dialog with Auxiliary Loss Functions and Custom Evaluation Metrics</title>
       <author><first>Thomas</first><last>Conley</last></author>
-      <author><first>Jack St.</first><last>Clair</last></author>
+      <author><first>Jack</first><last>St. Clair</last></author>
       <author><first>Jugal</first><last>Kalita</last></author>
       <pages>138–144</pages>
       <url hash="db18d5dc">2018.icon-1.20</url>
diff --git a/data/xml/2021.emnlp.xml b/data/xml/2021.emnlp.xml
index 7e5d2c234a..768486a3ee 100644
--- a/data/xml/2021.emnlp.xml
+++ b/data/xml/2021.emnlp.xml
@@ -3683,13 +3683,14 @@
       <author><first>Yohei</first><last>Oseki</last></author>
       <pages>2964–2973</pages>
       <abstract>In computational linguistics, it has been shown that hierarchical structures make language models (LMs) more human-like. However, the previous literature has been agnostic about a parsing strategy of the hierarchical models. In this paper, we investigated whether hierarchical structures make LMs more human-like, and if so, which parsing strategy is most cognitively plausible. In order to address this question, we evaluated three LMs against human reading times in Japanese with head-final left-branching structures: Long Short-Term Memory (LSTM) as a sequential model and Recurrent Neural Network Grammars (RNNGs) with top-down and left-corner parsing strategies as hierarchical models. Our computational modeling demonstrated that left-corner RNNGs outperformed top-down RNNGs and LSTM, suggesting that hierarchical and left-corner architectures are more cognitively plausible than top-down or sequential architectures. In addition, the relationships between the cognitive plausibility and (i) perplexity, (ii) parsing, and (iii) beam size will also be discussed.</abstract>
-      <url hash="ee033676">2021.emnlp-main.235</url>
+      <url hash="cecb8202">2021.emnlp-main.235</url>
       <bibkey>yoshida-etal-2021-modeling</bibkey>
       <doi>10.18653/v1/2021.emnlp-main.235</doi>
       <video href="2021.emnlp-main.235.mp4"/>
       <revision id="1" href="2021.emnlp-main.235v1" hash="11d3baf4"/>
       <revision id="2" href="2021.emnlp-main.235v2" hash="ee033676" date="2023-05-14">Fixed a typo and added a footnote.</revision>
-      <pwccode url="https://github.com/osekilab/rnng-eyetrack" additional="true">osekilab/rnng-eyetrack</pwccode>
+      <pwccode url="https://github.com/osekilab/rnng-lc" additional="false">osekilab/rnng-lc</pwccode>
+      <revision id="3" href="2021.emnlp-main.235v3" hash="cecb8202" date="2023-10-05">Updated code link.</revision>
     </paper>
     <paper id="236">
       <title>A Simple and Effective Positional Encoding for Transformers</title>
@@ -5835,7 +5836,8 @@
       <author><first>Thai</first><last>Hoang</last></author>
       <author><first>Dat Quoc</first><last>Nguyen</last></author>
       <pages>4495–4503</pages>
-      <abstract>We introduce a high-quality and large-scale Vietnamese-English parallel dataset of 3.02M sentence pairs, which is 2.9M pairs larger than the benchmark Vietnamese-English machine translation corpus IWSLT15. We conduct experiments comparing strong neural baselines and well-known automatic translation engines on our dataset and find that in both automatic and human evaluations: the best performance is obtained by fine-tuning the pre-trained sequence-to-sequence denoising auto-encoder mBART. To our best knowledge, this is the first large-scale Vietnamese-English machine translation study. We hope our publicly available dataset and study can serve as a starting point for future research and applications on Vietnamese-English machine translation. We release our dataset at: <url>https://github.com/VinAIResearch/PhoMT</url></abstract>
+      <abstract>We introduce a high-quality and large-scale Vietnamese-English parallel dataset of 3.02M sentence pairs, which is 2.9M pairs larger than the benchmark Vietnamese-English machine translation corpus IWSLT15. We conduct experiments comparing strong neural baselines and well-known automatic translation engines on our dataset and find that in both automatic and human evaluations: the best performance is obtained by fine-tuning the pre-trained sequence-to-sequence denoising auto-encoder mBART. To our best knowledge, this is the first large-scale Vietnamese-English machine translation study. We hope our publicly available dataset and study can serve as a starting point for future research and applications on Vietnamese-English machine translation. We release our dataset at: <url>https://github.com/VinAIResearch/PhoMT</url>
+      </abstract>
       <url hash="587dc8f9">2021.emnlp-main.369</url>
       <bibkey>doan-etal-2021-phomt</bibkey>
       <doi>10.18653/v1/2021.emnlp-main.369</doi>
@@ -8223,7 +8225,8 @@
       <author><first>Wilker</first><last>Aziz</last></author>
       <author><first>Ivan</first><last>Titov</last></author>
       <pages>6491–6506</pages>
-      <abstract>The factual knowledge acquired during pre-training and stored in the parameters of Language Models (LMs) can be useful in downstream tasks (e.g., question answering or textual inference). However, some facts can be incorrectly induced or become obsolete over time. We present KnowledgeEditor, a method which can be used to edit this knowledge and, thus, fix ‘bugs’ or unexpected predictions without the need for expensive re-training or fine-tuning. Besides being computationally efficient, KnowledgeEditordoes not require any modifications in LM pre-training (e.g., the use of meta-learning). In our approach, we train a hyper-network with constrained optimization to modify a fact without affecting the rest of the knowledge; the trained hyper-network is then used to predict the weight update at test time. We show KnowledgeEditor’s efficacy with two popular architectures and knowledge-intensive tasks: i) a BERT model fine-tuned for fact-checking, and ii) a sequence-to-sequence BART model for question answering. With our method, changing a prediction on the specific wording of a query tends to result in a consistent change in predictions also for its paraphrases. We show that this can be further encouraged by exploiting (e.g., automatically-generated) paraphrases during training. Interestingly, our hyper-network can be regarded as a ‘probe’ revealing which components need to be changed to manipulate factual knowledge; our analysis shows that the updates tend to be concentrated on a small subset of components. Source code available at <url>https://github.com/nicola-decao/KnowledgeEditor</url></abstract>
+      <abstract>The factual knowledge acquired during pre-training and stored in the parameters of Language Models (LMs) can be useful in downstream tasks (e.g., question answering or textual inference). However, some facts can be incorrectly induced or become obsolete over time. We present KnowledgeEditor, a method which can be used to edit this knowledge and, thus, fix ‘bugs’ or unexpected predictions without the need for expensive re-training or fine-tuning. Besides being computationally efficient, KnowledgeEditordoes not require any modifications in LM pre-training (e.g., the use of meta-learning). In our approach, we train a hyper-network with constrained optimization to modify a fact without affecting the rest of the knowledge; the trained hyper-network is then used to predict the weight update at test time. We show KnowledgeEditor’s efficacy with two popular architectures and knowledge-intensive tasks: i) a BERT model fine-tuned for fact-checking, and ii) a sequence-to-sequence BART model for question answering. With our method, changing a prediction on the specific wording of a query tends to result in a consistent change in predictions also for its paraphrases. We show that this can be further encouraged by exploiting (e.g., automatically-generated) paraphrases during training. Interestingly, our hyper-network can be regarded as a ‘probe’ revealing which components need to be changed to manipulate factual knowledge; our analysis shows that the updates tend to be concentrated on a small subset of components. Source code available at <url>https://github.com/nicola-decao/KnowledgeEditor</url>
+      </abstract>
       <url hash="d82198d4">2021.emnlp-main.522</url>
       <bibkey>de-cao-etal-2021-editing</bibkey>
       <doi>10.18653/v1/2021.emnlp-main.522</doi>
@@ -9052,7 +9055,8 @@
       <author><first>Sreyas</first><last>Mohan</last></author>
       <author><first>Mitesh M.</first><last>Khapra</last></author>
       <pages>7219–7234</pages>
-      <abstract>Natural Language Generation (NLG) evaluation is a multifaceted task requiring assessment of multiple desirable criteria, e.g., fluency, coherency, coverage, relevance, adequacy, overall quality, etc. Across existing datasets for 6 NLG tasks, we observe that the human evaluation scores on these multiple criteria are often not correlated. For example, there is a very low correlation between human scores on fluency and data coverage for the task of structured data to text generation. This suggests that the current recipe of proposing new automatic evaluation metrics for NLG by showing that they correlate well with scores assigned by humans for a single criteria (overall quality) alone is inadequate. Indeed, our extensive study involving 25 automatic evaluation metrics across 6 different tasks and 18 different evaluation criteria shows that there is no single metric which correlates well with human scores on all desirable criteria, for most NLG tasks. Given this situation, we propose CheckLists for better design and evaluation of automatic metrics. We design templates which target a specific criteria (e.g., coverage) and perturb the output such that the quality gets affected only along this specific criteria (e.g., the coverage drops). We show that existing evaluation metrics are not robust against even such simple perturbations and disagree with scores assigned by humans to the perturbed output. The proposed templates thus allow for a fine-grained assessment of automatic evaluation metrics exposing their limitations and will facilitate better design, analysis and evaluation of such metrics. Our templates and code are available at <url>https://iitmnlp.github.io/EvalEval/</url></abstract>
+      <abstract>Natural Language Generation (NLG) evaluation is a multifaceted task requiring assessment of multiple desirable criteria, e.g., fluency, coherency, coverage, relevance, adequacy, overall quality, etc. Across existing datasets for 6 NLG tasks, we observe that the human evaluation scores on these multiple criteria are often not correlated. For example, there is a very low correlation between human scores on fluency and data coverage for the task of structured data to text generation. This suggests that the current recipe of proposing new automatic evaluation metrics for NLG by showing that they correlate well with scores assigned by humans for a single criteria (overall quality) alone is inadequate. Indeed, our extensive study involving 25 automatic evaluation metrics across 6 different tasks and 18 different evaluation criteria shows that there is no single metric which correlates well with human scores on all desirable criteria, for most NLG tasks. Given this situation, we propose CheckLists for better design and evaluation of automatic metrics. We design templates which target a specific criteria (e.g., coverage) and perturb the output such that the quality gets affected only along this specific criteria (e.g., the coverage drops). We show that existing evaluation metrics are not robust against even such simple perturbations and disagree with scores assigned by humans to the perturbed output. The proposed templates thus allow for a fine-grained assessment of automatic evaluation metrics exposing their limitations and will facilitate better design, analysis and evaluation of such metrics. Our templates and code are available at <url>https://iitmnlp.github.io/EvalEval/</url>
+      </abstract>
       <url hash="7bf7f3b6">2021.emnlp-main.575</url>
       <bibkey>sai-etal-2021-perturbation</bibkey>
       <doi>10.18653/v1/2021.emnlp-main.575</doi>
@@ -9520,7 +9524,8 @@
       <author><first>Wilker</first><last>Aziz</last></author>
       <author><first>Ivan</first><last>Titov</last></author>
       <pages>7662–7669</pages>
-      <abstract>Generative approaches have been recently shown to be effective for both Entity Disambiguation and Entity Linking (i.e., joint mention detection and disambiguation). However, the previously proposed autoregressive formulation for EL suffers from i) high computational cost due to a complex (deep) decoder, ii) non-parallelizable decoding that scales with the source sequence length, and iii) the need for training on a large amount of data. In this work, we propose a very efficient approach that parallelizes autoregressive linking across all potential mentions and relies on a shallow and efficient decoder. Moreover, we augment the generative objective with an extra discriminative component, i.e., a correction term which lets us directly optimize the generator’s ranking. When taken together, these techniques tackle all the above issues: our model is &gt;70 times faster and more accurate than the previous generative method, outperforming state-of-the-art approaches on the standard English dataset AIDA-CoNLL. Source code available at <url>https://github.com/nicola-decao/efficient-autoregressive-EL</url></abstract>
+      <abstract>Generative approaches have been recently shown to be effective for both Entity Disambiguation and Entity Linking (i.e., joint mention detection and disambiguation). However, the previously proposed autoregressive formulation for EL suffers from i) high computational cost due to a complex (deep) decoder, ii) non-parallelizable decoding that scales with the source sequence length, and iii) the need for training on a large amount of data. In this work, we propose a very efficient approach that parallelizes autoregressive linking across all potential mentions and relies on a shallow and efficient decoder. Moreover, we augment the generative objective with an extra discriminative component, i.e., a correction term which lets us directly optimize the generator’s ranking. When taken together, these techniques tackle all the above issues: our model is &gt;70 times faster and more accurate than the previous generative method, outperforming state-of-the-art approaches on the standard English dataset AIDA-CoNLL. Source code available at <url>https://github.com/nicola-decao/efficient-autoregressive-EL</url>
+      </abstract>
       <url hash="0d81d37b">2021.emnlp-main.604</url>
       <bibkey>de-cao-etal-2021-highly</bibkey>
       <doi>10.18653/v1/2021.emnlp-main.604</doi>
@@ -10067,7 +10072,8 @@
       <author><first>Federico</first><last>Liberatore</last></author>
       <author><first>Jose</first><last>Camacho-Collados</last></author>
       <pages>8089–8103</pages>
-      <abstract>Term weighting schemes are widely used in Natural Language Processing and Information Retrieval. In particular, term weighting is the basis for keyword extraction. However, there are relatively few evaluation studies that shed light about the strengths and shortcomings of each weighting scheme. In fact, in most cases researchers and practitioners resort to the well-known tf-idf as default, despite the existence of other suitable alternatives, including graph-based models. In this paper, we perform an exhaustive and large-scale empirical comparison of both statistical and graph-based term weighting methods in the context of keyword extraction. Our analysis reveals some interesting findings such as the advantages of the less-known lexical specificity with respect to tf-idf, or the qualitative differences between statistical and graph-based methods. Finally, based on our findings we discuss and devise some suggestions for practitioners. Source code to reproduce our experimental results, including a keyword extraction library, are available in the following repository: <url>https://github.com/asahi417/kex</url></abstract>
+      <abstract>Term weighting schemes are widely used in Natural Language Processing and Information Retrieval. In particular, term weighting is the basis for keyword extraction. However, there are relatively few evaluation studies that shed light about the strengths and shortcomings of each weighting scheme. In fact, in most cases researchers and practitioners resort to the well-known tf-idf as default, despite the existence of other suitable alternatives, including graph-based models. In this paper, we perform an exhaustive and large-scale empirical comparison of both statistical and graph-based term weighting methods in the context of keyword extraction. Our analysis reveals some interesting findings such as the advantages of the less-known lexical specificity with respect to tf-idf, or the qualitative differences between statistical and graph-based methods. Finally, based on our findings we discuss and devise some suggestions for practitioners. Source code to reproduce our experimental results, including a keyword extraction library, are available in the following repository: <url>https://github.com/asahi417/kex</url>
+      </abstract>
       <url hash="5795ffdc">2021.emnlp-main.638</url>
       <bibkey>ushio-etal-2021-back</bibkey>
       <doi>10.18653/v1/2021.emnlp-main.638</doi>
@@ -11199,7 +11205,8 @@
       <author><first>Jose</first><last>Camacho-Collados</last></author>
       <author><first>Steven</first><last>Schockaert</last></author>
       <pages>9044–9062</pages>
-      <abstract>Pre-trained language models have been found to capture a surprisingly rich amount of lexical knowledge, ranging from commonsense properties of everyday concepts to detailed factual knowledge about named entities. Among others, this makes it possible to distill high-quality word vectors from pre-trained language models. However, it is currently unclear to what extent it is possible to distill relation embeddings, i.e. vectors that characterize the relationship between two words. Such relation embeddings are appealing because they can, in principle, encode relational knowledge in a more fine-grained way than is possible with knowledge graphs. To obtain relation embeddings from a pre-trained language model, we encode word pairs using a (manually or automatically generated) prompt, and we fine-tune the language model such that relationally similar word pairs yield similar output vectors. We find that the resulting relation embeddings are highly competitive on analogy (unsupervised) and relation classification (supervised) benchmarks, even without any task-specific fine-tuning. Source code to reproduce our experimental results and the model checkpoints are available in the following repository: <url>https://github.com/asahi417/relbert</url></abstract>
+      <abstract>Pre-trained language models have been found to capture a surprisingly rich amount of lexical knowledge, ranging from commonsense properties of everyday concepts to detailed factual knowledge about named entities. Among others, this makes it possible to distill high-quality word vectors from pre-trained language models. However, it is currently unclear to what extent it is possible to distill relation embeddings, i.e. vectors that characterize the relationship between two words. Such relation embeddings are appealing because they can, in principle, encode relational knowledge in a more fine-grained way than is possible with knowledge graphs. To obtain relation embeddings from a pre-trained language model, we encode word pairs using a (manually or automatically generated) prompt, and we fine-tune the language model such that relationally similar word pairs yield similar output vectors. We find that the resulting relation embeddings are highly competitive on analogy (unsupervised) and relation classification (supervised) benchmarks, even without any task-specific fine-tuning. Source code to reproduce our experimental results and the model checkpoints are available in the following repository: <url>https://github.com/asahi417/relbert</url>
+      </abstract>
       <url hash="c43be5b6">2021.emnlp-main.712</url>
       <bibkey>ushio-etal-2021-distilling</bibkey>
       <doi>10.18653/v1/2021.emnlp-main.712</doi>
@@ -11245,7 +11252,8 @@
       <author><first>Michele</first><last>Bevilacqua</last></author>
       <author><first>Roberto</first><last>Navigli</last></author>
       <pages>9092–9098</pages>
-      <abstract>Neural Word Sense Disambiguation (WSD) has recently been shown to benefit from the incorporation of pre-existing knowledge, such as that coming from the WordNet graph. However, state-of-the-art approaches have been successful in exploiting only the local structure of the graph, with only close neighbors of a given synset influencing the prediction. In this work, we improve a classification model by recomputing logits as a function of both the vanilla independently produced logits and the global WordNet graph. We achieve this by incorporating an online neural approximated PageRank, which enables us to refine edge weights as well. This method exploits the global graph structure while keeping space requirements linear in the number of edges. We obtain strong improvements, matching the current state of the art. Code is available at <url>https://github.com/SapienzaNLP/neural-pagerank-wsd</url></abstract>
+      <abstract>Neural Word Sense Disambiguation (WSD) has recently been shown to benefit from the incorporation of pre-existing knowledge, such as that coming from the WordNet graph. However, state-of-the-art approaches have been successful in exploiting only the local structure of the graph, with only close neighbors of a given synset influencing the prediction. In this work, we improve a classification model by recomputing logits as a function of both the vanilla independently produced logits and the global WordNet graph. We achieve this by incorporating an online neural approximated PageRank, which enables us to refine edge weights as well. This method exploits the global graph structure while keeping space requirements linear in the number of edges. We obtain strong improvements, matching the current state of the art. Code is available at <url>https://github.com/SapienzaNLP/neural-pagerank-wsd</url>
+      </abstract>
       <url hash="4e2100ea">2021.emnlp-main.715</url>
       <bibkey>el-sheikh-etal-2021-integrating</bibkey>
       <doi>10.18653/v1/2021.emnlp-main.715</doi>
@@ -12232,7 +12240,8 @@
       <author><first>Marco</first><last>Di Giovanni</last></author>
       <author><first>Marco</first><last>Brambilla</last></author>
       <pages>9902–9910</pages>
-      <abstract>Semantic sentence embeddings are usually supervisedly built minimizing distances between pairs of embeddings of sentences labelled as semantically similar by annotators. Since big labelled datasets are rare, in particular for non-English languages, and expensive, recent studies focus on unsupervised approaches that require not-paired input sentences. We instead propose a language-independent approach to build large datasets of pairs of informal texts weakly similar, without manual human effort, exploiting Twitter’s intrinsic powerful signals of relatedness: replies and quotes of tweets. We use the collected pairs to train a Transformer model with triplet-like structures, and we test the generated embeddings on Twitter NLP similarity tasks (PIT and TURL) and STSb. We also introduce four new sentence ranking evaluation benchmarks of informal texts, carefully extracted from the initial collections of tweets, proving not only that our best model learns classical Semantic Textual Similarity, but also excels on tasks where pairs of sentences are not exact paraphrases. Ablation studies reveal how increasing the corpus size influences positively the results, even at 2M samples, suggesting that bigger collections of Tweets still do not contain redundant information about semantic similarities. Code available at <url>https://github.com/marco-digio/Twitter4SSE</url></abstract>
+      <abstract>Semantic sentence embeddings are usually supervisedly built minimizing distances between pairs of embeddings of sentences labelled as semantically similar by annotators. Since big labelled datasets are rare, in particular for non-English languages, and expensive, recent studies focus on unsupervised approaches that require not-paired input sentences. We instead propose a language-independent approach to build large datasets of pairs of informal texts weakly similar, without manual human effort, exploiting Twitter’s intrinsic powerful signals of relatedness: replies and quotes of tweets. We use the collected pairs to train a Transformer model with triplet-like structures, and we test the generated embeddings on Twitter NLP similarity tasks (PIT and TURL) and STSb. We also introduce four new sentence ranking evaluation benchmarks of informal texts, carefully extracted from the initial collections of tweets, proving not only that our best model learns classical Semantic Textual Similarity, but also excels on tasks where pairs of sentences are not exact paraphrases. Ablation studies reveal how increasing the corpus size influences positively the results, even at 2M samples, suggesting that bigger collections of Tweets still do not contain redundant information about semantic similarities. Code available at <url>https://github.com/marco-digio/Twitter4SSE</url>
+      </abstract>
       <url hash="fcbf27ea">2021.emnlp-main.780</url>
       <bibkey>di-giovanni-brambilla-2021-exploiting</bibkey>
       <doi>10.18653/v1/2021.emnlp-main.780</doi>
@@ -13640,7 +13649,8 @@
       <author><first>Stefan</first><last>Jänicke</last></author>
       <author><first>Martin</first><last>Potthast</last></author>
       <pages>185–194</pages>
-      <abstract>This paper introduces Summary Explorer, a new tool to support the manual inspection of text summarization systems by compiling the outputs of 55 state-of-the-art single document summarization approaches on three benchmark datasets, and visually exploring them during a qualitative assessment. The underlying design of the tool considers three well-known summary quality criteria (coverage, faithfulness, and position bias), encapsulated in a guided assessment based on tailored visualizations. The tool complements existing approaches for locally debugging summarization models and improves upon them. The tool is available at <url>https://tldr.webis.de/</url></abstract>
+      <abstract>This paper introduces Summary Explorer, a new tool to support the manual inspection of text summarization systems by compiling the outputs of 55 state-of-the-art single document summarization approaches on three benchmark datasets, and visually exploring them during a qualitative assessment. The underlying design of the tool considers three well-known summary quality criteria (coverage, faithfulness, and position bias), encapsulated in a guided assessment based on tailored visualizations. The tool complements existing approaches for locally debugging summarization models and improves upon them. The tool is available at <url>https://tldr.webis.de/</url>
+      </abstract>
       <url hash="f2634d04">2021.emnlp-demo.22</url>
       <bibkey>syed-etal-2021-summary</bibkey>
       <doi>10.18653/v1/2021.emnlp-demo.22</doi>
diff --git a/data/xml/2022.eval4nlp.xml b/data/xml/2022.eval4nlp.xml
index d77f875999..cd74349737 100644
--- a/data/xml/2022.eval4nlp.xml
+++ b/data/xml/2022.eval4nlp.xml
@@ -35,7 +35,7 @@
     </paper>
     <paper id="2">
       <title>Assessing Resource-Performance Trade-off of Natural Language Models using Data Envelopment Analysis</title>
-      <author><first>Shohei</first><last>Zhou</last></author>
+      <author><first>Zachary</first><last>Zhou</last></author>
       <author><first>Alisha</first><last>Zachariah</last></author>
       <author><first>Devin</first><last>Conathan</last></author>
       <author><first>Jeffery</first><last>Kline</last></author>
diff --git a/data/xml/2022.findings.xml b/data/xml/2022.findings.xml
index 358bde788c..1089be3ad4 100644
--- a/data/xml/2022.findings.xml
+++ b/data/xml/2022.findings.xml
@@ -1096,7 +1096,8 @@
       <author><first>Arda</first><last>Goktogan</last></author>
       <author><first>Deniz</first><last>Yuret</last></author>
       <pages>846-863</pages>
-      <abstract>Having sufficient resources for language X lifts it from the under-resourced languages class, but not necessarily from the under-researched class. In this paper, we address the problem of the absence of organized benchmarks in the Turkish language. We demonstrate that languages such as Turkish are left behind the state-of-the-art in NLP applications. As a solution, we present Mukayese, a set of NLP benchmarks for the Turkish language that contains several NLP tasks. We work on one or more datasets for each benchmark and present two or more baselines. Moreover, we present four new benchmarking datasets in Turkish for language modeling, sentence segmentation, and spell checking. All datasets and baselines are available under: <url>https://github.com/alisafaya/mukayese</url></abstract>
+      <abstract>Having sufficient resources for language X lifts it from the under-resourced languages class, but not necessarily from the under-researched class. In this paper, we address the problem of the absence of organized benchmarks in the Turkish language. We demonstrate that languages such as Turkish are left behind the state-of-the-art in NLP applications. As a solution, we present Mukayese, a set of NLP benchmarks for the Turkish language that contains several NLP tasks. We work on one or more datasets for each benchmark and present two or more baselines. Moreover, we present four new benchmarking datasets in Turkish for language modeling, sentence segmentation, and spell checking. All datasets and baselines are available under: <url>https://github.com/alisafaya/mukayese</url>
+      </abstract>
       <url hash="e0c245a3">2022.findings-acl.69</url>
       <bibkey>safaya-etal-2022-mukayese</bibkey>
       <doi>10.18653/v1/2022.findings-acl.69</doi>
@@ -3583,7 +3584,8 @@
       <author><first>Miriam</first><last>Wanner</last></author>
       <author><first>Antonios</first><last>Anastasopoulos</last></author>
       <pages>2925-2934</pages>
-      <abstract>Recent work by Søgaard (2020) showed that, treebank size aside, overlap between training and test graphs (termed <i>leakage</i>) explains more of the observed variation in dependency parsing performance than other explanations. In this work we revisit this claim, testing it on more models and languages. We find that it only holds for zero-shot cross-lingual settings. We then propose a more fine-grained measure of such leakage which, unlike the original measure, not only explains but also correlates with observed performance variation. Code and data are available here: <url>https://github.com/miriamwanner/reu-nlp-project</url></abstract>
+      <abstract>Recent work by Søgaard (2020) showed that, treebank size aside, overlap between training and test graphs (termed <i>leakage</i>) explains more of the observed variation in dependency parsing performance than other explanations. In this work we revisit this claim, testing it on more models and languages. We find that it only holds for zero-shot cross-lingual settings. We then propose a more fine-grained measure of such leakage which, unlike the original measure, not only explains but also correlates with observed performance variation. Code and data are available here: <url>https://github.com/miriamwanner/reu-nlp-project</url>
+      </abstract>
       <url hash="9bf61937">2022.findings-acl.230</url>
       <bibkey>krasner-etal-2022-revisiting</bibkey>
       <doi>10.18653/v1/2022.findings-acl.230</doi>
@@ -4061,7 +4063,8 @@
       <author><first>Stefanos</first><last>Angelidis</last></author>
       <author><first>Yoshihiko</first><last>Suhara</last></author>
       <pages>3307-3324</pages>
-      <abstract>Opinion summarization focuses on generating summaries that reflect popular subjective information expressed in multiple online reviews. While generated summaries offer general and concise information about a particular hotel or product, the information may be insufficient to help the user compare multiple different choices. Thus, the user may still struggle with the question “Which one should I pick?” In this paper, we propose the comparative opinion summarization task, which aims at generating two contrastive summaries and one common summary from two different candidate sets of reviews. We develop a comparative summarization framework CoCoSum, which consists of two base summarization models that jointly generate contrastive and common summaries. Experimental results on a newly created benchmark CoCoTrip show that CoCoSum can produce higher-quality contrastive and common summaries than state-of-the-art opinion summarization models. The dataset and code are available at <url>https://github.com/megagonlabs/cocosum</url></abstract>
+      <abstract>Opinion summarization focuses on generating summaries that reflect popular subjective information expressed in multiple online reviews. While generated summaries offer general and concise information about a particular hotel or product, the information may be insufficient to help the user compare multiple different choices. Thus, the user may still struggle with the question “Which one should I pick?” In this paper, we propose the comparative opinion summarization task, which aims at generating two contrastive summaries and one common summary from two different candidate sets of reviews. We develop a comparative summarization framework CoCoSum, which consists of two base summarization models that jointly generate contrastive and common summaries. Experimental results on a newly created benchmark CoCoTrip show that CoCoSum can produce higher-quality contrastive and common summaries than state-of-the-art opinion summarization models. The dataset and code are available at <url>https://github.com/megagonlabs/cocosum</url>
+      </abstract>
       <url hash="547a496b">2022.findings-acl.261</url>
       <bibkey>iso-etal-2022-comparative</bibkey>
       <doi>10.18653/v1/2022.findings-acl.261</doi>
@@ -6264,7 +6267,8 @@
       <author><first>Zechen</first><last>Li</last></author>
       <author><first>Anders</first><last>Søgaard</last></author>
       <pages>980-996</pages>
-      <abstract>Synthetic datasets have successfully been used to probe visual question-answering datasets for their reasoning abilities. CLEVR (John- son et al., 2017), for example, tests a range of visual reasoning abilities. The questions in CLEVR focus on comparisons of shapes, colors, and sizes, numerical reasoning, and existence claims. This paper introduces a minimally biased, diagnostic visual question-answering dataset, QLEVR, that goes beyond existential and numerical quantification and focus on more complex quantifiers and their combinations, e.g., asking whether there are more than two red balls that are smaller than at least three blue balls in an image. We describe how the dataset was created and present a first evaluation of state-of-the-art visual question-answering models, showing that QLEVR presents a formidable challenge to our current models. Code and Dataset are available at <url>https://github.com/zechenli03/QLEVR</url></abstract>
+      <abstract>Synthetic datasets have successfully been used to probe visual question-answering datasets for their reasoning abilities. CLEVR (John- son et al., 2017), for example, tests a range of visual reasoning abilities. The questions in CLEVR focus on comparisons of shapes, colors, and sizes, numerical reasoning, and existence claims. This paper introduces a minimally biased, diagnostic visual question-answering dataset, QLEVR, that goes beyond existential and numerical quantification and focus on more complex quantifiers and their combinations, e.g., asking whether there are more than two red balls that are smaller than at least three blue balls in an image. We describe how the dataset was created and present a first evaluation of state-of-the-art visual question-answering models, showing that QLEVR presents a formidable challenge to our current models. Code and Dataset are available at <url>https://github.com/zechenli03/QLEVR</url>
+      </abstract>
       <url hash="6ce0846b">2022.findings-naacl.73</url>
       <bibkey>li-sogaard-2022-qlevr</bibkey>
       <doi>10.18653/v1/2022.findings-naacl.73</doi>
@@ -9020,7 +9024,8 @@
       <author><first>Laure</first><last>Berti-Equille</last></author>
       <author><first>Kalyan</first><last>Veeramachaneni</last></author>
       <pages>438–452</pages>
-      <abstract>Adversarial examples are helpful for analyzing and improving the robustness of text classifiers. Generating high-quality adversarial examples is a challenging task as it requires generating fluent adversarial sentences that are semantically similar to the original sentences and preserve the original labels, while causing the classifier to misclassify them. Existing methods prioritize misclassification by maximizing each perturbation’s effectiveness at misleading a text classifier; thus, the generated adversarial examples fall short in terms of fluency and similarity. In this paper, we propose a rewrite and rollback (R&amp;R) framework for adversarial attack. It improves the quality of adversarial examples by optimizing a critique score which combines the fluency, similarity, and misclassification metrics. R&amp;R generates high-quality adversarial examples by allowing exploration of perturbations that do not have immediate impact on the misclassification metric but can improve fluency and similarity metrics. We evaluate our method on 5 representative datasets and 3 classifier architectures. Our method outperforms current state-of-the-art in attack success rate by +16.2%, +12.8%, and +14.0% on the classifiers respectively. Code is available at <url>https://github.com/DAI-Lab/fibber</url></abstract>
+      <abstract>Adversarial examples are helpful for analyzing and improving the robustness of text classifiers. Generating high-quality adversarial examples is a challenging task as it requires generating fluent adversarial sentences that are semantically similar to the original sentences and preserve the original labels, while causing the classifier to misclassify them. Existing methods prioritize misclassification by maximizing each perturbation’s effectiveness at misleading a text classifier; thus, the generated adversarial examples fall short in terms of fluency and similarity. In this paper, we propose a rewrite and rollback (R&amp;R) framework for adversarial attack. It improves the quality of adversarial examples by optimizing a critique score which combines the fluency, similarity, and misclassification metrics. R&amp;R generates high-quality adversarial examples by allowing exploration of perturbations that do not have immediate impact on the misclassification metric but can improve fluency and similarity metrics. We evaluate our method on 5 representative datasets and 3 classifier architectures. Our method outperforms current state-of-the-art in attack success rate by +16.2%, +12.8%, and +14.0% on the classifiers respectively. Code is available at <url>https://github.com/DAI-Lab/fibber</url>
+      </abstract>
       <url hash="901d8f84">2022.findings-aacl.41</url>
       <bibkey>xu-etal-2022-r</bibkey>
     </paper>
@@ -9311,7 +9316,8 @@
       <author><first>Xipeng</first><last>Qiu</last></author>
       <author><first>Zheng</first><last>Zhang</last></author>
       <pages>223-237</pages>
-      <abstract>Dialogue meaning representation formulates natural language utterance semantics in their conversational context in an explicit and machine-readable form. Previous work typically follows the intent-slot framework, which is easy for annotation yet limited in scalability for complex linguistic expressions. A line of works alleviates the representation issue by introducing hierarchical structures but challenging to express complex compositional semantics, such as negation and coreference. We propose Dialogue Meaning Representation (DMR), a pliable and easily extendable representation for task-oriented dialogue. Our representation contains a set of nodes and edges to represent rich compositional semantics. Moreover, we propose an inheritance hierarchy mechanism focusing on domain extensibility. Additionally, we annotated DMR-FastFood, a multi-turn dialogue dataset with more than 70k utterances, with DMR. We propose two evaluation tasks to evaluate different dialogue models and a novel coreference resolution model GNNCoref for the graph-based coreference resolution task. Experiments show that DMR can be parsed well with pre-trained Seq2Seq models, and GNNCoref outperforms the baseline models by a large margin. The dataset and code are available at <url>https://github.com/amazon-research/dialogue-meaning-representation</url></abstract>
+      <abstract>Dialogue meaning representation formulates natural language utterance semantics in their conversational context in an explicit and machine-readable form. Previous work typically follows the intent-slot framework, which is easy for annotation yet limited in scalability for complex linguistic expressions. A line of works alleviates the representation issue by introducing hierarchical structures but challenging to express complex compositional semantics, such as negation and coreference. We propose Dialogue Meaning Representation (DMR), a pliable and easily extendable representation for task-oriented dialogue. Our representation contains a set of nodes and edges to represent rich compositional semantics. Moreover, we propose an inheritance hierarchy mechanism focusing on domain extensibility. Additionally, we annotated DMR-FastFood, a multi-turn dialogue dataset with more than 70k utterances, with DMR. We propose two evaluation tasks to evaluate different dialogue models and a novel coreference resolution model GNNCoref for the graph-based coreference resolution task. Experiments show that DMR can be parsed well with pre-trained Seq2Seq models, and GNNCoref outperforms the baseline models by a large margin. The dataset and code are available at <url>https://github.com/amazon-research/dialogue-meaning-representation</url>
+      </abstract>
       <url hash="e010d2f4">2022.findings-emnlp.17</url>
       <bibkey>hu-etal-2022-dialogue</bibkey>
       <doi>10.18653/v1/2022.findings-emnlp.17</doi>
@@ -9346,7 +9352,8 @@
       <author><first>Sonal</first><last>Gupta</last></author>
       <author><first>Wen-tau</first><last>Yih</last></author>
       <pages>250-262</pages>
-      <abstract>Despite their recent popularity and well-known advantages, dense retrievers still lag behind sparse methods such as BM25 in their ability to reliably match salient phrases and rare entities in the query and to generalize to out-of-domain data. It has been argued that this is an inherent limitation of dense models. We rebut this claim by introducing the Salient Phrase Aware Retriever (SPAR), a dense retriever with the lexical matching capacity of a sparse model. We show that a dense Lexical Model Λ can be trained to imitate a sparse one, and SPAR is built by augmenting a standard dense retriever with Λ. Empirically, SPAR shows superior performance on a range of tasks including five question answering datasets, MS MARCO passage retrieval, as well as the EntityQuestions and BEIR benchmarks for out-of-domain evaluation, exceeding the performance of state-of-the-art dense and sparse retrievers. The code and models of SPAR are available at: <url>https://github.com/facebookresearch/dpr-scale/tree/main/spar</url></abstract>
+      <abstract>Despite their recent popularity and well-known advantages, dense retrievers still lag behind sparse methods such as BM25 in their ability to reliably match salient phrases and rare entities in the query and to generalize to out-of-domain data. It has been argued that this is an inherent limitation of dense models. We rebut this claim by introducing the Salient Phrase Aware Retriever (SPAR), a dense retriever with the lexical matching capacity of a sparse model. We show that a dense Lexical Model Λ can be trained to imitate a sparse one, and SPAR is built by augmenting a standard dense retriever with Λ. Empirically, SPAR shows superior performance on a range of tasks including five question answering datasets, MS MARCO passage retrieval, as well as the EntityQuestions and BEIR benchmarks for out-of-domain evaluation, exceeding the performance of state-of-the-art dense and sparse retrievers. The code and models of SPAR are available at: <url>https://github.com/facebookresearch/dpr-scale/tree/main/spar</url>
+      </abstract>
       <url hash="a671d688">2022.findings-emnlp.19</url>
       <bibkey>chen-etal-2022-salient</bibkey>
       <video href="2022.findings-emnlp.19.mp4"/>
@@ -11601,7 +11608,8 @@ Faster and Smaller Speech Translation without Quality Compromise</title>
       <author><first>Md Azam</first><last>Hossain</last></author>
       <author><first>Abu Raihan Mostofa</first><last>Kamal</last></author>
       <pages>2518-2532</pages>
-      <abstract>High-resource languages, such as English, have access to a plethora of datasets with various question-answer types resembling real-world reading comprehension. However, there is a severe lack of diverse and comprehensive question-answering datasets in under-resourced languages like Bangla. The ones available are either translated versions of English datasets with a niche answer format or created by human annotations focusing on a specific domain, question type, or answer type. To address these limitations, this paper introduces BanglaRQA, a reading comprehension-based Bangla question-answering dataset with various question-answer types. BanglaRQA consists of 3,000 context passages and 14,889 question-answer pairs created from those passages. The dataset comprises answerable and unanswerable questions covering four unique categories of questions and three types of answers. In addition, this paper also implemented four different Transformer models for question-answering on the proposed dataset. The best-performing model achieved an overall 62.42% EM and 78.11% F1 score. However, detailed analyses showed that the performance varies across question-answer types, leaving room for substantial improvement of the model performance. Furthermore, we demonstrated the effectiveness of BanglaRQA as a training resource by showing strong results on the bn_squad dataset. Therefore, BanglaRQA has the potential to contribute to the advancement of future research by enhancing the capability of language models. The dataset and codes are available at <url>https://github.com/sartajekram419/BanglaRQA</url></abstract>
+      <abstract>High-resource languages, such as English, have access to a plethora of datasets with various question-answer types resembling real-world reading comprehension. However, there is a severe lack of diverse and comprehensive question-answering datasets in under-resourced languages like Bangla. The ones available are either translated versions of English datasets with a niche answer format or created by human annotations focusing on a specific domain, question type, or answer type. To address these limitations, this paper introduces BanglaRQA, a reading comprehension-based Bangla question-answering dataset with various question-answer types. BanglaRQA consists of 3,000 context passages and 14,889 question-answer pairs created from those passages. The dataset comprises answerable and unanswerable questions covering four unique categories of questions and three types of answers. In addition, this paper also implemented four different Transformer models for question-answering on the proposed dataset. The best-performing model achieved an overall 62.42% EM and 78.11% F1 score. However, detailed analyses showed that the performance varies across question-answer types, leaving room for substantial improvement of the model performance. Furthermore, we demonstrated the effectiveness of BanglaRQA as a training resource by showing strong results on the bn_squad dataset. Therefore, BanglaRQA has the potential to contribute to the advancement of future research by enhancing the capability of language models. The dataset and codes are available at <url>https://github.com/sartajekram419/BanglaRQA</url>
+      </abstract>
       <url hash="4afe19c2">2022.findings-emnlp.186</url>
       <bibkey>ekram-etal-2022-banglarqa</bibkey>
       <video href="2022.findings-emnlp.186.mp4"/>
@@ -13801,7 +13809,8 @@ Faster and Smaller Speech Translation without Quality Compromise</title>
       <author><first>Mobashir</first><last>Sadat</last></author>
       <author><first>Cornelia</first><last>Caragea</last></author>
       <pages>4763-4776</pages>
-      <abstract>Natural Language Inference (NLI) or Recognizing Textual Entailment (RTE) aims at predicting the relation between a pair of sentences (premise and hypothesis) as entailment, contradiction or semantic independence. Although deep learning models have shown promising performance for NLI in recent years, they rely on large scale expensive human-annotated datasets. Semi-supervised learning (SSL) is a popular technique for reducing the reliance on human annotation by leveraging unlabeled data for training. However, despite its substantial success on single sentence classification tasks where the challenge in making use of unlabeled data is to assign “good enough” pseudo-labels, for NLI tasks, the nature of unlabeled data is more complex: one of the sentences in the pair (usually the hypothesis) along with the class label are missing from the data and require human annotations, which makes SSL for NLI more challenging. In this paper, we propose a novel way to incorporate unlabeled data in SSL for NLI where we use a conditional language model, BART to generate the hypotheses for the unlabeled sentences (used as premises). Our experiments show that our SSL framework successfully exploits unlabeled data and substantially improves the performance of four NLI datasets in low-resource settings. We release our code here: <url>https://github.com/msadat3/SSL_for_NLI</url></abstract>
+      <abstract>Natural Language Inference (NLI) or Recognizing Textual Entailment (RTE) aims at predicting the relation between a pair of sentences (premise and hypothesis) as entailment, contradiction or semantic independence. Although deep learning models have shown promising performance for NLI in recent years, they rely on large scale expensive human-annotated datasets. Semi-supervised learning (SSL) is a popular technique for reducing the reliance on human annotation by leveraging unlabeled data for training. However, despite its substantial success on single sentence classification tasks where the challenge in making use of unlabeled data is to assign “good enough” pseudo-labels, for NLI tasks, the nature of unlabeled data is more complex: one of the sentences in the pair (usually the hypothesis) along with the class label are missing from the data and require human annotations, which makes SSL for NLI more challenging. In this paper, we propose a novel way to incorporate unlabeled data in SSL for NLI where we use a conditional language model, BART to generate the hypotheses for the unlabeled sentences (used as premises). Our experiments show that our SSL framework successfully exploits unlabeled data and substantially improves the performance of four NLI datasets in low-resource settings. We release our code here: <url>https://github.com/msadat3/SSL_for_NLI</url>
+      </abstract>
       <url hash="6626cfc7">2022.findings-emnlp.351</url>
       <bibkey>sadat-caragea-2022-learning</bibkey>
       <doi>10.18653/v1/2022.findings-emnlp.351</doi>
@@ -14940,9 +14949,11 @@ Faster and Smaller Speech Translation without Quality Compromise</title>
       <author><first>Ruifeng</first><last>Xu</last></author>
       <pages>5948-5958</pages>
       <abstract>We present MCPG: a simple and effectiveapproach for controllable unsupervised paraphrase generation, which is also flexible toadapt to specific domains without extra training. MCPG is controllable in different levels: local lexicons, global semantics, and universal styles. The unsupervised paradigm ofMCPG combines factual keywords and diversified semantic embeddings as local lexical andglobal semantic constraints. The semantic embeddings are diversified by standard dropout,which is exploited for the first time to increaseinference diversity by us. Moreover, MCPGis qualified with good domain adaptability byadding a transfer vector as a universal style constraint, which is refined from the exemplars retrieved from the corpus of the target domain in atraining-free way. Extensive experiments showthat MCPG outperforms state-of-the-art unsupervised baselines by a margin. Meanwhile,our domain-adapted MCPG also achieves competitive performance with strong supervisedbaselines even without training.</abstract>
-      <url hash="d51cb912">2022.findings-emnlp.439</url>
+      <url hash="f7e07088">2022.findings-emnlp.439</url>
       <bibkey>chen-etal-2022-mcpg</bibkey>
       <doi>10.18653/v1/2022.findings-emnlp.439</doi>
+      <revision id="1" href="2022.findings-emnlp.439v1" hash="d51cb912"/>
+      <revision id="2" href="2022.findings-emnlp.439v2" hash="f7e07088" date="2023-10-21">Minor typo fix.</revision>
     </paper>
     <paper id="440">
       <title><fixed-case>W</fixed-case>ord<fixed-case>T</fixed-case>ies: Measuring Word Associations in Language Models via Constrained Sampling</title>
@@ -16149,7 +16160,8 @@ Faster and Smaller Speech Translation without Quality Compromise</title>
       <author><first>Rada</first><last>Mihalcea</last></author>
       <author><first>Bernhard</first><last>Schoelkopf</last></author>
       <pages>7180-7198</pages>
-      <abstract>Reasoning is central to human intelligence. However, fallacious arguments are common, and some exacerbate problems such as spreading misinformation about climate change. In this paper, we propose the task of logical fallacy detection, and provide a new dataset (Logic) of logical fallacies generally found in text, together with an additional challenge set for detecting logical fallacies in climate change claims (LogicClimate). Detecting logical fallacies is a hard problem as the model must understand the underlying logical structure of the argument. We find that existing pretrained large language models perform poorly on this task. In contrast, we show that a simple structure-aware classifier outperforms the best language model by 5.46% F1 scores on Logic and 4.51% on LogicClimate. We encourage future work to explore this task since (a) it can serve as a new reasoning challenge for language models, and (b) it can have potential applications in tackling the spread of misinformation. Our dataset and code are available at <url>https://github.com/causalNLP/logical-fallacy</url></abstract>
+      <abstract>Reasoning is central to human intelligence. However, fallacious arguments are common, and some exacerbate problems such as spreading misinformation about climate change. In this paper, we propose the task of logical fallacy detection, and provide a new dataset (Logic) of logical fallacies generally found in text, together with an additional challenge set for detecting logical fallacies in climate change claims (LogicClimate). Detecting logical fallacies is a hard problem as the model must understand the underlying logical structure of the argument. We find that existing pretrained large language models perform poorly on this task. In contrast, we show that a simple structure-aware classifier outperforms the best language model by 5.46% F1 scores on Logic and 4.51% on LogicClimate. We encourage future work to explore this task since (a) it can serve as a new reasoning challenge for language models, and (b) it can have potential applications in tackling the spread of misinformation. Our dataset and code are available at <url>https://github.com/causalNLP/logical-fallacy</url>
+      </abstract>
       <url hash="8e282cee">2022.findings-emnlp.532</url>
       <bibkey>jin-etal-2022-logical</bibkey>
       <video href="2022.findings-emnlp.532.mp4"/>
diff --git a/data/xml/2022.semeval.xml b/data/xml/2022.semeval.xml
index c6d27e8460..d468d26908 100644
--- a/data/xml/2022.semeval.xml
+++ b/data/xml/2022.semeval.xml
@@ -412,12 +412,12 @@
     </paper>
     <paper id="31">
       <title><fixed-case>RUG</fixed-case>-1-Pegasussers at <fixed-case>S</fixed-case>em<fixed-case>E</fixed-case>val-2022 Task 3: Data Generation Methods to Improve Recognizing Appropriate Taxonomic Word Relations</title>
-      <author><first>Wessel</first><last>Poelman</last></author>
+      <author><first>Frank</first><last>van den Berg</last></author>
       <author><first>Gijs</first><last>Danoe</last></author>
       <author><first>Esther</first><last>Ploeger</last></author>
-      <author><first>Frank</first><last>van den Berg</last></author>
-      <author><first>Tommaso</first><last>Caselli</last></author>
+      <author><first>Wessel</first><last>Poelman</last></author>
       <author><first>Lukas</first><last>Edman</last></author>
+      <author><first>Tommaso</first><last>Caselli</last></author>
       <pages>247-254</pages>
       <abstract>This paper describes our system created for the SemEval 2022 Task 3: Presupposed Taxonomies - Evaluating Neural-network Semantics. This task is focused on correctly recognizing taxonomic word relations in English, French and Italian. We developed various datageneration techniques that expand the originally provided train set and show that all methods increase the performance of modelstrained on these expanded datasets. Our final system outperformed the baseline system from the task organizers by achieving an average macro F1 score of 79.6 on all languages, compared to the baseline’s 67.4.</abstract>
       <url hash="3364a902">2022.semeval-1.31</url>
diff --git a/data/xml/2023.acl.xml b/data/xml/2023.acl.xml
index 2135dc5f71..dbc701b574 100644
--- a/data/xml/2023.acl.xml
+++ b/data/xml/2023.acl.xml
@@ -488,7 +488,8 @@
       <author><first>André</first><last>Martins</last><affiliation>Unbabel, Instituto de Telecomunicacoes</affiliation></author>
       <author><first>Graham</first><last>Neubig</last><affiliation>Carnegie Mellon University</affiliation></author>
       <pages>606-626</pages>
-      <abstract>Although proper handling of discourse significantly contributes to the quality of machine translation (MT), these improvements are not adequately measured in common translation quality metrics. Recent works in context-aware MT attempt to target a small set of discourse phenomena during evaluation, however not in a fully systematic way. In this paper, we develop the Multilingual Discourse-Aware (MuDA) benchmark, a series of taggers that identify and evaluate model performance on discourse phenomena in any given dataset. The choice of phenomena is inspired by a novel methodology to systematically identify translations that require context. This methodology confirms the difficulty of previously studied phenomena while uncovering others which were not previously addressed. We find that commonly studied context-aware MT models make only marginal improvements over context-agnostic models, which suggests these models do not handle these ambiguities effectively. We release code and data for 14 language pairs to encourage the MT community to focus on accurately capturing discourse phenomena. Code available at <url>https://github.com/neulab/contextual-mt</url></abstract>
+      <abstract>Although proper handling of discourse significantly contributes to the quality of machine translation (MT), these improvements are not adequately measured in common translation quality metrics. Recent works in context-aware MT attempt to target a small set of discourse phenomena during evaluation, however not in a fully systematic way. In this paper, we develop the Multilingual Discourse-Aware (MuDA) benchmark, a series of taggers that identify and evaluate model performance on discourse phenomena in any given dataset. The choice of phenomena is inspired by a novel methodology to systematically identify translations that require context. This methodology confirms the difficulty of previously studied phenomena while uncovering others which were not previously addressed. We find that commonly studied context-aware MT models make only marginal improvements over context-agnostic models, which suggests these models do not handle these ambiguities effectively. We release code and data for 14 language pairs to encourage the MT community to focus on accurately capturing discourse phenomena. Code available at <url>https://github.com/neulab/contextual-mt</url>
+      </abstract>
       <url hash="73f05db1">2023.acl-long.36</url>
       <bibkey>fernandes-etal-2023-translation</bibkey>
       <doi>10.18653/v1/2023.acl-long.36</doi>
@@ -527,7 +528,8 @@
       <author><first>Yuxin</first><last>Huang</last><affiliation>Zhejiang Lab</affiliation></author>
       <author><first>Taihao</first><last>Li</last><affiliation>zhejianglab</affiliation></author>
       <pages>658-670</pages>
-      <abstract>Multi-modal emotion recognition has gained increasing attention in recent years due to its widespread applications and the advances in multi-modal learning approaches. However, previous studies primarily focus on developing models that exploit the unification of multiple modalities. In this paper, we propose that maintaining modality independence is beneficial for the model performance. According to this principle, we construct a dataset, and devise a multi-modal transformer model. The new dataset, CHinese Emotion Recognition dataset with Modality-wise Annotions, abbreviated as CHERMA, provides uni-modal labels for each individual modality, and multi-modal labels for all modalities jointly observed. The model consists of uni-modal transformer modules that learn representations for each modality, and a multi-modal transformer module that fuses all modalities. All the modules are supervised by their corresponding labels separately, and the forward information flow is uni-directionally from the uni-modal modules to the multi-modal module. The supervision strategy and the model architecture guarantee each individual modality learns its representation independently, and meanwhile the multi-modal module aggregates all information. Extensive empirical results demonstrate that our proposed scheme outperforms state-of-the-art alternatives, corroborating the importance of modality independence in multi-modal emotion recognition. The dataset and codes are availabel at <url>https://github.com/sunjunaimer/LFMIM</url></abstract>
+      <abstract>Multi-modal emotion recognition has gained increasing attention in recent years due to its widespread applications and the advances in multi-modal learning approaches. However, previous studies primarily focus on developing models that exploit the unification of multiple modalities. In this paper, we propose that maintaining modality independence is beneficial for the model performance. According to this principle, we construct a dataset, and devise a multi-modal transformer model. The new dataset, CHinese Emotion Recognition dataset with Modality-wise Annotions, abbreviated as CHERMA, provides uni-modal labels for each individual modality, and multi-modal labels for all modalities jointly observed. The model consists of uni-modal transformer modules that learn representations for each modality, and a multi-modal transformer module that fuses all modalities. All the modules are supervised by their corresponding labels separately, and the forward information flow is uni-directionally from the uni-modal modules to the multi-modal module. The supervision strategy and the model architecture guarantee each individual modality learns its representation independently, and meanwhile the multi-modal module aggregates all information. Extensive empirical results demonstrate that our proposed scheme outperforms state-of-the-art alternatives, corroborating the importance of modality independence in multi-modal emotion recognition. The dataset and codes are availabel at <url>https://github.com/sunjunaimer/LFMIM</url>
+      </abstract>
       <url hash="ea5bbafb">2023.acl-long.39</url>
       <bibkey>sun-etal-2023-layer</bibkey>
       <revision id="1" href="2023.acl-long.39v1" hash="3df5987a"/>
@@ -1631,7 +1633,8 @@
       <author><first>Yunbo</first><last>Cao</last><affiliation>Tencent Corporation</affiliation></author>
       <author><first>Zhifang</first><last>Sui</last><affiliation>Peking University</affiliation></author>
       <pages>2231-2243</pages>
-      <abstract>Video multimodal fusion aims to integrate multimodal signals in videos, such as visual, audio and text, to make a complementary prediction with multiple modalities contents. However, unlike other image-text multimodal tasks, video has longer multimodal sequences with more redundancy and noise in both visual and audio modalities. Prior denoising methods like forget gate are coarse in the granularity of noise filtering. They often suppress the redundant and noisy information at the risk of losing critical information. Therefore, we propose a denoising bottleneck fusion (DBF) model for fine-grained video multimodal fusion. On the one hand, we employ a bottleneck mechanism to filter out noise and redundancy with a restrained receptive field. On the other hand, we use a mutual information maximization module to regulate the filter-out module to preserve key information within different modalities. Our DBF model achieves significant improvement over current state-of-the-art baselines on multiple benchmarks covering multimodal sentiment analysis and multimodal summarization tasks. It proves that our model can effectively capture salient features from noisy and redundant video, audio, and text inputs. The code for this paper will be publicly available at <url>https://github.com/WSXRHFG/DBF</url></abstract>
+      <abstract>Video multimodal fusion aims to integrate multimodal signals in videos, such as visual, audio and text, to make a complementary prediction with multiple modalities contents. However, unlike other image-text multimodal tasks, video has longer multimodal sequences with more redundancy and noise in both visual and audio modalities. Prior denoising methods like forget gate are coarse in the granularity of noise filtering. They often suppress the redundant and noisy information at the risk of losing critical information. Therefore, we propose a denoising bottleneck fusion (DBF) model for fine-grained video multimodal fusion. On the one hand, we employ a bottleneck mechanism to filter out noise and redundancy with a restrained receptive field. On the other hand, we use a mutual information maximization module to regulate the filter-out module to preserve key information within different modalities. Our DBF model achieves significant improvement over current state-of-the-art baselines on multiple benchmarks covering multimodal sentiment analysis and multimodal summarization tasks. It proves that our model can effectively capture salient features from noisy and redundant video, audio, and text inputs. The code for this paper will be publicly available at <url>https://github.com/WSXRHFG/DBF</url>
+      </abstract>
       <url hash="2ebd0426">2023.acl-long.124</url>
       <bibkey>wu-etal-2023-denoising</bibkey>
       <doi>10.18653/v1/2023.acl-long.124</doi>
@@ -1872,7 +1875,8 @@
       <author><first>Yong-Bin</first><last>Kang</last><affiliation>Swinburne University of Technology</affiliation></author>
       <author><first>Rifat</first><last>Shahriyar</last><affiliation>Bangladesh University of Engineering and Technology</affiliation></author>
       <pages>2541-2564</pages>
-      <abstract>We present CrossSum, a large-scale cross-lingual summarization dataset comprising 1.68 million article-summary samples in 1,500+ language pairs. We create CrossSum by aligning parallel articles written in different languages via cross-lingual retrieval from a multilingual abstractive summarization dataset and perform a controlled human evaluation to validate its quality. We propose a multistage data sampling algorithm to effectively train a cross-lingual summarization model capable of summarizing an article in any target language. We also introduce LaSE, an embedding-based metric for automatically evaluating model-generated summaries. LaSE is strongly correlated with ROUGE and, unlike ROUGE, can be reliably measured even in the absence of references in the target language. Performance on ROUGE and LaSE indicate that our proposed model consistently outperforms baseline models. To the best of our knowledge, CrossSum is the largest cross-lingual summarization dataset and the first ever that is not centered around English. We are releasing the dataset, training and evaluation scripts, and models to spur future research on cross-lingual summarization. The resources can be found at <url>https://github.com/csebuetnlp/CrossSum</url></abstract>
+      <abstract>We present CrossSum, a large-scale cross-lingual summarization dataset comprising 1.68 million article-summary samples in 1,500+ language pairs. We create CrossSum by aligning parallel articles written in different languages via cross-lingual retrieval from a multilingual abstractive summarization dataset and perform a controlled human evaluation to validate its quality. We propose a multistage data sampling algorithm to effectively train a cross-lingual summarization model capable of summarizing an article in any target language. We also introduce LaSE, an embedding-based metric for automatically evaluating model-generated summaries. LaSE is strongly correlated with ROUGE and, unlike ROUGE, can be reliably measured even in the absence of references in the target language. Performance on ROUGE and LaSE indicate that our proposed model consistently outperforms baseline models. To the best of our knowledge, CrossSum is the largest cross-lingual summarization dataset and the first ever that is not centered around English. We are releasing the dataset, training and evaluation scripts, and models to spur future research on cross-lingual summarization. The resources can be found at <url>https://github.com/csebuetnlp/CrossSum</url>
+      </abstract>
       <url hash="ed26478c">2023.acl-long.143</url>
       <bibkey>bhattacharjee-etal-2023-crosssum</bibkey>
       <doi>10.18653/v1/2023.acl-long.143</doi>
@@ -2090,7 +2094,8 @@
       <author><first>Janghoon</first><last>Han</last><affiliation>LG AI Research</affiliation></author>
       <author><first>Kyomin</first><last>Jung</last><affiliation>Seoul National University</affiliation></author>
       <pages>2832-2846</pages>
-      <abstract>Despite the recent advances in dialogue state tracking (DST), the joint goal accuracy (JGA) of the existing methods on MultiWOZ 2.1 still remains merely 60%. In our preliminary error analysis, we find that beam search produces a pool of candidates that is likely to include the correct dialogue state. Motivated by this observation, we introduce a novel framework, called BREAK (Beam search and RE-rAnKing), that achieves outstanding performance on DST. BREAK performs DST in two stages: (i) generating k-best dialogue state candidates with beam search and (ii) re-ranking the candidates to select the correct dialogue state. This simple yet powerful framework shows state-of-the-art performance on all versions of MultiWOZ and M2M datasets. Most notably, we push the joint goal accuracy to 80-90% on MultiWOZ 2.1-2.4, which is an improvement of 23.6%, 26.3%, 21.7%, and 10.8% over the previous best-performing models, respectively. The data and code will be available at <url>https://github.com/tony-won/DST-BREAK</url></abstract>
+      <abstract>Despite the recent advances in dialogue state tracking (DST), the joint goal accuracy (JGA) of the existing methods on MultiWOZ 2.1 still remains merely 60%. In our preliminary error analysis, we find that beam search produces a pool of candidates that is likely to include the correct dialogue state. Motivated by this observation, we introduce a novel framework, called BREAK (Beam search and RE-rAnKing), that achieves outstanding performance on DST. BREAK performs DST in two stages: (i) generating k-best dialogue state candidates with beam search and (ii) re-ranking the candidates to select the correct dialogue state. This simple yet powerful framework shows state-of-the-art performance on all versions of MultiWOZ and M2M datasets. Most notably, we push the joint goal accuracy to 80-90% on MultiWOZ 2.1-2.4, which is an improvement of 23.6%, 26.3%, 21.7%, and 10.8% over the previous best-performing models, respectively. The data and code will be available at <url>https://github.com/tony-won/DST-BREAK</url>
+      </abstract>
       <url hash="4500dd63">2023.acl-long.159</url>
       <bibkey>won-etal-2023-break</bibkey>
       <doi>10.18653/v1/2023.acl-long.159</doi>
@@ -2681,7 +2686,7 @@
       <author><first>Ryan</first><last>Cotterell</last><affiliation>ETH Zürich</affiliation></author>
       <author><first>Jason</first><last>Eisner</last><affiliation>Johns Hopkins University + Microsoft Corporation</affiliation></author>
       <pages>3687-3713</pages>
-      <abstract>We present Earley’s (1970) context-free parsing algorithm as a deduction system, incorporating various known and new speed-ups. In particular, our presentation supports a known worst-case runtime improvement from Earley’s (1970) O(N3|G||R|), which is unworkable for the large grammars that arise in natural language processing, to O(N3|G|), which matches the complexity of CKY on a binarized version of the grammar G. Here N is the length of the sentence, |R| is the number of productions in G, and |G| is the total length of those productions. We also provide a version that achieves runtime of O(N3|M|) with |M| leq |G| when the grammar is represented compactly as a single finite-state automaton M (this is partly novel). We carefully treat the generalization to semiring-weighted deduction, preprocessing the grammar like Stolcke (1995) to eliminate the possibility of deduction cycles, and further generalize Stolcke’s method to compute the weights of sentence prefixes. We also provide implementation details for efficient execution, ensuring that on a preprocessed grammar, the semiring-weighted versions of our methods have the same asymptotic runtime and space requirements as the unweighted methods, including sub-cubic runtime on some grammars.</abstract>
+      <abstract>We present Earley’s (1970) context-free parsing algorithm as a deduction system, incorporating various known and new speed-ups. In particular, our presentation supports a known worst-case runtime improvement from Earley’s (1970) <tex-math>O(N^3|G||R|)</tex-math>, which is unworkable for the large grammars that arise in natural language processing, to <tex-math>O(N^3|G|)</tex-math>, which matches the complexity of CKY on a binarized version of the grammar G. Here N is the length of the sentence, |R| is the number of productions in G, and |G| is the total length of those productions. We also provide a version that achieves runtime of <tex-math>O(N^3|M|)</tex-math> with <tex-math>|M| \leq |G|</tex-math> when the grammar is represented compactly as a single finite-state automaton M (this is partly novel). We carefully treat the generalization to semiring-weighted deduction, preprocessing the grammar like Stolcke (1995) to eliminate the possibility of deduction cycles, and further generalize Stolcke’s method to compute the weights of sentence prefixes. We also provide implementation details for efficient execution, ensuring that on a preprocessed grammar, the semiring-weighted versions of our methods have the same asymptotic runtime and space requirements as the unweighted methods, including sub-cubic runtime on some grammars.</abstract>
       <url hash="64bf7d60">2023.acl-long.204</url>
       <bibkey>opedal-etal-2023-efficient</bibkey>
       <doi>10.18653/v1/2023.acl-long.204</doi>
@@ -3709,7 +3714,8 @@
       <author><first>Megh</first><last>Thakkar</last><affiliation>BITS Pilani</affiliation></author>
       <author><first>Chi</first><last>Xu</last><affiliation>National University of Defense Technology</affiliation></author>
       <pages>5145-5165</pages>
-      <abstract>Large-scale pre-trained language models have shown outstanding performance in a variety of NLP tasks. However, they are also known to be significantly brittle against specifically crafted adversarial examples, leading to increasing interest in probing the adversarial robustness of NLP systems. We introduce RSMI, a novel two-stage framework that combines randomized smoothing (RS) with masked inference (MI) to improve the adversarial robustness of NLP systems. RS transforms a classifier into a smoothed classifier to obtain robust representations, whereas MI forces a model to exploit the surrounding context of a masked token in an input sequence. RSMI improves adversarial robustness by 2 to 3 times over existing state-of-the-art methods on benchmark datasets. We also perform in-depth qualitative analysis to validate the effectiveness of the different stages of RSMI and probe the impact of its components through extensive ablations. By empirically proving the stability of RSMI, we put it forward as a practical method to robustly train large-scale NLP models. Our code and datasets are available at <url>https://github.com/Han8931/rsmi_nlp</url></abstract>
+      <abstract>Large-scale pre-trained language models have shown outstanding performance in a variety of NLP tasks. However, they are also known to be significantly brittle against specifically crafted adversarial examples, leading to increasing interest in probing the adversarial robustness of NLP systems. We introduce RSMI, a novel two-stage framework that combines randomized smoothing (RS) with masked inference (MI) to improve the adversarial robustness of NLP systems. RS transforms a classifier into a smoothed classifier to obtain robust representations, whereas MI forces a model to exploit the surrounding context of a masked token in an input sequence. RSMI improves adversarial robustness by 2 to 3 times over existing state-of-the-art methods on benchmark datasets. We also perform in-depth qualitative analysis to validate the effectiveness of the different stages of RSMI and probe the impact of its components through extensive ablations. By empirically proving the stability of RSMI, we put it forward as a practical method to robustly train large-scale NLP models. Our code and datasets are available at <url>https://github.com/Han8931/rsmi_nlp</url>
+      </abstract>
       <url hash="e24ebe2c">2023.acl-long.282</url>
       <bibkey>moon-etal-2023-randomized</bibkey>
       <doi>10.18653/v1/2023.acl-long.282</doi>
@@ -3977,7 +3983,8 @@
       <author><first>Ashish</first><last>Sabharwal</last><affiliation>Allen Institute for AI (AI2)</affiliation></author>
       <author><first>Kyle</first><last>Richardson</last><affiliation>the Allen Institute for Artificial Intelligence</affiliation></author>
       <pages>5514-5528</pages>
-      <abstract>Models trained with counterfactually augmented data learn representations of the causal structure of tasks, enabling robust generalization. However, high-quality counterfactual data is scarce for most tasks and not easily generated at scale. When crowdsourced, such data is typically limited in scale and diversity; when generated using supervised methods, it is computationally expensive to extend to new counterfactual dimensions. In this work, we introduce DISCO (DIStilled COunterfactual Data), a new method for automatically generating high-quality counterfactual data at scale. DISCO engineers prompts to generate phrasal perturbations with a large general language model. Then, a task-specific teacher model filters these generations to distill high-quality counterfactual data. While task-agnostic, we apply our pipeline to the task of natural language inference (NLI) and find that on challenging evaluations such as the NLI stress test, comparatively smaller student models trained with DISCO generated counterfactuals are more robust (6% absolute) and generalize better across distributions (2%) compared to models trained without data augmentation. Furthermore, DISCO augmented models are 10% more consistent between counterfactual pairs on three evaluation sets, demonstrating that DISCO augmentation enables models to more reliably learn causal representations. Our repository are available at: <url>https://github.com/eric11eca/disco</url></abstract>
+      <abstract>Models trained with counterfactually augmented data learn representations of the causal structure of tasks, enabling robust generalization. However, high-quality counterfactual data is scarce for most tasks and not easily generated at scale. When crowdsourced, such data is typically limited in scale and diversity; when generated using supervised methods, it is computationally expensive to extend to new counterfactual dimensions. In this work, we introduce DISCO (DIStilled COunterfactual Data), a new method for automatically generating high-quality counterfactual data at scale. DISCO engineers prompts to generate phrasal perturbations with a large general language model. Then, a task-specific teacher model filters these generations to distill high-quality counterfactual data. While task-agnostic, we apply our pipeline to the task of natural language inference (NLI) and find that on challenging evaluations such as the NLI stress test, comparatively smaller student models trained with DISCO generated counterfactuals are more robust (6% absolute) and generalize better across distributions (2%) compared to models trained without data augmentation. Furthermore, DISCO augmented models are 10% more consistent between counterfactual pairs on three evaluation sets, demonstrating that DISCO augmentation enables models to more reliably learn causal representations. Our repository are available at: <url>https://github.com/eric11eca/disco</url>
+      </abstract>
       <url hash="07785d0b">2023.acl-long.302</url>
       <bibkey>chen-etal-2023-disco</bibkey>
       <doi>10.18653/v1/2023.acl-long.302</doi>
@@ -4157,7 +4164,7 @@
       <author><first>Roger</first><last>Levy</last><affiliation>Massachusetts Institute of Technology</affiliation></author>
       <author><first>Yoon</first><last>Kim</last><affiliation>MIT</affiliation></author>
       <pages>5747-5766</pages>
-      <abstract>We study grammar induction with mildly context-sensitive grammars for unsupervised discontinuous parsing. Using the probabilistic linear context-free rewriting system (LCFRS) formalism, our approach fixes the rule structure in advance and focuses on parameter learning with maximum likelihood. To reduce the computational complexity of both parsing and parameter estimation, we restrict the grammar formalism to LCFRS-2 (i.e., binary LCFRS with fan-out two) and further discard rules that require O(l6) time to parse, reducing inference to O(l5). We find that using a large number of nonterminals is beneficial and thus make use of tensor decomposition-based rank-space dynamic programming with an embedding-based parameterization of rule probabilities to scale up the number of nonterminals. Experiments on German and Dutch show that our approach is able to induce linguistically meaningful trees with continuous and discontinuous structures.</abstract>
+      <abstract>We study grammar induction with mildly context-sensitive grammars for unsupervised discontinuous parsing. Using the probabilistic linear context-free rewriting system (LCFRS) formalism, our approach fixes the rule structure in advance and focuses on parameter learning with maximum likelihood. To reduce the computational complexity of both parsing and parameter estimation, we restrict the grammar formalism to LCFRS-2 (i.e., binary LCFRS with fan-out two) and further discard rules that require <tex-math>O(l^6)</tex-math> time to parse, reducing inference to <tex-math>O(l^5)</tex-math>. We find that using a large number of nonterminals is beneficial and thus make use of tensor decomposition-based rank-space dynamic programming with an embedding-based parameterization of rule probabilities to scale up the number of nonterminals. Experiments on German and Dutch show that our approach is able to induce linguistically meaningful trees with continuous and discontinuous structures.</abstract>
       <url hash="18b8b02c">2023.acl-long.316</url>
       <bibkey>yang-etal-2023-unsupervised</bibkey>
       <doi>10.18653/v1/2023.acl-long.316</doi>
@@ -5706,7 +5713,8 @@
       <author><first>Alon</first><last>Halevy</last><affiliation>Facebook AI</affiliation></author>
       <author><first>Diyi</first><last>Yang</last><affiliation>Stanford University</affiliation></author>
       <pages>7756-7776</pages>
-      <abstract>We present NormBank, a knowledge bank of 155k situational norms. This resource is designed to ground flexible normative reasoning for interactive, assistive, and collaborative AI systems. Unlike prior commonsense resources, NormBank grounds each inference within a multivalent sociocultural frame, which includes the setting (e.g., restaurant), the agents’ contingent roles (waiter, customer), their attributes (age, gender), and other physical, social, and cultural constraints (e.g., the temperature or the country of operation). In total, NormBank contains 63k unique constraints from a taxonomy that we introduce and iteratively refine here. Constraints then apply in different combinations to frame social norms. Under these manipulations, norms are non-monotonic — one can cancel an inference by updating its frame even slightly. Still, we find evidence that neural models can help reliably extend the scope and coverage of NormBank. We further demonstrate the utility of this resource with a series of transfer experiments. For data and code, see <url>https://github.com/SALT-NLP/normbank</url></abstract>
+      <abstract>We present NormBank, a knowledge bank of 155k situational norms. This resource is designed to ground flexible normative reasoning for interactive, assistive, and collaborative AI systems. Unlike prior commonsense resources, NormBank grounds each inference within a multivalent sociocultural frame, which includes the setting (e.g., restaurant), the agents’ contingent roles (waiter, customer), their attributes (age, gender), and other physical, social, and cultural constraints (e.g., the temperature or the country of operation). In total, NormBank contains 63k unique constraints from a taxonomy that we introduce and iteratively refine here. Constraints then apply in different combinations to frame social norms. Under these manipulations, norms are non-monotonic — one can cancel an inference by updating its frame even slightly. Still, we find evidence that neural models can help reliably extend the scope and coverage of NormBank. We further demonstrate the utility of this resource with a series of transfer experiments. For data and code, see <url>https://github.com/SALT-NLP/normbank</url>
+      </abstract>
       <url hash="e6caecd9">2023.acl-long.429</url>
       <bibkey>ziems-etal-2023-normbank</bibkey>
       <doi>10.18653/v1/2023.acl-long.429</doi>
@@ -7145,7 +7153,8 @@
       <author><first>Chang</first><last>Liu</last><affiliation>Peking University</affiliation></author>
       <author><first>Dongyan</first><last>Zhao</last><affiliation>pku.edu.cn</affiliation></author>
       <pages>9631-9646</pages>
-      <abstract>Event temporal relation extraction (ETRE) is usually formulated as a multi-label classification task, where each type of relation is simply treated as a one-hot label. This formulation ignores the meaning of relations and wipes out their intrinsic dependency. After examining the relation definitions in various ETRE tasks, we observe that all relations can be interpreted using the start and end time points of events. For example, relation <i>Includes</i> could be interpreted as event 1 starting no later than event 2 and ending no earlier than event 2. In this paper, we propose a unified event temporal relation extraction framework, which transforms temporal relations into logical expressions of time points and completes the ETRE by predicting the relations between certain time point pairs. Experiments on TB-Dense and MATRES show significant improvements over a strong baseline and outperform the state-of-the-art model by 0.3% on both datasets. By representing all relations in a unified framework, we can leverage the relations with sufficient data to assist the learning of other relations, thus achieving stable improvement in low-data scenarios. When the relation definitions are changed, our method can quickly adapt to the new ones by simply modifying the logic expressions that map time points to new event relations. The code is released at <url>https://github.com/AndrewZhe/A-Unified-Framework-for-ETRE</url></abstract>
+      <abstract>Event temporal relation extraction (ETRE) is usually formulated as a multi-label classification task, where each type of relation is simply treated as a one-hot label. This formulation ignores the meaning of relations and wipes out their intrinsic dependency. After examining the relation definitions in various ETRE tasks, we observe that all relations can be interpreted using the start and end time points of events. For example, relation <i>Includes</i> could be interpreted as event 1 starting no later than event 2 and ending no earlier than event 2. In this paper, we propose a unified event temporal relation extraction framework, which transforms temporal relations into logical expressions of time points and completes the ETRE by predicting the relations between certain time point pairs. Experiments on TB-Dense and MATRES show significant improvements over a strong baseline and outperform the state-of-the-art model by 0.3% on both datasets. By representing all relations in a unified framework, we can leverage the relations with sufficient data to assist the learning of other relations, thus achieving stable improvement in low-data scenarios. When the relation definitions are changed, our method can quickly adapt to the new ones by simply modifying the logic expressions that map time points to new event relations. The code is released at <url>https://github.com/AndrewZhe/A-Unified-Framework-for-ETRE</url>
+      </abstract>
       <url hash="c7b4d0f3">2023.acl-long.536</url>
       <bibkey>huang-etal-2023-classification</bibkey>
       <doi>10.18653/v1/2023.acl-long.536</doi>
@@ -7918,9 +7927,11 @@
       <author><first>Jie</first><last>Zhou</last><affiliation>Tencent Inc.</affiliation></author>
       <pages>10641-10658</pages>
       <abstract>Injecting external knowledge can improve the performance of pre-trained language models (PLMs) on various downstream NLP tasks. However, massive retraining is required to deploy new knowledge injection methods or knowledge bases for downstream tasks. In this work, we are the first to study how to improve the flexibility and efficiency of knowledge injection by reusing existing downstream models. To this end, we explore a new paradigm <i>plug-and-play knowledge injection</i>, where knowledge bases are injected into frozen existing downstream models by a <i>knowledge plugin</i>. Correspondingly, we propose a plug-and-play injection method <i>map-tuning</i>, which trains a mapping of knowledge embeddings to enrich model inputs with mapped embeddings while keeping model parameters frozen. Experimental results on three knowledge-driven NLP tasks show that existing injection methods are not suitable for the new paradigm, while map-tuning effectively improves the performance of downstream models. Moreover, we show that a frozen downstream model can be well adapted to different domains with different mapping networks of domain knowledge. Our code and models are available at <url>https://github.com/THUNLP/Knowledge-Plugin</url>.</abstract>
-      <url hash="910a8e28">2023.acl-long.594</url>
+      <url hash="4cccf7c8">2023.acl-long.594</url>
       <bibkey>zhang-etal-2023-plug</bibkey>
       <doi>10.18653/v1/2023.acl-long.594</doi>
+      <revision id="1" href="2023.acl-long.594v1" hash="910a8e28"/>
+      <revision id="2" href="2023.acl-long.594v2" hash="4cccf7c8" date="2023-10-30">Corrected the coding error of Figure 2.</revision>
     </paper>
     <paper id="595">
       <title>Two Birds One Stone: Dynamic Ensemble for <fixed-case>OOD</fixed-case> Intent Classification</title>
@@ -11725,7 +11736,8 @@
       <author><first>Steven</first><last>Bethard</last><affiliation>University of Arizona</affiliation></author>
       <author><first>Guergana</first><last>Savova</last><affiliation>Boston Children’s Hospital and Harvard Medical School</affiliation></author>
       <pages>15746-15761</pages>
-      <abstract>The bias-variance tradeoff is the idea that learning methods need to balance model complexity with data size to minimize both under-fitting and over-fitting. Recent empirical work and theoretical analysis with over-parameterized neural networks challenges the classic bias-variance trade-off notion suggesting that no such trade-off holds: as the width of the network grows, bias monotonically decreases while variance initially increases followed by a decrease. In this work, we first provide a variance decomposition-based justification criteria to examine whether large pretrained neural models in a fine-tuning setting are generalizable enough to have low bias and variance. We then perform theoretical and empirical analysis using ensemble methods explicitly designed to decrease variance due to optimization. This results in essentially a two-stage fine-tuning algorithm that first ratchets down bias and variance iteratively, and then uses a selected fixed-bias model to further reduce variance due to optimization by ensembling. We also analyze the nature of variance change with the ensemble size in low- and high-resource classes. Empirical results show that this two-stage method obtains strong results on SuperGLUE tasks and clinical information extraction tasks. Code and settings are available: <url>https://github.com/christa60/bias-var-fine-tuning-plms.git</url></abstract>
+      <abstract>The bias-variance tradeoff is the idea that learning methods need to balance model complexity with data size to minimize both under-fitting and over-fitting. Recent empirical work and theoretical analysis with over-parameterized neural networks challenges the classic bias-variance trade-off notion suggesting that no such trade-off holds: as the width of the network grows, bias monotonically decreases while variance initially increases followed by a decrease. In this work, we first provide a variance decomposition-based justification criteria to examine whether large pretrained neural models in a fine-tuning setting are generalizable enough to have low bias and variance. We then perform theoretical and empirical analysis using ensemble methods explicitly designed to decrease variance due to optimization. This results in essentially a two-stage fine-tuning algorithm that first ratchets down bias and variance iteratively, and then uses a selected fixed-bias model to further reduce variance due to optimization by ensembling. We also analyze the nature of variance change with the ensemble size in low- and high-resource classes. Empirical results show that this two-stage method obtains strong results on SuperGLUE tasks and clinical information extraction tasks. Code and settings are available: <url>https://github.com/christa60/bias-var-fine-tuning-plms.git</url>
+      </abstract>
       <url hash="1c3f62ed">2023.acl-long.877</url>
       <bibkey>wang-etal-2023-two</bibkey>
       <doi>10.18653/v1/2023.acl-long.877</doi>
@@ -11884,7 +11896,8 @@
       <author><first>Gaurav</first><last>Verma</last><affiliation>Georgia Institute of Technology</affiliation></author>
       <author><first>Srijan</first><last>Kumar</last><affiliation>Georgia Institute of Technology</affiliation></author>
       <pages>15974-15990</pages>
-      <abstract>The robustness of multimodal deep learning models to realistic changes in the input text is critical for applicability on important tasks such as text-to-image retrieval and cross-modal entailment. To measure robustness, several existing approaches edit the text data, but without leveraging the cross-modal information present in multimodal data. Such information from the visual modality, such as color, size, and shape, provides additional attributes that users can include in their inputs. Thus, we propose cross-modal attribute insertions as a realistic perturbation strategy for vision-and-language data that inserts visual attributes of the objects in the image into the corresponding text (e.g., “girl on a chair” to “little girl on a wooden chair”). Our proposed approach for cross-modal attribute insertions is modular, controllable, and task-agnostic. We find that augmenting input text using cross-modal insertions causes state-of-the-art approaches for text-to-image retrieval and cross-modal entailment to perform poorly, resulting in relative drops of ~15% in MRR and ~20% in F1 score, respectively. Crowd-sourced annotations demonstrate that cross-modal insertions lead to higher quality augmentations for multimodal data than augmentations using text-only data, and are equivalent in quality to original examples. We release the code to encourage robustness evaluations of deep vision-and-language models: <url>https://github.com/claws-lab/multimodal-robustness-xmai</url></abstract>
+      <abstract>The robustness of multimodal deep learning models to realistic changes in the input text is critical for applicability on important tasks such as text-to-image retrieval and cross-modal entailment. To measure robustness, several existing approaches edit the text data, but without leveraging the cross-modal information present in multimodal data. Such information from the visual modality, such as color, size, and shape, provides additional attributes that users can include in their inputs. Thus, we propose cross-modal attribute insertions as a realistic perturbation strategy for vision-and-language data that inserts visual attributes of the objects in the image into the corresponding text (e.g., “girl on a chair” to “little girl on a wooden chair”). Our proposed approach for cross-modal attribute insertions is modular, controllable, and task-agnostic. We find that augmenting input text using cross-modal insertions causes state-of-the-art approaches for text-to-image retrieval and cross-modal entailment to perform poorly, resulting in relative drops of ~15% in MRR and ~20% in F1 score, respectively. Crowd-sourced annotations demonstrate that cross-modal insertions lead to higher quality augmentations for multimodal data than augmentations using text-only data, and are equivalent in quality to original examples. We release the code to encourage robustness evaluations of deep vision-and-language models: <url>https://github.com/claws-lab/multimodal-robustness-xmai</url>
+      </abstract>
       <url hash="cefb3313">2023.acl-long.890</url>
       <bibkey>ramshetty-etal-2023-cross</bibkey>
       <doi>10.18653/v1/2023.acl-long.890</doi>
@@ -12222,7 +12235,8 @@
       <title>Back to Patterns: Efficient <fixed-case>J</fixed-case>apanese Morphological Analysis with Feature-Sequence Trie</title>
       <author><first>Naoki</first><last>Yoshinaga</last><affiliation>Institute of Industrial Science, The University of Tokyo</affiliation></author>
       <pages>13-23</pages>
-      <abstract>Accurate neural models are much less efficient than non-neural models and are useless for processing billions of social media posts or handling user queries in real time with a limited budget. This study revisits the fastest pattern-based NLP methods to make them as accurate as possible, thus yielding a strikingly simple yet surprisingly accurate morphological analyzer for Japanese. The proposed method induces reliable patterns from a morphological dictionary and annotated data. Experimental results on two standard datasets confirm that the method exhibits comparable accuracy to learning-based baselines, while boasting a remarkable throughput of over 1,000,000 sentences per second on a single modern CPU. The source code is available at <url>https://www.tkl.iis.u-tokyo.ac.jp/ynaga/jagger/</url></abstract>
+      <abstract>Accurate neural models are much less efficient than non-neural models and are useless for processing billions of social media posts or handling user queries in real time with a limited budget. This study revisits the fastest pattern-based NLP methods to make them as accurate as possible, thus yielding a strikingly simple yet surprisingly accurate morphological analyzer for Japanese. The proposed method induces reliable patterns from a morphological dictionary and annotated data. Experimental results on two standard datasets confirm that the method exhibits comparable accuracy to learning-based baselines, while boasting a remarkable throughput of over 1,000,000 sentences per second on a single modern CPU. The source code is available at <url>https://www.tkl.iis.u-tokyo.ac.jp/ynaga/jagger/</url>
+      </abstract>
       <url hash="8abd4555">2023.acl-short.2</url>
       <bibkey>yoshinaga-2023-back</bibkey>
       <doi>10.18653/v1/2023.acl-short.2</doi>
@@ -12263,7 +12277,7 @@
       <author><first>Franz</first><last>Nowak</last><affiliation>ETH Zurich</affiliation></author>
       <author><first>Ryan</first><last>Cotterell</last><affiliation>ETH Zürich</affiliation></author>
       <pages>57-69</pages>
-      <abstract>Multiple algorithms are known for efficiently calculating the prefix probability of a string under a probabilistic context-free grammar (PCFG). Good algorithms for the problem have a runtime cubic in the length of the input string. However, some proposed algorithms are suboptimal with respect to the size of the grammar. This paper proposes a new speed-up of Jelinek and Lafferty’s (1991) algorithm, which runs in O(n3|N|3 + |N|4), where n is the input length and |N| is the number of non-terminals in the grammar. In contrast, our speed-up runs in O(n2|N|3 + n3|N|2).</abstract>
+      <abstract>Multiple algorithms are known for efficiently calculating the prefix probability of a string under a probabilistic context-free grammar (PCFG). Good algorithms for the problem have a runtime cubic in the length of the input string. However, some proposed algorithms are suboptimal with respect to the size of the grammar. This paper proposes a new speed-up of Jelinek and Lafferty’s (1991) algorithm, which runs in <tex-math>O(n^3|N|^3 + |N|^4)</tex-math>, where n is the input length and |N| is the number of non-terminals in the grammar. In contrast, our speed-up runs in <tex-math>O(n^2|N|^3 + n^3|N|^2)</tex-math>.</abstract>
       <url hash="55e03aba">2023.acl-short.6</url>
       <bibkey>nowak-cotterell-2023-fast</bibkey>
       <doi>10.18653/v1/2023.acl-short.6</doi>
@@ -12910,9 +12924,11 @@
       <author><first>Yadollah</first><last>Yaghoobzadeh</last><affiliation>University of Tehran</affiliation></author>
       <pages>670-681</pages>
       <abstract>In recent years, there has been significant progress in developing pre-trained language models for NLP. However, these models often struggle when fine-tuned on small datasets. To address this issue, researchers have proposed various adaptation approaches. Prompt-based tuning is arguably the most common way, especially for larger models. Previous research shows that adding contrastive learning to prompt-based fine-tuning is effective as it helps the model generate embeddings that are more distinguishable between classes, and it can also be more sample-efficient as the model learns from positive and negative examples simultaneously. One of the most important components of contrastive learning is data augmentation, but unlike computer vision, effective data augmentation for NLP is still challenging. This paper proposes LM-CPPF, Contrastive Paraphrasing-guided Prompt-based Fine-tuning of Language Models, which leverages prompt-based few-shot paraphrasing using generative language models, especially large language models such as GPT-3 and OPT-175B, for data augmentation. Our experiments on multiple text classification benchmarks show that this augmentation method outperforms other methods, such as easy data augmentation, back translation, and multiple templates.</abstract>
-      <url hash="5c0c728c">2023.acl-short.59</url>
+      <url hash="808fd2e6">2023.acl-short.59</url>
       <bibkey>abaskohi-etal-2023-lm</bibkey>
       <doi>10.18653/v1/2023.acl-short.59</doi>
+      <revision id="1" href="2023.acl-short.59v1" hash="5c0c728c"/>
+      <revision id="2" href="2023.acl-short.59v2" hash="808fd2e6" date="2023-10-15">Updated figurer 1.</revision>
     </paper>
     <paper id="60">
       <title>Considerations for meaningful sign language machine translation based on glosses</title>
@@ -13343,7 +13359,8 @@
       <author><first>Alon</first><last>Lavie</last><affiliation>Unbabel/Carnegie Mellon University</affiliation></author>
       <author><first>André</first><last>Martins</last><affiliation>Unbabel, Instituto de Telecomunicacoes</affiliation></author>
       <pages>1089-1105</pages>
-      <abstract>Neural metrics for machine translation evaluation, such as COMET, exhibit significant improvements in their correlation with human judgments, as compared to traditional metrics based on lexical overlap, such as BLEU. Yet, neural metrics are, to a great extent, “black boxes” returning a single sentence-level score without transparency about the decision-making process. In this work, we develop and compare several neural explainability methods and demonstrate their effectiveness for interpreting state-of-the-art fine-tuned neural metrics. Our study reveals that these metrics leverage token-level information that can be directly attributed to translation errors, as assessed through comparison of token-level neural saliency maps with Multidimensional Quality Metrics (MQM) annotations and with synthetically-generated critical translation errors. To ease future research, we release our code at: <url>https://github.com/Unbabel/COMET/tree/explainable-metrics</url></abstract>
+      <abstract>Neural metrics for machine translation evaluation, such as COMET, exhibit significant improvements in their correlation with human judgments, as compared to traditional metrics based on lexical overlap, such as BLEU. Yet, neural metrics are, to a great extent, “black boxes” returning a single sentence-level score without transparency about the decision-making process. In this work, we develop and compare several neural explainability methods and demonstrate their effectiveness for interpreting state-of-the-art fine-tuned neural metrics. Our study reveals that these metrics leverage token-level information that can be directly attributed to translation errors, as assessed through comparison of token-level neural saliency maps with Multidimensional Quality Metrics (MQM) annotations and with synthetically-generated critical translation errors. To ease future research, we release our code at: <url>https://github.com/Unbabel/COMET/tree/explainable-metrics</url>
+      </abstract>
       <url hash="1d7c88ce">2023.acl-short.94</url>
       <bibkey>rei-etal-2023-inside</bibkey>
       <doi>10.18653/v1/2023.acl-short.94</doi>
@@ -13698,7 +13715,7 @@
       <author><first>Jong-Hyeok</first><last>Lee</last><affiliation>Pohang University of Science and Technology</affiliation></author>
       <author><first>Yunsu</first><last>Kim</last><affiliation>POSTECH</affiliation></author>
       <pages>1433-1441</pages>
-      <abstract>Automatic postediting (APE) is an automated process to refine a given machine translation (MT).Recent findings present that existing APE systems are not good at handling high-quality MTs even for a language pair with abundant data resources, English–German: the better the given MT is, the harder it is to decide what parts to edit and how to fix these errors. One possible solution to this problem is to instill deeper knowledge about the target language into the model. Thus, we propose a linguistically motivated method of regularization that is expected to enhance APE models’ understanding of the target language: a loss function that encourages symmetric self-attention on the given MT.Our analysis of experimental results demonstrates that the proposed method helps improving the state-of-the-art architecture’s APE quality for high-quality MTs.</abstract>
+      <abstract>Automatic postediting (APE) is an automated process to refine a given machine translation (MT). Recent findings present that existing APE systems are not good at handling high-quality MTs even for a language pair with abundant data resources, English–German: the better the given MT is, the harder it is to decide what parts to edit and how to fix these errors. One possible solution to this problem is to instill deeper knowledge about the target language into the model. Thus, we propose a linguistically motivated method of regularization that is expected to enhance APE models’ understanding of the target language: a loss function that encourages symmetric self-attention on the given MT. Our analysis of experimental results demonstrates that the proposed method helps improving the state-of-the-art architecture’s APE quality for high-quality MTs.</abstract>
       <url hash="15c50c22">2023.acl-short.122</url>
       <bibkey>jung-etal-2023-bring</bibkey>
       <doi>10.18653/v1/2023.acl-short.122</doi>
@@ -14425,7 +14442,8 @@
       <title><fixed-case>LIDA</fixed-case>: A Tool for Automatic Generation of Grammar-Agnostic Visualizations and Infographics using Large Language Models</title>
       <author><first>Victor</first><last>Dibia</last><affiliation>Microsoft Research</affiliation></author>
       <pages>113-126</pages>
-      <abstract>Systems that support users in the automatic creation of visualizations must address several subtasks - understand the semantics of data, enumerate relevant visualization goals and generate visualization specifications. In this work, we pose visualization generation as a multi-stage generation problem and argue that well-orchestrated pipelines based on large language models (LLMs) and image generation models (IGMs) are suitable to addressing these tasks. We present LIDA, a novel tool for generating grammar-agnostic visualizations and infographics. LIDA comprises of 4 modules - A SUMMARIZER that converts data into a rich but compact natural language summary, a GOAL EXPLORER that enumerates visualization goals given the data, a VISGENERATOR that generates, refines, executes and filters visualization code and an INFOGRAPHER module that yields data-faithful stylized graphics using IGMs. LIDA provides a python api, and a hybrid user interface (direct manipulation and multilingual natural language) for interactive chart, infographics and data story generation. Code and demo are available at this url - <url>https://microsoft.github.io/lida/</url></abstract>
+      <abstract>Systems that support users in the automatic creation of visualizations must address several subtasks - understand the semantics of data, enumerate relevant visualization goals and generate visualization specifications. In this work, we pose visualization generation as a multi-stage generation problem and argue that well-orchestrated pipelines based on large language models (LLMs) and image generation models (IGMs) are suitable to addressing these tasks. We present LIDA, a novel tool for generating grammar-agnostic visualizations and infographics. LIDA comprises of 4 modules - A SUMMARIZER that converts data into a rich but compact natural language summary, a GOAL EXPLORER that enumerates visualization goals given the data, a VISGENERATOR that generates, refines, executes and filters visualization code and an INFOGRAPHER module that yields data-faithful stylized graphics using IGMs. LIDA provides a python api, and a hybrid user interface (direct manipulation and multilingual natural language) for interactive chart, infographics and data story generation. Code and demo are available at this url - <url>https://microsoft.github.io/lida/</url>
+      </abstract>
       <url hash="c8d38139">2023.acl-demo.11</url>
       <bibkey>dibia-2023-lida</bibkey>
       <doi>10.18653/v1/2023.acl-demo.11</doi>
@@ -14471,7 +14489,8 @@
       <author><first>Kevin</first><last>Duh</last><affiliation>Johns Hopkins University</affiliation></author>
       <author><first>Paul</first><last>McNamee</last><affiliation>Johns Hopkins University</affiliation></author>
       <pages>161-168</pages>
-      <abstract>Hyperparameter optimization is an important but often overlooked process in the research of deep learning technologies. To obtain a good model, one must carefully tune hyperparameters that determine the architecture and training algorithm. Insufficient tuning may result in poor results, while inequitable tuning may lead to exaggerated differences between models. We present a hyperparameter optimization toolkit for neural machine translation (NMT) to help researchers focus their time on the creative rather than the mundane. The toolkit is implemented as a wrapper on top of the open-source Sockeye NMT software. Using the Asynchronous Successive Halving Algorithm (ASHA), we demonstrate that it is possible to discover near-optimal models under a computational budget with little effort. Code: <url>https://github.com/kevinduh/sockeye-recipes3Video</url> demo: <url>https://cs.jhu.edu/kevinduh/j/demo.mp4</url></abstract>
+      <abstract>Hyperparameter optimization is an important but often overlooked process in the research of deep learning technologies. To obtain a good model, one must carefully tune hyperparameters that determine the architecture and training algorithm. Insufficient tuning may result in poor results, while inequitable tuning may lead to exaggerated differences between models. We present a hyperparameter optimization toolkit for neural machine translation (NMT) to help researchers focus their time on the creative rather than the mundane. The toolkit is implemented as a wrapper on top of the open-source Sockeye NMT software. Using the Asynchronous Successive Halving Algorithm (ASHA), we demonstrate that it is possible to discover near-optimal models under a computational budget with little effort. Code: <url>https://github.com/kevinduh/sockeye-recipes3Video</url> demo: <url>https://cs.jhu.edu/kevinduh/j/demo.mp4</url>
+      </abstract>
       <url hash="b62a4f9f">2023.acl-demo.15</url>
       <bibkey>zhang-etal-2023-hyperparameter</bibkey>
       <doi>10.18653/v1/2023.acl-demo.15</doi>
@@ -14575,7 +14594,8 @@
       <author><first>Hassan</first><last>Sajjad</last><affiliation>Dalhousie University</affiliation></author>
       <author><first>Nadir</first><last>Durrani</last><affiliation>QCRI</affiliation></author>
       <pages>226-234</pages>
-      <abstract>Neuron analysis provides insights into how knowledge is structured in representations and discovers the role of neurons in the network. In addition to developing an understanding of our models, neuron analysis enables various applications such as debiasing, domain adaptation and architectural search. We present NeuroX, a comprehensive open-source toolkit to conduct neuron analysis of natural language processing models. It implements various interpretation methods under a unified API, and provides a framework for data processing and evaluation, thus making it easier for researchers and practitioners to perform neuron analysis. The Python toolkit is available at <url>https://www.github.com/fdalvi/NeuroX.Demo</url> Video available at: <url>https://youtu.be/mLhs2YMx4u8</url></abstract>
+      <abstract>Neuron analysis provides insights into how knowledge is structured in representations and discovers the role of neurons in the network. In addition to developing an understanding of our models, neuron analysis enables various applications such as debiasing, domain adaptation and architectural search. We present NeuroX, a comprehensive open-source toolkit to conduct neuron analysis of natural language processing models. It implements various interpretation methods under a unified API, and provides a framework for data processing and evaluation, thus making it easier for researchers and practitioners to perform neuron analysis. The Python toolkit is available at <url>https://www.github.com/fdalvi/NeuroX.Demo</url> Video available at: <url>https://youtu.be/mLhs2YMx4u8</url>
+      </abstract>
       <url hash="8d47e900">2023.acl-demo.21</url>
       <bibkey>dalvi-etal-2023-neurox</bibkey>
       <doi>10.18653/v1/2023.acl-demo.21</doi>
@@ -14585,7 +14605,8 @@
       <author><first>Nianlong</first><last>Gu</last><affiliation>ETH Zurich</affiliation></author>
       <author><first>Richard H.R.</first><last>Hahnloser</last><affiliation>ETH Zurich</affiliation></author>
       <pages>235-246</pages>
-      <abstract>Scientific writing involves retrieving, summarizing, and citing relevant papers, which can be time-consuming processes. Although in many workflows these processes are serially linked, there are opportunities for natural language processing (NLP) to provide end-to-end assistive tools. We propose SciLit, a pipeline that automatically recommends relevant papers, extracts highlights, and suggests a reference sentence as a citation of a paper, taking into consideration the user-provided context and keywords. SciLit efficiently recommends papers from large databases of hundreds of millions of papers using a two-stage pre-fetching and re-ranking literature search system that flexibly deals with addition and removal of a paper database. We provide a convenient user interface that displays the recommended papers as extractive summaries and that offers abstractively-generated citing sentences which are aligned with the provided context and which mention the chosen keyword(s). Our assistive tool for literature discovery and scientific writing is available at <url>https://scilit.vercel.app</url></abstract>
+      <abstract>Scientific writing involves retrieving, summarizing, and citing relevant papers, which can be time-consuming processes. Although in many workflows these processes are serially linked, there are opportunities for natural language processing (NLP) to provide end-to-end assistive tools. We propose SciLit, a pipeline that automatically recommends relevant papers, extracts highlights, and suggests a reference sentence as a citation of a paper, taking into consideration the user-provided context and keywords. SciLit efficiently recommends papers from large databases of hundreds of millions of papers using a two-stage pre-fetching and re-ranking literature search system that flexibly deals with addition and removal of a paper database. We provide a convenient user interface that displays the recommended papers as extractive summaries and that offers abstractively-generated citing sentences which are aligned with the provided context and which mention the chosen keyword(s). Our assistive tool for literature discovery and scientific writing is available at <url>https://scilit.vercel.app</url>
+      </abstract>
       <url hash="15d89601">2023.acl-demo.22</url>
       <bibkey>gu-hahnloser-2023-scilit</bibkey>
       <doi>10.18653/v1/2023.acl-demo.22</doi>
@@ -14668,7 +14689,8 @@
       <author><first>Ilia</first><last>Kuznetsov</last><affiliation>UKP Lab, Technische Universität Darmstadt</affiliation></author>
       <author><first>Iryna</first><last>Gurevych</last><affiliation>UKP Lab, Technische Universität Darmstadt</affiliation></author>
       <pages>291-303</pages>
-      <abstract>Recent years have seen impressive progress in AI-assisted writing, yet the developments in AI-assisted reading are lacking. We propose inline commentary as a natural vehicle for AI-based reading assistance, and present CARE: the first open integrated platform for the study of inline commentary and reading. CARE facilitates data collection for inline commentaries in a commonplace collaborative reading environment, and provides a framework for enhancing reading with NLP-based assistance, such as text classification, generation or question answering. The extensible behavioral logging allows unique insights into the reading and commenting behavior, and flexible configuration makes the platform easy to deploy in new scenarios. To evaluate CARE in action, we apply the platform in a user study dedicated to scholarly peer review. CARE facilitates the data collection and study of inline commentary in NLP, extrinsic evaluation of NLP assistance, and application prototyping. We invite the community to explore and build upon the open source implementation of CARE.Github Repository: <url>https://github.com/UKPLab/CAREPublic</url> Live Demo: <url>https://care.ukp.informatik.tu-darmstadt.de</url></abstract>
+      <abstract>Recent years have seen impressive progress in AI-assisted writing, yet the developments in AI-assisted reading are lacking. We propose inline commentary as a natural vehicle for AI-based reading assistance, and present CARE: the first open integrated platform for the study of inline commentary and reading. CARE facilitates data collection for inline commentaries in a commonplace collaborative reading environment, and provides a framework for enhancing reading with NLP-based assistance, such as text classification, generation or question answering. The extensible behavioral logging allows unique insights into the reading and commenting behavior, and flexible configuration makes the platform easy to deploy in new scenarios. To evaluate CARE in action, we apply the platform in a user study dedicated to scholarly peer review. CARE facilitates the data collection and study of inline commentary in NLP, extrinsic evaluation of NLP assistance, and application prototyping. We invite the community to explore and build upon the open source implementation of CARE.Github Repository: <url>https://github.com/UKPLab/CAREPublic</url> Live Demo: <url>https://care.ukp.informatik.tu-darmstadt.de</url>
+      </abstract>
       <url hash="4079cee4">2023.acl-demo.28</url>
       <bibkey>zyska-etal-2023-care</bibkey>
       <revision id="1" href="2023.acl-demo.28v1" hash="76f7ed85"/>
@@ -14905,7 +14927,8 @@
       <author><first>Kun</first><last>Mao</last><affiliation>Huawei Cloud Computing Technologies</affiliation></author>
       <author><first>Yong</first><last>Zhang</last><affiliation>Huawei Technologies Canada Co., Ltd</affiliation></author>
       <pages>471-478</pages>
-      <abstract>We demonstrate an interactive system to help operations research (OR) practitioners convert the mathematical formulation of optimization problems from TeX document format into the solver modeling language. In practice, a manual translation is cumbersome and time-consuming. Moreover, it requires an in-depth understanding of the problem description and a technical expertise to produce the modeling code. Thus, our proposed system TeX2Solver helps partially automate this conversion and help the users build optimization models more efficiently. In this paper, we describe its interface and the components of the hierarchical parsing system. A video demo walk-through is available online at <url>http://bit.ly/3kuOm3x</url></abstract>
+      <abstract>We demonstrate an interactive system to help operations research (OR) practitioners convert the mathematical formulation of optimization problems from TeX document format into the solver modeling language. In practice, a manual translation is cumbersome and time-consuming. Moreover, it requires an in-depth understanding of the problem description and a technical expertise to produce the modeling code. Thus, our proposed system TeX2Solver helps partially automate this conversion and help the users build optimization models more efficiently. In this paper, we describe its interface and the components of the hierarchical parsing system. A video demo walk-through is available online at <url>http://bit.ly/3kuOm3x</url>
+      </abstract>
       <url hash="cbe0e130">2023.acl-demo.45</url>
       <bibkey>ramamonjison-etal-2023-latex2solver</bibkey>
       <doi>10.18653/v1/2023.acl-demo.45</doi>
@@ -15038,7 +15061,8 @@
       <author><first>Pavel</first><last>Samygin</last><affiliation>Yandex School of Data Analysis</affiliation></author>
       <author><first>Colin</first><last>Raffel</last><affiliation>University of North Carolina/Hugging Face</affiliation></author>
       <pages>558-568</pages>
-      <abstract>Many NLP tasks benefit from using large language models (LLMs) that often have more than 100 billion parameters. With the release of BLOOM-176B and OPT-175B, everyone can download pretrained models of this scale. Still, using these models requires high-end hardware unavailable to many researchers. In some cases, LLMs can be used more affordably via RAM offloading or hosted APIs. However, these techniques have innate limitations: offloading is too slow for interactive inference, while APIs are not flexible enough for research that requires access to weights, attention or logits. In this work, we propose Petals - a system for inference and fine-tuning of large models collaboratively by joining the resources of multiple parties. We demonstrate that this strategy outperforms offloading for very large models, running inference of BLOOM-176B on consumer GPUs with ≈1 step per second, which is enough for many interactive LLM applications. Unlike most inference APIs, Petals also natively exposes hidden states of served models, allowing to train and share custom model extensions based on efficient fine-tuning methods. The system, its source code, and documentation are available at https://petals.mlVideo (2 min): <url>https://youtu.be/F4muLI-0hTE</url></abstract>
+      <abstract>Many NLP tasks benefit from using large language models (LLMs) that often have more than 100 billion parameters. With the release of BLOOM-176B and OPT-175B, everyone can download pretrained models of this scale. Still, using these models requires high-end hardware unavailable to many researchers. In some cases, LLMs can be used more affordably via RAM offloading or hosted APIs. However, these techniques have innate limitations: offloading is too slow for interactive inference, while APIs are not flexible enough for research that requires access to weights, attention or logits. In this work, we propose Petals - a system for inference and fine-tuning of large models collaboratively by joining the resources of multiple parties. We demonstrate that this strategy outperforms offloading for very large models, running inference of BLOOM-176B on consumer GPUs with ≈1 step per second, which is enough for many interactive LLM applications. Unlike most inference APIs, Petals also natively exposes hidden states of served models, allowing to train and share custom model extensions based on efficient fine-tuning methods. The system, its source code, and documentation are available at https://petals.mlVideo (2 min): <url>https://youtu.be/F4muLI-0hTE</url>
+      </abstract>
       <url hash="375a2319">2023.acl-demo.54</url>
       <bibkey>borzunov-etal-2023-petals</bibkey>
       <doi>10.18653/v1/2023.acl-demo.54</doi>
diff --git a/data/xml/2023.ccl.xml b/data/xml/2023.ccl.xml
index 05676a9de7..b4b578d3ff 100644
--- a/data/xml/2023.ccl.xml
+++ b/data/xml/2023.ccl.xml
@@ -1468,7 +1468,8 @@
       <author><first>Cunliang</first><last>Kong</last></author>
       <author><first>Liner</first><last>Yang</last></author>
       <author><first>Yang</first><last>Erhong</last></author>
-      <author><first>Gaoqi</first><last>Sun, Maosong andRao</last></author>
+      <author><first>Maosong</first><last>Sun</last></author>
+      <author><first>Gaoqi</first><last>Rao</last></author>
       <author><first>Renfen</first><last>Hu</last></author>
       <author><first>Zhenghao</first><last>Liu</last></author>
       <author><first>鸿翔</first><last>常</last></author>
diff --git a/data/xml/2023.clasp.xml b/data/xml/2023.clasp.xml
index 11d94bb22b..757bf41151 100644
--- a/data/xml/2023.clasp.xml
+++ b/data/xml/2023.clasp.xml
@@ -36,8 +36,10 @@
       <author><first>Jan</first><last>Snajder</last></author>
       <pages>11–24</pages>
       <abstract>Developed to alleviate prohibitive labeling costs, active learning (AL) methods aim to reduce label complexity in supervised learning. While recent work has demonstrated the benefit of using AL in combination with large pre-trained language models (PLMs), it has often overlooked the practical challenges that hinder the effectiveness of AL. We address these challenges by leveraging representation smoothness analysis to ensure AL is feasible, that is, both effective and practicable. Firstly, we propose an early stopping technique that does not require a validation set – often unavailable in realistic AL conditions – and observe significant improvements over random sampling across multiple datasets and AL methods. Further, we find that task adaptation improves AL, whereas standard short fine-tuning in AL does not provide improvements over random sampling. Our work demonstrates the usefulness of representation smoothness analysis for AL and introduces an AL stopping criterion that reduces label complexity.</abstract>
-      <url hash="8dc24535">2023.clasp-1.2</url>
+      <url hash="3612888a">2023.clasp-1.2</url>
       <bibkey>jukic-snajder-2023-smooth</bibkey>
+      <revision id="1" href="2023.clasp-1.2v1" hash="8dc24535"/>
+      <revision id="2" href="2023.clasp-1.2v2" hash="3612888a" date="2023-10-05">Corrcted typo in Table 1.</revision>
     </paper>
     <paper id="3">
       <title>Entrenchment Matters: Investigating Positional and Constructional Sensitivity in Small and Large Language Models</title>
diff --git a/data/xml/2023.dstc.xml b/data/xml/2023.dstc.xml
index 1343da40c1..ab3abe939f 100644
--- a/data/xml/2023.dstc.xml
+++ b/data/xml/2023.dstc.xml
@@ -189,10 +189,10 @@
     <paper id="14">
       <title>Three Ways of Using Large Language Models to Evaluate Chat</title>
       <author><first>Ondřej</first><last>Plátek</last><affiliation>Charles University, Faculty of Mathematics and Physics Institute of Formal and Applied Linguistics</affiliation></author>
-      <author><first>Ondrej</first><last>Dusek</last><affiliation>Charles University</affiliation></author>
-      <author><first>Patricia</first><last>Schmidtova</last><affiliation>Charles Universit</affiliation></author>
       <author><first>Vojtech</first><last>Hudecek</last><affiliation>Charles University, Czech Republic</affiliation></author>
-      <author><first>Matheusz</first><last>Lango</last><affiliation>Charles University</affiliation></author>
+      <author><first>Patricia</first><last>Schmidtova</last><affiliation>Charles Universit</affiliation></author>
+      <author><first>Mateusz</first><last>Lango</last><affiliation>Charles University</affiliation></author>
+      <author><first>Ondrej</first><last>Dusek</last><affiliation>Charles University</affiliation></author>
       <pages>113-122</pages>
       <abstract>This paper describes the systems submitted by team6 for ChatEval, the DSTC 11 Track 4 competition. We present three different approaches to predicting turn-level qualities of chatbot responses based on large language models (LLMs). We report improvement over the baseline using dynamic few-shot examples from a vector store for the prompts for ChatGPT. We also analyze the performance of the other two approaches and report needed improvements for future work. We developed the three systems over just two weeks, showing the potential of LLMs for this task. An ablation study conducted after the challenge deadline shows that the new Llama 2 models are closing the performance gap between ChatGPT and open-source LLMs. However, we find that the Llama 2 models do not benefit from few-shot examples in the same way as ChatGPT.</abstract>
       <url hash="da20672c">2023.dstc-1.14</url>
@@ -216,7 +216,7 @@
       <author><first>Helena</first><last>Moniz</last><affiliation>INESC-ID</affiliation></author>
       <author><first>Joao</first><last>Paulo Carvalho</last><affiliation>INESC-ID / Instituto Superior Técnico, University of Lisbon, Portugal</affiliation></author>
       <author><first>Alon</first><last>Lavie</last><affiliation>Unbabel</affiliation></author>
-      <author><first>Isabel</first><last>M Trancoso</last><affiliation>IST / INESC-ID</affiliation></author>
+      <author><first>Isabel</first><last>Trancoso</last><affiliation>IST / INESC-ID</affiliation></author>
       <pages>133-143</pages>
       <abstract>Despite significant research effort in the development of automatic dialogue evaluation metrics, little thought is given to evaluating dialogues other than in English. At the same time, ensuring metrics are invariant to semantically similar responses is also an overlooked topic. In order to achieve the desired properties of robustness and multilinguality for dialogue evaluation metrics, we propose a novel framework that takes advantage of the strengths of current evaluation models with the newly-established paradigm of prompting Large Language Models (LLMs). Empirical results show our framework achieves state of the art results in terms of mean Spearman correlation scores across several benchmarks and ranks first place on both the Robust and Multilingual tasks of the DSTC11 Track 4 “Automatic Evaluation Metrics for Open-Domain Dialogue Systems”, proving the evaluation capabilities of prompted LLMs.</abstract>
       <url hash="d9c1a914">2023.dstc-1.16</url>
@@ -364,7 +364,7 @@ for Open-Domain Dialogue Systems at <fixed-case>DSTC</fixed-case> 11 Track 4</ti
       <author><first>Sarik</first><last>Ghazarian</last><affiliation>ISI USC</affiliation></author>
       <author><first>João</first><last>Sedoc</last><affiliation>New York University</affiliation></author>
       <author><first>Luis</first><last>Fernando D’Haro</last><affiliation>Speech Technology and Machine Learning Group - Universidad Politécnica de Madrid</affiliation></author>
-      <author><first>Alexander</first><last>I. Rudnicky</last><affiliation>Carnegie Mellon University</affiliation></author>
+      <author><first>Alexander I.</first><last>Rudnicky</last><affiliation>Carnegie Mellon University</affiliation></author>
       <pages>260-273</pages>
       <abstract>The advent and fast development of neural networks have revolutionized the research on dialogue systems and subsequently have triggered various challenges regarding their automatic evaluation. Automatic evaluation of open-domain dialogue systems as an open challenge has been the center of the attention of many researchers. Despite the consistent efforts to improve automatic metrics’ correlations with human evaluation, there have been very few attempts to assess their robustness over multiple domains and dimensions. Also, their focus is mainly on the English language. All of these challenges prompt the development of automatic evaluation metrics that are reliable in various domains, dimensions, and languages. This track in the 11th Dialogue System Technology Challenge (DSTC11) is part of the ongoing effort to promote robust and multilingual automatic evaluation metrics. This article describes the datasets and baselines provided to participants and discusses the submission and result details of the two proposed subtasks.</abstract>
       <url hash="d203a908">2023.dstc-1.28</url>
diff --git a/data/xml/2023.eacl.xml b/data/xml/2023.eacl.xml
index 98625b5471..0b48bf0a3d 100644
--- a/data/xml/2023.eacl.xml
+++ b/data/xml/2023.eacl.xml
@@ -1945,7 +1945,8 @@
       <author><first>Myung Hee</first><last>Kim</last><affiliation>Defence Science Technology Group</affiliation></author>
       <author><first>Jennifer</first><last>Biggs</last><affiliation>Defence Science and Technology Group</affiliation></author>
       <pages>1960-1970</pages>
-      <abstract>The Appraisal framework in linguistics defines the framework for fine-grained evaluations and opinions and has contributed to sentiment analysis and opinion mining. As developing appraisal-annotated resources requires tagging of several dimensions with distinct semantic taxonomies, it has been primarily conducted manually by human experts through expensive and time-consuming processes. In this paper, we study how to automatically identify and annotate text segments for appraisal. We formulate the problem as a sequence tagging problem and propose novel task and sentiment adapters based on language models for appraisal tagging. Our model, named Adaptive Appraisal (A<tex-math>ˆ2</tex-math>), achieves superior performance than baseline adapter-based models and other neural classification models, especially for cross-domain and cross-language settings. Source code for A<tex-math>ˆ2</tex-math> is available at: <url>https://github.com/ltian678/AA-code.git</url></abstract>
+      <abstract>The Appraisal framework in linguistics defines the framework for fine-grained evaluations and opinions and has contributed to sentiment analysis and opinion mining. As developing appraisal-annotated resources requires tagging of several dimensions with distinct semantic taxonomies, it has been primarily conducted manually by human experts through expensive and time-consuming processes. In this paper, we study how to automatically identify and annotate text segments for appraisal. We formulate the problem as a sequence tagging problem and propose novel task and sentiment adapters based on language models for appraisal tagging. Our model, named Adaptive Appraisal (A<tex-math>ˆ2</tex-math>), achieves superior performance than baseline adapter-based models and other neural classification models, especially for cross-domain and cross-language settings. Source code for A<tex-math>ˆ2</tex-math> is available at: <url>https://github.com/ltian678/AA-code.git</url>
+      </abstract>
       <url hash="26af015e">2023.eacl-main.144</url>
       <bibkey>tian-etal-2023-task</bibkey>
       <video href="2023.eacl-main.144.mp4"/>
@@ -2128,7 +2129,8 @@
       <author><first>Laxmidhar</first><last>Behera</last><affiliation>IIT Mandi/ Kanpur</affiliation></author>
       <author><first>Pawan</first><last>Goyal</last><affiliation>IIT Kharagpur</affiliation></author>
       <pages>2164-2171</pages>
-      <abstract>In this work, we focus on low-resource dependency parsing for multiple languages. Several strategies are tailored to enhance performance in low-resource scenarios. While these are well-known to the community, it is not trivial to select the best-performing combination of these strategies for a low-resource language that we are interested in, and not much attention has been given to measuring the efficacy of these strategies. We experiment with 5 low-resource strategies for our ensembled approach on 7 Universal Dependency (UD) low-resource languages. Our exhaustive experimentation on these languages supports the effective improvements for languages not covered in pretrained models. We show a successful application of the ensembled system on a truly low-resource language Sanskrit. The code and data are available at: <url>https://github.com/Jivnesh/SanDP</url></abstract>
+      <abstract>In this work, we focus on low-resource dependency parsing for multiple languages. Several strategies are tailored to enhance performance in low-resource scenarios. While these are well-known to the community, it is not trivial to select the best-performing combination of these strategies for a low-resource language that we are interested in, and not much attention has been given to measuring the efficacy of these strategies. We experiment with 5 low-resource strategies for our ensembled approach on 7 Universal Dependency (UD) low-resource languages. Our exhaustive experimentation on these languages supports the effective improvements for languages not covered in pretrained models. We show a successful application of the ensembled system on a truly low-resource language Sanskrit. The code and data are available at: <url>https://github.com/Jivnesh/SanDP</url>
+      </abstract>
       <url hash="27c7e3d7">2023.eacl-main.158</url>
       <attachment type="software" hash="bf383072">2023.eacl-main.158.software.zip</attachment>
       <bibkey>sandhan-etal-2023-systematic</bibkey>
@@ -3784,7 +3786,8 @@
       <author><first>Katharina</first><last>Kleinen-von K”onigsl”ow</last><affiliation>Universität Hamburg</affiliation></author>
       <author><first>Chris</first><last>Biemann</last><affiliation>Universität Hamburg</affiliation></author>
       <pages>11-17</pages>
-      <abstract>WebAnno is one of the most popular annotation tools that supports generic annotation types and distributive annotation with multiple user roles. However, WebAnno focuses on annotating span-level mentions and relations among them, making document-level annotation complicated. When it comes to the annotation and analysis of social science materials, it usually involves the creation of codes to categorize a given document. The codes, which are known as codebooks, are typically hierarchical, which enables to code the document either with a general category or more fine-grained subcategories. CodeAnno is forked from WebAnno and designed to solve the coding problems faced by many social science researchers with the following main functionalities. 1) Creation of hierarchical codebooks, with functionality to move and sort categories in the hierarchy 2) an interactive UI for codebook annotation 3) import and export of annotations in CSV format, hence being compatible with existing annotations conducted using spreadsheet applications 4) integration of an external automation component to facilitate coding using machine learning 5) project templating that allows duplicating a project structure without copying the actual documents. We present different use-cases to demonstrate the capability of CodeAnno. A shot demonstration video of the system is available here: <url>https://www.youtube.com/watch?v=RmCdTghBe-s</url></abstract>
+      <abstract>WebAnno is one of the most popular annotation tools that supports generic annotation types and distributive annotation with multiple user roles. However, WebAnno focuses on annotating span-level mentions and relations among them, making document-level annotation complicated. When it comes to the annotation and analysis of social science materials, it usually involves the creation of codes to categorize a given document. The codes, which are known as codebooks, are typically hierarchical, which enables to code the document either with a general category or more fine-grained subcategories. CodeAnno is forked from WebAnno and designed to solve the coding problems faced by many social science researchers with the following main functionalities. 1) Creation of hierarchical codebooks, with functionality to move and sort categories in the hierarchy 2) an interactive UI for codebook annotation 3) import and export of annotations in CSV format, hence being compatible with existing annotations conducted using spreadsheet applications 4) integration of an external automation component to facilitate coding using machine learning 5) project templating that allows duplicating a project structure without copying the actual documents. We present different use-cases to demonstrate the capability of CodeAnno. A shot demonstration video of the system is available here: <url>https://www.youtube.com/watch?v=RmCdTghBe-s</url>
+      </abstract>
       <url hash="1fcec9c5">2023.eacl-demo.2</url>
       <bibkey>schneider-etal-2023-codeanno</bibkey>
       <video href="2023.eacl-demo.2.mp4"/>
@@ -3828,7 +3831,8 @@
       <author><first>Yuji</first><last>Matsumoto</last><affiliation>RIKEN Center for Advanced Intelligence Project (AIP), Japan</affiliation></author>
       <author><first>Minh</first><last>Nguyen</last><affiliation>Japan Advanced Institute of Science and Technology</affiliation></author>
       <pages>35-42</pages>
-      <abstract>In recent years, COVID-19 has impacted all aspects of human life. As a result, numerous publications relating to this disease have been issued. Due to the massive volume of publications, some retrieval systems have been developed to provide researchers with useful information. In these systems, lexical searching methods are widely used, which raises many issues related to acronyms, synonyms, and rare keywords. In this paper, we present a hybrid relation retrieval system, CovRelex-SE, based on embeddings to provide high-quality search results. Our system can be accessed through the following URL: <url>https://www.jaist.ac.jp/is/labs/nguyen-lab/systems/covrelex-se/</url></abstract>
+      <abstract>In recent years, COVID-19 has impacted all aspects of human life. As a result, numerous publications relating to this disease have been issued. Due to the massive volume of publications, some retrieval systems have been developed to provide researchers with useful information. In these systems, lexical searching methods are widely used, which raises many issues related to acronyms, synonyms, and rare keywords. In this paper, we present a hybrid relation retrieval system, CovRelex-SE, based on embeddings to provide high-quality search results. Our system can be accessed through the following URL: <url>https://www.jaist.ac.jp/is/labs/nguyen-lab/systems/covrelex-se/</url>
+      </abstract>
       <url hash="5d0101d9">2023.eacl-demo.5</url>
       <bibkey>do-etal-2023-covrelex</bibkey>
       <video href="2023.eacl-demo.5.mp4"/>
@@ -3900,7 +3904,8 @@
       <author><first>Mus’ab</first><last>Husaini</last><affiliation>Qcri</affiliation></author>
       <author><first>Ummar</first><last>Abbas</last><affiliation>Qcri</affiliation></author>
       <pages>75-83</pages>
-      <abstract>The proliferation of deep neural networks in various domains has seen an increased need for the interpretability of these models, especially in scenarios where fairness and trust are as important as model performance. A lot of independent work is being carried out to: i) analyze what linguistic and non-linguistic knowledge is learned within these models, and ii) highlight the salient parts of the input. We present NxPlain, a web-app that provides an explanation of a model’s prediction using latent concepts. NxPlain discovers latent concepts learned in a deep NLP model, provides an interpretation of the knowledge learned in the model, and explains its predictions based on the used concepts. The application allows users to browse through the latent concepts in an intuitive order, letting them efficiently scan through the most salient concepts with a global corpus-level view and a local sentence-level view. Our tool is useful for debugging, unraveling model bias, and for highlighting spurious correlations in a model. A hosted demo is available here: <url>https://nxplain.qcri.org</url></abstract>
+      <abstract>The proliferation of deep neural networks in various domains has seen an increased need for the interpretability of these models, especially in scenarios where fairness and trust are as important as model performance. A lot of independent work is being carried out to: i) analyze what linguistic and non-linguistic knowledge is learned within these models, and ii) highlight the salient parts of the input. We present NxPlain, a web-app that provides an explanation of a model’s prediction using latent concepts. NxPlain discovers latent concepts learned in a deep NLP model, provides an interpretation of the knowledge learned in the model, and explains its predictions based on the used concepts. The application allows users to browse through the latent concepts in an intuitive order, letting them efficiently scan through the most salient concepts with a global corpus-level view and a local sentence-level view. Our tool is useful for debugging, unraveling model bias, and for highlighting spurious correlations in a model. A hosted demo is available here: <url>https://nxplain.qcri.org</url>
+      </abstract>
       <url hash="2d995075">2023.eacl-demo.10</url>
       <bibkey>dalvi-etal-2023-nxplain</bibkey>
       <video href="2023.eacl-demo.10.mp4"/>
@@ -3915,10 +3920,12 @@
       <author><first>Martin</first><last>Potthast</last><affiliation>Leipzig University</affiliation></author>
       <pages>84-95</pages>
       <abstract>We introduce small-text, an easy-to-use active learning library, which offers pool-based active learning for single- and multi-label text classification in Python. It features numerous pre-implemented state-of-the-art query strategies, including some that leverage the GPU. Standardized interfaces allow the combination of a variety of classifiers, query strategies, and stopping criteria, facilitating a quick mix and match, and enabling a rapid development of both active learning experiments and applications. With the objective of making various classifiers and query strategies accessible for active learning, small-text integrates several well-known machine learning libraries, namely scikit-learn, Pytorch, and Hugging Face transformers. The latter integrations are optionally installable extensions, so GPUs can be used but are not required. Using this new library, we investigate the performance of the recently published SetFit training paradigm, which we compare to vanilla transformer fine-tuning, finding that it matches the latter in classification accuracy while outperforming it in area under the curve. The library is available under the MIT License at <url>https://github.com/webis-de/small-text</url>, in version 1.3.0 at the time of writing.</abstract>
-      <url hash="8ad5d9cb">2023.eacl-demo.11</url>
+      <url hash="b3cec21a">2023.eacl-demo.11</url>
       <bibkey>schroder-etal-2023-small</bibkey>
       <video href="2023.eacl-demo.11.mp4"/>
       <doi>10.18653/v1/2023.eacl-demo.11</doi>
+      <revision id="1" href="2023.eacl-demo.11v1" hash="8ad5d9cb"/>
+      <revision id="2" href="2023.eacl-demo.11v2" hash="b3cec21a" date="2023-10-13">Minor updates.</revision>
     </paper>
     <paper id="12">
       <title>kogito: A Commonsense Knowledge Inference Toolkit</title>
@@ -3943,7 +3950,8 @@
       <author><first>Dipanjan</first><last>Das</last><affiliation>Google Research</affiliation></author>
       <author><first>Mirella</first><last>Lapata</last><affiliation>School of Informatics, University of Edinburgh</affiliation></author>
       <pages>105-116</pages>
-      <abstract>While conditional generation models can now generate natural language well enough to create fluent text, it is still difficult to control the generation process, leading to irrelevant, repetitive, and hallucinated content. Recent work shows that planning can be a useful intermediate step to render conditional generation less opaque and more grounded. We present a web browser-based demonstration for query-focused summarization that uses a sequence of question-answer pairs, as a blueprint plan for guiding text generation (i.e., what to say and in what order). We illustrate how users may interact with the generated text and associated plan visualizations, e.g., by editing and modifying the plan in order to improve or control the generated output.A short video demonstrating our system is available at <url>https://goo.gle/text-blueprint-demo</url></abstract>
+      <abstract>While conditional generation models can now generate natural language well enough to create fluent text, it is still difficult to control the generation process, leading to irrelevant, repetitive, and hallucinated content. Recent work shows that planning can be a useful intermediate step to render conditional generation less opaque and more grounded. We present a web browser-based demonstration for query-focused summarization that uses a sequence of question-answer pairs, as a blueprint plan for guiding text generation (i.e., what to say and in what order). We illustrate how users may interact with the generated text and associated plan visualizations, e.g., by editing and modifying the plan in order to improve or control the generated output.A short video demonstrating our system is available at <url>https://goo.gle/text-blueprint-demo</url>
+      </abstract>
       <url hash="9a2a072e">2023.eacl-demo.13</url>
       <bibkey>huot-etal-2023-text</bibkey>
       <video href="2023.eacl-demo.13.mp4"/>
@@ -4126,7 +4134,8 @@
       <author><first>Iain</first><last>Marshall</last><affiliation>King’s College London</affiliation></author>
       <author><first>Byron</first><last>Wallace</last><affiliation>Northeastern University</affiliation></author>
       <pages>236-247</pages>
-      <abstract>In this work we present TrialsSummarizer, a system that aims to automatically summarize evidence presented in the set of randomized controlled trials most relevant to a given query. Building on prior work, the system retrieves trial publications matching a query specifying a combination of condition, intervention(s), and outcome(s), and ranks these according to sample size and estimated study quality. The top-k such studies are passed through a neural multi-document summarization system, yielding a synopsis of these trials. We consider two architectures: A standard sequence-to-sequence model based on BART, and a multi-headed architecture intended to provide greater transparency and controllability to end-users. Both models produce fluent and relevant summaries of evidence retrieved for queries, but their tendency to introduce unsupported statements render them inappropriate for use in this domain at present. The proposed architecture may help users verify outputs allowing users to trace generated tokens back to inputs. The demonstration video can be found at <url>https://vimeo.com/735605060The</url> prototype, source code, and model weights are available at: <url>https://sanjanaramprasad.github.io/trials-summarizer/</url></abstract>
+      <abstract>In this work we present TrialsSummarizer, a system that aims to automatically summarize evidence presented in the set of randomized controlled trials most relevant to a given query. Building on prior work, the system retrieves trial publications matching a query specifying a combination of condition, intervention(s), and outcome(s), and ranks these according to sample size and estimated study quality. The top-k such studies are passed through a neural multi-document summarization system, yielding a synopsis of these trials. We consider two architectures: A standard sequence-to-sequence model based on BART, and a multi-headed architecture intended to provide greater transparency and controllability to end-users. Both models produce fluent and relevant summaries of evidence retrieved for queries, but their tendency to introduce unsupported statements render them inappropriate for use in this domain at present. The proposed architecture may help users verify outputs allowing users to trace generated tokens back to inputs. The demonstration video can be found at <url>https://vimeo.com/735605060The</url> prototype, source code, and model weights are available at: <url>https://sanjanaramprasad.github.io/trials-summarizer/</url>
+      </abstract>
       <url hash="8caba601">2023.eacl-demo.27</url>
       <bibkey>ramprasad-etal-2023-automatically</bibkey>
       <video href="2023.eacl-demo.27.mp4"/>
@@ -4189,7 +4198,8 @@
       <author><first>Geonsik</first><last>Moon</last><affiliation>National University of Singapore</affiliation></author>
       <author><first>Hwee Tou</first><last>Ng</last><affiliation>National University of Singapore</affiliation></author>
       <pages>298-306</pages>
-      <abstract>In this paper, we present ALLECS, a lightweight web application to serve grammatical error correction (GEC) systems so that they can be easily used by the general public. We design ALLECS to be accessible to as many users as possible, including users who have a slow Internet connection and who use mobile phones as their main devices to connect to the Internet. ALLECS provides three state-of-the-art base GEC systems using two approaches (sequence-to-sequence generation and sequence tagging), as well as two state-of-the-art GEC system combination methods using two approaches (edit-based and text-based). ALLECS can be accessed at <url>https://sterling8.d2.comp.nus.edu.sg/gec-demo/</url></abstract>
+      <abstract>In this paper, we present ALLECS, a lightweight web application to serve grammatical error correction (GEC) systems so that they can be easily used by the general public. We design ALLECS to be accessible to as many users as possible, including users who have a slow Internet connection and who use mobile phones as their main devices to connect to the Internet. ALLECS provides three state-of-the-art base GEC systems using two approaches (sequence-to-sequence generation and sequence tagging), as well as two state-of-the-art GEC system combination methods using two approaches (edit-based and text-based). ALLECS can be accessed at <url>https://sterling8.d2.comp.nus.edu.sg/gec-demo/</url>
+      </abstract>
       <url hash="4e038e2b">2023.eacl-demo.32</url>
       <bibkey>qorib-etal-2023-allecs</bibkey>
       <video href="2023.eacl-demo.32.mp4"/>
diff --git a/data/xml/2023.finnlp.xml b/data/xml/2023.finnlp.xml
index ee95b845e6..b72b212605 100644
--- a/data/xml/2023.finnlp.xml
+++ b/data/xml/2023.finnlp.xml
@@ -78,16 +78,27 @@
     </paper>
     <paper id="6">
       <title>Using Deep Learning to Find the Next Unicorn: A Practical Synthesis on Optimization Target, Feature Selection, Data Split and Evaluation Strategy</title>
-      <author><first>Lefteris</first><last>Loukas</last></author>
-      <author><first>Ilias</first><last>Stogiannidis</last></author>
-      <author><first>Prodromos</first><last>Malakasiotis</last></author>
-      <author><first>Stavros</first><last>Vassos</last></author>
+      <author><first>Lele</first><last>Cao</last></author>
+      <author><first>Vilhelm</first><last>von Ehrenheim</last></author>
+      <author><first>Sebastian</first><last>Stan</last></author>
+      <author><first>Xiaoxue</first><last>Li</last></author>
+      <author><first>Alexandra</first><last>Lutz</last></author>
       <pages>63–73</pages>
       <url hash="e47e20ef">2023.finnlp-1.6</url>
       <bibkey>loukas-etal-2023-using</bibkey>
     </paper>
     <paper id="7">
       <title>Breaking the Bank with <fixed-case>C</fixed-case>hat<fixed-case>GPT</fixed-case>: Few-Shot Text Classification for Finance</title>
+      <author><first>Lefteris</first><last>Loukas</last></author>
+      <author><first>Ilias</first><last>Stogiannidis</last></author>
+      <author><first>Prodromos</first><last>Malakasiotis</last></author>
+      <author><first>Stavros</first><last>Vassos</last></author>
+      <pages>74–80</pages>
+      <url hash="52094c5a">2023.finnlp-1.7</url>
+      <bibkey>liang-etal-2023-breaking</bibkey>
+    </paper>
+    <paper id="8">
+      <title><fixed-case>D</fixed-case>e<fixed-case>R</fixed-case>isk: An Effective Deep Learning Framework for Credit Risk Prediction over Real-World Financial Data</title>
       <author><first>Yancheng</first><last>Liang</last></author>
       <author><first>Jiajie</first><last>Zhang</last></author>
       <author><first>Hui</first><last>Li</last></author>
@@ -97,26 +108,16 @@
       <author><first>Jiaoyao</first><last>Zhang</last></author>
       <author><first>Yongyan</first><last>Liu</last></author>
       <author><first>Yi</first><last>Wu</last></author>
-      <pages>74–80</pages>
-      <url hash="52094c5a">2023.finnlp-1.7</url>
-      <bibkey>liang-etal-2023-breaking</bibkey>
-    </paper>
-    <paper id="8">
-      <title><fixed-case>D</fixed-case>e<fixed-case>R</fixed-case>isk: An Effective Deep Learning Framework for Credit Risk Prediction over Real-World Financial Data</title>
-      <author><first>Braulio Blanco</first><last>Lambruschini</last></author>
-      <author><first>Patricia</first><last>Becerra-Sanchez</last></author>
-      <author><first>Mats</first><last>Brorsson</last></author>
-      <author><first>Maciej</first><last>Zurad</last></author>
       <pages>81–93</pages>
       <url hash="4885bf3d">2023.finnlp-1.8</url>
       <bibkey>lambruschini-etal-2023-derisk</bibkey>
     </paper>
     <paper id="9">
       <title>Reducing tokenizer’s tokens per word ratio in Financial domain with <fixed-case>T</fixed-case>-<fixed-case>M</fixed-case>u<fixed-case>F</fixed-case>in <fixed-case>BERT</fixed-case> Tokenizer</title>
-      <author><first>Seethalakshmi</first><last>Gopalakrishnan</last></author>
-      <author><first>Victor Zitian</first><last>Chen</last></author>
-      <author><first>Wenwen</first><last>Dou</last></author>
-      <author><first>Wlodek</first><last>Zadrozny</last></author>
+      <author><first>Braulio Blanco</first><last>Lambruschini</last></author>
+      <author><first>Patricia</first><last>Becerra-Sanchez</last></author>
+      <author><first>Mats</first><last>Brorsson</last></author>
+      <author><first>Maciej</first><last>Zurad</last></author>
       <pages>94–103</pages>
       <url hash="4a89bfbf">2023.finnlp-1.9</url>
       <bibkey>gopalakrishnan-etal-2023-reducing</bibkey>
diff --git a/data/xml/2023.icard.xml b/data/xml/2023.icard.xml
index 71b5cd08fa..4eb1669bb0 100644
--- a/data/xml/2023.icard.xml
+++ b/data/xml/2023.icard.xml
@@ -37,8 +37,8 @@
     </paper>
     <paper id="3">
       <title>Computational Analysis of Backchannel Usage and Overlap Length in Autistic Children</title>
-      <author><first>Grace</first><last>Lawley</last></author>
-      <author><first>Peter</first><last>Heeman</last></author>
+      <author><first>Grace O.</first><last>Lawley</last></author>
+      <author><first>Peter A.</first><last>Heeman</last></author>
       <author><first>Steven</first><last>Bedrick</last></author>
       <pages>17-23</pages>
       <url hash="beb6955a">2023.icard-1.3</url>
diff --git a/data/xml/2023.inlg.xml b/data/xml/2023.inlg.xml
index 0fbf14453f..673c53dca1 100644
--- a/data/xml/2023.inlg.xml
+++ b/data/xml/2023.inlg.xml
@@ -129,7 +129,7 @@
     <paper id="10">
       <title>Claim Optimization in Computational Argumentation</title>
       <author><first>Gabriella</first><last>Skitalinskaya</last></author>
-      <author><first>Maximilian</first><last>SpliethÃ¶ver</last></author>
+      <author><first>Maximilian</first><last>Spliethöver</last></author>
       <author><first>Henning</first><last>Wachsmuth</last></author>
       <pages>134–152</pages>
       <abstract>An optimal delivery of arguments is key to persuasion in any debate, both for humans and for AI systems. This requires the use of clear and fluent claims relevant to the given debate. Prior work has studied the automatic assessment of argument quality extensively. Yet, no approach actually improves the quality so far. To fill this gap, this paper proposes the task of claim optimization: to rewrite argumentative claims in order to optimize their delivery. As multiple types of optimization are possible, we approach this task by first generating a diverse set of candidate claims using a large language model, such as BART, taking into account contextual information. Then, the best candidate is selected using various quality metrics. In automatic and human evaluation on an English-language corpus, our quality-based candidate selection outperforms several baselines, improving 60% of all claims (worsening 16% only). Follow-up analyses reveal that, beyond copy editing, our approach often specifies claims with details, whereas it adds less evidence than humans do. Moreover, its capabilities generalize well to other domains, such as instructional texts.</abstract>
diff --git a/data/xml/2023.latechclfl.xml b/data/xml/2023.latechclfl.xml
index e98cbfad7a..c3a75d732c 100644
--- a/data/xml/2023.latechclfl.xml
+++ b/data/xml/2023.latechclfl.xml
@@ -40,7 +40,7 @@
       <author><first>Bastien</first><last>Bernath</last><affiliation>Epfl</affiliation></author>
       <author><first>Etienne</first><last>Boisson</last><affiliation>Epfl</affiliation></author>
       <author><first>Teo</first><last>Ferrari</last><affiliation>Hes-so</affiliation></author>
-      <author><first>Xavier</first><last>Theimer-lienhard</last><affiliation>Epfl</affiliation></author>
+      <author><first>Xavier</first><last>Theimer-Lienhard</last><affiliation>Epfl</affiliation></author>
       <author><first>Giorgos</first><last>Vernikos</last><affiliation>École Polytechnique Fédérale de Lausanne</affiliation></author>
       <pages>10-20</pages>
       <abstract>Poem generation with language models requires the modeling of rhyming patterns. We propose a novel solution for learning to rhyme, based on synthetic data generated with a rule-based rhyming algorithm. The algorithm and an evaluation metric use a phonetic dictionary and the definitions of perfect and assonant rhymes. We fine-tune a GPT-2 English model with 124M parameters on 142 MB of natural poems and find that this model generates consecutive rhymes infrequently (11%). We then fine-tune the model on 6 MB of synthetic quatrains with consecutive rhymes (AABB) and obtain nearly 60% of rhyming lines in samples generated by the model. Alternating rhymes (ABAB) are more difficult to model because of longer-range dependencies, but they are still learnable from synthetic data, reaching 45% of rhyming lines in generated samples.</abstract>
diff --git a/data/xml/2023.sigdial.xml b/data/xml/2023.sigdial.xml
index c5c30603fd..0af4ce3cc2 100644
--- a/data/xml/2023.sigdial.xml
+++ b/data/xml/2023.sigdial.xml
@@ -3,9 +3,9 @@
   <volume id="1" ingest-date="2023-10-01" type="proceedings">
     <meta>
       <booktitle>Proceedings of the 24th Meeting of the Special Interest Group on Discourse and Dialogue</booktitle>
-      <editor><first>David</first><last>Schlangen</last></editor>
       <editor><first>Svetlana</first><last>Stoyanchev</last></editor>
       <editor><first>Shafiq</first><last>Joty</last></editor>
+      <editor><first>David</first><last>Schlangen</last></editor>
       <editor><first>Ondrej</first><last>Dusek</last></editor>
       <editor><first>Casey</first><last>Kennington</last></editor>
       <editor><first>Malihe</first><last>Alikhani</last></editor>
@@ -63,7 +63,7 @@
     </paper>
     <paper id="5">
       <title>A Statistical Approach for Quantifying Group Difference in Topic Distributions Using Clinical Discourse Samples</title>
-      <author><first>Grace</first><last>Lawley</last></author>
+      <author><first>Grace O.</first><last>Lawley</last></author>
       <author><first>Peter A.</first><last>Heeman</last></author>
       <author><first>Jill K.</first><last>Dolata</last></author>
       <author><first>Eric</first><last>Fombonne</last></author>

From 5e005a2b336005ab722213024f1fcfcfb0628fd0 Mon Sep 17 00:00:00 2001
From: anthology-assist <126604033+anthology-assist@users.noreply.github.com>
Date: Tue, 14 Nov 2023 20:58:45 -0600
Subject: [PATCH 05/12] MT Summit 2023 ingestion (#2832)

---
 data/xml/2023.alt.xml      | 120 +++++++
 data/xml/2023.mtsummit.xml | 688 +++++++++++++++++++++++++++++++++++++
 data/xml/2023.wat.xml      |  89 +++++
 data/yaml/venues/alt.yaml  |   2 +
 4 files changed, 899 insertions(+)
 create mode 100644 data/xml/2023.alt.xml
 create mode 100644 data/xml/2023.mtsummit.xml
 create mode 100644 data/xml/2023.wat.xml
 create mode 100644 data/yaml/venues/alt.yaml

diff --git a/data/xml/2023.alt.xml b/data/xml/2023.alt.xml
new file mode 100644
index 0000000000..580d993b1b
--- /dev/null
+++ b/data/xml/2023.alt.xml
@@ -0,0 +1,120 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2023.alt">
+  <volume id="1" ingest-date="2023-10-08" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of ALT2023: Ancient Language Translation Workshop</booktitle>
+      <publisher>Asia-Pacific Association for Machine Translation</publisher>
+      <address>Macau SAR, China</address>
+      <month>September</month>
+      <year>2023</year>
+      <url hash="2ac19a64">2023.alt-1</url>
+      <venue>alt</venue>
+    </meta>
+    <frontmatter>
+      <url hash="68a7f134">2023.alt-1.0</url>
+      <bibkey>alt-2023-alt2023</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title><fixed-case>E</fixed-case>va<fixed-case>H</fixed-case>an2023: Overview of the First International <fixed-case>A</fixed-case>ncient <fixed-case>C</fixed-case>hinese Translation Bakeoff</title>
+      <author><first>Dongbo</first><last>Wang</last></author>
+      <author><first>Litao</first><last>Lin</last></author>
+      <author><first>Zhixiao</first><last>Zhao</last></author>
+      <author><first>Wenhao</first><last>Ye</last></author>
+      <author><first>Kai</first><last>Meng</last></author>
+      <author><first>Wenlong</first><last>Sun</last></author>
+      <author><first>Lianzhen</first><last>Zhao</last></author>
+      <author><first>Xue</first><last>Zhao</last></author>
+      <author><first>Si</first><last>Shen</last></author>
+      <author><first>Wei</first><last>Zhang</last></author>
+      <author><first>Bin</first><last>Li</last></author>
+      <pages>1–14</pages>
+      <abstract>This paper present the results of the First International Ancient Chinese Transalation Bakeoff (EvaHan), which is a shared task of the Ancient Language Translation Workshop (ALT2023) and a co-located event of the 19th Edition of the Machine Translation Summit 2023 (MTS 2023). We described the motivation for having an international shared contest, as well as the datasets and tracks. The contest consists of two modalities, closed and open. In the closed modality, the participants are only allowed to use the training data, the partic-ipating teams achieved the highest BLEU scores of 27.3315 and 1.1102 in the tasks of translating Ancient Chinese to Modern Chinese and translating Ancient Chinese to English, respectively. In the open mode, contestants can only use any available data and models. The participating teams achieved the highest BLEU scores of 29.6832 and 6.5493 in the ancient Chinese to modern and ancient Chinese to English tasks, respectively.</abstract>
+      <url hash="b1ddbe82">2023.alt-1.1</url>
+      <bibkey>wang-etal-2023-evahan2023</bibkey>
+    </paper>
+    <paper id="2">
+      <title>The Ups and Downs of Training <fixed-case>R</fixed-case>o<fixed-case>BERT</fixed-case>a-based models on Smaller Datasets for Translation Tasks from Classical <fixed-case>C</fixed-case>hinese into Modern Standard <fixed-case>M</fixed-case>andarin and <fixed-case>M</fixed-case>odern <fixed-case>E</fixed-case>nglish</title>
+      <author><first>Stuart Michael</first><last>McManus</last></author>
+      <author><first>Roslin</first><last>Liu</last></author>
+      <author><first>Yuji</first><last>Li</last></author>
+      <author><first>Leo</first><last>Tam</last></author>
+      <author><first>Stephanie</first><last>Qiu</last></author>
+      <author><first>Letian</first><last>Yu</last></author>
+      <pages>15–22</pages>
+      <abstract>The paper presents an investigation into the effectiveness of pre-trained language models, Siku-RoBERTa and RoBERTa, for Classical Chinese to Modern Standard Mandarin and Classical Chinese to English translation tasks. The English translation model resulted in unsatisfactory performance due to the small dataset, while the Modern Standard Mandarin model gave reasonable results.</abstract>
+      <url hash="855e9bbb">2023.alt-1.2</url>
+      <bibkey>mcmanus-etal-2023-ups</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Pre-trained Model In <fixed-case>A</fixed-case>ncient-<fixed-case>C</fixed-case>hinese-to-<fixed-case>M</fixed-case>odern-<fixed-case>C</fixed-case>hinese Machine Translation</title>
+      <author><first>Jiahui</first><last>Wang</last></author>
+      <author><first>Xuqin</first><last>Zhang</last></author>
+      <author><first>Jiahuan</first><last>Li</last></author>
+      <author><first>Shujian</first><last>Huang</last></author>
+      <pages>23–28</pages>
+      <abstract>This paper presents an analysis of the pre-trained Transformer model Neural Machine Translation (NMT) for the Ancient-Chinese-to-Modern-Chinese machine translation task.</abstract>
+      <url hash="9e4f5c9a">2023.alt-1.3</url>
+      <bibkey>wang-etal-2023-pre-trained</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Some Trials on Ancient <fixed-case>M</fixed-case>odern <fixed-case>C</fixed-case>hinese Translation</title>
+      <author><first>Li</first><last>Lin</last></author>
+      <author><first>Xinyu</first><last>Hu</last></author>
+      <pages>29–33</pages>
+      <abstract>In this study, we explored various neural machine translation techniques for the task of translating ancient Chinese into modern Chinese. Our aim was to find an effective method for achieving accurate and reliable translation results. After experimenting with different approaches, we discovered that the method of concatenating adjacent sentences yielded the best performance among all the methods tested.</abstract>
+      <url hash="f01059b6">2023.alt-1.4</url>
+      <bibkey>lin-hu-2023-trials</bibkey>
+    </paper>
+    <paper id="5">
+      <title>Istic Neural Machine Translation System for <fixed-case>E</fixed-case>va<fixed-case>H</fixed-case>an 2023</title>
+      <author><first>Ningyuan</first><last>Deng</last></author>
+      <author><first>Shuao</first><last>Guo</last></author>
+      <author><first>Yanqing</first><last>He</last></author>
+      <pages>34–42</pages>
+      <abstract>This paper presents the system architecture and the technique details adopted by Institute of Scientific and Technical Information of China (ISTIC) in the evaluation of First Conference on EvaHan(2023). In this evaluation, ISTIC participated in two tasks of Ancient Chinese Machine Translation: Ancient Chinese to Modern Chinese and Ancient Chinese to English. The paper mainly elaborates the model framework and data processing methods adopted in ISTIC’s system. Finally a comparison and analysis of different machine translation systems are also given.</abstract>
+      <url hash="416f48b4">2023.alt-1.5</url>
+      <bibkey>deng-etal-2023-istic</bibkey>
+    </paper>
+    <paper id="6">
+      <title><fixed-case>BIT</fixed-case>-<fixed-case>ACT</fixed-case>: An <fixed-case>A</fixed-case>ncient <fixed-case>C</fixed-case>hinese Translation System Using Data Augmentation</title>
+      <author><first>Li</first><last>Zeng</last></author>
+      <author><first>Yanzhi</first><last>Tian</last></author>
+      <author><first>Yingyu</first><last>Shan</last></author>
+      <author><first>Yuhang</first><last>Guo</last></author>
+      <pages>43–47</pages>
+      <abstract>This paper describes a translation model for ancient Chinese to modern Chinese and English for the Evahan 2023 competition, a subtask of the Ancient Language Translation 2023 challenge. During the training of our model, we applied various data augmentation techniques and used SiKu-RoBERTa as part of our model architecture. The results indicate that back translation improves the model’s performance, but double back translation introduces noise and harms the model’s performance. Fine-tuning on the original dataset can be helpful in solving the issue.</abstract>
+      <url hash="3ac59c9d">2023.alt-1.6</url>
+      <bibkey>zeng-etal-2023-bit</bibkey>
+    </paper>
+    <paper id="7">
+      <title>Technical Report on <fixed-case>A</fixed-case>ncient <fixed-case>C</fixed-case>hinese Machine Translation Based on m<fixed-case>RASP</fixed-case> Model</title>
+      <author><first>Wenjing</first><last>Liu</last></author>
+      <author><first>Jing</first><last>Xie</last></author>
+      <pages>48–54</pages>
+      <abstract>Abstract: Objective This paper aims to improve the performance of machine translation of ancient Chinese classics, which can better promote the development of ancient books research and the spread of Chinese culture. Methods Based on the multilingual translation machine pre-training model of mRASP, the model was trained by fine-tuning the specific language pairs, namely a2m, and a2e, according to the two downstream tasks of classical Chinese translation into modern Chinese and classical Chinese translation into English, using the parallel corpus of ancient white and white and ancient English parallel corpus of Pre-Qin+ZiZhiTongJian, and the translation performance of the fine-tuning model was evaluated by BIEU evaluation index. Results The BIEU4 results of the three downstream tasks of 24_histories_a2m、Pre-Qin+ZiZhiTongJian_a2m、 Pre-Qin+ZiZhiTongJian_a2e were 17.38, 13.69 and 12.90 respectively.</abstract>
+      <url hash="2d00f799">2023.alt-1.7</url>
+      <bibkey>liu-xie-2023-technical</bibkey>
+    </paper>
+    <paper id="8">
+      <title><fixed-case>A</fixed-case>nchi<fixed-case>L</fixed-case>m: An Effective Classical-to-<fixed-case>M</fixed-case>odern <fixed-case>C</fixed-case>hinese Translation Model Leveraging bpe-drop and <fixed-case>S</fixed-case>iku<fixed-case>R</fixed-case>o<fixed-case>BERT</fixed-case>a</title>
+      <author><first>Jiahui</first><last>Zhu</last></author>
+      <author><first>Sizhou</first><last>Chen</last></author>
+      <pages>55–60</pages>
+      <abstract>In this paper, we present our submitted model for translating ancient to modern texts, which ranked sixth in the closed track of ancient Chinese in the 2nd International Review of Automatic Analysis of Ancient Chinese (EvaHan). Specifically, we employed two strategies to improve the translation from ancient to modern texts. First, we used bpe-drop to enhance the parallel corpus. Second, we use SikuRoBERTa to simultaneously initialize the translation model’s codec and reconstruct the bpe word list. In our experiments, we compare the baseline model, rdrop, pre-trained model, and parameter initialization methods. The experimental results show that the parameter initialization method in this paper significantly outperforms the baseline model in terms of performance, and its BLEU score reaches 21.75.</abstract>
+      <url hash="245fd02d">2023.alt-1.8</url>
+      <bibkey>zhu-chen-2023-anchilm</bibkey>
+    </paper>
+    <paper id="9">
+      <title>Translating <fixed-case>A</fixed-case>ncient <fixed-case>C</fixed-case>hinese to <fixed-case>M</fixed-case>odern <fixed-case>C</fixed-case>hinese at Scale: A Large Language Model-based Approach</title>
+      <author><first>Jiahuan</first><last>Cao</last></author>
+      <author><first>Dezhi</first><last>Peng</last></author>
+      <author><first>Yongxin</first><last>Shi</last></author>
+      <author><first>Zongyuan</first><last>Jiang</last></author>
+      <author><first>Lianwen</first><last>Jin</last></author>
+      <pages>61–69</pages>
+      <abstract>Recently, the emergence of large language models (LLMs) has provided powerful foundation models for a wide range of natural language processing (NLP) tasks. However, the vast majority of the pre-training corpus for most existing LLMs is in English, resulting in their Chinese proficiency falling far behind that of English. Furthermore, ancient Chinese has a much larger vocabulary and less available corpus than modern Chinese, which significantly challenges the generalization capacity of existing LLMs. In this paper, we investigate the Ancient-Chinese-to-Modern-Chinese (A2M) translation using LLMs including LLaMA and Ziya. Specifically, to improve the understanding of Chinese texts, we explore the vocabulary expansion and incremental pre-training methods based on existing pre-trained LLMs. Subsequently, a large-scale A2M translation dataset with 4M pairs is utilized to finetune the LLMs.Experimental results demonstrate the effectiveness of the proposed method, especially with Ziya-13B, in translating ancient Chinese to modern Chinese. Moreover,we deeply analyze the performance of various LLMs with different strategies, which we believe can benefit further research on LLM-based A2M approaches.</abstract>
+      <url hash="5cdd99cd">2023.alt-1.9</url>
+      <bibkey>cao-etal-2023-translating</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/xml/2023.mtsummit.xml b/data/xml/2023.mtsummit.xml
new file mode 100644
index 0000000000..2b6e4bba5a
--- /dev/null
+++ b/data/xml/2023.mtsummit.xml
@@ -0,0 +1,688 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2023.mtsummit">
+  <volume id="research" ingest-date="2023-10-08" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of Machine Translation Summit XIX, Vol. 1: Research Track</booktitle>
+      <editor><first>Masao</first><last>Utiyama</last></editor>
+      <editor><first>Rui</first><last>Wang</last></editor>
+      <publisher>Asia-Pacific Association for Machine Translation</publisher>
+      <address>Macau SAR, China</address>
+      <month>September</month>
+      <year>2023</year>
+      <url hash="9c5334ff">2023.mtsummit-research</url>
+      <venue>mtsummit</venue>
+    </meta>
+    <frontmatter>
+      <url hash="64ab1cb9">2023.mtsummit-research.0</url>
+      <bibkey>mtsummit-2023-machine</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>Multiloop Incremental Bootstrapping for Low-Resource Machine Translation</title>
+      <author><first>Wuying</first><last>Liu</last></author>
+      <author><first>Wei</first><last>Li</last></author>
+      <author><first>Lin</first><last>Wang</last></author>
+      <pages>1–11</pages>
+      <abstract>Due to the scarcity of high-quality bilingual sentence pairs, some deep-learning-based machine translation algorithms cannot achieve better performance in low-resource machine translation. On this basis, we are committed to integrating the ideas of machine learning algorithm improvement and data augmentation, propose a novel multiloop incremental bootstrapping framework, and design the corresponding semi-supervised learning algorithm. This framework is a meta-frame independent of specific machine translation algorithms. This algorithm makes full use of bilingual seed data of appropriate scale and super-large-scale monolingual data to expand bilingual sentence pair data incrementally, and trains machine translation models step by step to improve the translation quality. The experimental results of neural machine translation on multiple language pairs prove that our proposed framework can make use of continuous monolingual data to raise itself. Its effectiveness is not only reflected in the easy implementation of state-of-the-art low-resource machine translation, but also in the practical option to quickly establish precise domain machine translation systems.</abstract>
+      <url hash="26be9ded">2023.mtsummit-research.1</url>
+      <bibkey>liu-etal-2023-multiloop</bibkey>
+    </paper>
+    <paper id="2">
+      <title>Joint Dropout: Improving Generalizability in Low-Resource Neural Machine Translation through Phrase Pair Variables</title>
+      <author><first>Ali</first><last>Araabi</last></author>
+      <author><first>Vlad</first><last>Niculae</last></author>
+      <author><first>Christof</first><last>Monz</last></author>
+      <pages>12–25</pages>
+      <abstract>Despite the tremendous success of Neural Machine Translation (NMT), its performance on low- resource language pairs still remains subpar, partly due to the limited ability to handle previously unseen inputs, i.e., generalization. In this paper, we propose a method called Joint Dropout, that addresses the challenge of low-resource neural machine translation by substituting phrases with variables, resulting in significant enhancement of compositionality, which is a key aspect of generalization. We observe a substantial improvement in translation quality for language pairs with minimal resources, as seen in BLEU and Direct Assessment scores. Furthermore, we conduct an error analysis, and find Joint Dropout to also enhance generalizability of low-resource NMT in terms of robustness and adaptability across different domains.</abstract>
+      <url hash="149036c3">2023.mtsummit-research.2</url>
+      <bibkey>araabi-etal-2023-joint</bibkey>
+    </paper>
+    <paper id="3">
+      <title>A Study of Multilingual versus Meta-Learning for Language Model Pre-Training for Adaptation to Unseen Low Resource Languages</title>
+      <author><first>Jyotsana</first><last>Khatri</last></author>
+      <author><first>Rudra</first><last>Murthy</last></author>
+      <author><first>Amar Prakash</first><last>Azad</last></author>
+      <author><first>Pushpak</first><last>Bhattacharyya</last></author>
+      <pages>26–34</pages>
+      <abstract>In this paper, we compare two approaches to train a multilingual language model: (i) simple multilingual learning using data-mixing, and (ii) meta-learning. We examine the performance of these models by extending them to unseen language pairs and further finetune them for the task of unsupervised NMT. We perform several experiments with varying amounts of data and give a comparative analysis of the approaches. We observe that both approaches give a comparable performance, and meta-learning gives slightly better results in a few cases of low amounts of data. For Oriya-Punjabi language pair, meta-learning performs better than multilingual learning when using 2M, and 3M sentences.</abstract>
+      <url hash="a3991172">2023.mtsummit-research.3</url>
+      <bibkey>khatri-etal-2023-study</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Data Augmentation with Diversified Rephrasing for Low-Resource Neural Machine Translation</title>
+      <author><first>Yuan</first><last>Gao</last></author>
+      <author><first>Feng</first><last>Hou</last></author>
+      <author><first>Huia</first><last>Jahnke</last></author>
+      <author><first>Ruili</first><last>Wang</last></author>
+      <pages>35–47</pages>
+      <abstract>Data augmentation is an effective way to enhance the performance of neural machine translation models, especially for low-resource languages. Existing data augmentation methods are either at a token level or a sentence level. The data augmented using token level methods lack syntactic diversity and may alter original meanings. Sentence level methods usually generate low-quality source sentences that are not semantically paired with the original target sentences. In this paper, we propose a novel data augmentation method to generate diverse, high-quality and meaning-preserved new instances. Our method leverages high-quality translation models trained with high-resource languages to rephrase an original sentence by translating it into an intermediate language and then back to the original language. Through this process, the high-performing translation models guarantee the quality of the rephrased sentences, and the syntactic knowledge from the intermediate language can bring syntactic diversity to the rephrased sentences. Experimental results show our method can enhance the performance in various low-resource machine translation tasks. Moreover, by combining our method with other techniques that facilitate NMT, we can yield even better results.</abstract>
+      <url hash="09782dd1">2023.mtsummit-research.4</url>
+      <bibkey>gao-etal-2023-data</bibkey>
+    </paper>
+    <paper id="5">
+      <title>A Dual Reinforcement Method for Data Augmentation using Middle Sentences for Machine Translation</title>
+      <author><first>Wenyi</first><last>Tang</last></author>
+      <author><first>Yves</first><last>Lepage</last></author>
+      <pages>48–58</pages>
+      <abstract>This paper presents an approach to enhance the quality of machine translation by leveraging middle sentences as pivot points and employing dual reinforcement learning. Conventional methods for generating parallel sentence pairs for machine translation rely on parallel corpora, which may be scarce, resulting in limitations in translation quality. In contrast, our proposed method entails training two machine translation models in opposite directions, utilizing the middle sentence as a bridge for a virtuous feedback loop between the two models. This feedback loop resembles reinforcement learning, facilitating the models to make informed decisions based on mutual feedback. Experimental results substantiate that our proposed method significantly improves machine translation quality.</abstract>
+      <url hash="4e9cc7d5">2023.mtsummit-research.5</url>
+      <bibkey>tang-lepage-2023-dual</bibkey>
+    </paper>
+    <paper id="6">
+      <title>Perturbation-based <fixed-case>QE</fixed-case>: An Explainable, Unsupervised Word-level Quality Estimation Method for Blackbox Machine Translation</title>
+      <author><first>Tu Anh</first><last>Dinh</last></author>
+      <author><first>Jan</first><last>Niehues</last></author>
+      <pages>59–71</pages>
+      <abstract>Quality Estimation (QE) is the task of predicting the quality of Machine Translation (MT) system output, without using any gold-standard translation references. State-of-the-art QE models are supervised: they require human-labeled quality of some MT system output on some datasets for training, making them domain-dependent and MT-system-dependent. There has been research on unsupervised QE, which requires glass-box access to the MT systems, or parallel MT data to generate synthetic errors for training QE models. In this paper, we present Perturbation-based QE - a word-level Quality Estimation approach that works simply by analyzing MT system output on perturbed input source sentences. Our approach is unsupervised, explainable, and can evaluate any type of blackbox MT systems, including the currently prominent large language models (LLMs) with opaque internal processes. For language directions with no labeled QE data, our approach has similar or better performance than the zero-shot supervised approach on the WMT21 shared task. Our approach is better at detecting gender bias and word-sense-disambiguation errors in translation than supervised QE, indicating its robustness to out-of-domain usage. The performance gap is larger when detecting errors on a nontraditional translation-prompting LLM, indicating that our approach is more generalizable to different MT systems. We give examples demonstrating our approach’s explainability power, where it shows which input source words have influence on a certain MT output word.</abstract>
+      <url hash="dc6ee960">2023.mtsummit-research.6</url>
+      <bibkey>dinh-niehues-2023-perturbation</bibkey>
+    </paper>
+    <paper id="7">
+      <title>Semi-supervised Learning for Quality Estimation of Machine Translation</title>
+      <author><first>Tarun</first><last>Bhatia</last></author>
+      <author><first>Martin</first><last>Kraemer</last></author>
+      <author><first>Eduardo</first><last>Vellasques</last></author>
+      <author><first>Eleftherios</first><last>Avramidis</last></author>
+      <pages>72–83</pages>
+      <abstract>We investigate whether using semi-supervised learning (SSL) methods can be beneficial for the task of word-level Quality Estimation of Machine Translation in low resource conditions. We show that the Mean Teacher network can provide equal or significantly better MCC scores (up to +12%) than supervised methods when a limited amount of labeled data is available. Additionally, following previous work on SSL, we investigate Pseudo-Labeling in combination with SSL, which nevertheless does not provide consistent improvements.</abstract>
+      <url hash="1db88430">2023.mtsummit-research.7</url>
+      <bibkey>bhatia-etal-2023-semi</bibkey>
+    </paper>
+    <paper id="8">
+      <title>Learning from Past Mistakes: Quality Estimation from Monolingual Corpora and Machine Translation Learning Stages</title>
+      <author><first>Thierry</first><last>Etchegoyhen</last></author>
+      <author><first>David</first><last>Ponce</last></author>
+      <pages>84–98</pages>
+      <abstract>Quality Estimation (QE) of Machine Translation output suffers from the lack of annotated data to train supervised models across domains and language pairs. In this work, we describe a method to generate synthetic QE data based on Neural Machine Translation (NMT) models at different learning stages. Our approach consists in training QE models on the errors produced by different NMT model checkpoints, obtained during the course of model training, under the assumption that gradual learning will induce errors that more closely resemble those produced by NMT models in adverse conditions. We test this approach on English-German and Romanian-English WMT QE test sets, demonstrating that pairing translations from earlier checkpoints with translations of converged models outperforms the use of reference human translations and can achieve competitive results against human-labelled data. We also show that combining post-edited data with our synthetic data yields to significant improvements across the board. Our approach thus opens new possibilities for an efficient use of monolingual corpora to generate quality synthetic QE data, thereby mitigating the data bottleneck.</abstract>
+      <url hash="ca04cfa1">2023.mtsummit-research.8</url>
+      <bibkey>etchegoyhen-ponce-2023-learning</bibkey>
+    </paper>
+    <paper id="9">
+      <title>Exploring Domain-shared and Domain-specific Knowledge in Multi-Domain Neural Machine Translation</title>
+      <author><first>Zhibo</first><last>Man</last></author>
+      <author><first>Yujie</first><last>Zhang</last></author>
+      <author><first>Yuanmeng</first><last>Chen</last></author>
+      <author><first>Yufeng</first><last>Chen</last></author>
+      <author><first>Jinan</first><last>Xu</last></author>
+      <pages>99–110</pages>
+      <abstract>Currently, multi-domain neural machine translation (NMT) has become a significant research topic in domain adaptation machine translation, which trains a single model by mixing data from multiple domains. Multi-domain NMT aims to improve the performance of the low-resources domain through data augmentation. However, mixed domain data brings more translation ambiguity. Previous work focused on domain-general or domain-context knowledge learning, respectively. Therefore, there is a challenge for acquiring domain-general or domain-context knowledge simultaneously. To this end, we propose a unified framework for learning simultaneously domain-general and domain-specific knowledge, we are the first to apply parameter differentiation in multi-domain NMT. Specifically, we design the differentiation criterion and differentiation granularity to obtain domain-specific parameters. Experimental results on multi-domain UM-corpus English-to-Chinese and OPUS German-to-English datasets show that the average BLEU scores of the proposed method exceed the strong baseline by 1.22 and 1.87, respectively. In addition, we investigate the case study to illustrate the effectiveness of the proposed method in acquiring domain knowledge.</abstract>
+      <url hash="f2eedf3f">2023.mtsummit-research.9</url>
+      <bibkey>man-etal-2023-exploring</bibkey>
+    </paper>
+    <paper id="10">
+      <title>Enhancing Translation of <fixed-case>M</fixed-case>yanmar Sign Language by Transfer Learning and Self-Training</title>
+      <author><first>Hlaing Myat</first><last>Nwe</last></author>
+      <author><first>Kiyoaki</first><last>Shirai</last></author>
+      <author><first>Natthawut</first><last>Kertkeidkachorn</last></author>
+      <author><first>Thanaruk</first><last>Theeramunkong</last></author>
+      <author><first>Ye Kyaw</first><last>Thu</last></author>
+      <author><first>Thepchai</first><last>Supnithi</last></author>
+      <author><first>Natsuda</first><last>Kaothanthong</last></author>
+      <pages>111–122</pages>
+      <abstract>This paper proposes a method to develop a machine translation (MT) system from Myanmar Sign Language (MSL) to Myanmar Written Language (MWL) and vice versa for the deaf community. Translation of MSL is a difficult task since only a small amount of a parallel corpus between MSL and MWL is available. To address the challenge for MT of the low-resource language, transfer learning is applied. An MT model is trained first for a high-resource language pair, American Sign Language (ASL) and English, then it is used as an initial model to train an MT model between MSL and MWL. The mT5 model is used as a base MT model in this transfer learning. Additionally, a self-training technique is applied to generate synthetic translation pairs of MSL and MWL from a large monolingual MWL corpus. Furthermore, since the segmentation of a sentence is required as preprocessing of MT for the Myanmar language, several segmentation schemes are empirically compared. Results of experiments show that both transfer learning and self-training can enhance the performance of the translation between MSL and MWL compared with a baseline model fine-tuned from a small MSL-MWL parallel corpus only.</abstract>
+      <url hash="21fa755f">2023.mtsummit-research.10</url>
+      <bibkey>nwe-etal-2023-enhancing</bibkey>
+    </paper>
+    <paper id="11">
+      <title>Improving Embedding Transfer for Low-Resource Machine Translation</title>
+      <author><first>Van Hien</first><last>Tran</last></author>
+      <author><first>Chenchen</first><last>Ding</last></author>
+      <author><first>Hideki</first><last>Tanaka</last></author>
+      <author><first>Masao</first><last>Utiyama</last></author>
+      <pages>123–134</pages>
+      <abstract>Low-resource machine translation (LRMT) poses a substantial challenge due to the scarcity of parallel training data. This paper introduces a new method to improve the transfer of the embedding layer from the Parent model to the Child model in LRMT, utilizing trained token embeddings in the Parent model’s high-resource vocabulary. Our approach involves projecting all tokens into a shared semantic space and measuring the semantic similarity between tokens in the low-resource and high-resource languages. These measures are then utilized to initialize token representations in the Child model’s low-resource vocabulary. We evaluated our approach on three benchmark datasets of low-resource language pairs: Myanmar-English, Indonesian-English, and Turkish-English. The experimental results demonstrate that our method outperforms previous methods regarding translation quality. Additionally, our approach is computationally efficient, leading to reduced training time compared to prior works.</abstract>
+      <url hash="dfbd05a8">2023.mtsummit-research.11</url>
+      <bibkey>tran-etal-2023-improving</bibkey>
+    </paper>
+    <paper id="12">
+      <title>Boosting Unsupervised Machine Translation with Pseudo-Parallel Data</title>
+      <author><first>Ivana</first><last>Kvapilíková</last></author>
+      <author><first>Ondřej</first><last>Bojar</last></author>
+      <pages>135–147</pages>
+      <abstract>Even with the latest developments in deep learning and large-scale language modeling, the task of machine translation (MT) of low-resource languages remains a challenge. Neural MT systems can be trained in an unsupervised way without any translation resources but the quality lags behind, especially in truly low-resource conditions. We propose a training strategy that relies on pseudo-parallel sentence pairs mined from monolingual corpora in addition to synthetic sentence pairs back-translated from monolingual corpora. We experiment with different training schedules and reach an improvement of up to 14.5 BLEU points (English to Ukrainian) over a baseline trained on back-translated data only.</abstract>
+      <url hash="ffe713a1">2023.mtsummit-research.12</url>
+      <bibkey>kvapilikova-bojar-2023-boosting</bibkey>
+    </paper>
+    <paper id="13">
+      <title>A Study on the Effectiveness of Large Language Models for Translation with Markup</title>
+      <author><first>Raj</first><last>Dabre</last></author>
+      <author><first>Bianka</first><last>Buschbeck</last></author>
+      <author><first>Miriam</first><last>Exel</last></author>
+      <author><first>Hideki</first><last>Tanaka</last></author>
+      <pages>148–159</pages>
+      <abstract>In this paper we evaluate the utility of large language models (LLMs) for translation of text with markup in which the most important and challenging aspect is to correctly transfer markup tags while ensuring that the content, both, inside and outside tags is correctly translated. While LLMs have been shown to be effective for plain text translation, their effectiveness for structured document translation is not well understood. To this end, we experiment with BLOOM and BLOOMZ, which are open-source multilingual LLMs, using zero, one and few-shot prompting, and compare with a domain-specific in-house NMT system using a detag-and-project approach for markup tags. We observe that LLMs with in-context learning exhibit poorer translation quality compared to the domain-specific NMT system, however, they are effective in transferring markup tags, especially the large BLOOM model (176 billion parameters). This is further confirmed by our human evaluation which also reveals the types of errors of the different tag transfer techniques. While LLM-based approaches come with the risk of losing, hallucinating and corrupting tags, they excel at placing them correctly in the translation.</abstract>
+      <url hash="a32553ed">2023.mtsummit-research.13</url>
+      <bibkey>dabre-etal-2023-study</bibkey>
+    </paper>
+    <paper id="14">
+      <title>A Case Study on Context Encoding in Multi-Encoder based Document-Level Neural Machine Translation</title>
+      <author><first>Ramakrishna</first><last>Appicharla</last></author>
+      <author><first>Baban</first><last>Gain</last></author>
+      <author><first>Santanu</first><last>Pal</last></author>
+      <author><first>Asif</first><last>Ekbal</last></author>
+      <pages>160–172</pages>
+      <abstract>Recent studies have shown that the multi-encoder models are agnostic to the choice of context and the context encoder generates noise which helps in the improvement of the models in terms of BLEU score. In this paper, we further explore this idea by evaluating with context-aware pronoun translation test set by training multi-encoder models trained on three different context settings <i>viz,</i> previous two sentences, random two sentences, and a mix of both as context. Specifically, we evaluate the models on the ContraPro test set to study how different contexts affect pronoun translation accuracy. The results show that the model can perform well on the ContraPro test set even when the context is random. We also analyze the source representations to study whether the context encoder is generating noise or not. Our analysis shows that the context encoder is providing sufficient information to learn discourse-level information. Additionally, we observe that mixing the selected context (the previous two sentences in this case) and the random context is generally better than the other settings.</abstract>
+      <url hash="09860ffa">2023.mtsummit-research.14</url>
+      <bibkey>appicharla-etal-2023-case</bibkey>
+    </paper>
+    <paper id="15">
+      <title>In-context Learning as Maintaining Coherency: A Study of On-the-fly Machine Translation Using Large Language Models</title>
+      <author><first>Suzanna</first><last>Sia</last></author>
+      <author><first>Kevin</first><last>Duh</last></author>
+      <pages>173–185</pages>
+      <abstract>The phenomena of in-context learning has typically been thought of as “learning from examples”. In this work which focuses on Machine Translation, we present a perspective of in-context learning as the desired generation task maintaining coherency with its context, i.e., the prompt examples. We first investigate randomly sampled prompts across 4 domains, and find that translation performance improves when shown in-domain prompts. Next, we investigate coherency for the in-domain setting, which uses prompt examples from a moving window. We study this with respect to other factors that have previously been identified in the literature such as length, surface similarity and sentence embedding similarity. Our results across 3 models (GPTNeo2.7B, Bloom3B, XGLM2.9B), and three translation directions (en<tex-math>\rightarrow</tex-math>{pt, de, fr}) suggest that the long-term coherency of the prompts and the test sentence is a good indicator of downstream translation performance. In doing so, we demonstrate the efficacy of in-context Machine Translation for on-the-fly adaptation.</abstract>
+      <url hash="c38e8b2e">2023.mtsummit-research.15</url>
+      <bibkey>sia-duh-2023-context</bibkey>
+    </paper>
+    <paper id="16">
+      <title>Beyond Correlation: Making Sense of the Score Differences of New <fixed-case>MT</fixed-case> Evaluation Metrics</title>
+      <author><first>Chi-kiu</first><last>Lo</last></author>
+      <author><first>Rebecca</first><last>Knowles</last></author>
+      <author><first>Cyril</first><last>Goutte</last></author>
+      <pages>186–199</pages>
+      <abstract>While many new automatic metrics for machine translation evaluation have been proposed in recent years, BLEU scores are still used as the primary metric in the vast majority of MT research papers. There are many reasons that researchers may be reluctant to switch to new metrics, from external pressures (reviewers, prior work) to the ease of use of metric toolkits. Another reason is a lack of intuition about the meaning of novel metric scores. In this work, we examine “rules of thumb” about metric score differences and how they do (and do not) correspond to human judgments of statistically significant differences between systems. In particular, we show that common rules of thumb about BLEU score differences do not in fact guarantee that human annotators will find significant differences between systems. We also show ways in which these rules of thumb fail to generalize across translation directions or domains.</abstract>
+      <url hash="3324c47a">2023.mtsummit-research.16</url>
+      <bibkey>lo-etal-2023-beyond</bibkey>
+    </paper>
+    <paper id="17">
+      <title>Bad <fixed-case>MT</fixed-case> Systems are Good for Quality Estimation</title>
+      <author><first>Iryna</first><last>Tryhubyshyn</last></author>
+      <author><first>Aleš</first><last>Tamchyna</last></author>
+      <author><first>Ondřej</first><last>Bojar</last></author>
+      <pages>200–208</pages>
+      <abstract>Quality estimation (QE) is the task of predicting quality of outputs produced by machine translation (MT) systems. Currently, the highest-performing QE systems are supervised and require training on data with golden quality scores. In this paper, we investigate the impact of the quality of the underlying MT outputs on the performance of QE systems. We find that QE models trained on datasets with lower-quality translations often outperform those trained on higher-quality data. We also demonstrate that good performance can be achieved by using a mix of data from different MT systems.</abstract>
+      <url hash="cb530057">2023.mtsummit-research.17</url>
+      <bibkey>tryhubyshyn-etal-2023-bad</bibkey>
+    </paper>
+    <paper id="18">
+      <title>Improving Domain Robustness in Neural Machine Translation with Fused Topic Knowledge Embeddings</title>
+      <author><first>Danai</first><last>Xezonaki</last></author>
+      <author><first>Talaat</first><last>Khalil</last></author>
+      <author><first>David</first><last>Stap</last></author>
+      <author><first>Brandon</first><last>Denis</last></author>
+      <pages>209–221</pages>
+      <abstract>Domain robustness is a key challenge for Neural Machine Translation (NMT). Translating text from a different distribution than the training set requires the NMT models to generalize well to unseen domains. In this work we propose a novel way to address domain robustness, by fusing external topic knowledge into the NMT architecture. We employ a pretrained denoising autoencoder and fuse topic information into the system during continued pretraining, and finetuning of the model on the downstream NMT task. Our results show that incorporating external topic knowledge, as well as additional pretraining can improve the out-of-domain performance of NMT models. The proposed methodology meets state-of-the-art on out-of-domain performance. Our analysis shows that a low overlap between the pretraining and finetuning corpora, as well as the quality of topic representations help the NMT systems become more robust under domain shift.</abstract>
+      <url hash="daa4cdbd">2023.mtsummit-research.18</url>
+      <bibkey>xezonaki-etal-2023-improving</bibkey>
+    </paper>
+    <paper id="19">
+      <title>Instance-Based Domain Adaptation for Improving Terminology Translation</title>
+      <author><first>Prashanth</first><last>Nayak</last></author>
+      <author><first>John</first><last>Kelleher</last></author>
+      <author><first>Rejwanul</first><last>Haque</last></author>
+      <author><first>Andy</first><last>Way</last></author>
+      <pages>222–234</pages>
+      <abstract>Terms are essential indicators of a domain, and domain term translation is dealt with priority in any translation workflow. Translation service providers who use machine translation (MT) expect term translation to be unambiguous and consistent with the context and domain in question. Although current state-of-the-art neural MT (NMT) models are able to produce high-quality translations for many languages, they are still not at the level required when it comes to translating domain-specific terms. This study presents a terminology-aware instance- based adaptation method for improving terminology translation in NMT. We conducted our experiments for French-to-English and found that our proposed approach achieves a statistically significant improvement over the baseline NMT system in translating domain-specific terms. Specifically, the translation of multi-word terms is improved by 6.7% compared to the strong baseline.</abstract>
+      <url hash="4fc50f8c">2023.mtsummit-research.19</url>
+      <bibkey>nayak-etal-2023-instance</bibkey>
+    </paper>
+    <paper id="20">
+      <title>Learning from Mistakes: Towards Robust Neural Machine Translation for Disfluent <fixed-case>L</fixed-case>2 Sentences</title>
+      <author><first>Shuyue Stella</first><last>Li</last></author>
+      <author><first>Philipp</first><last>Koehn</last></author>
+      <pages>235–247</pages>
+      <abstract>We study the sentences written by second-language (L2) learners to improve the robustness of current neural machine translation (NMT) models on this type of data. Current large datasets used to train NMT systems are mostly Wikipedia or government documents written by highly competent speakers of that language, especially English. However, given that English is the most common second language, it is crucial that machine translation systems are robust against the large number of sentences written by L2 learners of English. By studying the difficulties faced by humans in their L2 acquisition process, we are able to transfer such insights to machine translation systems to recover from source-side fluency variations. In this work, we create additional training data with artificial errors similar to mistakes made by L2 learners of various fluency levels to improve the quality of the machine translation system. We test our method in zero-shot settings on the JFLEG-es (English-Spanish) dataset. The quality of our machine translation system on disfluent sentences outperforms the baseline by 1.8 BLEU scores.</abstract>
+      <url hash="d15bdc94">2023.mtsummit-research.20</url>
+      <bibkey>li-koehn-2023-learning</bibkey>
+    </paper>
+    <paper id="21">
+      <title>The Role of Compounds in Human vs. Machine Translation Quality</title>
+      <author><first>Kristyna</first><last>Neumannova</last></author>
+      <author><first>Ondřej</first><last>Bojar</last></author>
+      <pages>248–260</pages>
+      <abstract>We focus on the production of German compounds in English-to-German manual and automatic translation. On the example of WMT21 news translation test set, we observe that even the best MT systems produce much fewer compounds compared to three independent manual translations. Despite this striking difference, we observe that this insufficiency is not apparent in manual evaluation methods that target the overall translation quality (DA and MQM). Simple automatic methods like BLEU somewhat surprisingly provide a better indication of this quality aspect. Our manual analysis of system outputs, including our freshly trained Transformer models, confirms that current deep neural systems operating at the level of subword units are capable of constructing novel words, including novel compounds. This effect however cannot be measured using static dictionaries of compounds such as GermaNet. German compounds thus pose an interesting challenge for future development of MT systems.</abstract>
+      <url hash="9ed2e602">2023.mtsummit-research.21</url>
+      <bibkey>neumannova-bojar-2023-role</bibkey>
+    </paper>
+    <paper id="22">
+      <title>Benchmarking Dialectal <fixed-case>A</fixed-case>rabic-<fixed-case>T</fixed-case>urkish Machine Translation</title>
+      <author><first>Hasan</first><last>Alkheder</last></author>
+      <author><first>Houda</first><last>Bouamor</last></author>
+      <author><first>Nizar</first><last>Habash</last></author>
+      <author><first>Ahmet</first><last>Zengin</last></author>
+      <pages>261–271</pages>
+      <abstract>Due to the significant influx of Syrian refugees in Turkey in recent years, the Syrian Arabic dialect has become increasingly prevalent in certain regions of Turkey. Developing a machine translation system between Turkish and Syrian Arabic would be crucial in facilitating communication between the Turkish and Syrian communities in these regions, which can have a positive impact on various domains such as politics, trade, and humanitarian aid. Such a system would also contribute positively to the growing Arab-focused tourism industry in Turkey. In this paper, we present the first research effort exploring translation between Syrian Arabic and Turkish. We use a set of 2,000 parallel sentences from the MADAR corpus containing 25 different city dialects from different cities across the Arab world, in addition to Modern Standard Arabic (MSA), English, and French. Additionally, we explore the translation performance into Turkish from other Arabic dialects and compare the results to the performance achieved when translating from Syrian Arabic. We build our MADAR-Turk data set by manually translating the set of 2,000 sentences from the Damascus dialect of Syria to Turkish with the help of two native Arabic speakers from Syria who are also highly fluent in Turkish. We evaluate the quality of the translations and report the results achieved. We make this first-of-a-kind data set publicly available to support research in machine translation between these important but less studied language pairs.</abstract>
+      <url hash="a977877b">2023.mtsummit-research.22</url>
+      <bibkey>alkheder-etal-2023-benchmarking</bibkey>
+    </paper>
+    <paper id="23">
+      <title>Context-aware Neural Machine Translation for <fixed-case>E</fixed-case>nglish-<fixed-case>J</fixed-case>apanese Business Scene Dialogues</title>
+      <author><first>Sumire</first><last>Honda</last></author>
+      <author><first>Patrick</first><last>Fernandes</last></author>
+      <author><first>Chrysoula</first><last>Zerva</last></author>
+      <pages>272–285</pages>
+      <abstract>Despite the remarkable advancements in machine translation, the current sentence-level paradigm faces challenges when dealing with highly-contextual languages like Japanese. In this paper, we explore how context-awareness can improve the performance of the current Neural Machine Translation (NMT) models for English-Japanese business dialogues translation, and what kind of context provides meaningful information to improve translation. As business dialogue involves complex discourse phenomena but offers scarce training resources, we adapted a pretrained mBART model, finetuning on multi-sentence dialogue data, which allows us to experiment with different contexts. We investigate the impact of larger context sizes and propose novel context tokens encoding extra-sentential information, such as speaker turn and scene type. We make use of Conditional Cross-Mutual Information (CXMI) to explore how much of the context the model uses and generalise CXMI to study the impact of the extra sentential context. Overall, we find that models leverage both preceding sentences and extra-sentential context (with CXMI increasing with context size) and we provide a more focused analysis on honorifics translation. Regarding translation quality, increased source-side context paired with scene and speaker information improves the model performance compared to previous work and our context-agnostic baselines, measured in BLEU and COMET metrics.</abstract>
+      <url hash="f38d8848">2023.mtsummit-research.23</url>
+      <bibkey>honda-etal-2023-context</bibkey>
+    </paper>
+    <paper id="24">
+      <title>A Context-Aware Annotation Framework for Customer Support Live Chat Machine Translation</title>
+      <author><first>Miguel</first><last>Menezes</last></author>
+      <author><first>M. Amin</first><last>Farajian</last></author>
+      <author><first>Helena</first><last>Moniz</last></author>
+      <author><first>João Varelas</first><last>Graça</last></author>
+      <pages>286–297</pages>
+      <abstract>To measure context-aware machine translation (MT) systems quality, existing solutions have recommended human annotators to consider the full context of a document. In our work, we revised a well known Machine Translation quality assessment framework, Multidimensional Quality Metrics (MQM), (Lommel et al., 2014) by introducing a set of nine annotation categories that allows to map MT errors to source document contextual phenomenon, for simplicity sake we named such phenomena as contextual triggers. Our analysis shows that the adapted categories set enhanced MQM’s potential for MT error identification, being able to cover up to 61% more errors, when compared to traditional non-context core MQM’s application. Subsequently, we analyzed the severity of these MT “contextual errors”, showing that the majority fall under the critical and major levels, further indicating the impact of such errors. Finally, we measured the ability of existing evaluation metrics in detecting the proposed MT “contextual errors”. The results have shown that current state-of-the-art metrics fall short in detecting MT errors that are caused by contextual triggers on the source document side. With the work developed, we hope to understand how impactful context is for enhancing quality within a MT workflow and draw attention to future integration of the proposed contextual annotation framework into current MQM’s core typology.</abstract>
+      <url hash="6990cfd4">2023.mtsummit-research.24</url>
+      <bibkey>menezes-etal-2023-context</bibkey>
+    </paper>
+    <paper id="25">
+      <title>Targeted Data Augmentation Improves Context-aware Neural Machine Translation</title>
+      <author><first>Harritxu</first><last>Gete</last></author>
+      <author><first>Thierry</first><last>Etchegoyhen</last></author>
+      <author><first>Gorka</first><last>Labaka</last></author>
+      <pages>298–312</pages>
+      <abstract>Progress in document-level Machine Translation is hindered by the lack of parallel training data that include context information. In this work, we evaluate the potential of data augmentation techniques to circumvent these limitations, showing that significant gains can be achieved via upsampling, similar context sampling and back-translations, targeted on context-relevant data. We apply these methods on standard document-level datasets in English-German and English-French and demonstrate their relevance to improve the translation of contextual phenomena. In particular, we show that relatively small volumes of targeted data augmentation lead to significant improvements over a strong context-concatenation baseline and standard back-translation of document-level data. We also compare the accuracy of the selected methods depending on data volumes or distance to relevant context information, and explore their use in combination.</abstract>
+      <url hash="5fb87bb9">2023.mtsummit-research.25</url>
+      <bibkey>gete-etal-2023-targeted</bibkey>
+    </paper>
+    <paper id="26">
+      <title>Target Language Monolingual Translation Memory based <fixed-case>NMT</fixed-case> by Cross-lingual Retrieval of Similar Translations and Reranking</title>
+      <author><first>Takuya</first><last>Tamura</last></author>
+      <author><first>Xiaotian</first><last>Wang</last></author>
+      <author><first>Takehito</first><last>Utsuro</last></author>
+      <author><first>Masaaki</first><last>Nagata</last></author>
+      <pages>313–323</pages>
+      <abstract>Retrieve-edit-rerank is a text generation framework composed of three steps: retrieving for sentences using the input sentence as a query, generating multiple output sentence candidates, and selecting the final output sentence from these candidates. This simple approach has outperformed other existing and more complex methods. This paper focuses on the retrieving and the reranking steps. In the retrieving step, we propose retrieving similar target language sentences from a target language monolingual translation memory using language-independent sentence embeddings generated by mSBERT or LaBSE. We demonstrate that this approach significantly outperforms existing methods that use monolingual inter-sentence similarity measures such as edit distance, which is only applicable to a parallel translation memory. In the reranking step, we propose a new reranking score for selecting the best sentences, which considers both the log-likelihood of each candidate and the sentence embeddings based similarity between the input and the candidate. We evaluated the proposed method for English-to-Japanese translation on the ASPEC and English-to-French translation on the EU Bookshop Corpus (EUBC). The proposed method significantly exceeded the baseline in BLEU score, especially observing a 1.4-point improvement in the EUBC dataset over the original Retrieve-Edit-Rerank method.</abstract>
+      <url hash="a3da2a21">2023.mtsummit-research.26</url>
+      <bibkey>tamura-etal-2023-target</bibkey>
+    </paper>
+    <paper id="27">
+      <title>Towards Zero-Shot Multilingual Poetry Translation</title>
+      <author><first>Wai Lei</first><last>Song</last></author>
+      <author><first>Haoyun</first><last>Xu</last></author>
+      <author><first>Derek F.</first><last>Wong</last></author>
+      <author><first>Runzhe</first><last>Zhan</last></author>
+      <author><first>Lidia S.</first><last>Chao</last></author>
+      <author><first>Shanshan</first><last>Wang</last></author>
+      <pages>324–335</pages>
+      <abstract>The application of machine translation in the field of poetry has always presented significant challenges. Conventional machine translation techniques are inadequate for capturing and translating the unique style of poetry. The absence of a parallel poetry corpus and the distinctive structure of poetry further restrict the effectiveness of traditional methods. This paper introduces a zero-shot method that is capable of translating poetry style without the need for a large-scale training corpus. Specifically, we treat poetry translation as a standard machine translation problem and subsequently inject the poetry style upon completion of the translation process. Our injection model only requires back-translation and easily obtainable monolingual data, making it a low-cost solution. We conducted experiments on three translation directions and presented automatic and human evaluations, demonstrating that our proposed method outperforms existing online systems and other competitive baselines. These results validate the feasibility and potential of our proposed approach and provide new prospects for poetry translation.</abstract>
+      <url hash="a8ceec33">2023.mtsummit-research.27</url>
+      <bibkey>song-etal-2023-towards</bibkey>
+    </paper>
+    <paper id="28">
+      <title>Leveraging Highly Accurate Word Alignment for Low Resource Translation by Pretrained Multilingual Model</title>
+      <author><first>Jingyi</first><last>Zhu</last></author>
+      <author><first>Minato</first><last>Kondo</last></author>
+      <author><first>Takuya</first><last>Tamura</last></author>
+      <author><first>Takehito</first><last>Utsuro</last></author>
+      <author><first>Masaaki</first><last>Nagata</last></author>
+      <pages>336–347</pages>
+      <abstract>Recently, there has been a growing interest in pretraining models in the field of natural language processing. As opposed to training models from scratch, pretrained models have been shown to produce superior results in low-resource translation tasks. In this paper, we introduced the use of pretrained seq2seq models for preordering and translation tasks. We utilized manual word alignment data and mBERT-based generated word alignment data for training preordering and compared the effectiveness of various types of mT5 and mBART models for preordering. For the translation task, we chose mBART as our baseline model and evaluated several input manners. Our approach was evaluated on the Asian Language Treebank dataset, consisting of 20,000 parallel data in Japanese, English and Hindi, where Japanese is either on the source or target side. We also used in-house 3,000 parallel data in Chinese and Japanese. The results indicated that mT5-large trained with manual word alignment achieved a preordering performance exceeding 0.9 RIBES score on Ja-En and Ja-Zh pairs. Moreover, our proposed approach significantly outperformed the baseline model in most translation directions of Ja-En, Ja-Zh, and Ja-Hi pairs in at least one of BLEU/COMET scores.</abstract>
+      <url hash="9dbc8120">2023.mtsummit-research.28</url>
+      <bibkey>zhu-etal-2023-leveraging</bibkey>
+    </paper>
+    <paper id="29">
+      <title>Pivot Translation for Zero-resource Language Pairs Based on a Multilingual Pretrained Model</title>
+      <author><first>Kenji</first><last>Imamura</last></author>
+      <author><first>Masao</first><last>Utiyama</last></author>
+      <author><first>Eiichiro</first><last>Sumita</last></author>
+      <pages>348–359</pages>
+      <abstract>A multilingual translation model enables a single model to handle multiple languages. However, the translation qualities of unlearned language pairs (i.e., zero-shot translation qualities) are still poor. By contrast, pivot translation translates source texts into target ones via a pivot language such as English, thus enabling machine translation without parallel texts between the source and target languages. In this paper, we perform pivot translation using a multilingual model and compare it with direct translation. We improve the translation quality without using parallel texts of direct translation by fine-tuning the model with machine-translated pseudo-translations. We also discuss what type of parallel texts are suitable for effectively improving the translation quality in multilingual pivot translation.</abstract>
+      <url hash="32ae51fc">2023.mtsummit-research.29</url>
+      <bibkey>imamura-etal-2023-pivot</bibkey>
+    </paper>
+    <paper id="30">
+      <title>Character-level <fixed-case>NMT</fixed-case> and language similarity</title>
+      <author><first>Josef</first><last>Jon</last></author>
+      <author><first>Ondřej</first><last>Bojar</last></author>
+      <pages>360–371</pages>
+      <abstract>We explore the effectiveness of character-level neural machine translation using Transformer architecture for various levels of language similarity and size of the training dataset. We evaluate the models using automatic MT metrics and show that translation between similar languages benefits from character-level input segmentation, while for less related languages, character-level vanilla Transformer-base often lags behind subword-level segmentation. We confirm previous findings that it is possible to close the gap by finetuning the already trained subword-level models to character-level.</abstract>
+      <url hash="5c932080">2023.mtsummit-research.30</url>
+      <bibkey>jon-bojar-2023-character</bibkey>
+    </paper>
+    <paper id="31">
+      <title>Negative Lexical Constraints in Neural Machine Translation</title>
+      <author><first>Josef</first><last>Jon</last></author>
+      <author><first>Dusan</first><last>Varis</last></author>
+      <author><first>Michal</first><last>Novák</last></author>
+      <author><first>João Paulo</first><last>Aires</last></author>
+      <author><first>Ondřej</first><last>Bojar</last></author>
+      <pages>372–384</pages>
+      <abstract>This paper explores negative lexical constraining in English to Czech neural machine translation. Negative lexical constraining is used to prohibit certain words or expressions in the translation produced by the NMT model. We compared various methods based on modifying either the decoding process or the training data. The comparison was performed on two tasks: paraphrasing and feedback-based translation refinement. We also studied how the methods “evade” the constraints, meaning that the disallowed expression is still present in the output, but in a changed form, most interestingly the case where a different surface form (for example different inflection) is produced. We propose a way to mitigate the issue through training with stemmed negative constraints, so that the ability of the model to induce different forms of a word might be used to prohibit the usage of all possible forms of the constraint. This helps to some extent, but the problem still persists in many cases.</abstract>
+      <url hash="2139cc61">2023.mtsummit-research.31</url>
+      <bibkey>jon-etal-2023-negative</bibkey>
+    </paper>
+    <paper id="32">
+      <title>Post-editing of Technical Terms based on Bilingual Example Sentences</title>
+      <author><first>Elsie K. Y.</first><last>Chan</last></author>
+      <author><first>John</first><last>Lee</last></author>
+      <author><first>Chester</first><last>Cheng</last></author>
+      <author><first>Benjamin</first><last>Tsou</last></author>
+      <pages>385–392</pages>
+      <abstract>As technical fields become ever more specialized, and with continuous emergence of novel technical terms, it may not be always possible to avail of bilingual experts in the field to perform translation. This paper investigates the performance of bilingual non-experts in Computer-Assisted Translation. The translators were asked to identify and correct errors in MT output of technical terms in patent materials, aided only by example bilingual sentences. Targeting English-to-Chinese translation, we automatically extract the example sentences from a bilingual corpus of English and Chinese patents. We identify the most frequent translation candidates of a term, and then select the most relevant example sentences for each candidate according to semantic similarity. Even when given only two example sentences for each translation candidate, the non-expert translators were able to post-edit effectively, correcting 67.2% of the MT errors while mistakenly revising correct MT output in only 17% of the cases.</abstract>
+      <url hash="b293a412">2023.mtsummit-research.32</url>
+      <bibkey>chan-etal-2023-post</bibkey>
+    </paper>
+    <paper id="33">
+      <title>A Filtering Approach to Object Region Detection in Multimodal Machine Translation</title>
+      <author><first>Ali</first><last>Hatami</last></author>
+      <author><first>Paul</first><last>Buitelaar</last></author>
+      <author><first>Mihael</first><last>Arcan</last></author>
+      <pages>393–405</pages>
+      <abstract>Recent studies in Multimodal Machine Translation (MMT) have explored the use of visual information in a multimodal setting to analyze its redundancy with textual information. The aim of this work is to develop a more effective approach to incorporating relevant visual information into the translation process and improve the overall performance of MMT models. This paper proposes an object-level filtering approach in Multimodal Machine Translation, where the approach is applied to object regions extracted from an image to filter out irrelevant objects based on the image captions to be translated. Using the filtered image helps the model to consider only relevant objects and their relative locations to each other. Different matching methods, including string matching and word embeddings, are employed to identify relevant objects. Gaussian blurring is used to soften irrelevant objects from the image and to evaluate the effect of object filtering on translation quality. The performance of the filtering approaches was evaluated on the Multi30K dataset in English to German, French, and Czech translations, based on BLEU, ChrF2, and TER metrics.</abstract>
+      <url hash="e636f322">2023.mtsummit-research.33</url>
+      <bibkey>hatami-etal-2023-filtering</bibkey>
+    </paper>
+  </volume>
+  <volume id="users" ingest-date="2023-10-08" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of Machine Translation Summit XIX, Vol. 2: Users Track</booktitle>
+      <editor><first>Masaru</first><last>Yamada</last></editor>
+      <editor><first>Felix</first><last>do Carmo</last></editor>
+      <publisher>Asia-Pacific Association for Machine Translation</publisher>
+      <address>Macau SAR, China</address>
+      <month>September</month>
+      <year>2023</year>
+      <url hash="9e3c23d3">2023.mtsummit-users</url>
+      <venue>mtsummit</venue>
+    </meta>
+    <frontmatter>
+      <url hash="3bf00398">2023.mtsummit-users.0</url>
+      <bibkey>mtsummit-2023-machine-translation</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>Exploring undergraduate translation students’ perceptions towards machine translation: A qualitative questionnaire survey</title>
+      <author><first>Jia</first><last>Zhang</last></author>
+      <pages>1–10</pages>
+      <abstract>Machine translation (MT) has relatively recently been introduced in higher education institutions, with specialised courses provided for students. However, such courses are often offered at the postgraduate level or towards the last year of an undergraduate programme (e.g., Arenas &amp; Moorkens, 2019; Doherty et al., 2012). Most previous studies have focussed on postgraduate students or undergraduate students in the last year of their programme and surveyed their perceptions or attitudes towards MT with quantitative questionnaires (e.g., Liu et al., 2022; Yang et al., 2021), yet undergraduate students earlier in their translation education remain overlooked. As such, not much is known about how they perceive and use MT and what their training needs may be. This study investigates the perceptions towards MT of undergraduate students at the early stage of translator training via qualitative questionnaires. Year-two translation students with little or no MT knowledge and no real-life translation experience (n=20) were asked to fill out a questionnaire with open-ended questions. Their answers were manually analysed by the researcher using NVivo to identify themes and arguments. It was revealed that even without proper training, the participants recognised MT’s potential advantages and disadvantages to a certain degree. MT is more often engaged as an instrument to learn language and translation rather than straightforwardly a translation tool. None of the students reported post-editing machine-generated translation in their translation assignments. Instead, they referenced MT output to understand terms, slang, fixed combinations and complicated sentences and to produce accurate, authentic and diversified phrases and sentences. They held a positive attitude towards MT quality and agreed that MT increased their translation quality, and they felt more confident with the tasks. While they were willing to experiment with MT as a translation tool and perform post-editing in future tasks, they were doubtful that MT could be introduced in the classroom at their current stage of translation learning. They feared that MT would impact their independent and critical thinking. Students did not mention any potential negative impacts of MT on the development of their language proficiency or translation competency. It is hoped that the findings will make an evidence-based contribution to the design of MT curricula and teaching pedagogies. Keywords: machine translation, post-editing, translator training, perception, attitudes, teaching pedagogy References: Arenas, A. G., &amp; Moorkens, J. (2019). Machine translation and post-editing training as part of a master’s programme. Journal of Specialised Translation, 31, 217–238. Doherty, S., Kenny, D., &amp; Way, A. (2012). Taking statistical machine translation to the student translator. Proceedings of the 10th Conference of the Association for Machine Translation in the Americas: Commercial MT User Program. Liu, K., Kwok, H. L., Liu, J., &amp; Cheung, A. K. (2022). Sustainability and influence of machine translation: Perceptions and attitudes of translation instructors and learners in Hong Kong. Sustainability, 14(11), 6399. Yang, Y., Wang, X., &amp; Yuan, Q. (2021). Measuring the usability of machine translation in the classroom context. Translation and Interpreting Studies, 16(1), 101–123.</abstract>
+      <url hash="7170ff9b">2023.mtsummit-users.1</url>
+      <bibkey>zhang-2023-exploring</bibkey>
+    </paper>
+    <paper id="2">
+      <title><fixed-case>MT</fixed-case> and legal translation: applications in training</title>
+      <author><first>Suzana</first><last>Cunha</last></author>
+      <pages>11–23</pages>
+      <abstract>This paper investigates the introduction of machine translation (MT) in the legal translation class by means of a pilot study conducted with two groups of students. Both groups took courses in legal translation, but only one was familiarised with post-editing (PE). The groups post-edited an extract of a Portuguese company formation document, translated by an open-access neural machine translation (NMT) system and, subsequently, reflected on the assigned task. Although the scope of the study was limited, it was sufficient to confirm that prior ex-posure to machine translation post-editing (MTPE) did not significantly alter both groups’ editing operations. The pilot study is part of a broader investigation into how technology affects the decision-making process of trainee legal translators, and its results contributed to fine-tuning a meth-odological tool that aims to integrate MTPE procedures in an existing process-oriented legal translation approach developed by Prieto Ramos (2014). The study was repeated this year. This time both groups of trainees were introduced to and used the tool in class. A comparison of both studies’ results is expected to provide insight onto the productive use of MTPE in other domain-specific texts.</abstract>
+      <url hash="2ace64a7">2023.mtsummit-users.2</url>
+      <bibkey>cunha-2023-mt</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Technology Preparedness and Translator Training: Implications for Pedagogy</title>
+      <author><first>Hari</first><last>Venkatesan</last></author>
+      <pages>24–41</pages>
+      <abstract>With increasing acknowledgement of enhanced quality now achievable by Machine Translation, new possibilities have emerged in translation, both vis-à-vis division of labour between human and machine in the translation process and acceptability of lower quality of language in exchange for efficiency. This paper presents surveys of four cohorts of post-graduate students of translation from the University of Macau to see if perceived trainee awareness and preparedness has kept pace with these possibilities. It is found that trainees across the years generally lack confidence in their perceived awareness, are hesitant in employing MT, and show definite reservations when reconsidering issues such as quality and division of labour. While the size of respondents is small, it is interesting to note that the awareness and preparedness mentioned above are found to be similar across the four years. The implication for training is that technology be fully integrated into the translation process in order to provide trainees with a template/framework to handle diverse situations, particularly those that require offering translations of a lower quality with a short turnaround time. The focus here is on Chinese-English translation, but the discussion may find resonance with other language pairs. Keywords Translator training, Computer-Assisted Translation, Machine Translation, translation pedagogy, Chinese-English translation</abstract>
+      <url hash="f3d3d685">2023.mtsummit-users.3</url>
+      <bibkey>venkatesan-2023-technology</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Reception of machine-translated and human-translated subtitles – A case study</title>
+      <author><first>Frederike</first><last>Schierl</last></author>
+      <pages>42–53</pages>
+      <abstract>Accessibility and inclusion have become key terms of the last decades, and this does not exclude linguistics. Machine-translated subtitling has become the new approach to over-come linguistic accessibility barriers since it has proven to be fast and thus cost-efficient for audiovisual media, as opposed to human translation, which is time-intensive and costly. Machine translation can be considered as a solution when a translation is urgently needed. Overall, studies researching benefits of subtitling yield different results, also always depending on the application context (see Chan et al. 2022, Hu et al. 2020). Still, the acceptance of machine-translated subtitles is limited (see Tuominen et al., 2023) and users are rather skeptical, especially regarding the quality of MT subtitles. In the presented project, I investigated the effects of machine-translated subtitling (raw machine translation) compared to human-translated subtitling on the consumer, presenting the results of a case study, knowing that HT as the gold standard for translation is more and more put into question and being aware of today’s convincing output of NMT. The presented study investigates the use of (machine-translated) subtitles by the average consumer due to the current strong societal interest. I base my research project on the 3 R concept, i.e. response, reaction, and repercussion (Gambier, 2009), in which participants were asked to watch two video presentations on educational topics, one in German and another in Finnish, subtitled either with machine translation or by a human translator, or in a mixed condition (machine-translated and human-translated). Subtitle languages are English, German, and Finnish. Afterwards, they were asked to respond to questions on the video content (information retrieval) and evaluate the subtitles based on the User Experience Questionnaire (Laugwitz et al., 2008) and NASA Task Load Index (NASA, 2006). The case study shows that information retrieval in the HT conditions is higher, except for the direction Finnish-German. However, users generally report a better user experience for all lan-guages, which indicates a higher immersion. Participants also report that long subtitles combined with a fast pace contribute to more stress and more distraction from the other visual elements. Generally, users recognise the potential of MT subtitles, but also state that a human-in-the-loop is still needed to ensure publishable quality. References: Chan, Win Shan, Jan-Louis Kruger, and Stephen Doherty. 2022. ‘An Investigation of Subtitles as Learning Support in University Education’. Journal of Specialised Translation, no. 38: 155–79. Gambier, Yves. 2009. ‘Challenges in Research on Audiovisual Translation.’ In Translation Research Projects 2, edited by Pym, Anthony and Alexander Perekrestenko, 17–25. Tarragona: Intercultural Studies Group. Hu, Ke, Sharon O’Brien, and Dorothy Kenny. 2020. ‘A Reception Study of Machine Translated Subtitles for MOOCs’. Perspectives 28 (4): 521–38. https://doi.org/10.1080/0907676X.2019.1595069. Laugwitz, Bettina, Theo Held, and Martin Schrepp. 2008. ‘Construction and Evaluation of a User Experience Questionnaire’. In Symposium of the Austrian HCI and Usability Engineering Group, edited by Andreas Holzinger, 63–76. Springer. NASA. 2006. ‘NASA TLX: Task Load Index’. Tuominen, Tiina, Maarit Koponen, Kaisa Vitikainen, Umut Sulubacak, and Jörg Tiedemann. 2023. ‘Exploring the Gaps in Linguistic Accessibility of Media: The Potential of Automated Subtitling as a Solution’. Journal of Specialised Translation, no. 39: 77–89.</abstract>
+      <url hash="3e33eba5">2023.mtsummit-users.4</url>
+      <bibkey>schierl-2023-reception</bibkey>
+    </paper>
+    <paper id="5">
+      <title>Machine Translation Implementation in Automatic Subtitling from a Subtitlers’ Perspective</title>
+      <author><first>Bina</first><last>Xie</last></author>
+      <pages>54–64</pages>
+      <abstract>In recent years, automatic subtitling has gained considerable scholarly attention. Implementing machine translation in subtitling editors faces challenges, being a primary process in automatic subtitling. Therefore, there is still a significant research gap when it comes to machine translation implementation in automatic subtitling. This project compared different levels of non-verbal input videos from English to Chinese Simplified to examine post-editing efforts in automatic subtitling. The research collected the following data: process logs, which records the total time spent on the subtitles, keystrokes, and user experience questionnaire (UEQ). 12 subtitlers from a translation agency in Mainland China were invited to complete the task. The results show that there are no significant differences between videos with low and high levels of non-verbal input in terms of time spent. Furthermore, the subtitlers spent more effort on revising spotting and segmentation than translation when they post-edited texts with a high level of non-verbal input. While a majority of subtitlers show a positive attitude towards the application of machine translation, their apprehension lies in the potential overreliance on its usage.</abstract>
+      <url hash="12af7e36">2023.mtsummit-users.5</url>
+      <bibkey>xie-2023-machine</bibkey>
+    </paper>
+    <paper id="6">
+      <title>Improving <fixed-case>S</fixed-case>tandard <fixed-case>G</fixed-case>erman Captioning of Spoken <fixed-case>S</fixed-case>wiss <fixed-case>G</fixed-case>erman: Evaluating Multilingual Pre-trained Models</title>
+      <author><first>Jonathan David</first><last>Mutal</last></author>
+      <author><first>Pierrette</first><last>Bouillon</last></author>
+      <author><first>Johanna</first><last>Gerlach</last></author>
+      <author><first>Marianne</first><last>Starlander</last></author>
+      <pages>65–76</pages>
+      <abstract>Multilingual pre-trained language models are often the best alternative in low-resource settings. In the context of a cascade architecture for automatic Standard German captioning of spoken Swiss German, we evaluate different models on the task of transforming normalised Swiss German ASR output into Standard German. Instead of training a large model from scratch, we fine-tuned publicly available pre-trained models, which reduces the cost of training high-quality neural machine translation models. Results show that pre-trained multilingual models achieve the highest scores, and that a higher number of languages included in pre-training improves the performance. We also observed that the type of source and target included in fine-tuning data impacts the results.</abstract>
+      <url hash="885b466b">2023.mtsummit-users.6</url>
+      <bibkey>mutal-etal-2023-improving</bibkey>
+    </paper>
+    <paper id="7">
+      <title>Leveraging Multilingual Knowledge Graph to Boost Domain-specific Entity Translation of <fixed-case>C</fixed-case>hat<fixed-case>GPT</fixed-case></title>
+      <author><first>Min</first><last>Zhang</last></author>
+      <author><first>Limin</first><last>Liu</last></author>
+      <author><first>Zhao</first><last>Yanqing</last></author>
+      <author><first>Xiaosong</first><last>Qiao</last></author>
+      <author><first>Su</first><last>Chang</last></author>
+      <author><first>Xiaofeng</first><last>Zhao</last></author>
+      <author><first>Junhao</first><last>Zhu</last></author>
+      <author><first>Ming</first><last>Zhu</last></author>
+      <author><first>Song</first><last>Peng</last></author>
+      <author><first>Yinglu</first><last>Li</last></author>
+      <author><first>Yilun</first><last>Liu</last></author>
+      <author><first>Wenbing</first><last>Ma</last></author>
+      <author><first>Mengyao</first><last>Piao</last></author>
+      <author><first>Shimin</first><last>Tao</last></author>
+      <author><first>Hao</first><last>Yang</last></author>
+      <author><first>Yanfei</first><last>Jiang</last></author>
+      <pages>77–87</pages>
+      <abstract>Recently, ChatGPT has shown promising results for Machine Translation (MT) in general domains and is becoming a new paradigm for translation. In this paper, we focus on how to apply ChatGPT to domain-specific translation and propose to leverage Multilingual Knowledge Graph (MKG) to help ChatGPT improve the domain entity translation quality. To achieve this, we extract the bilingual entity pairs from MKG for the domain entities that are recognized from source sentences. We then introduce these pairs into translation prompts, instructing ChatGPT to use the correct translations of the domain entities. To evaluate the novel MKG method for ChatGPT, we conduct comparative experiments on three Chinese-English (zh-en) test datasets constructed from three specific domains, of which one domain is from biomedical science, and the other two are from the Information and Communications Technology (ICT) industry — Visible Light Communication (VLC) and wireless domains. Experimental results demonstrate that both the overall translation quality of ChatGPT (+6.21, +3.13 and +11.25 in BLEU scores) and the translation accuracy of domain entities (+43.2%, +30.2% and +37.9% absolute points) are significantly improved with MKG on the three test datasets.</abstract>
+      <url hash="ed901ff5">2023.mtsummit-users.7</url>
+      <bibkey>zhang-etal-2023-leveraging</bibkey>
+    </paper>
+    <paper id="8">
+      <title>Human-in-the-loop Machine Translation with Large Language Model</title>
+      <author><first>Xinyi</first><last>Yang</last></author>
+      <author><first>Runzhe</first><last>Zhan</last></author>
+      <author><first>Derek F.</first><last>Wong</last></author>
+      <author><first>Junchao</first><last>Wu</last></author>
+      <author><first>Lidia S.</first><last>Chao</last></author>
+      <pages>88–98</pages>
+      <abstract>The large language model (LLM) has garnered significant attention due to its in-context learning mechanisms and emergent capabilities. The research community has conducted several pilot studies to apply LLMs to machine translation tasks and evaluate their performance from diverse perspectives. However, previous research has primarily focused on the LLM itself and has not explored human intervention in the inference process of LLM. The characteristics of LLM, such as in-context learning and prompt engineering, closely mirror human cognitive abilities in language tasks, offering an intuitive solution for human-in-the-loop generation. In this study, we propose a human-in-the-loop pipeline that guides LLMs to produce customized outputs with revision instructions. The pipeline initiates by prompting the LLM to produce a draft translation, followed by the utilization of automatic retrieval or human feedback as supervision signals to enhance the LLM’s translation through in-context learning. The human-machine interactions generated in this pipeline are also stored in an external database to expand the in-context retrieval database, enabling us to leverage human supervision in an offline setting. We evaluate the proposed pipeline using the GPT-3.5-turbo API on five domain-specific benchmarks for German-English translation. The results demonstrate the effectiveness of the pipeline in tailoring in-domain translations and improving translation performance compared to direct translation instructions. Additionally, we discuss the experimental results from the following perspectives: 1) the effectiveness of different in-context retrieval methods; 2) the construction of a retrieval database under low-resource scenarios; 3) the observed differences across selected domains; 4) the quantitative analysis of sentence-level and word-level statistics; and 5) the qualitative analysis of representative translation cases.</abstract>
+      <url hash="db5bec08">2023.mtsummit-users.8</url>
+      <bibkey>yang-etal-2023-human</bibkey>
+    </paper>
+    <paper id="9">
+      <title>The impact of machine translation on the translation quality of undergraduate translation students</title>
+      <author><first>Jia</first><last>Zhang</last></author>
+      <author><first>Hong</first><last>Qian</last></author>
+      <pages>99–108</pages>
+      <abstract>Post-editing (PE) refers to checking, proofreading, and revising the translation output of any automated translation (Gouadec, 2007, p. 25). It is needed because the meaning of a text can yet be accurately and fluently conveyed by machine translation (MT). The importance of PE and, accordingly, PE training has been widely acknowledged, and specialised courses have recently been introduced across universities and other organisations worldwide. However, scant consideration is given to when PE skills should be introduced in translation training. PE courses are usually offered to advanced translation learners, i.e., those at the postgraduate level or in the last year of an undergraduate program. Also, existing empirical studies most often investigate the impact of MT on postgraduate students or undergraduate students in the last year of their study. This paper reports on a study that aims to determine the possible effects of MT and PE on the translation quality of students at the early stage of translator training, i.e., undergraduate translation students with only basic translation knowledge. Methodologically, an experiment was conducted to compare students’ (n=10) PEMT-based translations and from-scratch translations without the assistance of machine translation. Second-year students of an undergraduate translation programme were invited to translate two English texts with similar difficulties into Chinese. One of the texts was translated directly, while the other one was done with reference to machine-generated translation. Translation quality can be dynamic. When examined from different perspectives using different methods, the quality of a translation can vary. Several methods of translation quality assessment were adopted in this project, including rubrics-based scoring, error analysis and fixed-point translation analysis. It was found that the quality of students’ PE translations was compromised compared with that of from-scratch translations. In addition, errors were more homogenised in the PEMT-based translations. It is hoped that this study can shed some light on the role of PEMT in translator training and contribute to the curricula and course design of post-editing for translator education. Reference: Gouadec, D. (2007). Translation as a Profession. John Benjamins Publishing. Keywords: machine translation, post-editing, translator training, translation quality assessment, error analysis, undergraduate students</abstract>
+      <url hash="0353918c">2023.mtsummit-users.9</url>
+      <bibkey>zhang-qian-2023-impact</bibkey>
+    </paper>
+    <paper id="10">
+      <title>Leveraging Latent Topic Information to Improve Product Machine Translation</title>
+      <author><first>Bryan</first><last>Zhang</last></author>
+      <author><first>Stephan</first><last>Walter</last></author>
+      <author><first>Amita</first><last>Misra</last></author>
+      <author><first>Liling</first><last>Tan</last></author>
+      <pages>109–118</pages>
+      <abstract>Meeting the expectations of e-commerce customers involves offering a seamless online shopping experience in their preferred language. To achieve this, modern e-commerce platforms rely on machine translation systems to provide multilingual product information on a large scale. However, maintaining high-quality machine translation that can keep up with the ever-expanding volume of product data remains an open challenge for industrial machine translation systems. In this context, topical clustering emerges as a valuable approach, leveraging latent signals and interpretable textual patterns to potentially enhance translation quality and facilitate industry-scale translation data discovery. This paper proposes two innovative methods: topic-based data selection and topic-signal augmentation, both utilizing latent topic clusters to improve the quality of machine translation in e-commerce. Furthermore, we present a data discovery workflow that utilizes topic clusters to effectively manage the growing multilingual product catalogs, addressing the challenges posed by their expansion.</abstract>
+      <url hash="f07638f0">2023.mtsummit-users.10</url>
+      <bibkey>zhang-etal-2023-leveraging-latent</bibkey>
+    </paper>
+    <paper id="11">
+      <title>Translating Dislocations or Parentheticals : Investigating the Role of Prosodic Boundaries for Spoken Language Translation of <fixed-case>F</fixed-case>rench into <fixed-case>E</fixed-case>nglish</title>
+      <author><first>Nicolas</first><last>Ballier</last></author>
+      <author><first>Behnoosh</first><last>Namdarzadeh</last></author>
+      <author><first>Maria</first><last>Zimina</last></author>
+      <author><first>Jean-Baptiste</first><last>Yunès</last></author>
+      <pages>119–131</pages>
+      <abstract>This paper examines some of the effects of prosodic boundaries on ASR outputs and Spoken Language Translations into English for two competing French structures (“c’est” dislocation vs. “c’est” parentheticals). One native speaker of French read 104 test sentences that were then submitted to two systems. We compared the outputs of two toolkits, SYSTRAN Pure Neural Server (SPNS9) (Crego et al., 2016) and Whisper. For SPNS9, we compared the translation of the text file used for the reading with the translation of the transcription generated through Vocapia ASR. We also tested the transcription engine for speech recognition uploading an MP3 file and used the same procedure for AI Whisper’s Web-scale Supervised Pretraining for Speech Recognition system (Radford et al., 2022). We reported WER for the transcription tasks and the BLEU scores for the different models. We evidenced the variability of the punctuation in the ASR outputs and discussed it in relation to the duration of the utterance. We discussed the effects of the prosodic boundaries. We described the status of the boundary in the speech-to-text systems, discussing the consequence for the neural machine translation of the rendering of the prosodic boundary by a comma, a full stop, or any other punctuation symbol. We used the reference transcript of the reading phase to compute the edit distance between the reference transcript and the ASR output. We also used textometric analyses with iTrameur (Fleury and Zimina, 2014) for insights into the errors that can be attributed to ASR or to Neural Machine translation.</abstract>
+      <url hash="ac015325">2023.mtsummit-users.11</url>
+      <bibkey>ballier-etal-2023-translating</bibkey>
+    </paper>
+    <paper id="12">
+      <title>Exploring Multilingual Pretrained Machine Translation Models for Interactive Translation</title>
+      <author><first>Angel</first><last>Navarro</last></author>
+      <author><first>Francisco</first><last>Casacuberta</last></author>
+      <pages>132–142</pages>
+      <abstract>Pre-trained large language models (LLM) constitute very important tools in many artificial intelligence applications. In this work, we explore the use of these models in interactive machine translation environments. In particular, we have chosen mBART (multilingual Bidirectional and Auto-Regressive Transformer) as one of these LLMs. The system enables users to refine the translation output interactively by providing feedback. The system utilizes a two-step process, where the NMT (Neural Machine Translation) model generates a preliminary translation in the first step, and the user performs one correction in the second step–repeating the process until the sentence is correctly translated. We assessed the performance of both mBART and the fine-tuned version by comparing them to a state-of-the-art machine translation model on a benchmark dataset regarding user effort, WSR (Word Stroke Ratio), and MAR (Mouse Action Ratio). The experimental results indicate that all the models performed comparably, suggesting that mBART is a viable option for an interactive machine translation environment, as it eliminates the need to train a model from scratch for this particular task. The implications of this finding extend to the development of new machine translation models for interactive environments, as it indicates that novel pre-trained models exhibit state-of-the-art performance in this domain, highlighting the potential benefits of adapting these models to specific needs.</abstract>
+      <url hash="bf8ef12f">2023.mtsummit-users.12</url>
+      <bibkey>navarro-casacuberta-2023-exploring</bibkey>
+    </paper>
+    <paper id="13">
+      <title>Machine translation of <fixed-case>K</fixed-case>orean statutes examined from the perspective of quality and productivity</title>
+      <author><first>Jieun</first><last>Lee</last></author>
+      <author><first>Hyoeun</first><last>Choi</last></author>
+      <pages>143–151</pages>
+      <abstract>Because machine translation (MT) still falls short of human parity, human intervention is needed to ensure quality translation. The existing literature indicates that machine translation post-editing (MTPE) generally enhances translation productivity, but the question of quality remains for domain-specific texts (e.g. Aranberri et al., 2014; Jia et al., 2022; Kim et al., 2019; Lee, 2021a,b). Although legal translation is considered as one of the most complex specialist transla-tion domains, because of the demand surge for legal translation, MT has been utilized to some extent for documents of less importance (Roberts, 2022). Given that little research has examined the productivity and quality of MT and MTPE in Korean-English legal translation, we sought to examine the productivity and quality of MT and MTPE of Korean of statutes, using DeepL, a neural machine translation engine which has recently started the Korean language service. This paper presents the preliminary findings from a research project that investigated DeepL MT qua-lity and the quality and productivity of MTPE outputs and human translations by seven professional translators.</abstract>
+      <url hash="00767051">2023.mtsummit-users.13</url>
+      <bibkey>lee-choi-2023-machine</bibkey>
+    </paper>
+    <paper id="14">
+      <title>Fine-tuning <fixed-case>MBART</fixed-case>-50 with <fixed-case>F</fixed-case>rench and <fixed-case>F</fixed-case>arsi data to improve the translation of <fixed-case>F</fixed-case>arsi dislocations into <fixed-case>E</fixed-case>nglish and <fixed-case>F</fixed-case>rench</title>
+      <author><first>Behnoosh</first><last>Namdarzadeh</last></author>
+      <author><first>Sadaf</first><last>Mohseni</last></author>
+      <author><first>Lichao</first><last>Zhu</last></author>
+      <author><first>Guillaume</first><last>Wisniewski</last></author>
+      <author><first>Nicolas</first><last>Ballier</last></author>
+      <pages>152–161</pages>
+      <abstract>In this paper, we discuss the improvements brought by the fine-tuning of mBART50 for the translation of a specific Farsi dataset of dislocations. Given our BLEU scores, our evaluation is mostly qualitative: we assess the improvements of our fine-tuning in the translations into French of our test dataset of Farsi. We describe the fine-tuning procedure and discuss the quality of the results in the translations from Farsi. We assess the sentences in the French translations that contain English tokens and for the English translations, we examine the ability of the fine- tuned system to translate Farsi dislocations into English without replicating the dislocated item as a double subject. We scrutinized the Farsi training data used to train for mBART50 (Tang et al., 2021). We fine-tuned mBART50 with samples from an in-house French-Farsi aligned translation of a short story. In spite of the scarcity of available resources, we found that fine- tuning with aligned French-Farsi data dramatically improved the grammatical well-formedness of the predictions for French, even if serious semantic issues remained. We replicated the experiment with the English translation of the same Farsi short story for a Farsi-English fine-tuning and found out that similar semantic inadequacies cropped up, and that some translations were worse than our mBART50 baseline. We showcased the fine-tuning of mBART50 with supplementary data and discussed the asymmetry of the situation, adding little data in the fine-tuning is sufficient to improve morpho-syntax for one language pair but seems to degrade translation to English.</abstract>
+      <url hash="52ed2167">2023.mtsummit-users.14</url>
+      <bibkey>namdarzadeh-etal-2023-fine</bibkey>
+    </paper>
+    <paper id="15">
+      <title><fixed-case>KG</fixed-case>-<fixed-case>IQES</fixed-case>: An Interpretable Quality Estimation System for Machine Translation Based on Knowledge Graph</title>
+      <author><first>Junhao</first><last>Zhu</last></author>
+      <author><first>Min</first><last>Zhang</last></author>
+      <author><first>Hao</first><last>Yang</last></author>
+      <author><first>Song</first><last>Peng</last></author>
+      <author><first>Zhanglin</first><last>Wu</last></author>
+      <author><first>Yanfei</first><last>Jiang</last></author>
+      <author><first>Xijun</first><last>Qiu</last></author>
+      <author><first>Weiqiang</first><last>Pan</last></author>
+      <author><first>Ming</first><last>Zhu</last></author>
+      <author><first>Ma</first><last>Miaomiao</last></author>
+      <author><first>Weidong</first><last>Zhang</last></author>
+      <pages>162–170</pages>
+      <abstract>The widespread use of machine translation (MT) has driven the need for effective automatic quality estimation (AQE) methods. How to enhance the interpretability of MT output quality estimation is well worth exploring in the industry. From the perspective of the alignment of named entities (NEs) in the source and translated sentences, we construct a multilingual knowledge graph (KG) consisting of domain-specific NEs, and design a KG-based interpretable quality estimation (QE) system for machine translations (KG-IQES). KG-IQES effectively estimates the translation quality without relying on reference translations. Its effectiveness has been verified in our business scenarios.</abstract>
+      <url hash="56f63970">2023.mtsummit-users.15</url>
+      <bibkey>zhu-etal-2023-kg</bibkey>
+    </paper>
+    <paper id="16">
+      <title>Enhancing Gender Representation in Neural Machine Translation: A Comparative Analysis of Annotating Strategies for <fixed-case>E</fixed-case>nglish-<fixed-case>S</fixed-case>panish and <fixed-case>E</fixed-case>nglish-<fixed-case>P</fixed-case>olish Language Pairs</title>
+      <author><first>Celia</first><last>Soler Uguet</last></author>
+      <author><first>Fred</first><last>Bane</last></author>
+      <author><first>Mahmoud</first><last>Aymo</last></author>
+      <author><first>João Pedro</first><last>Fernandes Torres</last></author>
+      <author><first>Anna</first><last>Zaretskaya</last></author>
+      <author><first>Tània</first><last>Blanch Miró</last></author>
+      <pages>171–172</pages>
+      <abstract>Machine translation systems have been shown to demonstrate gender bias (Savoldi et al., 2021; Stafanovičs et al., 2020; Stanovsky et al., 2020), and contribute to this bias with systematically unfair translations. In this presentation, we explore a method of enforcing gender in NMT. We generalize the method proposed by Vincent et al. (2022) to create training data not requiring a first-person speaker. Drawing from other works that use special tokens to pass additional information to NMT systems (e.g. Ailem et al., 2021), we annotate the training data with special tokens to mark the gender of a given noun in the text, which enables the NMT system to produce the correct gender during translation. These tokens are also used to mark the gender in a source sentence at inference time. However, in production scenarios, gender is often unknown at inference time, so we propose two methods of leveraging language models to obtain these labels. Our experiment is set up in a fine-tuning scenario, adapting an existing translation model with gender-annotated data. We focus on the English to Spanish and Polish language pairs. Without guidance, NMT systems often ignore signals that indicate the correct gender for translation. To this end, we consider two methods of annotating the source English sentence for gender, such as the noun developer in the following sentence: The developer argued with the designer because she did not like the design. a) We use a coreference resolution model based on SpanBERT (Joshi et al., 2020) to connect any gender-indicating pronouns to their head nouns. b) We use the GPT-3.5 model prompted to identify the gender of each person in the sentence based on the context within the sentence. For test data, we use a collection of sentences from Stanovsky et al. including two professions and one pronoun that can refer only to one of them. We use the above two methods to annotate the source sentence we want to translate, produce the translations with our fine-tuned model and compare the accuracy of the gender translation in both cases. The correctness of the gender was evaluated by professional linguists. Overall, we observed a significant improvement in gender translations compared to the baseline (a 7% improvement for Spanish and a 50% improvement for Polish), with SpanBERT outperforming GPT on this task. The Polish MT model still struggles to produce the correct gender (even the translations produced with the ‘gold truth’ gender markings are only correct in 56% of the cases). We discuss limitations to this method. Our research is intended as a reference for fellow MT practitioners, as it offers a comparative analysis of two practical implementations that show the potential to enhance the accuracy of gender in translation, thereby elevating the overall quality of translation and mitigating gender bias.</abstract>
+      <url hash="ac03c8fd">2023.mtsummit-users.16</url>
+      <bibkey>soler-uguet-etal-2023-enhancing</bibkey>
+    </paper>
+    <paper id="17">
+      <title>Brand Consistency for Multilingual <fixed-case>E</fixed-case>-commerce Machine Translation</title>
+      <author><first>Bryan</first><last>Zhang</last></author>
+      <author><first>Stephan</first><last>Walter</last></author>
+      <author><first>Saurabh Chetan</first><last>Birari</last></author>
+      <author><first>Ozlem</first><last>Eren</last></author>
+      <pages>173–182</pages>
+      <abstract>In the realm of e-commerce, it is crucial to ensure consistent localization of brand terms in product information translations. With the ever-evolving e-commerce landscape, new brands and their localized versions are consistently emerging. However, these diverse brand forms and aliases present a significant challenge in machine translation (MT). This study investigates MT brand consistency problem in multilingual e-commerce and proposes practical and sustainable solutions to maintain brand consistency in various scenarios within the e-commerce industry. Through experimentation and analysis of an English-Arabic MT system, we demonstrate the effectiveness of our proposed solutions.</abstract>
+      <url hash="0074cc65">2023.mtsummit-users.17</url>
+      <bibkey>zhang-etal-2023-brand</bibkey>
+    </paper>
+    <paper id="18">
+      <title>Developing automatic verbatim transcripts for international multilingual meetings: an end-to-end solution</title>
+      <author><first>Akshat</first><last>Dewan</last></author>
+      <author><first>Michal</first><last>Ziemski</last></author>
+      <author><first>Henri</first><last>Meylan</last></author>
+      <author><first>Lorenzo</first><last>Concina</last></author>
+      <author><first>Bruno</first><last>Pouliquen</last></author>
+      <pages>183–194</pages>
+      <abstract>This paper presents an end-to-end solution for the creation of fully automated conference meeting transcripts and their machine translations into various languages. This tool has been developed at the World Intellectual Property Organization (WIPO) using in-house developed speech-to-text (S2T) and machine translation (MT) components. Beyond describing data collection and fine-tuning, resulting in a highly customized and robust system, this paper describes the architecture and evolution of the technical components as well as highlights the business impact and benefits from the user side. We also point out particular challenges in the evolution and adoption of the system and how the new approach created a new product and replaced existing established workflows in conference management documentation.</abstract>
+      <url hash="8f773bf9">2023.mtsummit-users.18</url>
+      <bibkey>dewan-etal-2023-developing</bibkey>
+    </paper>
+    <paper id="19">
+      <title>Optimizing Machine Translation through Prompt Engineering: An Investigation into <fixed-case>C</fixed-case>hat<fixed-case>GPT</fixed-case>’s Customizability</title>
+      <author><first>Masaru</first><last>Yamada</last></author>
+      <pages>195–204</pages>
+      <abstract>This paper explores the influence of integrating the purpose of the translation and the target audience into prompts on the quality of translations produced by ChatGPT. Drawing on previous translation studies, industry practices, and ISO standards, the research underscores the significance of the pre-production phase in the translation process. The study reveals that the inclusion of suitable prompts in large-scale language models like ChatGPT can yield flexible translations, a feat yet to be realized by conventional Ma-chine Translation (MT). The research scrutinizes the changes in translation quality when prompts are used to generate translations that meet specific conditions. The evaluation is conducted from a practicing translator’s viewpoint, both subjectively and qualitatively, supplemented by the use of OpenAI’s word embedding API for cosine similarity calculations. The findings suggest that the integration of the purpose and target audience into prompts can indeed modify the generated translations, generally enhancing the translation quality by industry standards. The study also demonstrates the practical application of the “good translation” concept, particularly in the context of marketing documents and culturally dependent idioms.</abstract>
+      <url hash="94b8dbd5">2023.mtsummit-users.19</url>
+      <bibkey>yamada-2023-optimizing</bibkey>
+    </paper>
+    <paper id="20">
+      <title>Comparing <fixed-case>C</fixed-case>hinese-<fixed-case>E</fixed-case>nglish <fixed-case>MT</fixed-case> Performance Involving <fixed-case>C</fixed-case>hat<fixed-case>GPT</fixed-case> and <fixed-case>MT</fixed-case> Providers and the Efficacy of <fixed-case>AI</fixed-case> mediated Post-Editing</title>
+      <author><first>Larry</first><last>Cady</last></author>
+      <author><first>Benjamin</first><last>Tsou</last></author>
+      <author><first>John</first><last>Lee</last></author>
+      <pages>205–216</pages>
+      <abstract>The recent introduction of ChatGPT has caused much stir in the translation industry because of its impressive translation performance against leaders in the industry. We review some ma-jor issues based on the BLEU comparisons of Chinese-to-English (C2E) and English-to-Chinese (E2C) machine translation (MT) performance by ChatGPT against a range of leading MT providers in mostly technical domains. Based on sample aligned sentences from a sizable bilingual Chinese-English patent corpus and other sources, we find that while ChatGPT perform better generally, it does not consistently perform better than others in all areas or cases. We also draw on novice translators as post-editors to explore a major component in MT post-editing: Optimization of terminology. Many new technical words, including MWEs (Multi-Word Expressions), are problematic because they involve terminological developments which must balance between proper encapsulation of technical innovation and conforming to past traditions . Drawing on the above-mentioned corpus we have been developing an AI mediated MT post-editing (MTPE) system through the optimization of precedent rendition distribution and semantic association to enhance the work of translators and MTPE practitioners.</abstract>
+      <url hash="ec6a2094">2023.mtsummit-users.20</url>
+      <bibkey>cady-etal-2023-comparing</bibkey>
+    </paper>
+    <paper id="21">
+      <title>Challenges of Human vs Machine Translation of Emotion-Loaded <fixed-case>C</fixed-case>hinese Microblog Texts</title>
+      <author><first>Shenbin</first><last>Qian</last></author>
+      <author><first>Constantin</first><last>Orăsan</last></author>
+      <author><first>Félix</first><last>do Carmo</last></author>
+      <author><first>Diptesh</first><last>Kanojia</last></author>
+      <pages>217–236</pages>
+      <abstract>This paper attempts to identify challenges professional translators face when translating emotion-loaded texts as well as errors machine translation (MT) makes when translating this content. We invited ten Chinese-English translators to translate thirty posts of a Chinese microblog, and interviewed them about the challenges encountered during translation and the problems they believe MT might have. Further, we analysed more than five-thousand automatic translations of microblog posts to observe problems in MT outputs. We establish that the most challenging problem for human translators is emotion-carrying words, which translators also consider as a problem for MT. Analysis of MT outputs shows that this is also the most common source of MT errors. We also find that what is challenging for MT, such as non-standard writing, is not necessarily an issue for humans. Our work contributes to a better understanding of the challenges for the translation of microblog posts by humans and MT, caused by different forms of expression of emotion.</abstract>
+      <url hash="33da0850">2023.mtsummit-users.21</url>
+      <bibkey>qian-etal-2023-challenges</bibkey>
+    </paper>
+  </volume>
+  <volume id="coco4mt" ingest-date="2023-10-11" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the Second Workshop on Corpus Generation and Corpus Augmentation for Machine Translation</booktitle>
+      <publisher>Asia-Pacific Association for Machine Translation</publisher>
+      <address>Macau SAR, China</address>
+      <month>September</month>
+      <year>2023</year>
+      <url hash="21a69ec8">2023.mtsummit-coco4mt</url>
+      <venue>mtsummit</venue>
+    </meta>
+    <frontmatter>
+      <url hash="6ecb7eff">2023.mtsummit-coco4mt.0</url>
+      <bibkey>mtsummit-2023-corpus</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>Do Not Discard – Extracting Useful Fragments from Low-Quality Parallel Data to Improve Machine Translation</title>
+      <author><first>Steinþór</first><last>Steingrímsson</last></author>
+      <author><first>Pintu</first><last>Lohar</last></author>
+      <author><first>Hrafn</first><last>Loftsson</last></author>
+      <author><first>Andy</first><last>Way</last></author>
+      <pages>1–13</pages>
+      <abstract>When parallel corpora are preprocessed for machine translation (MT) training, a part of the parallel data is commonly discarded and deemed non-parallel due to odd-length ratio, overlapping text in source and target sentences or failing some other form of a semantic equivalency test. For language pairs with limited parallel resources, this can be costly as in such cases modest amounts of acceptable data may be useful to help build MT systems that generate higher quality translations. In this paper, we refine parallel corpora for two language pairs, English–Bengali and English–Icelandic, by extracting sub-sentence fragments from sentence pairs that would otherwise have been discarded, in order to increase recall when compiling training data. We find that by including the fragments, translation quality of NMT systems trained on the data improves significantly when translating from English to Bengali and from English to Icelandic.</abstract>
+      <url hash="898f91c0">2023.mtsummit-coco4mt.1</url>
+      <bibkey>steingrimsson-etal-2023-discard</bibkey>
+    </paper>
+    <paper id="2">
+      <title>Development of <fixed-case>U</fixed-case>rdu-<fixed-case>E</fixed-case>nglish Religious Domain Parallel Corpus</title>
+      <author><first>Sadaf</first><last>Abdul Rauf</last></author>
+      <author><first>Noor e</first><last>Hira</last></author>
+      <pages>14–21</pages>
+      <abstract>Despite the abundance of monolingual corpora accessible online, there remains a scarcity of domain specific parallel corpora. This scarcity poses a challenge in the development of robust translation systems tailored for such specialized domains. Addressing this gap, we have developed a parallel religious domain corpus for Urdu-English. This corpus consists of 18,426 parallel sentences from Sunan Daud, carefully curated to capture the unique linguistic and contextual aspects of religious texts. The developed corpus is then used to train Urdu-English religious domain Neural Machine Translation (NMT) systems, the best system scored 27.9 BLEU points</abstract>
+      <url hash="3647680a">2023.mtsummit-coco4mt.2</url>
+      <bibkey>abdul-rauf-hira-2023-development</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Findings of the <fixed-case>C</fixed-case>o<fixed-case>C</fixed-case>o4<fixed-case>MT</fixed-case> 2023 Shared Task on Corpus Construction for Machine Translation</title>
+      <author><first>Ananya</first><last>Ganesh</last></author>
+      <author><first>Marine</first><last>Carpuat</last></author>
+      <author><first>William</first><last>Chen</last></author>
+      <author><first>Katharina</first><last>Kann</last></author>
+      <author><first>Constantine</first><last>Lignos</last></author>
+      <author><first>John E.</first><last>Ortega</last></author>
+      <author><first>Jonne</first><last>Saleva</last></author>
+      <author><first>Shabnam</first><last>Tafreshi</last></author>
+      <author><first>Rodolfo</first><last>Zevallos</last></author>
+      <pages>22–27</pages>
+      <abstract>This paper provides an overview of the first shared task on choosing beneficial instances for machine translation, conducted as part of the CoCo4MT 2023 Workshop at MTSummit. This shared task was motivated by the need to make the data annotation process for machine translation more efficient, particularly for low-resource languages for which collecting human translations may be difficult or expensive. The task involved developing methods for selecting the most beneficial instances for training a machine translation system without access to an existing parallel dataset in the target language, such that the best selected instances can then be manually translated. Two teams participated in the shared task, namely the Williams team and the AST team. Submissions were evaluated by training a machine translation model on each submission’s chosen instances, and comparing their performance with the chRF++ score. The system that ranked first is by the Williams team, that finds representative instances by clustering the training data.</abstract>
+      <url hash="609155d5">2023.mtsummit-coco4mt.3</url>
+      <bibkey>ganesh-etal-2023-findings</bibkey>
+    </paper>
+    <paper id="4">
+      <title><fixed-case>W</fixed-case>illiams College’s Submission for the <fixed-case>C</fixed-case>oco4<fixed-case>MT</fixed-case> 2023 Shared Task</title>
+      <author><first>Alex</first><last>Root</last></author>
+      <author><first>Mark</first><last>Hopkins</last></author>
+      <pages>28–32</pages>
+      <abstract>Professional translation is expensive. As a consequence, when developing a translation system in the absence of a pre-existing parallel corpus, it is important to strategically choose sentences to have professionally translated for the training corpus. In our contribution to the Coco4MT 2023 Shared Task, we explore how sentence embeddings can be leveraged to choose an impactful set of sentences to translate. Based on six language pairs of the JHU Bible corpus, we demonstrate that a technique based on SimCSE embeddings outperforms a competitive suite of baselines.</abstract>
+      <url hash="6400c436">2023.mtsummit-coco4mt.4</url>
+      <bibkey>root-hopkins-2023-williams</bibkey>
+    </paper>
+    <paper id="5">
+      <title>The <fixed-case>AST</fixed-case> Submission for the <fixed-case>C</fixed-case>o<fixed-case>C</fixed-case>o4<fixed-case>MT</fixed-case> 2023 Shared Task on Corpus Construction for Low-Resource Machine Translation</title>
+      <author><first>Steinþór</first><last>Steingrímsson</last></author>
+      <pages>33–38</pages>
+      <abstract>We describe the AST submission for the CoCo4MT 2023 shared task. The aim of the task is to identify the best candidates for translation in a source data set with the aim to use the translated parallel data for fine-tuning the mBART-50 model. We experiment with three methods: scoring sentences based on n-gram coverage, using LaBSE to estimate semantic similarity and identify misalignments and mistranslations by comparing machine translated source sentences to corresponding manually translated segments in high-resource languages. We find that we obtain the best results by combining these three methods, using LaBSE and machine translation for filtering, and one of our n-gram scoring approaches for ordering sentences.</abstract>
+      <url hash="b55b8fa9">2023.mtsummit-coco4mt.5</url>
+      <bibkey>steingrimsson-2023-ast</bibkey>
+    </paper>
+  </volume>
+  <event id="mtsummit-2023">
+    <colocated>
+      <volume-id>2023.alt-1</volume-id>
+      <volume-id>2023.wat-1</volume-id>
+    </colocated>
+  </event>
+</collection>
diff --git a/data/xml/2023.wat.xml b/data/xml/2023.wat.xml
new file mode 100644
index 0000000000..7bd134c78f
--- /dev/null
+++ b/data/xml/2023.wat.xml
@@ -0,0 +1,89 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2023.wat">
+  <volume id="1" ingest-date="2023-10-08" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the 10th Workshop on Asian Translation</booktitle>
+      <editor><first>Toshiaki</first><last>Nakazawa</last></editor>
+      <editor><first>Kazutaka</first><last>Kinugawa</last></editor>
+      <editor><first>Hideya</first><last>Mino</last></editor>
+      <editor><first>Isao</first><last>Goto</last></editor>
+      <editor><first>Raj</first><last>Dabre</last></editor>
+      <editor><first>Shohei</first><last>Higashiyama</last></editor>
+      <editor><first>Shantipriya</first><last>Parida</last></editor>
+      <editor><first>Makoto</first><last>Morishita</last></editor>
+      <editor><first>Ondrej</first><last>Bojar</last></editor>
+      <editor><first>Akiko</first><last>Eriguchi</last></editor>
+      <editor><first>Yusuke</first><last>Oda</last></editor>
+      <editor><first>Akiko</first><last>Eriguchi</last></editor>
+      <editor><first>Chenhui</first><last>Chu</last></editor>
+      <editor><first>Sadao</first><last>Kurohashi</last></editor>
+      <publisher>Asia-Pacific Association for Machine Translation</publisher>
+      <address>Macau SAR, China</address>
+      <month>September</month>
+      <year>2023</year>
+      <url hash="c9767e32">2023.wat-1</url>
+      <venue>wat</venue>
+    </meta>
+    <frontmatter>
+      <url hash="e089603a">2023.wat-1.0</url>
+      <bibkey>wat-2023-asian</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>Overview of the 10th Workshop on <fixed-case>A</fixed-case>sian Translation</title>
+      <author><first>Toshiaki</first><last>Nakazawa</last></author>
+      <author><first>Kazutaka</first><last>Kinugawa</last></author>
+      <author><first>Hideya</first><last>Mino</last></author>
+      <author><first>Isao</first><last>Goto</last></author>
+      <author><first>Raj</first><last>Dabre</last></author>
+      <author><first>Shohei</first><last>Higashiyama</last></author>
+      <author><first>Shantipriya</first><last>Parida</last></author>
+      <author><first>Makoto</first><last>Morishita</last></author>
+      <author><first>Ondřej</first><last>Bojar</last></author>
+      <author><first>Akiko</first><last>Eriguchi</last></author>
+      <author><first>Yusuke</first><last>Oda</last></author>
+      <author><first>Chenhui</first><last>Chu</last></author>
+      <author><first>Sadao</first><last>Kurohashi</last></author>
+      <pages>1–28</pages>
+      <abstract>This paper presents the results of the shared tasks from the 10th workshop on Asian translation (WAT2023). For the WAT2023, 2 teams submitted their translation results for the human evaluation. We also accepted 1 research paper. About 40 translation results were submitted to the automatic evaluation server, and selected submissions were manually evaluated.</abstract>
+      <url hash="83c1222a">2023.wat-1.1</url>
+      <bibkey>nakazawa-etal-2023-overview</bibkey>
+    </paper>
+    <paper id="2">
+      <title>Mitigating Domain Mismatch in Machine Translation via Paraphrasing</title>
+      <author><first>Hyuga</first><last>Koretaka</last></author>
+      <author><first>Tomoyuki</first><last>Kajiwara</last></author>
+      <author><first>Atsushi</first><last>Fujita</last></author>
+      <author><first>Takashi</first><last>Ninomiya</last></author>
+      <pages>29–40</pages>
+      <abstract>Quality of machine translation (MT) deteriorates significantly when translating texts having characteristics that differ from the training data, such as content domain. Although previous studies have focused on adapting MT models on a bilingual parallel corpus in the target domain, this approach is not applicable when no parallel data are available for the target domain or when utilizing black-box MT systems. To mitigate problems caused by such domain mismatch without relying on any corpus in the target domain, this study proposes a method to search for better translations by paraphrasing input texts of MT. To obtain better translations even for input texts from unforeknown domains, we generate their multiple paraphrases, translate each, and rerank the resulting translations to select the most likely one. Experimental results on Japanese-to-English translation reveal that the proposed method improves translation quality in terms of BLEU score for input texts from specific domains.</abstract>
+      <url hash="95c558f4">2023.wat-1.2</url>
+      <bibkey>koretaka-etal-2023-mitigating</bibkey>
+    </paper>
+    <paper id="3">
+      <title><fixed-case>BITS</fixed-case>-<fixed-case>P</fixed-case> at <fixed-case>WAT</fixed-case> 2023: Improving <fixed-case>I</fixed-case>ndic Language Multimodal Translation by Image Augmentation using Diffusion Models</title>
+      <author><first>Amulya</first><last>Dash</last></author>
+      <author><first>Hrithik Raj</first><last>Gupta</last></author>
+      <author><first>Yashvardhan</first><last>Sharma</last></author>
+      <pages>41–45</pages>
+      <abstract>This paper describes the proposed system for mutlimodal machine translation. We have participated in multimodal translation tasks for English into three Indic languages: Hindi, Bengali, and Malayalam. We leverage the inherent richness of multimodal data to bridge the gap of ambiguity in translation. We fine-tuned the ‘No Language Left Behind’ (NLLB) machine translation model for multimodal translation, further enhancing the model accuracy by image data augmentation using latent diffusion. Our submission achieves the best BLEU score for English-Hindi, English-Bengali, and English-Malayalam language pairs for both Evaluation and Challenge test sets.</abstract>
+      <url hash="1da3607a">2023.wat-1.3</url>
+      <bibkey>dash-etal-2023-bits</bibkey>
+    </paper>
+    <paper id="4">
+      <title><fixed-case>O</fixed-case>dia<fixed-case>G</fixed-case>en<fixed-case>AI</fixed-case>’s Participation at <fixed-case>WAT</fixed-case>2023</title>
+      <author><first>Sk</first><last>Shahid</last></author>
+      <author><first>Guneet Singh</first><last>Kohli</last></author>
+      <author><first>Sambit</first><last>Sekhar</last></author>
+      <author><first>Debasish</first><last>Dhal</last></author>
+      <author><first>Adit</first><last>Sharma</last></author>
+      <author><first>Shubhendra</first><last>Khusawash</last></author>
+      <author><first>Shantipriya</first><last>Parida</last></author>
+      <author><first>Stig-Arne</first><last>Grönroos</last></author>
+      <author><first>Satya Ranjan</first><last>Dash</last></author>
+      <pages>46–52</pages>
+      <abstract>This paper offers an in-depth overview of the team “ODIAGEN’s” translation system submitted to the Workshop on Asian Translation (WAT2023). Our focus lies in the domain of Indic Multimodal tasks, specifically targeting English to Hindi, English to Malayalam, and English to Bengali translations. The system uses a state-of-the-art Transformer-based architecture, specifically the NLLB-200 model, fine-tuned with language-specific Visual Genome Datasets. With this robust system, we were able to manage both text-to-text and multimodal translations, demonstrating versatility in handling different translation modes. Our results showcase strong performance across the board, with particularly promising results in the Hindi and Bengali translation tasks. A noteworthy achievement of our system lies in its stellar performance across all text-to-text translation tasks. In the categories of English to Hindi, English to Bengali, and English to Malayalam translations, our system claimed the top positions for both the evaluation and challenge sets. This system not only advances our understanding of the challenges and nuances of Indic language translation but also opens avenues for future research to enhance translation accuracy and performance.</abstract>
+      <url hash="20cd5eb5">2023.wat-1.4</url>
+      <bibkey>shahid-etal-2023-odiagenais</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/yaml/venues/alt.yaml b/data/yaml/venues/alt.yaml
new file mode 100644
index 0000000000..5fdbd7818a
--- /dev/null
+++ b/data/yaml/venues/alt.yaml
@@ -0,0 +1,2 @@
+acronym: alt
+name: 'ALT2023: Ancient Language Translation Workshop'

From a98d9442afc25abaabe19bfdacb17981dcb8bd4a Mon Sep 17 00:00:00 2001
From: "Hou-Chiang, Tseng" <ouartz99@gmail.com>
Date: Wed, 15 Nov 2023 21:09:08 +0800
Subject: [PATCH 06/12] Fix 2023 ROCLING mis-parsed authors (#2875)

---
 data/xml/2023.rocling.xml | 35 +++++++++++++++++++++++------------
 1 file changed, 23 insertions(+), 12 deletions(-)

diff --git a/data/xml/2023.rocling.xml b/data/xml/2023.rocling.xml
index c3b72c0e62..e8e181ee87 100644
--- a/data/xml/2023.rocling.xml
+++ b/data/xml/2023.rocling.xml
@@ -27,22 +27,25 @@
     </paper>
     <paper id="2">
       <title>Story Co-telling Dialogue Generation via Reinforcement Learning and Knowledge Graph</title>
-      <author><first>Chia-Hui Chang</first><last>Yu-Kai Lee</last></author>
+      <author><first>Yu-Kai</first><last>Lee</last></author>
+      <author><first>Chia-Hui</first><last>Chang</last></author>
       <pages>12–20</pages>
       <url hash="636e7be5">2023.rocling-1.2</url>
       <bibkey>yu-kai-lee-2023-story</bibkey>
     </paper>
     <paper id="3">
       <title>Improving End-to-end <fixed-case>T</fixed-case>aiwanese-Speech-to-<fixed-case>C</fixed-case>hinese-Text Translation by Semi-supervised Learning</title>
-      <author><first>Chung-Che Wang</first><last>Yu-Chun Lin</last></author>
-      <author><first>Jyh-Shing</first><last>Jang</last></author>
+      <author><first>Yu-Chun</first><last>Lin</last></author>
+      <author><first>Chung-Che</first><last>Wang</last></author>
+      <author><first>Jyh-Shing Roger</first><last>Jang</last></author>
       <pages>21–28</pages>
       <url hash="e7e35fa4">2023.rocling-1.3</url>
       <bibkey>yu-chun-lin-jang-2023-improving</bibkey>
     </paper>
     <paper id="4">
       <title>Construction of Message Deliver Service Dialog Systems</title>
-      <author><first>Chia-Hui Chang</first><last>Cheng-Hung Yeh</last></author>
+      <author><first>Cheng-Hung</first><last>Yeh</last></author>
+      <author><first>Chia-Hui</first><last>Chang</last></author>
       <pages>29–37</pages>
       <url hash="3b774aec">2023.rocling-1.4</url>
       <bibkey>cheng-hung-yeh-2023-construction</bibkey>
@@ -76,7 +79,8 @@
     </paper>
     <paper id="8">
       <title>Improving Low-Resource Speech Recognition through Multilingual Fine-Tuning with Language Identifiers and Self-Training</title>
-      <author><first>Michal Ptaszynski</first><last>Karol Nowakowski</last></author>
+      <author><first>Karol</first><last>Nowakowski</last></author>
+      <author><first>Michal</first><last>Ptaszynski</last></author>
       <pages>63–70</pages>
       <url hash="91e158e1">2023.rocling-1.8</url>
       <bibkey>karol-nowakowski-2023-improving</bibkey>
@@ -140,7 +144,8 @@
     </paper>
     <paper id="15">
       <title>Is <fixed-case>GPT</fixed-case>-4 a Good Islamic Expert for Answering <fixed-case>Q</fixed-case>uran Questions?</title>
-      <author><first>Eric Atwell</first><last>Sarah Alnefaie</last></author>
+      <author><first>Sarah</first><last>Alnefaie</last></author>
+      <author><first>Eric</first><last>Atwell</last></author>
       <author><first>Mohammad Ammar</first><last>Alsalka</last></author>
       <pages>124–133</pages>
       <url hash="105de88a">2023.rocling-1.15</url>
@@ -190,7 +195,9 @@
     <paper id="20">
       <title>Fine-Tuning and Evaluation of Question Generation for <fixed-case>S</fixed-case>lovak Language</title>
       <author><first>Ondrej</first><last>Megela</last></author>
-      <author><first>Ján Staš</first><last>Daniel Hladek, Matus Pleva</last></author>
+      <author><first>Daniel</first><last>Hladek</last></author>
+      <author><first>Matus</first><last>Pleva</last></author>
+      <author><first>Ján</first><last>Staš</last></author>
       <author><first>Ming-Hsiang</first><last>Su</last></author>
       <author><first>Yuan-Fu</first><last>Liao</last></author>
       <pages>171–178</pages>
@@ -222,7 +229,8 @@
     </paper>
     <paper id="24">
       <title>Sentence-level Revision with Neural Reinforcement Learning</title>
-      <author><first>Kenji Hashimoto</first><last>Zhendong Du</last></author>
+      <author><first>Zhendong</first><last>Du</last></author>
+      <author><first>Kenji</first><last>Hashimoto</last></author>
       <pages>202–209</pages>
       <url hash="5bacd876">2023.rocling-1.24</url>
       <bibkey>zhendong-du-2023-sentence</bibkey>
@@ -240,7 +248,8 @@
       <title><fixed-case>KNOT</fixed-case>-<fixed-case>MCTS</fixed-case>: An Effective Approach to Addressing Hallucinations in Generative Language Modeling for Question Answering</title>
       <author><first>Chung-Wen</first><last>Wu</last></author>
       <author><first>Guan-Tang</first><last>Huang</last></author>
-      <author><first>Berlin Chen</first><last>Yue-Yang He</last></author>
+      <author><first>Yue-Yang</first><last>He</last></author>
+      <author><first>Berlin</first><last>Chen</last></author>
       <pages>215–221</pages>
       <url hash="2e43adcf">2023.rocling-1.26</url>
       <bibkey>wu-etal-2023-knot</bibkey>
@@ -276,7 +285,8 @@
       <author><first>Eugene</first><last>Sy</last></author>
       <author><first>Tzu-Cheng</first><last>Peng</last></author>
       <author><first>Shih-Hsuan</first><last>Huang</last></author>
-      <author><first>Yung-Chun Chang</first><last>Heng-Yu Lin</last></author>
+      <author><first>Heng-Yu</first><last>Lin</last></author>
+      <author><first>Yung-Chun</first><last>Chang</last></author>
       <pages>242–249</pages>
       <url hash="1a6e6f44">2023.rocling-1.30</url>
       <bibkey>sy-etal-2023-fine</bibkey>
@@ -508,8 +518,9 @@
     <paper id="55">
       <title>The <fixed-case>NTNU</fixed-case> Super Monster Team (<fixed-case>SPMT</fixed-case>) system for the <fixed-case>F</fixed-case>ormosa Speech Recognition Challenge 2023 - <fixed-case>H</fixed-case>akka <fixed-case>ASR</fixed-case></title>
       <author><first>Tzu-Ting</first><last>Yang</last></author>
-      <author><first>Hsin Wei</first><last>Wang</last></author>
-      <author><first>Berlin Chen</first><last>Meng-Ting Tsai</last></author>
+      <author><first>Hsin-Wei</first><last>Wang</last></author>
+      <author><first>Meng-Ting</first><last>Tsai</last></author>
+      <author><first>Berlin</first><last>Chen</last></author>
       <pages>414–422</pages>
       <url hash="9e2ed6a8">2023.rocling-1.55</url>
       <bibkey>yang-etal-2023-ntnu</bibkey>

From b34ea79039a25ff106d4adc63a6a5871b2395462 Mon Sep 17 00:00:00 2001
From: anthology-assist <126604033+anthology-assist@users.noreply.github.com>
Date: Wed, 15 Nov 2023 07:09:45 -0600
Subject: [PATCH 07/12] Ingestion: sigdial workshop yrrsds (#2876)

---
 data/xml/2023.sigdial.xml    |   1 +
 data/xml/2023.yrrsds.xml     | 216 +++++++++++++++++++++++++++++++++++
 data/yaml/venues/yrrsds.yaml |   3 +
 3 files changed, 220 insertions(+)
 create mode 100644 data/xml/2023.yrrsds.xml
 create mode 100644 data/yaml/venues/yrrsds.yaml

diff --git a/data/xml/2023.sigdial.xml b/data/xml/2023.sigdial.xml
index 0af4ce3cc2..204c60cc3b 100644
--- a/data/xml/2023.sigdial.xml
+++ b/data/xml/2023.sigdial.xml
@@ -720,6 +720,7 @@
       <volume-id>2023.icard-1</volume-id>
       <volume-id>2023.cs4oa-1</volume-id>
       <volume-id>2023.mmnlg-1</volume-id>
+      <volume-id>2023.yrrsds-1</volume-id>
     </colocated>
   </event>
 </collection>
diff --git a/data/xml/2023.yrrsds.xml b/data/xml/2023.yrrsds.xml
new file mode 100644
index 0000000000..accf1c0be5
--- /dev/null
+++ b/data/xml/2023.yrrsds.xml
@@ -0,0 +1,216 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2023.yrrsds">
+  <volume id="1" ingest-date="2023-11-09" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the 19th Annual Meeting of the Young Reseachers' Roundtable on Spoken Dialogue Systems</booktitle>
+      <editor><first>Vojtech</first><last>Hudecek</last></editor>
+      <editor><first>Patricia</first><last>Schmidtova</last></editor>
+      <editor><first>Tanvi</first><last>Dinkar</last></editor>
+      <editor><first>Javier</first><last>Chiyah-Garcia</last></editor>
+      <editor><first>Weronika</first><last>Sieinska</last></editor>
+      <publisher>Association for Computational Linguistics</publisher>
+      <address>Prague, Czechia</address>
+      <month>September</month>
+      <year>2023</year>
+      <url hash="e1086b76">2023.yrrsds-1</url>
+      <venue>yrrsds</venue>
+      <venue>ws</venue>
+    </meta>
+    <frontmatter>
+      <url hash="97964136">2023.yrrsds-1.0</url>
+      <bibkey>yrrsds-2023-young</bibkey>
+    </frontmatter>
+    <paper id="1">
+      <title>Processing Referential Ambiguities in Situated Dialogue Systems</title>
+      <author><first>Javier</first><last>Chiyah-Garcia</last></author>
+      <pages>1–4</pages>
+      <abstract>Position paper for YRRSDS 2023</abstract>
+      <url hash="63a7b8e1">2023.yrrsds-1.1</url>
+      <bibkey>chiyah-garcia-2023-processing</bibkey>
+    </paper>
+    <paper id="2">
+      <title>Safety and Robustness in Conversational <fixed-case>AI</fixed-case></title>
+      <author><first>Tanvi</first><last>Dinkar</last></author>
+      <pages>5–8</pages>
+      <abstract>In this position paper, I will present the research interests in my PostDoc on safety and robustness specific to conversational AI, including then relevant overlap from my PhD.</abstract>
+      <url hash="b3f1c209">2023.yrrsds-1.2</url>
+      <bibkey>dinkar-2023-safety</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Incremental Speech Processing for Voice Assistant Accessibility</title>
+      <author><first>Angus</first><last>Addlesee</last></author>
+      <pages>9–11</pages>
+      <abstract>Speech production is nuanced and unique to every individual, but today’s Spoken Dialogue Systems (SDSs) are trained to use general speech patterns to successfully improve performance on various evaluation metrics. However, these patterns do not apply to certain user groups - often the very people that can benefit the most from SDSs. For example, people with dementia produce more disfluent speech than the general population. The healthcare domain is now a popular setting for spoken dialogue and human-robot interaction research. This trend is similar when observing company behaviour. Charities promote industry voice assistants, the creators are getting HIPAA compliance, and their features sometimes target vulnerable user groups. It is therefore critical to adapt SDSs to be more accessible.</abstract>
+      <url hash="798a62e3">2023.yrrsds-1.3</url>
+      <bibkey>addlesee-2023-incremental</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Advancing Spoken Dialog Systems for Manufacturing: From Conceptual Architecture and Taxonomy to Real Case Applications and Future Directions</title>
+      <author><first>Silvia</first><last>Colabianchi</last></author>
+      <pages>12–14</pages>
+      <abstract>This research encompasses a comprehensive exploration of Spoken Dialogue Systems (SDSs) in the manufacturing sector. It begins by establishing a conceptual architecture and taxonomy to guide the design and selection of SDS elements. Real case applications, including worker safety and cybersecurity support, validate the research findings and highlight areas for improvement. Looking ahead, the study delves into the potential of Large Language Models (LLMs) and multi-modal applications. Emphasizing the importance of extreme personalization, the study highlights the need to cater to the diverse qualifications and preferences of workers. Additionally, it investigates the integration of SDSs with other sensory modalities, such as images, videos, and augmented or virtual reality scenarios, to enhance the user experience and productivity. The research also addresses crucial considerations related to knowledge base optimization. It examines semantic variations of words across different application contexts, the continuous updating of procedures and data, and the adaptability of SDSs to diverse dialects and linguistic abilities, particularly in low-schooling personnel scenarios. Privacy, industrial protection, and ethical concerns in the era of LLMs and external players like OpenAI are given due attention. The study explores the boundaries of knowledge that conversational systems should possess, advocating for transparency, explainability, and responsible data handling practices.</abstract>
+      <url hash="5e0208bd">2023.yrrsds-1.4</url>
+      <bibkey>colabianchi-2023-advancing</bibkey>
+    </paper>
+    <paper id="5">
+      <title>Conversational Grounding in Multimodal Dialog Systems</title>
+      <author><first>Biswesh</first><last>Mohapatra</last></author>
+      <pages>15–17</pages>
+      <abstract>The process of “conversational grounding” is an interactive process that has been studied extensively in cognitive science, whereby participants in a conversation check to make sure their interlocutors understand what is being referred to. This interactive process uses multiple modes of communication to establish the information between the participants. This could include information provided through eye-gaze, head movements, intonation in speech, along with the content of the speech. While the process is essential to successful communication between humans and between humans and machines, work needs to be done on testing and building the capabilities of the current dialogue system in managing conversational grounding, especially in multimodal medium of communication. Recent work such as Benotti and Blackburn have shown the importance of conversational grounding in dialog systems and how current systems fail in them. This is essential for the advancement of Embodied Conversational Agents and Social Robots. Thus my PhD project aims to test, understand and improve the functioning of current dialog models with respect to Conversational Grounding.</abstract>
+      <url hash="74e37877">2023.yrrsds-1.5</url>
+      <bibkey>mohapatra-2023-conversational</bibkey>
+    </paper>
+    <paper id="6">
+      <title><fixed-case>SQL</fixed-case> Comment Generation and Additional Research Interests</title>
+      <author><first>Alyssa</first><last>Allen</last></author>
+      <pages>18–20</pages>
+      <abstract>My research interests focus on natural language generation (NLG) regarding how to make system outputs more intuitive and comprehensible for the human-user and conversational entrainment and alignment from the perspective of how dialogue systems could or should personalize its responses to the human user. As it relates to NLG, my current work focuses on training a system to auto-generate comments for SQL queries produced by a Text-to-SQL parser. The goal is to make the connection between technical SQL language and the user’s question more transparent. My linguistic training lies primarily at the intersection of computational and socio-linguistics. As such, my curiosities in conversational entrainment and alignment focus on the extent to which conversational agents can or should adjust their language based on human characteristics such as age, race, or gender.</abstract>
+      <url hash="2f2f9453">2023.yrrsds-1.6</url>
+      <bibkey>allen-2023-sql</bibkey>
+    </paper>
+    <paper id="7">
+      <title>On Referring Language Use in Visually Grounded Dialogue</title>
+      <author><first>Bram</first><last>Willemsen</last></author>
+      <pages>21–23</pages>
+      <abstract>Position paper for YRRSDS 2023</abstract>
+      <url hash="6d3dfd86">2023.yrrsds-1.7</url>
+      <bibkey>willemsen-2023-referring</bibkey>
+    </paper>
+    <paper id="8">
+      <title>Challenges and Approaches in Designing Social <fixed-case>SDS</fixed-case> in the <fixed-case>LLM</fixed-case> Era</title>
+      <author><first>Koji</first><last>Inoue</last></author>
+      <pages>24–25</pages>
+      <abstract>Large language models (LLMs) have brought about a significant transformation in spoken dialogue systems (SDSs). It is anticipated that these systems will be implemented into diverse robotic applications and employed in a variety of social settings. The author presents research interest with the aim of realizing social SDSs from multiple perspectives, including task design, turn-taking mechanisms, and evaluation methodologies. Additionally, future research in social SDSs should delve into a deeper understanding of user mental states and a relationship with society via multi-party conversations. Finally, the author suggests topics for discussion regarding the future directions of SDS researchers in the LLM era.</abstract>
+      <url hash="a8fcdb4d">2023.yrrsds-1.8</url>
+      <bibkey>inoue-2023-challenges</bibkey>
+    </paper>
+    <paper id="9">
+      <title>Breakdowns and Repairs. Detecting Patterns that Lead to Breakdowns in Customer Service Messages</title>
+      <author><first>Anouck</first><last>Braggaar</last></author>
+      <pages>26–29</pages>
+      <abstract>Many companies use dialogue systems for their customer service, and although there has been a rise in the usage of these systems (Costello and LoDolce, 2022), many of these systems still face challenges in comprehending and properly responding to the customer (Følstadet al., 2021). In our project we aim to figure out how to develop and improve these conversational agents. Part of this project (detailed in this paper) will focus on the detection of breakdown patterns and the possible solutions (repairs) to mitigate negative results of these errors.</abstract>
+      <url hash="58e3f5fa">2023.yrrsds-1.9</url>
+      <bibkey>braggaar-2023-breakdowns</bibkey>
+    </paper>
+    <paper id="10">
+      <title>Towards More Natural Dialogues: Integrating Open-Domain Dialogue Skills into Task-Oriented Agents</title>
+      <author><first>Armand</first><last>Stricker</last></author>
+      <pages>30–32</pages>
+      <abstract>Position paper on the intersection between chitchat and task-oriented dialogues (TODs), with a focus on integrating capabilities typically associated with chitchat systems into task-oriented agents.</abstract>
+      <url hash="ac25af2a">2023.yrrsds-1.10</url>
+      <bibkey>stricker-2023-towards</bibkey>
+    </paper>
+    <paper id="11">
+      <title>The Future of Designing Spoken Dialogue Systems and Analyzing Written Conversations</title>
+      <author><first>Livia</first><last>Qian</last></author>
+      <pages>33–34</pages>
+      <abstract>This is my position paper for YRRSDS 2023. In it, I write about the details of my research interests as well as past, current and future projects, talk about the status of spoken dialogue system research, include a short bio, and suggest topics for discussion.</abstract>
+      <url hash="53b556fb">2023.yrrsds-1.11</url>
+      <bibkey>qian-2023-future</bibkey>
+    </paper>
+    <paper id="12">
+      <title>Exploring the Synergy of Deep Learning and Anthropomorphism in Multimodal Dialogue Systems</title>
+      <author><first>Iwona</first><last>Christop</last></author>
+      <pages>35–36</pages>
+      <abstract>This position paper is an overview of author’s main research interests and work considering deep learning techniques in audio classification, sign languages, and multimodality in dialogue systems. Author also shares her opinion on current and future research considering dialogue agents, and suggests topics for discussion panels.</abstract>
+      <url hash="99f8bb72">2023.yrrsds-1.12</url>
+      <bibkey>christop-2023-exploring</bibkey>
+    </paper>
+    <paper id="13">
+      <title>A Perspective on Anchoring and Dialogue History Propagation for Smoother Interactions with Spoken Task-Oriented Dialogue Systems</title>
+      <author><first>Lucas</first><last>Druart</last></author>
+      <pages>37–39</pages>
+      <abstract>Task-Oriented Dialogue (TOD) systems provide interactive assistance to a user in order to accomplish a specific task such as making a reservation at a restaurant or booking a room in a hotel. Speech presents itself as a natural interface for TOD systems. A typical approach to implement them is to use a modular architecture (Gao et al., 2018). A core component of such dialogue systems is Spoken Language Understanding (SLU) whose goal is to extract the relevant information from the user’s utterances. While spoken dialogue was the focus of earlier work (Williams et al., 2013; Henderson et al., 2014), recent work has focused on text inputs with no regard for the specificities of spoken language (Wu et al., 2019; Heck et al., 2020; Feng et al., 2021). However, this approach fails to account for the differences between written and spoken language (Faruqui and Hakkani-Tür, 2022) such as disfluencies. My research focuses on Spoken Language Understanding in the context of Task-Oriented Dialogue. More specifically I am interested in the two following research directions: • Annotation schema for spoken TODs, • Integration of dialogue history for contextually coherent predictions.</abstract>
+      <url hash="a8ca7c2d">2023.yrrsds-1.13</url>
+      <bibkey>druart-2023-perspective</bibkey>
+    </paper>
+    <paper id="14">
+      <title>More Human-Like Interaction in Spoken Dialogue Systems: Global Context for Natural Language Understanding and Multimodal Solutions</title>
+      <author><first>Kacper</first><last>Dudzic</last></author>
+      <pages>40–41</pages>
+      <abstract>My position paper for the YRRSDS 2023 workshop.</abstract>
+      <url hash="ce43c3ca">2023.yrrsds-1.14</url>
+      <bibkey>dudzic-2023-human</bibkey>
+    </paper>
+    <paper id="15">
+      <title>Designing and Evaluating <fixed-case>LLM</fixed-case>-based Conversational Agents for Behaviour Change</title>
+      <author><first>Selina</first><last>Meyer</last></author>
+      <pages>42–43</pages>
+      <abstract>My PhD focuses on conversational agents for behaviour change, with a focus on the feasibility of applying Large Language Models (LLMs) such as GPT-4 in this context.</abstract>
+      <url hash="c7c09c2e">2023.yrrsds-1.15</url>
+      <bibkey>meyer-2023-designing</bibkey>
+    </paper>
+    <paper id="16">
+      <title>Stylized Dialog Response Generation</title>
+      <author><first>Sourabrata</first><last>Mukherjee</last></author>
+      <pages>44–46</pages>
+      <abstract>My primary research focus lies in the domain of Text Style Transfer (TST), a fascinating area within Natural Language Processing (NLP). TST involves the transfor- mation of text into a desired style while approximately preserving its underlying content. In my research, I am also driven by the goal of incorporating TST techniques into NLP systems, particularly within the realm of dia- logue systems. I am intrigued by the concept of Stylized Dialog Response Generation, which aims to enhance the versatility and adaptability of dialog systems in generat- ing text responses with specific style attributes. By ad- vancing our understanding of TST and its integration into dialogue systems, my research seeks to contribute to the broader field of human-computer interaction. Through the development of robust and versatile dialogue systems with enhanced style transfer capabilities, we can facili- tate more engaging and personalized conversational experiences.</abstract>
+      <url hash="6fbbf2fd">2023.yrrsds-1.16</url>
+      <bibkey>mukherjee-2023-stylized</bibkey>
+    </paper>
+    <paper id="17">
+      <title>Take the Most out of Text Data Augmentation Strategies For Intent Clustering And Induction Based on <fixed-case>DSTC</fixed-case> 11 Track 2</title>
+      <author><first>Mikołaj</first><last>Krzymiński</last></author>
+      <pages>47–48</pages>
+      <abstract>A brief introduction to author’s keyinterests and research topics which are: multimodal dialogue systems and impact of data augmentation to NLU performance. In addition to that the author shares his biography and view on the future of dialogue assistants.</abstract>
+      <url hash="64a85986">2023.yrrsds-1.17</url>
+      <bibkey>krzyminski-2023-take</bibkey>
+    </paper>
+    <paper id="18">
+      <title>Advancing Dialogue Systems: Measuring User Satisfaction and Embracing Multimodality</title>
+      <author><first>Adrian</first><last>Charkiewicz</last></author>
+      <pages>49–50</pages>
+      <abstract>This submission discusses my research interests in two areas: measuring user satisfaction in goal-oriented dialogue systems and exploring the potential of multi-modal interactions. For goal-oriented dialogue systems, I focus on evaluating and enhancing user satisfaction throughout the interaction process, aiming to propose innovative strategies and address the limitations of existing evaluation techniques. Additionally, I explore the benefits of multi-modal dialogue systems, highlighting their ability to provide more natural and immersive conversations by incorporating various communication modes such as speech, text, gestures, and visuals.</abstract>
+      <url hash="a35e48c9">2023.yrrsds-1.18</url>
+      <bibkey>charkiewicz-2023-advancing</bibkey>
+    </paper>
+    <paper id="19">
+      <title>Information Extraction and Program Synthesis from Goal-Oriented Dialogue</title>
+      <author><first>Sopan</first><last>Khosla</last></author>
+      <pages>51–53</pages>
+      <abstract>My research interests broadly lie in the area of Information Extraction from Spoken Dialogue, with a spacial focus on state modeling, anaphora resolution, program synthesis &amp; planning, and intent classification in goal-oriented conversations. My aim is to create embedded dialogue systems that can interact with humans in a collaborative setup to solve tasks in a digital/non-digital environment. Most of the goal-oriented conversations usually involve experts and a laypersons. The aim for the expert is to consider all the information provided by the layperson, identify the underlying set of issues or intents, and prescribe solutions. While human experts are very good at extracting such information, AI agents (that build up most of the automatic dialog systems today) not so much. Most of the existing assistants (or chatbots) only consider individual utterances and do not ground them in the context of the dialogue. My work in this direction has focused on making these systems more effective at extracting the most relevant information from the dialogue to help the human user reach their end-goal.</abstract>
+      <url hash="cdaee196">2023.yrrsds-1.19</url>
+      <bibkey>khosla-2023-information</bibkey>
+    </paper>
+    <paper id="20">
+      <title>Modelling Emotions in Task-Oriented Dialogue</title>
+      <author><first>Shutong</first><last>Feng</last></author>
+      <pages>54–56</pages>
+      <abstract>My research interests lie in the area of modelling natural and human-like conversations, with a special focus on emotions in task-oriented dialogue (ToD) systems. ToD systems need to produce semantically and grammatically correct responses to fulfil the user’s goal. Being able to perceive and express emotions pushes them one more step towards achieving human-likeness. To begin with, I constructed a dataset with meaningful emotion labels as well as a wide coverage of emotions and linguistic features in ToDs. Then, I improved emotion recognition in conversations (ERC) in the task-oriented domain by exploiting key characteristics of ToDs. Currently, I am working towards enhancing ToD systems with emotions.</abstract>
+      <url hash="5c0cc104">2023.yrrsds-1.20</url>
+      <bibkey>feng-2023-modelling</bibkey>
+    </paper>
+    <paper id="21">
+      <title>Incrementally Enriching the Common Ground: A Research Path</title>
+      <author><first>Brielen</first><last>Madureira</last></author>
+      <pages>57–58</pages>
+      <abstract>I am broadly interested in evaluation of dialogue systems, in all its many facets: The data they are trained on, their ability to perform a task successfully, their skills with respect to various dialogue phenomena, their resemblance to human cognitive processes, and their ethical and societal impact. More specifically, my research topics focus on understanding the possibilities and limits of current multimodal neural network-based models to incrementally encode information for natural language understanding in general and also for building common ground and asking for clarification. Besides, I am interested in dialogue games as a means to elicit and collect dialogue data and to evaluate the abilities of dialogue models.</abstract>
+      <url hash="8f58ff4b">2023.yrrsds-1.21</url>
+      <bibkey>madureira-2023-incrementally</bibkey>
+    </paper>
+    <paper id="22">
+      <title>Commonsense Enabled Conversational Model and System-Initiated transitions in Unified <fixed-case>SDS</fixed-case>s</title>
+      <author><first>Ye</first><last>Liu</last></author>
+      <pages>59–61</pages>
+      <abstract>My research work centers on how to enable a human-like interaction through generating contextual, emotional or proactive responses, both in task-oriented and in chitchat spoken dialogue systems (SDSs), because natural lan- guage generation (NLG) is an indispensable component in SDSs and can directly affect the user interactive expe- rience of the entire dialogue system. In addition to NLG, I am also interested in natural language understanding (NLU), as it plays a crucial role in SDSs and is a prerequisite for dialogue systems to generate replies.</abstract>
+      <url hash="dba1ed11">2023.yrrsds-1.22</url>
+      <bibkey>liu-2023-commonsense</bibkey>
+    </paper>
+    <paper id="23">
+      <title>Causality Reasoning for Empathy-Enriched and Personality-Conditioned Spoken Dialogue System</title>
+      <author><first>Yahui</first><last>Fu</last></author>
+      <pages>62–63</pages>
+      <abstract>The author’s objective centers around developing a spoken dialogue system (SDS) that can emulate the cognitive and conversational qualities of a human friend. Key attributes such as empathy, knowledge/causality reasoning, and personality are integral components of human interaction. The proposed approach involves the creation of an <b>Empathy-enriched SDS</b>, capable of comprehending human emotions and circumstances, thus providing companionship and assistance akin to a trusted friend. Additionally, the <b>Causality-reasoning for SDS</b> aims to ground the system in commonsense knowledge and equip it with the ability to reason about causalities, such as predicting user desires/reactions and system intentions/reactions, thereby enhancing the system’s intelligence and human-like behavior. Finally, the concept of a <b>Personality-conditioned SDS</b> involves enabling systems to exhibit distinct personalities, further enhancing the naturalness of human-robot interaction.</abstract>
+      <url hash="97799e2a">2023.yrrsds-1.23</url>
+      <bibkey>fu-2023-causality</bibkey>
+    </paper>
+    <paper id="24">
+      <title>Tutorials and User Adaptation in Task Oriented Dialogue</title>
+      <author><first>Ryu</first><last>Hirai</last></author>
+      <pages>64–65</pages>
+      <abstract>This position paper describes my research interests, spoken dialogue system research, and suggested topics for discussion.</abstract>
+      <url hash="703e0bf7">2023.yrrsds-1.24</url>
+      <bibkey>hirai-2023-tutorials</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/yaml/venues/yrrsds.yaml b/data/yaml/venues/yrrsds.yaml
new file mode 100644
index 0000000000..7e00fbca61
--- /dev/null
+++ b/data/yaml/venues/yrrsds.yaml
@@ -0,0 +1,3 @@
+acronym: YRRSDS
+name: The 19th Annual Meeting of the Young Researchers' Roundtable on Spoken Dialogue
+  Systems

From 761715b54e9a42a62d232cd03110187fe250a441 Mon Sep 17 00:00:00 2001
From: acl-pwc-bot <94475230+acl-pwc-bot@users.noreply.github.com>
Date: Thu, 16 Nov 2023 02:04:59 +0100
Subject: [PATCH 08/12] Update metadata from Papers with Code

---
 data/xml/2020.acl.xml            | 30 +++++++++++++++---------------
 data/xml/2020.coling.xml         | 11 ++++++-----
 data/xml/2020.conll.xml          |  4 ++--
 data/xml/2020.emnlp.xml          | 16 ++++++++--------
 data/xml/2020.findings.xml       | 10 +++++-----
 data/xml/2020.icon.xml           |  2 +-
 data/xml/2020.insights.xml       |  2 +-
 data/xml/2020.lifelongnlp.xml    |  2 +-
 data/xml/2020.lrec.xml           |  2 +-
 data/xml/2020.wnut.xml           |  2 +-
 data/xml/2021.acl.xml            | 31 ++++++++++++++++---------------
 data/xml/2021.adaptnlp.xml       |  2 +-
 data/xml/2021.case.xml           |  2 +-
 data/xml/2021.cl.xml             |  2 +-
 data/xml/2021.conll.xml          |  2 +-
 data/xml/2021.dash.xml           |  2 +-
 data/xml/2021.eacl.xml           | 10 +++++-----
 data/xml/2021.econlp.xml         |  2 +-
 data/xml/2021.emnlp.xml          | 16 ++++++++--------
 data/xml/2021.eval4nlp.xml       |  2 +-
 data/xml/2021.findings.xml       | 26 ++++++++++++++------------
 data/xml/2021.naacl.xml          |  4 ++--
 data/xml/2021.nllp.xml           |  2 +-
 data/xml/2021.nlp4dh.xml         |  2 +-
 data/xml/2021.nodalida.xml       |  2 +-
 data/xml/2021.ranlp.xml          |  4 ++--
 data/xml/2021.wnut.xml           |  4 ++--
 data/xml/2022.acl.xml            | 29 ++++++++++++++++-------------
 data/xml/2022.ccl.xml            |  2 +-
 data/xml/2022.coling.xml         | 10 +++++-----
 data/xml/2022.dadc.xml           |  2 +-
 data/xml/2022.findings.xml       | 24 +++++++++++++-----------
 data/xml/2022.jeptalnrecital.xml |  4 ++--
 data/xml/2022.konvens.xml        |  2 +-
 data/xml/2022.law.xml            |  2 +-
 data/xml/2022.lrec.xml           |  2 +-
 data/xml/2022.ltedi.xml          |  1 +
 data/xml/2022.naacl.xml          | 17 +++++++++--------
 data/xml/2022.repl4nlp.xml       |  4 ++--
 data/xml/2022.semeval.xml        |  7 ++++---
 data/xml/2022.spnlp.xml          |  2 +-
 data/xml/C12.xml                 |  4 ++--
 data/xml/C18.xml                 | 10 +++++-----
 data/xml/D14.xml                 |  6 +++---
 data/xml/D15.xml                 |  6 +++---
 data/xml/D17.xml                 |  8 ++++----
 data/xml/D18.xml                 | 20 ++++++++++----------
 data/xml/D19.xml                 | 18 +++++++++---------
 data/xml/F14.xml                 |  2 +-
 data/xml/I13.xml                 |  6 +++---
 data/xml/I17.xml                 |  6 +++---
 data/xml/K15.xml                 |  2 +-
 data/xml/K18.xml                 |  4 ++--
 data/xml/K19.xml                 |  4 ++--
 data/xml/N16.xml                 |  4 ++--
 data/xml/N18.xml                 | 10 +++++-----
 data/xml/N19.xml                 | 24 ++++++++++++------------
 data/xml/P14.xml                 |  4 ++--
 data/xml/P15.xml                 |  2 +-
 data/xml/P16.xml                 |  7 ++++---
 data/xml/P17.xml                 |  6 +++---
 data/xml/P18.xml                 | 12 ++++++------
 data/xml/P19.xml                 | 22 +++++++++++-----------
 data/xml/Q16.xml                 |  2 +-
 data/xml/S18.xml                 |  2 +-
 data/xml/U15.xml                 |  2 +-
 data/xml/W14.xml                 |  6 +++---
 data/xml/W16.xml                 | 12 ++++++------
 data/xml/W17.xml                 |  8 ++++----
 data/xml/W18.xml                 |  8 ++++----
 data/xml/W19.xml                 | 15 +++++++--------
 data/xml/Y18.xml                 |  2 +-
 72 files changed, 280 insertions(+), 268 deletions(-)

diff --git a/data/xml/2020.acl.xml b/data/xml/2020.acl.xml
index 86c58c3a39..c58bc1dc35 100644
--- a/data/xml/2020.acl.xml
+++ b/data/xml/2020.acl.xml
@@ -673,7 +673,7 @@
       <video href="http://slideslive.com/38928694"/>
       <bibkey>li-etal-2020-dice</bibkey>
       <pwccode url="https://github.com/ShannonAI/dice_loss_for_NLP" additional="true">ShannonAI/dice_loss_for_NLP</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/msra-cn-ner">MSRA CN NER</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/ontonotes-4-0">OntoNotes 4.0</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/ontonotes-5-0">OntoNotes 5.0</pwcdataset>
@@ -2052,7 +2052,7 @@
       <video href="http://slideslive.com/38928783"/>
       <bibkey>namysl-etal-2020-nat</bibkey>
       <pwccode url="https://github.com/mnamysl/nat-acl2020" additional="false">mnamysl/nat-acl2020</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="139">
       <title>Named Entity Recognition without Labelled Data: A Weak Supervision Approach</title>
@@ -2068,7 +2068,7 @@
       <bibkey>lison-etal-2020-named</bibkey>
       <pwccode url="https://github.com/NorskRegnesentral/weak-supervision-for-NER" additional="false">NorskRegnesentral/weak-supervision-for-NER</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/broad-twitter-corpus">Broad Twitter Corpus</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="140">
       <title>Probing Linguistic Features of Sentence-Level Representations in Neural Relation Extraction</title>
@@ -2773,7 +2773,7 @@
       <video href="http://slideslive.com/38929087"/>
       <bibkey>brantley-etal-2020-active</bibkey>
       <pwccode url="https://github.com/xkianteb/leaqi" additional="false">xkianteb/leaqi</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="190">
       <title><fixed-case>E</fixed-case>xp<fixed-case>BERT</fixed-case>: Representation Engineering with Natural Language Explanations</title>
@@ -2815,7 +2815,7 @@
       <video href="http://slideslive.com/38928920"/>
       <bibkey>jiang-etal-2020-generalizing</bibkey>
       <pwccode url="https://github.com/jzbjyb/SpanRel" additional="true">jzbjyb/SpanRel</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/conll-2012-1">CoNLL-2012</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/oie2016">OIE2016</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/penn-treebank">Penn Treebank</pwcdataset>
@@ -3491,7 +3491,7 @@
       <doi>10.18653/v1/2020.acl-main.236</doi>
       <video href="http://slideslive.com/38929148"/>
       <bibkey>arora-etal-2020-contextual</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/glue">GLUE</pwcdataset>
     </paper>
     <paper id="237">
@@ -6576,7 +6576,7 @@
       <video href="http://slideslive.com/38929334"/>
       <bibkey>tabassum-etal-2020-code</bibkey>
       <pwccode url="https://github.com/jeniyat/StackOverflowNER" additional="true">jeniyat/StackOverflowNER</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="444">
       <title>Dialogue-Based Relation Extraction</title>
@@ -7691,7 +7691,7 @@
       <pwccode url="https://github.com/ShannonAI/mrc-for-flat-nested-ner" additional="true">ShannonAI/mrc-for-flat-nested-ner</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/ace-2004">ACE 2004</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/ace-2005">ACE 2005</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/genia">GENIA</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/msra-cn-ner">MSRA CN NER</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/ontonotes-4-0">OntoNotes 4.0</pwcdataset>
@@ -8529,7 +8529,7 @@
       <video href="http://slideslive.com/38928957"/>
       <bibkey>ouchi-etal-2020-instance</bibkey>
       <pwccode url="https://github.com/hiroki13/instance-based-ner" additional="false">hiroki13/instance-based-ner</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="576">
       <title><fixed-case>MIE</fixed-case>: A Medical Information Extractor towards Medical Dialogues</title>
@@ -8563,7 +8563,7 @@
       <pwcdataset url="https://paperswithcode.com/dataset/ace-2004">ACE 2004</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/ace-2005">ACE 2005</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/conll-2002">CoNLL 2002</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/genia">GENIA</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/ontonotes-5-0">OntoNotes 5.0</pwcdataset>
     </paper>
@@ -8625,7 +8625,7 @@
       <bibkey>wu-etal-2020-single</bibkey>
       <pwccode url="https://github.com/microsoft/vert-papers/tree/master/papers/SingleMulti-TS" additional="false">microsoft/vert-papers</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/conll-2002">CoNLL 2002</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="582">
       <title>Synchronous Double-channel Recurrent Network for Aspect-Opinion Pair Extraction</title>
@@ -10705,7 +10705,7 @@
       <video href="http://slideslive.com/38929323"/>
       <bibkey>rijhwani-etal-2020-soft</bibkey>
       <pwccode url="https://github.com/neulab/soft-gazetteers" additional="false">neulab/soft-gazetteers</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="723">
       <title>A Prioritization Model for Suicidality Risk Assessment</title>
@@ -11143,7 +11143,7 @@
       <bibkey>lin-etal-2020-triggerner</bibkey>
       <pwccode url="https://github.com/INK-USC/TriggerNER" additional="false">INK-USC/TriggerNER</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/bc5cdr">BC5CDR</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="753">
       <title>Addressing Posterior Collapse with Mutual Information for Improved Variational Neural Machine Translation</title>
@@ -11583,7 +11583,7 @@
       <pwccode url="https://github.com/airaria/TextBrewer" additional="false">airaria/TextBrewer</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/cmrc">CMRC</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/cmrc-2018">CMRC 2018</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/drcd">DRCD</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/squad">SQuAD</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/xnli">XNLI</pwcdataset>
@@ -11777,7 +11777,7 @@
       <video href="http://slideslive.com/38928620"/>
       <bibkey>qi-etal-2020-stanza</bibkey>
       <pwccode url="https://github.com/stanfordnlp/stanza" additional="true">stanfordnlp/stanza</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="15">
       <title>jiant: A Software Toolkit for Research on General-Purpose Text Understanding Models</title>
diff --git a/data/xml/2020.coling.xml b/data/xml/2020.coling.xml
index fb6da2fb12..442c82d684 100644
--- a/data/xml/2020.coling.xml
+++ b/data/xml/2020.coling.xml
@@ -69,7 +69,7 @@
       <bibkey>ma-etal-2020-charbert</bibkey>
       <pwccode url="https://github.com/wtma/CharBERT" additional="false">wtma/CharBERT</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/cola">CoLA</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/glue">GLUE</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/mrpc">MRPC</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/qnli">QNLI</pwcdataset>
@@ -484,7 +484,7 @@
       <url hash="3fabc411">2020.coling-main.36</url>
       <doi>10.18653/v1/2020.coling-main.36</doi>
       <bibkey>ashby-weir-2020-leveraging</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="37">
       <title>Multimodal Review Generation with Privacy and Fairness Awareness</title>
@@ -1047,7 +1047,7 @@
       <bibkey>luoma-pyysalo-2020-exploring</bibkey>
       <pwccode url="https://github.com/jouniluoma/bert-ner-cmv" additional="false">jouniluoma/bert-ner-cmv</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/conll-2002">CoNLL 2002</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="79">
       <title>Cross-lingual Annotation Projection in Legal Texts</title>
@@ -4375,6 +4375,7 @@
       <pwcdataset url="https://paperswithcode.com/dataset/glue">GLUE</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/lama">LAMA</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/open-entity-1">Open Entity</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/wikidata5m">Wikidata5M</pwcdataset>
     </paper>
     <paper id="328">
       <title>Invertible Tree Embeddings using a Cryptographic Role Embedding Scheme</title>
@@ -6126,7 +6127,7 @@
       <url hash="8845c18d">2020.coling-main.460</url>
       <doi>10.18653/v1/2020.coling-main.460</doi>
       <bibkey>vo-etal-2020-identifying</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/sick">SICK</pwcdataset>
     </paper>
     <paper id="461">
@@ -7317,7 +7318,7 @@
       <url hash="c9601ff6">2020.coling-main.550</url>
       <doi>10.18653/v1/2020.coling-main.550</doi>
       <bibkey>suwaileh-etal-2020-ready</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="551">
       <title>Mining Crowdsourcing Problems from Discussion Forums of Workers</title>
diff --git a/data/xml/2020.conll.xml b/data/xml/2020.conll.xml
index e44610ab8a..2804c58dd2 100644
--- a/data/xml/2020.conll.xml
+++ b/data/xml/2020.conll.xml
@@ -222,8 +222,8 @@
       <doi>10.18653/v1/2020.conll-1.16</doi>
       <bibkey>reiss-etal-2020-identifying</bibkey>
       <pwccode url="https://github.com/codait/text-extensions-for-pandas" additional="true">codait/text-extensions-for-pandas</pwccode>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/conll">CoNLL++</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/rcv1">RCV1</pwcdataset>
     </paper>
     <paper id="17">
@@ -459,7 +459,7 @@
       <doi>10.18653/v1/2020.conll-1.35</doi>
       <bibkey>boros-etal-2020-alleviating</bibkey>
       <pwccode url="https://github.com/embeddia/stacked-ner" additional="false">embeddia/stacked-ner</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="36">
       <title>Analysing Word Representation from the Input and Output Embeddings in Neural Network Language Models</title>
diff --git a/data/xml/2020.emnlp.xml b/data/xml/2020.emnlp.xml
index e1d47ef489..c65befe00a 100644
--- a/data/xml/2020.emnlp.xml
+++ b/data/xml/2020.emnlp.xml
@@ -1475,7 +1475,7 @@
       <video href="https://slideslive.com/38938834"/>
       <bibkey>chen-etal-2020-local</bibkey>
       <pwccode url="https://github.com/GT-SALT/LADA" additional="false">GT-SALT/LADA</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="96">
       <title>Grounded Compositional Outputs for Adaptive Language Modeling</title>
@@ -2776,7 +2776,7 @@
       <video href="https://slideslive.com/38938868"/>
       <bibkey>gui-etal-2020-uncertainty</bibkey>
       <pwccode url="https://github.com/jiacheng-ye/UANet" additional="false">jiacheng-ye/UANet</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/penn-treebank">Penn Treebank</pwcdataset>
     </paper>
     <paper id="182">
@@ -6116,7 +6116,7 @@
       <video href="https://slideslive.com/38939315"/>
       <bibkey>gao-gormley-2020-training</bibkey>
       <pwccode url="https://github.com/GaoSida/Neural-SampleRank" additional="false">GaoSida/Neural-SampleRank</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="407">
       <title>Multilevel Text Alignment with Cross-Document Attention</title>
@@ -7314,7 +7314,7 @@
       <bibkey>wang-etal-2020-ain</bibkey>
       <pwccode url="https://github.com/Alibaba-NLP/AIN" additional="false">Alibaba-NLP/AIN</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/atis">ATIS</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="486">
       <title><fixed-case>HIT</fixed-case>: Nested Named Entity Recognition via Head-Tail Pair and Token Interaction</title>
@@ -7887,7 +7887,7 @@
       <video href="https://slideslive.com/38938803"/>
       <bibkey>yamada-etal-2020-luke</bibkey>
       <pwccode url="https://github.com/studio-ousia/luke" additional="true">studio-ousia/luke</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/open-entity-1">Open Entity</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/record">ReCoRD</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/squad">SQuAD</pwcdataset>
@@ -9619,7 +9619,7 @@
       <doi>10.18653/v1/2020.emnlp-main.636</doi>
       <video href="https://slideslive.com/38939092"/>
       <bibkey>bhattacharjee-etal-2020-bert</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/imdb-movie-reviews">IMDb Movie Reviews</pwcdataset>
     </paper>
     <paper id="637">
@@ -10466,7 +10466,7 @@
       <video href="https://slideslive.com/38938974"/>
       <bibkey>zhang-etal-2020-seqmix</bibkey>
       <pwccode url="https://github.com/rz-zhang/SeqMix" additional="false">rz-zhang/SeqMix</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="692">
       <title><fixed-case>AxCell</fixed-case>: Automatic Extraction of Results from Machine Learning Papers</title>
@@ -10958,7 +10958,7 @@
       <video href="https://slideslive.com/38938819"/>
       <bibkey>luo-etal-2020-named</bibkey>
       <pwcdataset url="https://paperswithcode.com/dataset/conll-2002">CoNLL 2002</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="724">
       <title>Text Classification Using Label Names Only: A Language Model Self-Training Approach</title>
diff --git a/data/xml/2020.findings.xml b/data/xml/2020.findings.xml
index e780555e4d..7877f341d4 100644
--- a/data/xml/2020.findings.xml
+++ b/data/xml/2020.findings.xml
@@ -416,8 +416,8 @@
       <doi>10.18653/v1/2020.findings-emnlp.28</doi>
       <video href="https://slideslive.com/38940127"/>
       <bibkey>chen-etal-2020-enhance</bibkey>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/conll-2000-1">CoNLL-2000</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
     </paper>
     <paper id="29">
       <title>Multilingual Argument Mining: Datasets and Analysis</title>
@@ -861,7 +861,7 @@
       <url hash="fb5fd953">2020.findings-emnlp.60</url>
       <doi>10.18653/v1/2020.findings-emnlp.60</doi>
       <bibkey>peng-etal-2020-toward</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="61">
       <title>From Disjoint Sets to Parallel Data to Train <fixed-case>S</fixed-case>eq2<fixed-case>S</fixed-case>eq Models for Sentiment Transfer</title>
@@ -2451,7 +2451,7 @@
       <video href="https://slideslive.com/38940105"/>
       <bibkey>lester-etal-2020-constrained</bibkey>
       <pwccode url="https://github.com/blester125/constrained-decoding" additional="false">blester125/constrained-decoding</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/snips">SNIPS</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wnut-2017-emerging-and-rare-entity">WNUT 2017</pwcdataset>
     </paper>
@@ -4997,7 +4997,7 @@
       <url hash="7e2f6fb2">2020.findings-emnlp.338</url>
       <doi>10.18653/v1/2020.findings-emnlp.338</doi>
       <bibkey>lai-etal-2020-context</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/glue">GLUE</pwcdataset>
     </paper>
     <paper id="339">
@@ -5265,7 +5265,7 @@
       <url hash="42f6e4b0">2020.findings-emnlp.356</url>
       <doi>10.18653/v1/2020.findings-emnlp.356</doi>
       <bibkey>wang-etal-2020-embeddings</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="357">
       <title><fixed-case>NLP</fixed-case> Service <fixed-case>API</fixed-case>s and Models for Efficient Registration of New Clients</title>
diff --git a/data/xml/2020.icon.xml b/data/xml/2020.icon.xml
index f6f744727a..9de6efc808 100644
--- a/data/xml/2020.icon.xml
+++ b/data/xml/2020.icon.xml
@@ -26,7 +26,7 @@
       <url hash="b38b0528">2020.icon-main.1</url>
       <bibkey>nittala-shrivastava-2020-weave</bibkey>
       <pwccode url="https://github.com/nv-ravindra/the-weave-corpus" additional="false">nv-ravindra/the-weave-corpus</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="2">
       <title>Increasing accuracy of a semantic word labelling tool based on a small lexicon</title>
diff --git a/data/xml/2020.insights.xml b/data/xml/2020.insights.xml
index 91a32765be..39201dc986 100644
--- a/data/xml/2020.insights.xml
+++ b/data/xml/2020.insights.xml
@@ -217,7 +217,7 @@
       <doi>10.18653/v1/2020.insights-1.15</doi>
       <video href="https://slideslive.com/38940802"/>
       <bibkey>lignos-kamyab-2020-build</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="16">
       <title><fixed-case>HINT</fixed-case>3: Raising the bar for Intent Detection in the Wild</title>
diff --git a/data/xml/2020.lifelongnlp.xml b/data/xml/2020.lifelongnlp.xml
index d10cf47724..43c7bfd9f4 100644
--- a/data/xml/2020.lifelongnlp.xml
+++ b/data/xml/2020.lifelongnlp.xml
@@ -29,7 +29,7 @@
       <url hash="a0229731">2020.lifelongnlp-1.1</url>
       <bibkey>kim-2020-deep</bibkey>
       <pwcdataset url="https://paperswithcode.com/dataset/atis">ATIS</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="2">
       <title>Supervised Adaptation of Sequence-to-Sequence Speech Recognition Systems using Batch-Weighting</title>
diff --git a/data/xml/2020.lrec.xml b/data/xml/2020.lrec.xml
index bf10dfe367..a0b45233b8 100644
--- a/data/xml/2020.lrec.xml
+++ b/data/xml/2020.lrec.xml
@@ -7009,7 +7009,7 @@
       <bibkey>smith-etal-2020-scienceexamcer</bibkey>
       <pwcdataset url="https://paperswithcode.com/dataset/scienceexamcer">ScienceExamCER</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/arc">ARC</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="559">
       <title><fixed-case>N</fixed-case>or<fixed-case>NE</fixed-case>: Annotating Named Entities for <fixed-case>N</fixed-case>orwegian</title>
diff --git a/data/xml/2020.wnut.xml b/data/xml/2020.wnut.xml
index fa735bc78e..397b508fe9 100644
--- a/data/xml/2020.wnut.xml
+++ b/data/xml/2020.wnut.xml
@@ -25,8 +25,8 @@
       <url hash="e2a50533">2020.wnut-1.1</url>
       <doi>10.18653/v1/2020.wnut-1.1</doi>
       <bibkey>kaplan-2020-may</bibkey>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/conll">CoNLL++</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
     </paper>
     <paper id="2">
       <title>“Did you really mean what you said?” : Sarcasm Detection in <fixed-case>H</fixed-case>indi-<fixed-case>E</fixed-case>nglish Code-Mixed Data using Bilingual Word Embeddings</title>
diff --git a/data/xml/2021.acl.xml b/data/xml/2021.acl.xml
index 5c14ceeed4..b0e55d7268 100644
--- a/data/xml/2021.acl.xml
+++ b/data/xml/2021.acl.xml
@@ -274,7 +274,7 @@
       <video href="2021.acl-long.16.mp4"/>
       <pwccode url="https://github.com/LeeSureman/Sequence-Labeling-Early-Exit" additional="false">LeeSureman/Sequence-Labeling-Early-Exit</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/clue">CLUE</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/universal-dependencies">Universal Dependencies</pwcdataset>
     </paper>
     <paper id="17">
@@ -293,7 +293,7 @@
       <doi>10.18653/v1/2021.acl-long.17</doi>
       <bibkey>li-etal-2021-modularized</bibkey>
       <video href="2021.acl-long.17.mp4"/>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wnut-2017-emerging-and-rare-entity">WNUT 2017</pwcdataset>
     </paper>
     <paper id="18">
@@ -740,7 +740,7 @@
       <video href="2021.acl-long.46.mp4"/>
       <pwccode url="https://github.com/Alibaba-NLP/StructuralKD" additional="false">Alibaba-NLP/StructuralKD</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/conll-2002">CoNLL 2002</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/penn-treebank">Penn Treebank</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikiann-1">WikiAnn</pwcdataset>
     </paper>
@@ -972,7 +972,7 @@
       <video href="2021.acl-long.61.mp4"/>
       <pwccode url="https://github.com/microsoft/vert-papers/tree/master/papers/AdvPicker" additional="false">microsoft/vert-papers</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/conll-2002">CoNLL 2002</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="62">
       <title>Compare to The Knowledge: Graph Neural Fake News Detection with External Knowledge</title>
@@ -2294,9 +2294,9 @@
       <pwccode url="https://github.com/modelscope/adaseq" additional="true">modelscope/adaseq</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/bc5cdr">BC5CDR</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/cmeee">CMeEE</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/conll">CoNLL++</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/conll-2000-1">CoNLL-2000</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/msra-cn-ner">MSRA CN NER</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/ncbi-disease-1">NCBI Disease</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/resume-ner">Resume NER</pwcdataset>
@@ -3355,8 +3355,8 @@
       <video href="2021.acl-long.206.mp4"/>
       <pwccode url="https://github.com/Alibaba-NLP/ACE" additional="true">Alibaba-NLP/ACE</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/conll-2002">CoNLL 2002</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/conll-2000-1">CoNLL-2000</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/penn-treebank">Penn Treebank</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/semeval-2014-task-4-sub-task-2">SemEval 2014 Task 4 Sub Task 2</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/semeval-2016">SemEval 2016</pwcdataset>
@@ -3517,7 +3517,7 @@
       <pwccode url="https://github.com/tricktreat/locate-and-label" additional="false">tricktreat/locate-and-label</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/ace-2004">ACE 2004</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/ace-2005">ACE 2005</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/genia">GENIA</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/weibo-ner">Weibo NER</pwcdataset>
     </paper>
@@ -5397,7 +5397,7 @@
       <doi>10.18653/v1/2021.acl-long.332</doi>
       <bibkey>radmard-etal-2021-subsequence</bibkey>
       <video href="2021.acl-long.332.mp4"/>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="333">
       <title>Convolutions and Self-Attention: <fixed-case>R</fixed-case>e-interpreting Relative Positions in Pre-trained Language Models</title>
@@ -5736,7 +5736,7 @@
       <video href="2021.acl-long.352.mp4"/>
       <pwccode url="https://github.com/JiachengLi1995/TALLOR" additional="true">JiachengLi1995/TALLOR</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/bc5cdr">BC5CDR</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="353">
       <title>Prefix-Tuning: Optimizing Continuous Prompts for Generation</title>
@@ -6029,7 +6029,7 @@
       <bibkey>zhang-etal-2021-de</bibkey>
       <video href="2021.acl-long.371.mp4"/>
       <pwccode url="https://github.com/zwkatgithub/DSCAU" additional="false">zwkatgithub/DSCAU</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="372">
       <title>A Span-Based Model for Joint Overlapped and Discontinuous Named Entity Recognition</title>
@@ -6905,6 +6905,7 @@
       <doi>10.18653/v1/2021.acl-long.426</doi>
       <bibkey>zhou-etal-2021-defense</bibkey>
       <video href="2021.acl-long.426.mp4"/>
+      <pwccode url="https://github.com/dugu9sword/dne" additional="false">dugu9sword/dne</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/ag-news">AG News</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/imdb-movie-reviews">IMDb Movie Reviews</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/snli">SNLI</pwcdataset>
@@ -7004,7 +7005,7 @@
       <bibkey>zhang-etal-2021-crowdsourcing</bibkey>
       <video href="2021.acl-long.432.mp4"/>
       <pwccode url="https://github.com/izhx/CLasDA" additional="false">izhx/CLasDA</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="433">
       <title>Exploring Distantly-Labeled Rationales in Neural Network Models</title>
@@ -7315,7 +7316,7 @@
       <pwccode url="https://github.com/yhcc/BARTNER" additional="false">yhcc/BARTNER</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/ace-2004">ACE 2004</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/ace-2005">ACE 2005</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/genia">GENIA</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/ontonotes-5-0">OntoNotes 5.0</pwcdataset>
     </paper>
@@ -7820,7 +7821,7 @@
       <video href="2021.acl-long.482.mp4"/>
       <pwccode url="https://github.com/Yinghao-Li/CHMM-ALT" additional="true">Yinghao-Li/CHMM-ALT</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/bc5cdr">BC5CDR</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/ncbi-disease-1">NCBI Disease</pwcdataset>
     </paper>
     <paper id="483">
@@ -12422,7 +12423,7 @@
       <bibkey>liu-etal-2021-explainaboard</bibkey>
       <video href="2021.acl-demo.34.mp4"/>
       <pwccode url="https://github.com/neulab/ExplainaBoard" additional="false">neulab/ExplainaBoard</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="35">
       <title>Exploring Word Usage Change with Continuously Evolving Embeddings</title>
@@ -12514,7 +12515,7 @@
       <bibkey>lison-etal-2021-skweak</bibkey>
       <pwccode url="https://github.com/NorskRegnesentral/skweak" additional="false">NorskRegnesentral/skweak</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/100doh">100DOH</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/norec">NoReC</pwcdataset>
     </paper>
     <paper id="41">
diff --git a/data/xml/2021.adaptnlp.xml b/data/xml/2021.adaptnlp.xml
index 91a878711c..53382300f1 100644
--- a/data/xml/2021.adaptnlp.xml
+++ b/data/xml/2021.adaptnlp.xml
@@ -48,7 +48,7 @@
       <abstract>Contextual embedding models such as BERT can be easily fine-tuned on labeled samples to create a state-of-the-art model for many downstream tasks. However, the fine-tuned BERT model suffers considerably from unlabeled data when applied to a different domain. In unsupervised domain adaptation, we aim to train a model that works well on a target domain when provided with labeled source samples and unlabeled target samples. In this paper, we propose a pseudo-label guided method for unsupervised domain adaptation. Two models are fine-tuned on labeled source samples as pseudo labeling models. To learn representations for the target domain, one of those models is adapted by masked language modeling from the target domain. Then those models are used to assign pseudo-labels to target samples. We train the final model with those samples. We evaluate our method on named entity segmentation and sentiment analysis tasks. These experiments show that our approach outperforms baseline methods.</abstract>
       <url hash="3f6deeac">2021.adaptnlp-1.2</url>
       <bibkey>chen-etal-2021-pseudo</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="3">
       <title>Conditional Adversarial Networks for Multi-Domain Text Classification</title>
diff --git a/data/xml/2021.case.xml b/data/xml/2021.case.xml
index 3a58802213..e621d3afd1 100644
--- a/data/xml/2021.case.xml
+++ b/data/xml/2021.case.xml
@@ -272,7 +272,7 @@
       <bibkey>bouscarrat-etal-2021-amu</bibkey>
       <pwccode url="https://github.com/euranova/AMU-EURANOVA-CASE-2021" additional="false">euranova/AMU-EURANOVA-CASE-2021</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/conll-2002">CoNLL 2002</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="22">
       <title>Team “<fixed-case>D</fixed-case>a<fixed-case>D</fixed-case>e<fixed-case>F</fixed-case>r<fixed-case>N</fixed-case>i” at <fixed-case>CASE</fixed-case> 2021 Task 1: Document and Sentence Classification for Protest Event Detection</title>
diff --git a/data/xml/2021.cl.xml b/data/xml/2021.cl.xml
index f76cc477cd..cf620b504b 100644
--- a/data/xml/2021.cl.xml
+++ b/data/xml/2021.cl.xml
@@ -70,7 +70,7 @@
       <url hash="cda87812">2021.cl-1.5</url>
       <bibkey>agarwal-etal-2021-interpretability</bibkey>
       <video href="2021.cl-1.5.mp4"/>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="6">
       <title>Supervised and Unsupervised Neural Approaches to Text Readability</title>
diff --git a/data/xml/2021.conll.xml b/data/xml/2021.conll.xml
index b155c15155..ba349fa98f 100644
--- a/data/xml/2021.conll.xml
+++ b/data/xml/2021.conll.xml
@@ -706,7 +706,7 @@
       <bibkey>belkebir-habash-2021-automatic</bibkey>
       <doi>10.18653/v1/2021.conll-1.47</doi>
       <video href="2021.conll-1.47.mp4"/>
-      <pwccode url="https://github.com/camel-lab/arabic_error_type_annotation" additional="false">camel-lab/arabic_error_type_annotation</pwccode>
+      <pwccode url="https://github.com/camel-lab/arabic_error_type_annotation" additional="true">camel-lab/arabic_error_type_annotation</pwccode>
     </paper>
     <paper id="48">
       <title>The Emergence of the Shape Bias Results from Communicative Efficiency</title>
diff --git a/data/xml/2021.dash.xml b/data/xml/2021.dash.xml
index 44da20acd9..38556351be 100644
--- a/data/xml/2021.dash.xml
+++ b/data/xml/2021.dash.xml
@@ -132,7 +132,7 @@
       <url hash="3b9c164d">2021.dash-1.10</url>
       <doi>10.18653/v1/2021.dash-1.10</doi>
       <bibkey>muthuraman-etal-2021-data</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="11">
       <title>Building Low-Resource <fixed-case>NER</fixed-case> Models Using Non-Speaker Annotations</title>
diff --git a/data/xml/2021.eacl.xml b/data/xml/2021.eacl.xml
index 7b82dc200c..6ca7cf47e8 100644
--- a/data/xml/2021.eacl.xml
+++ b/data/xml/2021.eacl.xml
@@ -1988,7 +1988,7 @@
       <url hash="f0c04769">2021.eacl-main.145</url>
       <bibkey>shelmanov-etal-2021-active</bibkey>
       <doi>10.18653/v1/2021.eacl-main.145</doi>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="146">
       <title><fixed-case>M</fixed-case>ulti<fixed-case>H</fixed-case>um<fixed-case>ES</fixed-case>: Multilingual Humanitarian Dataset for Extractive Summarization</title>
@@ -4201,7 +4201,7 @@
       <bibkey>hsieh-etal-2021-enconter</bibkey>
       <doi>10.18653/v1/2021.eacl-main.313</doi>
       <pwccode url="https://github.com/LARC-CMU-SMU/Enconter" additional="false">LARC-CMU-SMU/Enconter</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="314">
       <title>Meta-Learning for Effective Multi-task and Multilingual Modelling</title>
@@ -4483,8 +4483,8 @@
       <bibkey>ushio-camacho-collados-2021-ner</bibkey>
       <doi>10.18653/v1/2021.eacl-demos.7</doi>
       <pwccode url="https://github.com/asahi417/tner" additional="false">asahi417/tner</pwccode>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/conll">CoNLL++</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/fin">FIN</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wnut-2017-emerging-and-rare-entity">WNUT 2017</pwcdataset>
     </paper>
@@ -4531,7 +4531,7 @@
       <bibkey>nguyen-etal-2021-trankit</bibkey>
       <doi>10.18653/v1/2021.eacl-demos.10</doi>
       <pwccode url="https://github.com/nlp-uoregon/trankit" additional="false">nlp-uoregon/trankit</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="11">
       <title><fixed-case>D</fixed-case>eb<fixed-case>IE</fixed-case>: A Platform for Implicit and Explicit Debiasing of Word Embedding Spaces</title>
@@ -5237,7 +5237,7 @@
       <bibkey>tu-lignos-2021-tmr</bibkey>
       <doi>10.18653/v1/2021.eacl-srw.21</doi>
       <pwcdataset url="https://paperswithcode.com/dataset/conll-2002">CoNLL 2002</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/ipm-nel">IPM NEL</pwcdataset>
     </paper>
     <paper id="22">
diff --git a/data/xml/2021.econlp.xml b/data/xml/2021.econlp.xml
index 130e030b56..a173328a4a 100644
--- a/data/xml/2021.econlp.xml
+++ b/data/xml/2021.econlp.xml
@@ -25,7 +25,7 @@
       <url hash="06570a64">2021.econlp-1.1</url>
       <bibkey>hu-paroubek-2021-fine</bibkey>
       <doi>10.18653/v1/2021.econlp-1.1</doi>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="2">
       <title><fixed-case>EDGAR</fixed-case>-<fixed-case>CORPUS</fixed-case>: Billions of Tokens Make The World Go Round</title>
diff --git a/data/xml/2021.emnlp.xml b/data/xml/2021.emnlp.xml
index 768486a3ee..569733e119 100644
--- a/data/xml/2021.emnlp.xml
+++ b/data/xml/2021.emnlp.xml
@@ -3303,7 +3303,7 @@
       <bibkey>wang-etal-2021-dylex</bibkey>
       <doi>10.18653/v1/2021.emnlp-main.211</doi>
       <pwccode url="https://github.com/huawei-noah/noah-research/tree/master/NLP/dylex" additional="false">huawei-noah/noah-research</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="212">
       <title><fixed-case>M</fixed-case>ap<fixed-case>RE</fixed-case>: An Effective Semantic Mapping Approach for Low-resource Relation Extraction</title>
@@ -3689,8 +3689,8 @@
       <video href="2021.emnlp-main.235.mp4"/>
       <revision id="1" href="2021.emnlp-main.235v1" hash="11d3baf4"/>
       <revision id="2" href="2021.emnlp-main.235v2" hash="ee033676" date="2023-05-14">Fixed a typo and added a footnote.</revision>
-      <pwccode url="https://github.com/osekilab/rnng-lc" additional="false">osekilab/rnng-lc</pwccode>
       <revision id="3" href="2021.emnlp-main.235v3" hash="cecb8202" date="2023-10-05">Updated code link.</revision>
+      <pwccode url="https://github.com/osekilab/rnng-eyetrack" additional="true">osekilab/rnng-eyetrack</pwccode>
     </paper>
     <paper id="236">
       <title>A Simple and Effective Positional Encoding for Transformers</title>
@@ -5899,7 +5899,7 @@
       <doi>10.18653/v1/2021.emnlp-main.373</doi>
       <video href="2021.emnlp-main.373.mp4"/>
       <pwccode url="https://github.com/gorokoba560/norm-analysis-of-transformer" additional="true">gorokoba560/norm-analysis-of-transformer</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/multinli">MultiNLI</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/sst">SST</pwcdataset>
     </paper>
@@ -6445,7 +6445,7 @@
       <bibkey>lowell-etal-2021-unsupervised</bibkey>
       <doi>10.18653/v1/2021.emnlp-main.408</doi>
       <video href="2021.emnlp-main.408.mp4"/>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/ebm-nlp">EBM-NLP</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/evidence-inference">Evidence Inference</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/imdb-movie-reviews">IMDb Movie Reviews</pwcdataset>
@@ -6872,8 +6872,8 @@
       <doi>10.18653/v1/2021.emnlp-main.437</doi>
       <video href="2021.emnlp-main.437.mp4"/>
       <pwccode url="https://github.com/wzhouad/NLL-IE" additional="false">wzhouad/NLL-IE</pwccode>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/conll">CoNLL++</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/tacred">TACRED</pwcdataset>
     </paper>
     <paper id="438">
@@ -12758,7 +12758,7 @@
       <bibkey>huang-etal-2021-shot</bibkey>
       <doi>10.18653/v1/2021.emnlp-main.813</doi>
       <video href="2021.emnlp-main.813.mp4"/>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/snips">SNIPS</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wnut-2017-emerging-and-rare-entity">WNUT 2017</pwcdataset>
     </paper>
@@ -12969,7 +12969,7 @@
       <pwccode url="https://github.com/adapter-hub/efficient-task-transfer" additional="false">adapter-hub/efficient-task-transfer</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/boolq">BoolQ</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/copa">COPA</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/drop">DROP</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/glue">GLUE</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/superglue">SuperGLUE</pwcdataset>
@@ -13870,7 +13870,7 @@
       <bibkey>simoncini-spanakis-2021-seqattack</bibkey>
       <doi>10.18653/v1/2021.emnlp-demo.35</doi>
       <video href="2021.emnlp-demo.35.mp4"/>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="36">
       <title><fixed-case>InVeRo-XL</fixed-case>: <fixed-case>M</fixed-case>aking Cross-Lingual <fixed-case>S</fixed-case>emantic <fixed-case>R</fixed-case>ole <fixed-case>L</fixed-case>abeling Accessible with Intelligible Verbs and Roles</title>
diff --git a/data/xml/2021.eval4nlp.xml b/data/xml/2021.eval4nlp.xml
index b7b773d257..fc5626205a 100644
--- a/data/xml/2021.eval4nlp.xml
+++ b/data/xml/2021.eval4nlp.xml
@@ -86,7 +86,7 @@
       <bibkey>palen-michel-etal-2021-seqscore</bibkey>
       <doi>10.18653/v1/2021.eval4nlp-1.5</doi>
       <pwccode url="https://github.com/bltlab/seqscore" additional="false">bltlab/seqscore</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/masakhaner">MasakhaNER</pwcdataset>
     </paper>
     <paper id="6">
diff --git a/data/xml/2021.findings.xml b/data/xml/2021.findings.xml
index b3aa20fabc..17c4dd74f0 100644
--- a/data/xml/2021.findings.xml
+++ b/data/xml/2021.findings.xml
@@ -401,7 +401,7 @@
       <bibkey>namysl-etal-2021-empirical</bibkey>
       <video href="2021.findings-acl.27.mp4"/>
       <pwccode url="https://github.com/mnamysl/nat-acl2021" additional="false">mnamysl/nat-acl2021</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="28">
       <title>Spatial Dependency Parsing for Semi-Structured Document Information Extraction</title>
@@ -2353,7 +2353,7 @@
       <doi>10.18653/v1/2021.findings-acl.161</doi>
       <bibkey>cui-etal-2021-template</bibkey>
       <pwccode url="https://github.com/Nealcly/templateNER" additional="false">Nealcly/templateNER</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="162">
       <title>“Does it Matter When <fixed-case>I</fixed-case> Think You Are Lying?” Improving Deception Detection by Integrating Interlocutor’s Judgements in Conversations</title>
@@ -2391,7 +2391,7 @@
       <url hash="76ff9b21">2021.findings-acl.164</url>
       <doi>10.18653/v1/2021.findings-acl.164</doi>
       <bibkey>wang-etal-2021-structured</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="165">
       <title>End-to-End Construction of <fixed-case>NLP</fixed-case> Knowledge Graph</title>
@@ -4939,7 +4939,7 @@
       <bibkey>agarwal-nenkova-2021-utility</bibkey>
       <video href="2021.findings-acl.349.mp4"/>
       <pwcdataset url="https://paperswithcode.com/dataset/broad-twitter-corpus">Broad Twitter Corpus</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="350">
       <title>On the Cost-Effectiveness of Stacking of Neural and Non-Neural Methods for Text Classification: Scenarios and Performance Prediction</title>
@@ -5581,7 +5581,7 @@
       <doi>10.18653/v1/2021.findings-acl.396</doi>
       <bibkey>guo-roth-2021-constrained</bibkey>
       <video href="2021.findings-acl.396.mp4"/>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="397">
       <title>He is very intelligent, she is very beautiful? <fixed-case>O</fixed-case>n <fixed-case>M</fixed-case>itigating <fixed-case>S</fixed-case>ocial <fixed-case>B</fixed-case>iases in <fixed-case>L</fixed-case>anguage <fixed-case>M</fixed-case>odelling and <fixed-case>G</fixed-case>eneration</title>
@@ -6508,7 +6508,7 @@
       <bibkey>shaffer-2021-language-clustering</bibkey>
       <doi>10.18653/v1/2021.findings-emnlp.4</doi>
       <video href="2021.findings-emnlp.4.mp4"/>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="5">
       <title>Neural News Recommendation with Collaborative News Encoding and Structural User Encoding</title>
@@ -7516,6 +7516,7 @@
       <bibkey>nguyen-etal-2021-uncertainty-aware</bibkey>
       <doi>10.18653/v1/2021.findings-emnlp.69</doi>
       <video href="2021.findings-emnlp.69.mp4"/>
+      <pwccode url="https://github.com/nhungnt7/UCE" additional="false">nhungnt7/UCE</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/oposum">OpoSum</pwcdataset>
     </paper>
     <paper id="70">
@@ -7775,7 +7776,7 @@
       <bibkey>li-etal-2021-task-adaptive</bibkey>
       <doi>10.18653/v1/2021.findings-emnlp.86</doi>
       <video href="2021.findings-emnlp.86.mp4"/>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/glue">GLUE</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/multinli">MultiNLI</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/qnli">QNLI</pwcdataset>
@@ -7794,7 +7795,7 @@
       <bibkey>sung-etal-2021-cnnbif-cnn</bibkey>
       <doi>10.18653/v1/2021.findings-emnlp.87</doi>
       <video href="2021.findings-emnlp.87.mp4"/>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="88">
       <title>Compositional Generalization via Semantic Tagging</title>
@@ -8571,7 +8572,7 @@
       <bibkey>wang-etal-2021-learning-language-description</bibkey>
       <doi>10.18653/v1/2021.findings-emnlp.139</doi>
       <video href="2021.findings-emnlp.139.mp4"/>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="140">
       <title><fixed-case>BERT</fixed-case> might be Overkill: A Tiny but Effective Biomedical Entity Linker based on Residual Convolutional Neural Networks</title>
@@ -9022,7 +9023,7 @@
       <revision id="2" href="2021.findings-emnlp.169v2" hash="b6c795cf" date="2021-11-29">Added acknowledgment.</revision>
       <doi>10.18653/v1/2021.findings-emnlp.169</doi>
       <video href="2021.findings-emnlp.169.mp4"/>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/glue">GLUE</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/qnli">QNLI</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/snli">SNLI</pwcdataset>
@@ -9713,7 +9714,7 @@
       <pwccode url="https://github.com/babelscape/wikineural" additional="false">babelscape/wikineural</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/wikineural">WikiNEuRal</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/conll-2002">CoNLL 2002</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikiann-1">WikiAnn</pwcdataset>
     </paper>
     <paper id="216">
@@ -9928,6 +9929,7 @@
       <doi>10.18653/v1/2021.findings-emnlp.230</doi>
       <video href="2021.findings-emnlp.230.mp4"/>
       <pwccode url="https://github.com/kevivk/mwp_adversarial" additional="false">kevivk/mwp_adversarial</pwccode>
+      <pwcdataset url="https://paperswithcode.com/dataset/asdiv">ASDiv</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/mawps">MAWPS</pwcdataset>
     </paper>
     <paper id="231">
@@ -12633,7 +12635,7 @@
       <url hash="57884c12">2021.findings-emnlp.410</url>
       <bibkey>ansell-etal-2021-mad-g</bibkey>
       <doi>10.18653/v1/2021.findings-emnlp.410</doi>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/masakhaner">MasakhaNER</pwcdataset>
     </paper>
     <paper id="411">
diff --git a/data/xml/2021.naacl.xml b/data/xml/2021.naacl.xml
index 4b0fafff4d..615b8c6b8d 100644
--- a/data/xml/2021.naacl.xml
+++ b/data/xml/2021.naacl.xml
@@ -1153,7 +1153,7 @@
       <bibkey>cao-etal-2021-low</bibkey>
       <video href="2021.naacl-main.74.mp4"/>
       <pwccode url="https://github.com/stevenxcao/subnetwork-probing" additional="false">stevenxcao/subnetwork-probing</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="75">
       <title>An Empirical Comparison of Instance Attribution Methods for <fixed-case>NLP</fixed-case></title>
@@ -2544,7 +2544,7 @@
       <video href="2021.naacl-main.163.mp4"/>
       <pwccode url="https://github.com/DandyQi/MaskedCRF" additional="true">DandyQi/MaskedCRF</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/atis">ATIS</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="164">
       <title>Heterogeneous Graph Neural Networks for Concept Prerequisite Relation Learning in Educational Data</title>
diff --git a/data/xml/2021.nllp.xml b/data/xml/2021.nllp.xml
index c3299123b4..14b7ae790c 100644
--- a/data/xml/2021.nllp.xml
+++ b/data/xml/2021.nllp.xml
@@ -257,7 +257,7 @@
       <url hash="24d1967c">2021.nllp-1.18</url>
       <bibkey>trias-etal-2021-named</bibkey>
       <doi>10.18653/v1/2021.nllp-1.18</doi>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="19">
       <title>Summarization of <fixed-case>G</fixed-case>erman Court Rulings</title>
diff --git a/data/xml/2021.nlp4dh.xml b/data/xml/2021.nlp4dh.xml
index faeaab961f..c222ff859d 100644
--- a/data/xml/2021.nlp4dh.xml
+++ b/data/xml/2021.nlp4dh.xml
@@ -216,7 +216,7 @@
       <url hash="9912feac">2021.nlp4dh-1.20</url>
       <bibkey>sierra-munera-krestel-2021-enjoy</bibkey>
       <pwccode url="https://github.com/hpi-information-systems/cross-domain-ner" additional="false">hpi-information-systems/cross-domain-ner</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="21">
       <title>An Exploratory Study on Temporally Evolving Discussion around Covid-19 using Diachronic Word Embeddings</title>
diff --git a/data/xml/2021.nodalida.xml b/data/xml/2021.nodalida.xml
index 3658b24acd..2abe1b2cc4 100644
--- a/data/xml/2021.nodalida.xml
+++ b/data/xml/2021.nodalida.xml
@@ -156,7 +156,7 @@
       <abstract>The current recipe for better model performance within NLP is to increase model size and training data. While it gives us models with increasingly impressive results, it also makes it more difficult to train and deploy state-of-the-art models for NLP due to increasing computational costs. Model compression is a field of research that aims to alleviate this problem. The field encompasses different methods that aim to preserve the performance of a model while decreasing the size of it. One such method is knowledge distillation. In this article, we investigate the effect of knowledge distillation for named entity recognition models in Swedish. We show that while some sequence tagging models benefit from knowledge distillation, not all models do. This prompts us to ask questions about in which situations and for which models knowledge distillation is beneficial. We also reason about the effect of knowledge distillation on computational costs.</abstract>
       <url hash="257f7716">2021.nodalida-main.13</url>
       <bibkey>hagstrom-johansson-2021-knowledge</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="14">
       <title>Fine-grained Named Entity Annotation for <fixed-case>F</fixed-case>innish</title>
diff --git a/data/xml/2021.ranlp.xml b/data/xml/2021.ranlp.xml
index 7b59e0a6dc..a6ba454b60 100644
--- a/data/xml/2021.ranlp.xml
+++ b/data/xml/2021.ranlp.xml
@@ -1132,7 +1132,7 @@
       <url hash="6d26cec5">2021.ranlp-1.100</url>
       <bibkey>loukachevitch-etal-2021-nerel</bibkey>
       <pwccode url="https://github.com/nerel-ds/nerel" additional="false">nerel-ds/nerel</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/docred">DocRED</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/nne">NNE</pwcdataset>
     </paper>
@@ -1322,7 +1322,7 @@
       <abstract>Biomedical Named Entities are complex, so approximate matching has been used to improve entity coverage. However, the usual approximate matching approach fetches only one matching result, which is often noisy. In this work, we propose a method for biomedical NER that fetches multiple approximate matches for a given phrase to leverage their variations to estimate entity-likeness. The model uses pooling to discard the unnecessary information from the noisy matching results, and learn the entity-likeness of the phrase with multiple approximate matches. Experimental results on three benchmark datasets from the biomedical domain, BC2GM, NCBI-disease, and BC4CHEMD, demonstrate the effectiveness. Our model improves the average by up to +0.21 points compared to a BioBERT-based NER.</abstract>
       <url hash="3ea5a289">2021.ranlp-1.117</url>
       <bibkey>nguyen-le-etal-2021-learning</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="118">
       <title>Extending a Text-to-Pictograph System to <fixed-case>F</fixed-case>rench and to Arasaac</title>
diff --git a/data/xml/2021.wnut.xml b/data/xml/2021.wnut.xml
index d0cd32770d..f765f1fcc3 100644
--- a/data/xml/2021.wnut.xml
+++ b/data/xml/2021.wnut.xml
@@ -207,7 +207,7 @@
       <doi>10.18653/v1/2021.wnut-1.16</doi>
       <pwccode url="https://github.com/overfit-ir/parstwiner" additional="false">overfit-ir/parstwiner</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/parstwiner">ParsTwiner</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/peyma">PEYMA</pwcdataset>
     </paper>
     <paper id="17">
@@ -517,7 +517,7 @@
       <url hash="c84c0ef8">2021.wnut-1.40</url>
       <bibkey>davidson-etal-2021-improved</bibkey>
       <doi>10.18653/v1/2021.wnut-1.40</doi>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="41">
       <title>Contrapositive Local Class Inference</title>
diff --git a/data/xml/2022.acl.xml b/data/xml/2022.acl.xml
index d274b9bd9c..2f11e8e8e8 100644
--- a/data/xml/2022.acl.xml
+++ b/data/xml/2022.acl.xml
@@ -231,7 +231,7 @@
       <attachment type="software" hash="bc84f15b">2022.acl-long.14.software.zip</attachment>
       <bibkey>li-etal-2022-unsupervised-multiple</bibkey>
       <doi>10.18653/v1/2022.acl-long.14</doi>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="15">
       <title>Discriminative Marginalized Probabilistic Neural Method for Multi-Document Summarization of Medical Literature</title>
@@ -1098,7 +1098,7 @@
       <pwccode url="https://github.com/tricktreat/piqn" additional="false">tricktreat/piqn</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/ace-2004">ACE 2004</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/ace-2005">ACE 2005</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/few-nerd">Few-NERD</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/genia">GENIA</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/msra-cn-ner">MSRA CN NER</pwcdataset>
@@ -2008,7 +2008,7 @@
       <doi>10.18653/v1/2022.acl-long.125</doi>
       <video href="2022.acl-long.125.mp4"/>
       <pwccode url="https://github.com/cambridgeltl/composable-sft" additional="true">cambridgeltl/composable-sft</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/glue">GLUE</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/mlqa">MLQA</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/masakhaner">MasakhaNER</pwcdataset>
@@ -3225,6 +3225,7 @@
       <pwcdataset url="https://paperswithcode.com/dataset/webquestions">WebQuestions</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/webquestionssp">WebQuestionsSP</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikimovies">WikiMovies</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/wikidata5m">Wikidata5M</pwcdataset>
     </paper>
     <paper id="202">
       <title>Learning to Mediate Disparities Towards Pragmatic Communication</title>
@@ -4678,7 +4679,7 @@ in the Case of Unambiguous Gender</title>
       <bibkey>wang-etal-2022-promda</bibkey>
       <doi>10.18653/v1/2022.acl-long.292</doi>
       <pwccode url="https://github.com/garyyufei/promda" additional="false">garyyufei/promda</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/sst">SST</pwcdataset>
     </paper>
     <paper id="293">
@@ -4725,6 +4726,7 @@ in the Case of Unambiguous Gender</title>
       <pwcdataset url="https://paperswithcode.com/dataset/fb15k-237">FB15k-237</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wn18">WN18</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wn18rr">WN18RR</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/wikidata5m">Wikidata5M</pwcdataset>
     </paper>
     <paper id="296">
       <title>Do Transformer Models Show Similar Attention Patterns to Task-Specific Human Gaze?</title>
@@ -5430,7 +5432,7 @@ in the Case of Unambiguous Gender</title>
       <pwccode url="https://github.com/thunlp/pl-marker" additional="false">thunlp/pl-marker</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/ace-2004">ACE 2004</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/ace-2005">ACE 2005</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/few-nerd">Few-NERD</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/ontonotes-5-0">OntoNotes 5.0</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/scierc">SciERC</pwcdataset>
@@ -6372,7 +6374,7 @@ in the Case of Unambiguous Gender</title>
       <bibkey>lu-etal-2022-unified</bibkey>
       <doi>10.18653/v1/2022.acl-long.395</doi>
       <pwcdataset url="https://paperswithcode.com/dataset/aste">ASTE</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/scierc">SciERC</pwcdataset>
     </paper>
     <paper id="396">
@@ -6868,7 +6870,7 @@ in the Case of Unambiguous Gender</title>
       <doi>10.18653/v1/2022.acl-long.426</doi>
       <pwccode url="https://github.com/JiachengLi1995/UCTopic" additional="false">JiachengLi1995/UCTopic</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/bc5cdr">BC5CDR</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/kp20k">KP20k</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/kptimes">KPTimes</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wnut-2017-emerging-and-rare-entity">WNUT 2017</pwcdataset>
@@ -7879,8 +7881,8 @@ in the Case of Unambiguous Gender</title>
       <pwccode url="https://github.com/syuoni/eznlp" additional="false">syuoni/eznlp</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/ace-2004">ACE 2004</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/ace-2005">ACE 2005</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/conll">CoNLL++</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/msra-cn-ner">MSRA CN NER</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/ontonotes-4-0">OntoNotes 4.0</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/ontonotes-5-0">OntoNotes 5.0</pwcdataset>
@@ -8026,7 +8028,7 @@ in the Case of Unambiguous Gender</title>
       <video href="2022.acl-long.498.mp4"/>
       <pwccode url="https://github.com/kangISU/Conf-MPU-DS-NER" additional="false">kangISU/Conf-MPU-DS-NER</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/bc5cdr">BC5CDR</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="499">
       <title><fixed-case>U</fixed-case>ni<fixed-case>X</fixed-case>coder: Unified Cross-Modal Pre-training for Code Representation</title>
@@ -8146,7 +8148,7 @@ in the Case of Unambiguous Gender</title>
       <video href="2022.acl-long.505.mp4"/>
       <video href="2022.acl-long.505.mp4"/>
       <pwccode url="https://github.com/studio-ousia/luke" additional="true">studio-ousia/luke</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/lama">LAMA</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/mlqa">MLQA</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/relx">RELX</pwcdataset>
@@ -8401,8 +8403,8 @@ in the Case of Unambiguous Gender</title>
       <video href="2022.acl-long.521.mp4"/>
       <pwccode url="https://github.com/Michael-Tanzer/BERT-mem-lowres" additional="false">Michael-Tanzer/BERT-mem-lowres</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/cifar-10">CIFAR-10</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/conll">CoNLL++</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wnut-2017-emerging-and-rare-entity">WNUT 2017</pwcdataset>
     </paper>
     <paper id="522">
@@ -9121,7 +9123,7 @@ in the Case of Unambiguous Gender</title>
       <video href="2022.acl-long.566.mp4"/>
       <pwccode url="https://github.com/airi-institute/uncertainty_transformers" additional="false">airi-institute/uncertainty_transformers</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/cola">CoLA</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/glue">GLUE</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/mrpc">MRPC</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/sst">SST</pwcdataset>
@@ -11295,7 +11297,7 @@ in the Case of Unambiguous Gender</title>
       <doi>10.18653/v1/2022.acl-srw.9</doi>
       <video href="2022.acl-srw.9.mp4"/>
       <pwccode url="https://github.com/urchade/gnner" additional="false">urchade/gnner</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/scierc">SciERC</pwcdataset>
     </paper>
     <paper id="10">
@@ -11524,6 +11526,7 @@ in the Case of Unambiguous Gender</title>
       <revision id="2" href="2022.acl-srw.27v2" hash="4c149c6c" date="2022-06-01">Update the PDF due to a Softconf uploading error.</revision>
       <pwcdataset url="https://paperswithcode.com/dataset/fb15k-237">FB15k-237</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/simplequestions">SimpleQuestions</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/wikidata5m">Wikidata5M</pwcdataset>
     </paper>
     <paper id="28">
       <title>Discourse on <fixed-case>ASR</fixed-case> Measurement: Introducing the <fixed-case>ARPOCA</fixed-case> Assessment Tool</title>
diff --git a/data/xml/2022.ccl.xml b/data/xml/2022.ccl.xml
index ad8c706803..9497dd44dd 100644
--- a/data/xml/2022.ccl.xml
+++ b/data/xml/2022.ccl.xml
@@ -1086,7 +1086,7 @@
       <bibkey>jiangxu-peiqi-2022-low</bibkey>
       <pwccode url="https://github.com/wjx-git/deptriggerner" additional="false">wjx-git/deptriggerner</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/bc5cdr">BC5CDR</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="86">
       <title>Fundamental Analysis based Neural Network for Stock Movement Prediction</title>
diff --git a/data/xml/2022.coling.xml b/data/xml/2022.coling.xml
index 7092cdfa16..589529b6a3 100644
--- a/data/xml/2022.coling.xml
+++ b/data/xml/2022.coling.xml
@@ -2784,7 +2784,7 @@
       <url hash="adb4764d">2022.coling-1.209</url>
       <bibkey>chen-etal-2022-lightner</bibkey>
       <pwccode url="https://github.com/zjunlp/DeepKE/tree/main/example/ner/few-shot" additional="false">zjunlp/DeepKE</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="210">
       <title>Cross-modal Contrastive Attention Model for Medical Report Generation</title>
@@ -2997,7 +2997,7 @@
       <url hash="02e5f58e">2022.coling-1.224</url>
       <bibkey>yang-etal-2022-see</bibkey>
       <pwccode url="https://github.com/unveiled-the-red-hat/SEE-Few" additional="false">unveiled-the-red-hat/SEE-Few</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/weibo-ner">Weibo NER</pwcdataset>
     </paper>
     <paper id="225">
@@ -3785,7 +3785,7 @@
       <url hash="cee37c2a">2022.coling-1.284</url>
       <bibkey>ding-etal-2022-cogbert</bibkey>
       <pwccode url="https://github.com/PosoSAgapo/cogbert" additional="false">PosoSAgapo/cogbert</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/glue">GLUE</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/qnli">QNLI</pwcdataset>
     </paper>
@@ -4388,7 +4388,7 @@
       <abstract>Named entity recognition has become an increasingly useful tool for digital humanities research, specially when it comes to historical texts. However, historical texts pose a wide range of challenges to both named entity recognition and natural language processing in general that are still difficult to address even with modern neural methods. In this article we focus in named entity recognition for historical French, and in particular for Early Modern French (16th-18th c.), i.e. Ancien Régime French. However, instead of developing a specialised architecture to tackle the particularities of this state of language, we opt for a data-driven approach by developing a new corpus with fine-grained entity annotation, covering three centuries of literature corresponding to the early modern period; we try to annotate as much data as possible producing a corpus that is many times bigger than the most popular NER evaluation corpora for both Contemporary English and French. We then fine-tune existing state-of-the-art architectures for Early Modern and Contemporary French, obtaining results that are on par with those of the current state-of-the-art NER systems for Contemporary English. Both the corpus and the fine-tuned models are released.</abstract>
       <url hash="4253486c">2022.coling-1.327</url>
       <bibkey>ortiz-suarez-gabay-2022-data</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="328">
       <title>Reproducibility and Automation of the Appraisal Taxonomy</title>
@@ -8073,7 +8073,7 @@
       <url hash="a6cbbd59">2022.coling-1.615</url>
       <bibkey>zhou-etal-2022-making</bibkey>
       <pwccode url="https://github.com/xzhou20/plugin-tuning" additional="false">xzhou20/plugin-tuning</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/glue">GLUE</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/penn-treebank">Penn Treebank</pwcdataset>
     </paper>
diff --git a/data/xml/2022.dadc.xml b/data/xml/2022.dadc.xml
index d0d1dc3fcb..adf3ab5f81 100644
--- a/data/xml/2022.dadc.xml
+++ b/data/xml/2022.dadc.xml
@@ -32,7 +32,7 @@
       <url hash="4b1b6e70">2022.dadc-1.1</url>
       <bibkey>das-paik-2022-resilience</bibkey>
       <doi>10.18653/v1/2022.dadc-1.1</doi>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="2">
       <title><fixed-case>G</fixed-case>rease<fixed-case>V</fixed-case>ision: Rewriting the Rules of the Interface</title>
diff --git a/data/xml/2022.findings.xml b/data/xml/2022.findings.xml
index 1089be3ad4..8de9cc7791 100644
--- a/data/xml/2022.findings.xml
+++ b/data/xml/2022.findings.xml
@@ -1060,8 +1060,8 @@
       <pwccode url="https://github.com/cgraywang/deepstruct" additional="false">cgraywang/deepstruct</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/ace-2005">ACE 2005</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/atis">ATIS</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/conll">CoNLL++</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/fewrel">FewRel</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/genia">GENIA</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/kelm">KELM</pwcdataset>
@@ -1825,7 +1825,7 @@
       <doi>10.18653/v1/2022.findings-acl.116</doi>
       <video href="2022.findings-acl.116.mp4"/>
       <pwccode url="https://github.com/vistec-ai/thai-nner" additional="false">vistec-ai/thai-nner</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/dan">DaN+</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/nne">NNE</pwcdataset>
     </paper>
@@ -2435,7 +2435,7 @@
       <doi>10.18653/v1/2022.findings-acl.154</doi>
       <video href="2022.findings-acl.154.mp4"/>
       <pwccode url="https://github.com/gt-salt/guided-adversarial-augmentation" additional="false">gt-salt/guided-adversarial-augmentation</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="155">
       <title>Label Semantics for Few Shot Named Entity Recognition</title>
@@ -2452,7 +2452,7 @@
       <bibkey>ma-etal-2022-label</bibkey>
       <doi>10.18653/v1/2022.findings-acl.155</doi>
       <video href="2022.findings-acl.155.mp4"/>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/ncbi-disease-1">NCBI Disease</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wnut-2017-emerging-and-rare-entity">WNUT 2017</pwcdataset>
     </paper>
@@ -2825,7 +2825,7 @@
       <url hash="2b1f451d">2022.findings-acl.179</url>
       <bibkey>xia-etal-2022-learn</bibkey>
       <doi>10.18653/v1/2022.findings-acl.179</doi>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="180">
       <title>Phoneme transcription of endangered languages: an evaluation of recent <fixed-case>ASR</fixed-case> architectures in the single speaker scenario</title>
@@ -3001,7 +3001,7 @@
       <doi>10.18653/v1/2022.findings-acl.191</doi>
       <pwcdataset url="https://paperswithcode.com/dataset/blimp">BLiMP</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/cola">CoLA</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/fce">FCE</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/glue">GLUE</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/open-entity-1">Open Entity</pwcdataset>
@@ -3456,7 +3456,7 @@
       <bibkey>ma-etal-2022-encbp</bibkey>
       <doi>10.18653/v1/2022.findings-acl.221</doi>
       <pwcdataset url="https://paperswithcode.com/dataset/emotion">CARER</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/glue">GLUE</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/goemotions">GoEmotions</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/qnli">QNLI</pwcdataset>
@@ -4311,7 +4311,7 @@
       <attachment type="software" hash="0978f6bd">2022.findings-acl.277.software.zip</attachment>
       <bibkey>chen-etal-2022-x</bibkey>
       <doi>10.18653/v1/2022.findings-acl.277</doi>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/glue">GLUE</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/mrpc">MRPC</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/multinli">MultiNLI</pwcdataset>
@@ -5711,7 +5711,7 @@
       <bibkey>gong-etal-2022-harmless</bibkey>
       <doi>10.18653/v1/2022.findings-naacl.38</doi>
       <video href="2022.findings-naacl.38.mp4"/>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/movielens">MovieLens</pwcdataset>
     </paper>
     <paper id="39">
@@ -5838,6 +5838,7 @@
       <doi>10.18653/v1/2022.findings-naacl.46</doi>
       <video href="2022.findings-naacl.46.mp4"/>
       <pwccode url="https://github.com/elanmarkowitz/statik" additional="false">elanmarkowitz/statik</pwccode>
+      <pwcdataset url="https://paperswithcode.com/dataset/wikidata5m">Wikidata5M</pwcdataset>
     </paper>
     <paper id="47">
       <title><fixed-case>C</fixed-case>o<fixed-case>C</fixed-case>o<fixed-case>A</fixed-case>-<fixed-case>MT</fixed-case>: A Dataset and Benchmark for Contrastive Controlled <fixed-case>MT</fixed-case> with Application to Formality</title>
@@ -6021,6 +6022,7 @@
       <pwcdataset url="https://paperswithcode.com/dataset/conceptnet">ConceptNet</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/lama">LAMA</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/webquestions">WebQuestions</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/wikidata5m">Wikidata5M</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/yago">YAGO</pwcdataset>
     </paper>
     <paper id="58">
@@ -6127,7 +6129,7 @@
       <bibkey>cao-etal-2022-attention</bibkey>
       <doi>10.18653/v1/2022.findings-naacl.64</doi>
       <video href="2022.findings-naacl.64.mp4"/>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/glue">GLUE</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/qnli">QNLI</pwcdataset>
     </paper>
@@ -6570,7 +6572,7 @@
       <video href="2022.findings-naacl.90.mp4"/>
       <pwccode url="https://github.com/airi-institute/al_nlp_feasible" additional="false">airi-institute/al_nlp_feasible</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/ag-news">AG News</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/imdb-movie-reviews">IMDb Movie Reviews</pwcdataset>
     </paper>
     <paper id="91">
diff --git a/data/xml/2022.jeptalnrecital.xml b/data/xml/2022.jeptalnrecital.xml
index 271fb30d67..f6e0ec9c5a 100644
--- a/data/xml/2022.jeptalnrecital.xml
+++ b/data/xml/2022.jeptalnrecital.xml
@@ -122,7 +122,7 @@
       <language>fra</language>
       <bibkey>millour-etal-2022-fenec</bibkey>
       <pwccode url="https://github.com/alicemillour/fenec" additional="false">alicemillour/fenec</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/the-quaero-french-medical-corpus">The QUAERO French Medical Corpus</pwcdataset>
     </paper>
     <paper id="9">
@@ -254,7 +254,7 @@
       <url hash="a3b97b03">2022.jeptalnrecital-taln.19</url>
       <language>fra</language>
       <bibkey>amalvy-etal-2022-remplacement</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="20">
       <title><fixed-case>R</fixed-case>ésume<fixed-case>SVD</fixed-case> : Un outil efficace et performant pour le résumé de texte non supervisé (<fixed-case>R</fixed-case>ésume<fixed-case>SVD</fixed-case> : An efficient and effective tool for unsupervised text summarization )</title>
diff --git a/data/xml/2022.konvens.xml b/data/xml/2022.konvens.xml
index c3bd0584db..c59e9f55e8 100644
--- a/data/xml/2022.konvens.xml
+++ b/data/xml/2022.konvens.xml
@@ -202,7 +202,7 @@
       <pages>156–166</pages>
       <url hash="2dc5a739">2022.konvens-1.19</url>
       <bibkey>remus-etal-2022-like</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/framenet">FrameNet</pwcdataset>
     </paper>
     <paper id="20">
diff --git a/data/xml/2022.law.xml b/data/xml/2022.law.xml
index afd61c46af..c0ef99e1de 100644
--- a/data/xml/2022.law.xml
+++ b/data/xml/2022.law.xml
@@ -86,7 +86,7 @@
       <abstract>In this paper we explore the use of an NLP system to assist the work of Security Force Monitor (SFM). SFM creates data about the organizational structure, command personnel and operations of police, army and other security forces, which assists human rights researchers, journalists and litigators in their work to help identify and bring to account specific units and personnel alleged to have committed abuses of human rights and international criminal law. This paper presents an NLP system that extracts from English language news reports the names of security force units and the biographical details of their personnel, and infers the formal relationship between them. Published alongside this paper are the system’s code and training dataset. We find that the experimental NLP system performs the task at a fair to good level. Its performance is sufficient to justify further development into a live workflow that will give insight into whether its performance translates into savings in time and resource that would make it an effective technical intervention.</abstract>
       <url hash="00d45048">2022.law-1.7</url>
       <bibkey>bauer-etal-2022-nlp</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="8">
       <title>Advantages of a Complex Multilayer Annotation Scheme: The Case of the <fixed-case>P</fixed-case>rague Dependency Treebank</title>
diff --git a/data/xml/2022.lrec.xml b/data/xml/2022.lrec.xml
index a6ab2775b8..76a4d61ba2 100644
--- a/data/xml/2022.lrec.xml
+++ b/data/xml/2022.lrec.xml
@@ -5037,8 +5037,8 @@
       <url hash="67ec8d36">2022.lrec-1.404</url>
       <bibkey>ivanova-etal-2022-comparing</bibkey>
       <pwccode url="https://github.com/therosko/annotated_datasets_en_comparisson" additional="false">therosko/annotated_datasets_en_comparisson</pwccode>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/conll">CoNLL++</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/litbank">LitBank</pwcdataset>
     </paper>
     <paper id="405">
diff --git a/data/xml/2022.ltedi.xml b/data/xml/2022.ltedi.xml
index 5daca73e3b..bb2d4ee0ab 100644
--- a/data/xml/2022.ltedi.xml
+++ b/data/xml/2022.ltedi.xml
@@ -472,6 +472,7 @@
       <bibkey>tavchioski-etal-2022-e8</bibkey>
       <doi>10.18653/v1/2022.ltedi-1.36</doi>
       <video href="2022.ltedi-1.36.mp4"/>
+      <pwcdataset url="https://paperswithcode.com/dataset/wikidata5m">Wikidata5M</pwcdataset>
     </paper>
     <paper id="37">
       <title>Nozza@<fixed-case>LT</fixed-case>-<fixed-case>EDI</fixed-case>-<fixed-case>ACL</fixed-case>2022: Ensemble Modeling for Homophobia and Transphobia Detection</title>
diff --git a/data/xml/2022.naacl.xml b/data/xml/2022.naacl.xml
index 87aa5ab9d3..289470ab16 100644
--- a/data/xml/2022.naacl.xml
+++ b/data/xml/2022.naacl.xml
@@ -1171,6 +1171,7 @@
       <bibkey>mathur-etal-2022-doctime</bibkey>
       <doi>10.18653/v1/2022.naacl-main.73</doi>
       <video href="2022.naacl-main.73.mp4"/>
+      <pwcdataset url="https://paperswithcode.com/dataset/timeqa">TimeQA</pwcdataset>
     </paper>
     <paper id="74">
       <title><fixed-case>F</fixed-case>act<fixed-case>PEGASUS</fixed-case>: Factuality-Aware Pre-training and Fine-tuning for Abstractive Summarization</title>
@@ -3228,7 +3229,7 @@
       <bibkey>min-etal-2022-metaicl</bibkey>
       <doi>10.18653/v1/2022.naacl-main.201</doi>
       <video href="2022.naacl-main.201.mp4"/>
-      <pwccode url="https://github.com/facebookresearch/metaicl" additional="false">facebookresearch/metaicl</pwccode>
+      <pwccode url="https://github.com/facebookresearch/metaicl" additional="true">facebookresearch/metaicl</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/hate-speech">Hate Speech</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/natural-instructions">Natural Instructions</pwcdataset>
     </paper>
@@ -4869,7 +4870,7 @@
       <doi>10.18653/v1/2022.naacl-main.297</doi>
       <video href="2022.naacl-main.297.mp4"/>
       <pwccode url="https://github.com/LindgeW/MetaAug4NER" additional="false">LindgeW/MetaAug4NER</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/weibo-ner">Weibo NER</pwcdataset>
     </paper>
     <paper id="298">
@@ -5215,7 +5216,7 @@
       <doi>10.18653/v1/2022.naacl-main.318</doi>
       <video href="2022.naacl-main.318.mp4"/>
       <pwccode url="https://github.com/frankaging/Causal-Distill" additional="false">frankaging/Causal-Distill</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/glue">GLUE</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/squad">SQuAD</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-103">WikiText-103</pwcdataset>
@@ -6152,7 +6153,7 @@
       <doi>10.18653/v1/2022.naacl-main.379</doi>
       <video href="2022.naacl-main.379.mp4"/>
       <pwccode url="https://github.com/nardien/kala" additional="false">nardien/kala</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/ncbi-disease-1">NCBI Disease</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/newsqa">NewsQA</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wnut-2017-emerging-and-rare-entity">WNUT 2017</pwcdataset>
@@ -6810,7 +6811,7 @@
       <doi>10.18653/v1/2022.naacl-main.420</doi>
       <video href="2022.naacl-main.420.mp4"/>
       <pwccode url="https://github.com/rtmaww/EntLM" additional="false">rtmaww/EntLM</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="421">
       <title>Few-Shot Document-Level Relation Extraction</title>
@@ -7301,7 +7302,7 @@
       <video href="2022.naacl-srw.11.mp4"/>
       <pwccode url="https://github.com/tugraz-isds/kd" additional="false">tugraz-isds/kd</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/bc5cdr">BC5CDR</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="12">
       <title>Analysing the Correlation between Lexical Ambiguity and Translation Quality in a Multimodal Setting using <fixed-case>W</fixed-case>ord<fixed-case>N</fixed-case>et</title>
@@ -7545,7 +7546,7 @@
       <video href="2022.naacl-srw.30.mp4"/>
       <pwccode url="https://github.com/epfl-dlab/nelight" additional="false">epfl-dlab/nelight</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/aida-conll-yago">AIDA CoNLL-YAGO</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="31">
       <title>Static and Dynamic Speaker Modeling based on Graph Neural Network for Emotion Recognition in Conversation</title>
@@ -7903,7 +7904,7 @@
       <doi>10.18653/v1/2022.naacl-demo.14</doi>
       <video href="2022.naacl-demo.14.mp4"/>
       <pwccode url="https://github.com/nlp-uoregon/famie" additional="false">nlp-uoregon/famie</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
   </volume>
   <volume id="tutorials" ingest-date="2022-06-28" type="proceedings">
diff --git a/data/xml/2022.repl4nlp.xml b/data/xml/2022.repl4nlp.xml
index b235bfb901..4b055f5249 100644
--- a/data/xml/2022.repl4nlp.xml
+++ b/data/xml/2022.repl4nlp.xml
@@ -98,7 +98,7 @@
       <doi>10.18653/v1/2022.repl4nlp-1.6</doi>
       <video href="2022.repl4nlp-1.6.mp4"/>
       <pwccode url="https://github.com/dfki-nlp/fewie" additional="false">dfki-nlp/fewie</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/few-nerd">Few-NERD</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wnut-2017-emerging-and-rare-entity">WNUT 2017</pwcdataset>
     </paper>
@@ -176,7 +176,7 @@
       <doi>10.18653/v1/2022.repl4nlp-1.11</doi>
       <video href="2022.repl4nlp-1.11.mp4"/>
       <pwccode url="https://github.com/frankaging/limits-cross-domain-transfer" additional="false">frankaging/limits-cross-domain-transfer</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/glue">GLUE</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/mrpc">MRPC</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/qnli">QNLI</pwcdataset>
diff --git a/data/xml/2022.semeval.xml b/data/xml/2022.semeval.xml
index d468d26908..ea9ce12cac 100644
--- a/data/xml/2022.semeval.xml
+++ b/data/xml/2022.semeval.xml
@@ -293,7 +293,7 @@
       <bibkey>oh-2022-kpfriends</bibkey>
       <doi>10.18653/v1/2022.semeval-1.21</doi>
       <video href="2022.semeval-1.21.mp4"/>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="22">
       <title>daminglu123 at <fixed-case>S</fixed-case>em<fixed-case>E</fixed-case>val-2022 Task 2: Using <fixed-case>BERT</fixed-case> and <fixed-case>LSTM</fixed-case> to Do Text Classification</title>
@@ -1998,6 +1998,7 @@
       <doi>10.18653/v1/2022.semeval-1.156</doi>
       <video href="2022.semeval-1.156.mp4"/>
       <pwccode url="https://github.com/EMBEDDIA/semeval-2022-MNS" additional="false">EMBEDDIA/semeval-2022-MNS</pwccode>
+      <pwcdataset url="https://paperswithcode.com/dataset/wikidata5m">Wikidata5M</pwcdataset>
     </paper>
     <paper id="157">
       <title><fixed-case>HFL</fixed-case> at <fixed-case>S</fixed-case>em<fixed-case>E</fixed-case>val-2022 Task 8: A Linguistics-inspired Regression Model with Data Augmentation for Multilingual News Similarity</title>
@@ -2603,7 +2604,7 @@
       <doi>10.18653/v1/2022.semeval-1.199</doi>
       <video href="2022.semeval-1.199.mp4"/>
       <pwcdataset url="https://paperswithcode.com/dataset/conll-2002">CoNLL 2002</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/multiconer">MultiCoNER</pwcdataset>
     </paper>
     <paper id="200">
@@ -2986,8 +2987,8 @@
       <bibkey>tavan-najafi-2022-marsan</bibkey>
       <doi>10.18653/v1/2022.semeval-1.226</doi>
       <video href="2022.semeval-1.226.mp4"/>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/conll">CoNLL++</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/multiconer">MultiCoNER</pwcdataset>
     </paper>
     <paper id="227">
diff --git a/data/xml/2022.spnlp.xml b/data/xml/2022.spnlp.xml
index 33d0ad02f8..36b7040f67 100644
--- a/data/xml/2022.spnlp.xml
+++ b/data/xml/2022.spnlp.xml
@@ -73,7 +73,7 @@
       <url hash="f1cb76a4">2022.spnlp-1.4</url>
       <bibkey>daza-etal-2022-slotgan</bibkey>
       <doi>10.18653/v1/2022.spnlp-1.4</doi>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="5">
       <title>A Joint Learning Approach for Semi-supervised Neural Topic Modeling</title>
diff --git a/data/xml/C12.xml b/data/xml/C12.xml
index b108103507..8c17e2e77a 100644
--- a/data/xml/C12.xml
+++ b/data/xml/C12.xml
@@ -54,7 +54,7 @@
       <pages>51–66</pages>
       <url hash="82cec8cd">C12-1004</url>
       <bibkey>al-rfou-skiena-2012-speedread</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="5">
       <title>Experiments with Term Translation</title>
@@ -2996,7 +2996,7 @@
       <pages>1281–1290</pages>
       <url hash="d487a826">C12-2125</url>
       <bibkey>wachsmuth-stein-2012-optimal</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="126">
       <title>Update Summarization Based on Co-Ranking with Constraints</title>
diff --git a/data/xml/C18.xml b/data/xml/C18.xml
index ae321212c1..bd97514067 100644
--- a/data/xml/C18.xml
+++ b/data/xml/C18.xml
@@ -678,7 +678,7 @@
       <url hash="25610f6b">C18-1059</url>
       <bibkey>al-olimat-etal-2018-practical</bibkey>
       <pwccode url="https://github.com/halolimat/SpExtor" additional="false">halolimat/SpExtor</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="60">
       <title>An Empirical Study on Fine-Grained Named Entity Recognition</title>
@@ -1576,8 +1576,8 @@
       <abstract>Recent advances in language modeling using recurrent neural networks have made it viable to model language as distributions over characters. By learning to predict the next character on the basis of previous characters, such models have been shown to automatically internalize linguistic concepts such as words, sentences, subclauses and even sentiment. In this paper, we propose to leverage the internal states of a trained character language model to produce a novel type of word embedding which we refer to as contextual string embeddings. Our proposed embeddings have the distinct properties that they (a) are trained without any explicit notion of words and thus fundamentally model words as sequences of characters, and (b) are contextualized by their surrounding text, meaning that the same word will have different embeddings depending on its contextual use. We conduct a comparative evaluation against previous embeddings and find that our embeddings are highly useful for downstream tasks: across four classic sequence labeling tasks we consistently outperform the previous state-of-the-art. In particular, we significantly outperform previous work on English and German named entity recognition (NER), allowing us to report new state-of-the-art F1-scores on the CoNLL03 shared task. We release all code and pre-trained language models in a simple-to-use framework to the research community, to enable reproduction of these experiments and application of our proposed embeddings to other tasks: <url>https://github.com/zalandoresearch/flair</url></abstract>
       <url hash="ca518de5">C18-1139</url>
       <bibkey>akbik-etal-2018-contextual</bibkey>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/conll-2000-1">CoNLL-2000</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/penn-treebank">Penn Treebank</pwcdataset>
     </paper>
     <paper id="140">
@@ -1840,7 +1840,7 @@
       <abstract>Neural network approaches to Named-Entity Recognition reduce the need for carefully hand-crafted features. While some features do remain in state-of-the-art systems, lexical features have been mostly discarded, with the exception of gazetteers. In this work, we show that this is unfair: lexical features are actually quite useful. We propose to embed words and entity types into a low-dimensional vector space we train from annotated data produced by distant supervision thanks to Wikipedia. From this, we compute — offline — a feature vector representing each word. When used with a vanilla recurrent neural network model, this representation yields substantial improvements. We establish a new state-of-the-art F1 score of 87.95 on ONTONOTES 5.0, while matching state-of-the-art performance with a F1 score of 91.73 on the over-studied CONLL-2003 dataset.</abstract>
       <url hash="a6ffd855">C18-1161</url>
       <bibkey>ghaddar-langlais-2018-robust</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/dbpedia">DBpedia</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/ontonotes-5-0">OntoNotes 5.0</pwcdataset>
     </paper>
@@ -2894,7 +2894,7 @@
       <abstract>We study three general multi-task learning (MTL) approaches on 11 sequence tagging tasks. Our extensive empirical results show that in about 50% of the cases, jointly learning all 11 tasks improves upon either independent or pairwise learning of the tasks. We also show that pairwise MTL can inform us what tasks can benefit others or what tasks can be benefited if they are learned jointly. In particular, we identify tasks that can always benefit others as well as tasks that can always be harmed by others. Interestingly, one of our MTL approaches yields embeddings of the tasks that reveal the natural clustering of semantic and syntactic tasks. Our inquiries have opened the doors to further utilization of MTL in NLP.</abstract>
       <url hash="4cb7b09c">C18-1251</url>
       <bibkey>changpinyo-etal-2018-multi</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/english-web-treebank">English Web Treebank</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/framenet">FrameNet</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/universal-dependencies">Universal Dependencies</pwcdataset>
@@ -3146,7 +3146,7 @@
       <abstract>Generating character-level features is an important step for achieving good results in various natural language processing tasks. To alleviate the need for human labor in generating hand-crafted features, methods that utilize neural architectures such as Convolutional Neural Network (CNN) or Recurrent Neural Network (RNN) to automatically extract such features have been proposed and have shown great results. However, CNN generates position-independent features, and RNN is slow since it needs to process the characters sequentially. In this paper, we propose a novel method of using a densely connected network to automatically extract character-level features. The proposed method does not require any language or task specific assumptions, and shows robustness and effectiveness while being faster than CNN- or RNN-based methods. Evaluating this method on three sequence labeling tasks - slot tagging, Part-of-Speech (POS) tagging, and Named-Entity Recognition (NER) - we obtain state-of-the-art performance with a 96.62 F1-score and 97.73% accuracy on slot tagging and POS tagging, respectively, and comparable performance to the state-of-the-art 91.13 F1-score on NER.</abstract>
       <url hash="98450a6e">C18-1273</url>
       <bibkey>lee-etal-2018-character</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="274">
       <title>Neural Machine Translation Incorporating Named Entity</title>
diff --git a/data/xml/D14.xml b/data/xml/D14.xml
index bc4575a45a..044e99af7b 100644
--- a/data/xml/D14.xml
+++ b/data/xml/D14.xml
@@ -136,7 +136,7 @@
       <url hash="75811617">D14-1012</url>
       <doi>10.3115/v1/D14-1012</doi>
       <bibkey>guo-etal-2014-revisiting</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="13">
       <title>Combining Punctuation and Disfluency Prediction: An Empirical Study</title>
@@ -1014,7 +1014,7 @@
       <url hash="b8e1deac">D14-1097</url>
       <doi>10.3115/v1/D14-1097</doi>
       <bibkey>marcheggiani-artieres-2014-experimental</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="98">
       <title>Language Modeling with Functional Head Constraint for Code Switching Speech Recognition</title>
@@ -1720,7 +1720,7 @@
       <doi>10.3115/v1/D14-1162</doi>
       <bibkey>pennington-etal-2014-glove</bibkey>
       <pwccode url="https://github.com/stanfordnlp/GloVe" additional="true">stanfordnlp/GloVe</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="163">
       <title>Jointly Learning Word Representations and Composition Functions Using Predicate-Argument Structures</title>
diff --git a/data/xml/D15.xml b/data/xml/D15.xml
index c81c731905..d81f9d8ecf 100644
--- a/data/xml/D15.xml
+++ b/data/xml/D15.xml
@@ -657,7 +657,7 @@
       <url hash="e341a3b3">D15-1058</url>
       <doi>10.18653/v1/D15-1058</doi>
       <bibkey>radford-etal-2015-named</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="59">
       <title>“A Spousal Relation Begins with a Deletion of engage and Ends with an Addition of divorce”: Learning State Changing Verbs from <fixed-case>W</fixed-case>ikipedia Revision History</title>
@@ -1155,7 +1155,7 @@
       <doi>10.18653/v1/D15-1102</doi>
       <video href="https://vimeo.com/160448577"/>
       <bibkey>lu-roth-2015-joint</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="103">
       <title><fixed-case>FINET</fixed-case>: Context-Aware Fine-Grained Named Entity Typing</title>
@@ -1180,7 +1180,7 @@
       <doi>10.18653/v1/D15-1104</doi>
       <video href="https://vimeo.com/160938204"/>
       <bibkey>luo-etal-2015-joint</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="105">
       <title>How Much Information Does a Human Translator Add to the Original?</title>
diff --git a/data/xml/D17.xml b/data/xml/D17.xml
index 47ca638d98..a1f3ea359b 100644
--- a/data/xml/D17.xml
+++ b/data/xml/D17.xml
@@ -3224,7 +3224,7 @@ and the code is available at <url>https://github.com/qizhex/RACE_AR_baselines</u
       <abstract>Neural networks have achieved state-of-the-art performance on several structured-output prediction tasks, trained in a fully supervised fashion. However, annotated examples in structured domains are often costly to obtain, which thus limits the applications of neural networks. In this work, we propose Maximum Margin Reward Networks, a neural network-based framework that aims to learn from both explicit (full structures) and implicit supervision signals (delayed feedback on the correctness of the predicted structure). On named entity recognition and semantic parsing, our model outperforms previous systems on the benchmark datasets, CoNLL-2003 and WebQuestionsSP.</abstract>
       <video href="https://vimeo.com/238234174"/>
       <bibkey>peng-etal-2017-maximum</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/webquestions">WebQuestions</pwcdataset>
     </paper>
     <paper id="253">
@@ -3618,7 +3618,7 @@ and <i>efficiency</i> of on-line policy optimization compared to other companion
       <abstract>In this paper, we utilize the linguistic structures of texts to improve named entity recognition by BRNN-CNN, a special bidirectional recursive network attached with a convolutional network. Motivated by the observation that named entities are highly related to linguistic constituents, we propose a constituent-based BRNN-CNN for named entity recognition. In contrast to classical sequential labeling methods, the system first identifies which text chunks are possible named entities by whether they are linguistic constituents. Then it classifies these chunks with a constituency tree structure by recursively propagating syntactic and semantic information to each constituent node. This method surpasses current state-of-the-art on OntoNotes 5.0 with automatically generated parses.</abstract>
       <bibkey>li-etal-2017-leveraging</bibkey>
       <pwccode url="https://github.com/jacobvsdanniel/tf_rnn" additional="false">jacobvsdanniel/tf_rnn</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="283">
       <title>Fast and Accurate Entity Recognition with Iterated Dilated Convolutions</title>
@@ -3632,7 +3632,7 @@ and <i>efficiency</i> of on-line policy optimization compared to other companion
       <abstract>Today when many practitioners run basic NLP on the entire web and large-volume traffic, faster methods are paramount to saving time and energy costs. Recent advances in GPU hardware have led to the emergence of bi-directional LSTMs as a standard method for obtaining per-token vector representations serving as input to labeling tasks such as NER (often followed by prediction in a linear-chain CRF). Though expressive and accurate, these models fail to fully exploit GPU parallelism, limiting their computational efficiency. This paper proposes a faster alternative to Bi-LSTMs for NER: Iterated Dilated Convolutional Neural Networks (ID-CNNs), which have better capacity than traditional CNNs for large context and structured prediction. Unlike LSTMs whose sequential processing on sentences of length N requires O(N) time even in the face of parallelism, ID-CNNs permit fixed-depth convolutions to run in parallel across entire documents. We describe a distinct combination of network structure, parameter sharing and training procedures that enable dramatic 14-20x test-time speedups while retaining accuracy comparable to the Bi-LSTM-CRF. Moreover, ID-CNNs trained to aggregate context from the entire document are more accurate than Bi-LSTM-CRFs while attaining 8x faster test time speeds.</abstract>
       <bibkey>strubell-etal-2017-fast</bibkey>
       <pwccode url="https://github.com/iesl/dilated-cnn-ner" additional="true">iesl/dilated-cnn-ner</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/ontonotes-5-0">OntoNotes 5.0</pwcdataset>
     </paper>
     <paper id="284">
@@ -4401,7 +4401,7 @@ and <i>efficiency</i> of on-line policy optimization compared to other companion
       <doi>10.18653/v1/D17-2017</doi>
       <abstract>Named-entity recognition (NER) aims at identifying entities of interest in a text. Artificial neural networks (ANNs) have recently been shown to outperform existing NER systems. However, ANNs remain challenging to use for non-expert users. In this paper, we present NeuroNER, an easy-to-use named-entity recognition tool based on ANNs. Users can annotate entities using a graphical web-based user interface (BRAT): the annotations are then used to train an ANN, which in turn predict entities’ locations and categories in new texts. NeuroNER makes this annotation-training-prediction flow smooth and accessible to anyone.</abstract>
       <bibkey>dernoncourt-etal-2017-neuroner</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="18">
       <title><fixed-case>S</fixed-case>up<fixed-case>WSD</fixed-case>: A Flexible Toolkit for Supervised Word Sense Disambiguation</title>
diff --git a/data/xml/D18.xml b/data/xml/D18.xml
index 92d86f4a64..c9c86e1d82 100644
--- a/data/xml/D18.xml
+++ b/data/xml/D18.xml
@@ -307,7 +307,7 @@
       <bibkey>wang-lu-2018-neural</bibkey>
       <pwcdataset url="https://paperswithcode.com/dataset/ace-2004">ACE 2004</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/ace-2005">ACE 2005</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/genia">GENIA</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/nne">NNE</pwcdataset>
     </paper>
@@ -325,7 +325,7 @@
       <doi>10.18653/v1/D18-1020</doi>
       <bibkey>chen-etal-2018-variational</bibkey>
       <pwccode url="https://github.com/mingdachen/vsl" additional="false">mingdachen/vsl</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="21">
       <title>Joint Representation Learning of Cross-lingual Words and Entities via Attentive Distant Supervision</title>
@@ -2075,7 +2075,7 @@
       <doi>10.18653/v1/D18-1153</doi>
       <bibkey>liu-etal-2018-efficient</bibkey>
       <pwccode url="https://github.com/LiyuanLucasLiu/LD-Net" additional="false">LiyuanLucasLiu/LD-Net</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="154">
       <title>Automatic Event Salience Identification</title>
@@ -2457,7 +2457,7 @@
       <doi>10.18653/v1/D18-1179</doi>
       <bibkey>peters-etal-2018-dissecting</bibkey>
       <pwcdataset url="https://paperswithcode.com/dataset/billion-word-benchmark">Billion Word Benchmark</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/multinli">MultiNLI</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/penn-treebank">Penn Treebank</pwcdataset>
     </paper>
@@ -2978,7 +2978,7 @@
       <bibkey>clark-etal-2018-semi</bibkey>
       <pwccode url="" additional="true"/>
       <pwcdataset url="https://paperswithcode.com/dataset/ccgbank">CCGbank</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/ontonotes-5-0">OntoNotes 5.0</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/penn-treebank">Penn Treebank</pwcdataset>
     </paper>
@@ -3895,7 +3895,7 @@
       <abstract>Character-based neural models have recently proven very useful for many NLP tasks. However, there is a gap of sophistication between methods for learning representations of sentences and words. While, most character models for learning representations of sentences are deep and complex, models for learning representations of words are shallow and simple. Also, in spite of considerable research on learning character embeddings, it is still not clear which kind of architecture is the best for capturing character-to-word representations. To address these questions, we first investigate the gaps between methods for learning word and sentence representations. We conduct detailed experiments and comparisons on different state-of-the-art convolutional models, and also investigate the advantages and disadvantages of their constituents. Furthermore, we propose IntNet, a funnel-shaped wide convolutional neural architecture with no down-sampling for learning representations of the internal structure of words by composing their characters from limited, supervised training corpora. We evaluate our proposed model on six sequence labeling datasets, including named entity recognition, part-of-speech tagging, and syntactic chunking. Our in-depth analysis shows that IntNet significantly outperforms other character embedding models and obtains new state-of-the-art performance without relying on any external knowledge or resources.</abstract>
       <doi>10.18653/v1/D18-1279</doi>
       <bibkey>xin-etal-2018-learning</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/penn-treebank">Penn Treebank</pwcdataset>
     </paper>
     <paper id="280">
@@ -4311,7 +4311,7 @@
       <doi>10.18653/v1/D18-1310</doi>
       <bibkey>wu-etal-2018-evaluating</bibkey>
       <pwccode url="https://github.com/minghao-wu/CRF-AE" additional="false">minghao-wu/CRF-AE</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="311">
       <title>Improved Dependency Parsing using Implicit Word Connections Learned from Unlabeled Data</title>
@@ -4414,7 +4414,7 @@
       <abstract>Several recent papers investigate Active Learning (AL) for mitigating the data dependence of deep learning for natural language processing. However, the applicability of AL to real-world problems remains an open question. While in supervised learning, practitioners can try many different methods, evaluating each against a validation set before selecting a model, AL affords no such luxury. Over the course of one AL run, an agent annotates its dataset exhausting its labeling budget. Thus, given a new task, we have no opportunity to compare models and acquisition functions. This paper provides a large-scale empirical study of deep active learning, addressing multiple tasks and, for each, multiple datasets, multiple models, and a full suite of acquisition functions. We find that across all settings, Bayesian active learning by disagreement, using uncertainty estimates provided either by Dropout or Bayes-by-Backprop significantly improves over i.i.d. baselines and usually outperforms classic uncertainty sampling.</abstract>
       <doi>10.18653/v1/D18-1318</doi>
       <bibkey>siddhant-lipton-2018-deep</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="319">
       <title><fixed-case>B</fixed-case>ayesian Compression for Natural Language Processing</title>
@@ -4741,7 +4741,7 @@
       <abstract>Character-level patterns have been widely used as features in English Named Entity Recognition (NER) systems. However, to date there has been no direct investigation of the inherent differences between name and nonname tokens in text, nor whether this property holds across multiple languages. This paper analyzes the capabilities of corpus-agnostic Character-level Language Models (CLMs) in the binary task of distinguishing name tokens from non-name tokens. We demonstrate that CLMs provide a simple and powerful model for capturing these differences, identifying named entity tokens in a diverse set of languages at close to the performance of full NER systems. Moreover, by adding very simple CLM-based features we can significantly improve the performance of an off-the-shelf NER system for multiple languages.</abstract>
       <doi>10.18653/v1/D18-1345</doi>
       <bibkey>yu-etal-2018-strength</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="346">
       <title>Code-switched Language Models Using Dual <fixed-case>RNN</fixed-case>s and Same-Source Pretraining</title>
@@ -7838,7 +7838,7 @@
       <abstract>In this paper, we present APLenty, an annotation tool for creating high-quality sequence labeling datasets using active and proactive learning. A major innovation of our tool is the integration of automatic annotation with active learning and proactive learning. This makes the task of creating labeled datasets easier, less time-consuming and requiring less human effort. APLenty is highly flexible and can be adapted to various other tasks.</abstract>
       <doi>10.18653/v1/D18-2019</doi>
       <bibkey>nghiem-ananiadou-2018-aplenty</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="20">
       <title>Interactive Instance-based Evaluation of Knowledge Base Question Answering</title>
diff --git a/data/xml/D19.xml b/data/xml/D19.xml
index e8481c4c95..bb77dd3e50 100644
--- a/data/xml/D19.xml
+++ b/data/xml/D19.xml
@@ -58,7 +58,7 @@
       <attachment hash="8d3ec095">D19-1003.Attachment.zip</attachment>
       <doi>10.18653/v1/D19-1003</doi>
       <bibkey>lowell-etal-2019-practical</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="4">
       <title>Transfer Learning Between Related Tasks Using Expected Label Proportions</title>
@@ -1401,7 +1401,7 @@
       <bibkey>jain-etal-2019-entity</bibkey>
       <pwccode url="https://github.com/alankarj/cross_lingual_ner" additional="false">alankarj/cross_lingual_ner</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/conll-2002">CoNLL 2002</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/polyglot-ner">Polyglot-NER</pwcdataset>
     </paper>
     <paper id="101">
@@ -1414,7 +1414,7 @@
       <doi>10.18653/v1/D19-1101</doi>
       <bibkey>simpson-gurevych-2019-bayesian</bibkey>
       <pwccode url="https://github.com/UKPLab/arxiv2018-bayesian-ensembles" additional="false">UKPLab/arxiv2018-bayesian-ensembles</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="102">
       <title>A systematic comparison of methods for low-resource dependency parsing on genuinely low-resource languages</title>
@@ -5088,7 +5088,7 @@
       <doi>10.18653/v1/D19-1367</doi>
       <bibkey>jiang-etal-2019-improved</bibkey>
       <pwccode url="https://github.com/jiangyingjunn/i-darts" additional="false">jiangyingjunn/i-darts</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/ptb">PTB Diagnostic ECG Database</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/penn-treebank">Penn Treebank</pwcdataset>
     </paper>
@@ -5523,7 +5523,7 @@
       <doi>10.18653/v1/D19-1399</doi>
       <bibkey>jie-lu-2019-dependency</bibkey>
       <pwccode url="https://github.com/allanj/ner_with_dependency" additional="false">allanj/ner_with_dependency</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/ontonotes-5-0">OntoNotes 5.0</pwcdataset>
     </paper>
     <paper id="400">
@@ -7183,7 +7183,7 @@
       <bibkey>wang-etal-2019-crossweigh</bibkey>
       <pwccode url="https://github.com/ZihanWangKi/CrossWeigh" additional="false">ZihanWangKi/CrossWeigh</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/conll">CoNLL++</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wnut-2017-emerging-and-rare-entity">WNUT 2017</pwcdataset>
     </paper>
     <paper id="520">
@@ -7475,7 +7475,7 @@
       <url hash="c855992a">D19-1539</url>
       <doi>10.18653/v1/D19-1539</doi>
       <bibkey>baevski-etal-2019-cloze</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/glue">GLUE</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/mrpc">MRPC</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/multinli">MultiNLI</pwcdataset>
@@ -8990,7 +8990,7 @@
       <doi>10.18653/v1/D19-1650</doi>
       <bibkey>mayhew-etal-2019-ner</bibkey>
       <pwcdataset url="https://paperswithcode.com/dataset/broad-twitter-corpus">Broad Twitter Corpus</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/penn-treebank">Penn Treebank</pwcdataset>
     </paper>
     <paper id="651">
@@ -9906,7 +9906,7 @@ A plethora of methods have been proposed to emphasize specific lexico-semantic r
       <doi>10.18653/v1/D19-3028</doi>
       <bibkey>gong-etal-2019-neuronblocks</bibkey>
       <pwccode url="https://github.com/Microsoft/NeuronBlocks" additional="true">Microsoft/NeuronBlocks</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/glue">GLUE</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikiqa">WikiQA</pwcdataset>
     </paper>
diff --git a/data/xml/F14.xml b/data/xml/F14.xml
index 2ed65f04c3..55901c34e0 100644
--- a/data/xml/F14.xml
+++ b/data/xml/F14.xml
@@ -408,7 +408,7 @@
       <pages>437-442</pages>
       <url hash="5489ef98">F14-2009</url>
       <bibkey>sagot-gabor-2014-named</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="10">
       <title>Smoothing methods for a morpho-statistical approach of automatic diacritization <fixed-case>A</fixed-case>rabic texts (Méthodes de lissage d’une approche morpho-statistique pour la voyellation automatique des textes arabes) [in <fixed-case>F</fixed-case>rench]</title>
diff --git a/data/xml/I13.xml b/data/xml/I13.xml
index 444223beb2..9d21607f2e 100644
--- a/data/xml/I13.xml
+++ b/data/xml/I13.xml
@@ -566,7 +566,7 @@
       <pages>525–533</pages>
       <url hash="ba6b9e7f">I13-1060</url>
       <bibkey>wang-manning-2013-learning</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="61">
       <title>Learning Efficient Information Extraction on Heterogeneous Texts</title>
@@ -576,7 +576,7 @@
       <pages>534–542</pages>
       <url hash="2e892ca1">I13-1061</url>
       <bibkey>wachsmuth-etal-2013-learning</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="62">
       <title><fixed-case>T</fixed-case>opic<fixed-case>R</fixed-case>ank: Graph-Based Topic Ranking for Keyphrase Extraction</title>
@@ -1693,7 +1693,7 @@
       <pages>1285–1291</pages>
       <url hash="1441e5e4">I13-1183</url>
       <bibkey>wang-manning-2013-effect</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="184">
       <title>Case Study of Model Adaptation: Transfer Learning and Online Learning</title>
diff --git a/data/xml/I17.xml b/data/xml/I17.xml
index 08bcd5d93d..8595ba7256 100644
--- a/data/xml/I17.xml
+++ b/data/xml/I17.xml
@@ -630,7 +630,7 @@
       <abstract>Despite successful applications across a broad range of NLP tasks, conditional random fields (“CRFs”), in particular the linear-chain variant, are only able to model local features. While this has important benefits in terms of inference tractability, it limits the ability of the model to capture long-range dependencies between items. Attempts to extend CRFs to capture long-range dependencies have largely come at the cost of computational complexity and approximate inference. In this work, we propose an extension to CRFs by integrating external memory, taking inspiration from memory networks, thereby allowing CRFs to incorporate information far beyond neighbouring steps. Experiments across two tasks show substantial improvements over strong CRF and LSTM baselines.</abstract>
       <bibkey>liu-etal-2017-capturing</bibkey>
       <pwccode url="https://github.com/liufly/mecrf" additional="false">liufly/mecrf</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="57">
       <title>Named Entity Recognition with Stack Residual <fixed-case>LSTM</fixed-case> and Trainable Bias Decoding</title>
@@ -642,7 +642,7 @@
       <abstract>Recurrent Neural Network models are the state-of-the-art for Named Entity Recognition (NER). We present two innovations to improve the performance of these models. The first innovation is the introduction of residual connections between the Stacked Recurrent Neural Network model to address the degradation problem of deep neural networks. The second innovation is a bias decoding mechanism that allows the trained system to adapt to non-differentiable and externally computed objectives, such as the entity-based F-measure. Our work improves the state-of-the-art results for both Spanish and English languages on the standard train/development/test split of the CoNLL 2003 Shared Task NER dataset.</abstract>
       <bibkey>tran-etal-2017-named</bibkey>
       <pwcdataset url="https://paperswithcode.com/dataset/billion-word-benchmark">Billion Word Benchmark</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="58">
       <title><fixed-case>N</fixed-case>euramanteau: A Neural Network Ensemble Model for Lexical Blends</title>
@@ -1363,7 +1363,7 @@
       <url hash="e202a1c6">I17-2017</url>
       <abstract>We present Segment-level Neural CRF, which combines neural networks with a linear chain CRF for segment-level sequence modeling tasks such as named entity recognition (NER) and syntactic chunking. Our segment-level CRF can consider higher-order label dependencies compared with conventional word-level CRF. Since it is difficult to consider all possible variable length segments, our method uses segment lattice constructed from the word-level tagging model to reduce the search space. Performing experiments on NER and chunking, we demonstrate that our method outperforms conventional word-level CRF with neural networks.</abstract>
       <bibkey>sato-etal-2017-segment</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="18">
       <title>Integrating Vision and Language Datasets to Measure Word Concreteness</title>
diff --git a/data/xml/K15.xml b/data/xml/K15.xml
index 0809387a3d..3375d340b0 100644
--- a/data/xml/K15.xml
+++ b/data/xml/K15.xml
@@ -108,7 +108,7 @@
       <url hash="85b03b61">K15-1009</url>
       <doi>10.18653/v1/K15-1009</doi>
       <bibkey>qu-etal-2015-big</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/penn-treebank">Penn Treebank</pwcdataset>
     </paper>
     <paper id="10">
diff --git a/data/xml/K18.xml b/data/xml/K18.xml
index 943658cb20..276cd871e7 100644
--- a/data/xml/K18.xml
+++ b/data/xml/K18.xml
@@ -121,7 +121,7 @@
       <abstract>Many name tagging approaches use local contextual information with much success, but can fail when the local context is ambiguous or limited. We present a new framework to improve name tagging by utilizing local, document-level, and corpus-level contextual information. For each word, we retrieve document-level context from other sentences within the same document and corpus-level context from sentences in other documents. We propose a model that learns to incorporate document-level and corpus-level contextual information alongside local contextual information via document-level and corpus-level attentions, which dynamically weight their respective contextual information and determines the influence of this information through gating mechanisms. Experiments on benchmark datasets show the effectiveness of our approach, which achieves state-of-the-art results for Dutch, German, and Spanish on the CoNLL-2002 and CoNLL-2003 datasets. We will make our code and pre-trained models publicly available for research purposes.</abstract>
       <doi>10.18653/v1/K18-1009</doi>
       <bibkey>zhang-etal-2018-global</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="10">
       <title>Pervasive Attention: 2<fixed-case>D</fixed-case> Convolutional Neural Networks for Sequence-to-Sequence Prediction</title>
@@ -567,7 +567,7 @@
       <abstract>Word order is clearly a vital part of human language, but it has been used comparatively lightly in distributional vector models. This paper presents a new method for incorporating word order information into word vector embedding models by combining the benefits of permutation-based order encoding with the more recent method of skip-gram with negative sampling. The new method introduced here is called Embeddings Augmented by Random Permutations (EARP). It operates by applying permutations to the coordinates of context vector representations during the process of training. Results show an 8% improvement in accuracy on the challenging Bigger Analogy Test Set, and smaller but consistent improvements on other analogy reference sets. These findings demonstrate the importance of order-based information in analogical retrieval tasks, and the utility of random permutations as a means to augment neural embeddings.</abstract>
       <doi>10.18653/v1/K18-1045</doi>
       <bibkey>cohen-widdows-2018-bringing</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="46">
       <title>Aggregated Semantic Matching for Short Text Entity Linking</title>
diff --git a/data/xml/K19.xml b/data/xml/K19.xml
index 5ea931631f..2c03a09247 100644
--- a/data/xml/K19.xml
+++ b/data/xml/K19.xml
@@ -637,7 +637,7 @@
       <doi>10.18653/v1/K19-1048</doi>
       <bibkey>huang-etal-2019-learning</bibkey>
       <pwccode url="https://github.com/xhuang28/NewBioNer" additional="false">xhuang28/NewBioNer</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/ncbi-disease-1">NCBI Disease</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/webkb">WebKB</pwcdataset>
     </paper>
@@ -670,7 +670,7 @@
       <doi>10.18653/v1/K19-1050</doi>
       <bibkey>hollenstein-etal-2019-cognival</bibkey>
       <pwccode url="https://github.com/DS3Lab/cognival" additional="false">DS3Lab/cognival</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/squad">SQuAD</pwcdataset>
     </paper>
     <paper id="51">
diff --git a/data/xml/N16.xml b/data/xml/N16.xml
index 7712b59ff5..e22f1451e2 100644
--- a/data/xml/N16.xml
+++ b/data/xml/N16.xml
@@ -356,8 +356,8 @@
       <doi>10.18653/v1/N16-1030</doi>
       <bibkey>lample-etal-2016-neural</bibkey>
       <pwccode url="https://github.com/glample/tagger" additional="true">glample/tagger</pwccode>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/conll">CoNLL++</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
     </paper>
     <paper id="31">
       <title>Dynamic Feature Induction: The Last Gist to the State-of-the-Art</title>
@@ -1302,7 +1302,7 @@
       <url hash="3a37974a">N16-1118</url>
       <doi>10.18653/v1/N16-1118</doi>
       <bibkey>melamud-etal-2016-role</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="119">
       <title>Improve <fixed-case>C</fixed-case>hinese Word Embeddings by Exploiting Internal Structure</title>
diff --git a/data/xml/N18.xml b/data/xml/N18.xml
index 2fdaa1abe4..e296f4d09b 100644
--- a/data/xml/N18.xml
+++ b/data/xml/N18.xml
@@ -36,7 +36,7 @@
       <doi>10.18653/v1/N18-1001</doi>
       <video href="http://vimeo.com/276388781"/>
       <bibkey>wang-etal-2018-label-aware</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/weibo-ner">Weibo NER</pwcdataset>
     </paper>
     <paper id="2">
@@ -982,7 +982,7 @@
       <bibkey>yang-etal-2018-collective</bibkey>
       <pwccode url="https://github.com/bloomberg/sgtb" additional="false">bloomberg/sgtb</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/aida-conll-yago">AIDA CoNLL-YAGO</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="72">
       <title><fixed-case>D</fixed-case>eep<fixed-case>A</fixed-case>lignment: Unsupervised Ontology Matching with Refined Word Vectors</title>
@@ -1229,8 +1229,8 @@
       <doi>10.18653/v1/N18-1089</doi>
       <bibkey>yasunaga-etal-2018-robust</bibkey>
       <pwccode url="https://github.com/michiyasunaga/pos_adv" additional="false">michiyasunaga/pos_adv</pwccode>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/conll-2000-1">CoNLL-2000</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/penn-treebank">Penn Treebank</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/universal-dependencies">Universal Dependencies</pwcdataset>
     </paper>
@@ -1727,7 +1727,7 @@
       <url hash="daca232b">N18-1127</url>
       <doi>10.18653/v1/N18-1127</doi>
       <bibkey>aguilar-etal-2018-modeling</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/ipm-nel">IPM NEL</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wnut-2016-ner">WNUT 2016 NER</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wnut-2017-emerging-and-rare-entity">WNUT 2017</pwcdataset>
@@ -2740,8 +2740,8 @@
       <bibkey>peters-etal-2018-deep</bibkey>
       <pwccode url="" additional="true"/>
       <pwcdataset url="https://paperswithcode.com/dataset/acl-arc-1">ACL ARC</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/conll">CoNLL++</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/ontonotes-5-0">OntoNotes 5.0</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/reddit-corpus">Reddit Corpus</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/snli">SNLI</pwcdataset>
diff --git a/data/xml/N19.xml b/data/xml/N19.xml
index bf3723db2e..572ee60952 100644
--- a/data/xml/N19.xml
+++ b/data/xml/N19.xml
@@ -1090,7 +1090,7 @@
       <video href="https://vimeo.com/360565437"/>
       <bibkey>jie-etal-2019-better</bibkey>
       <pwcdataset url="https://paperswithcode.com/dataset/conll-2002">CoNLL 2002</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="80">
       <title>Event Detection without Triggers</title>
@@ -1131,7 +1131,7 @@
       <doi>10.18653/v1/N19-1082</doi>
       <bibkey>qian-etal-2019-graphie</bibkey>
       <pwccode url="https://github.com/thomas0809/GraphIE" additional="true">thomas0809/GraphIE</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="83">
       <title><fixed-case>O</fixed-case>pen<fixed-case>KI</fixed-case>: <fixed-case>I</fixed-case>ntegrating <fixed-case>O</fixed-case>pen <fixed-case>I</fixed-case>nformation <fixed-case>E</fixed-case>xtraction and <fixed-case>K</fixed-case>nowledge <fixed-case>B</fixed-case>ases with <fixed-case>R</fixed-case>elation <fixed-case>I</fixed-case>nference</title>
@@ -1547,7 +1547,7 @@
       <bibkey>liu-etal-2019-linguistic</bibkey>
       <pwcdataset url="https://paperswithcode.com/dataset/billion-word-benchmark">Billion Word Benchmark</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/bookcorpus">BookCorpus</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="113">
       <title>Mutual Information Maximization for Simple and Accurate Part-Of-Speech Induction</title>
@@ -1624,7 +1624,7 @@
       <url hash="483870bf">N19-1117</url>
       <doi>10.18653/v1/N19-1117</doi>
       <bibkey>liu-etal-2019-knowledge-augmented</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitext-2">WikiText-2</pwcdataset>
     </paper>
     <paper id="118">
@@ -1854,7 +1854,7 @@
       <doi>10.18653/v1/N19-1133</doi>
       <bibkey>guo-etal-2019-star</bibkey>
       <pwccode url="https://github.com/dmlc/dgl" additional="true">dmlc/dgl</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/penn-treebank">Penn Treebank</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/snli">SNLI</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/sst">SST</pwcdataset>
@@ -2056,7 +2056,7 @@
       <doi>10.18653/v1/N19-1149</doi>
       <bibkey>dai-etal-2019-using</bibkey>
       <pwccode url="https://github.com/daixiangau/naacl2019-select-pretraining-data-for-ner" additional="false">daixiangau/naacl2019-select-pretraining-data-for-ner</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="150">
       <title>Predicting Annotation Difficulty to Improve Task Routing and Model Performance for Biomedical Information Extraction</title>
@@ -3381,7 +3381,7 @@
       <url hash="3475d65c">N19-1249</url>
       <doi>10.18653/v1/N19-1249</doi>
       <bibkey>lu-etal-2019-sc</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/universal-dependencies">Universal Dependencies</pwcdataset>
     </paper>
     <paper id="250">
@@ -4588,7 +4588,7 @@
       <url hash="f29a1e27">N19-1335</url>
       <doi>10.18653/v1/N19-1335</doi>
       <bibkey>tu-gimpel-2019-benchmarking</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="336">
       <title>Evaluating and Enhancing the Robustness of Dialogue Systems: A Case Study on a Negotiation Agent</title>
@@ -4814,7 +4814,7 @@
       <video href="https://vimeo.com/347430247"/>
       <bibkey>sasaki-etal-2019-subword</bibkey>
       <pwccode url="https://github.com/losyer/compact_reconstruction" additional="false">losyer/compact_reconstruction</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/snli">SNLI</pwcdataset>
     </paper>
     <paper id="354">
@@ -5763,7 +5763,7 @@
       <pwccode url="https://github.com/google-research/bert" additional="true">google-research/bert</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/cped">CPED</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/cola">CoLA</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/coqa">CoQA</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/dbpedia">DBpedia</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/glue">GLUE</pwcdataset>
@@ -6145,7 +6145,7 @@
       <url hash="8bb8a851">N19-2023</url>
       <doi>10.18653/v1/N19-2023</doi>
       <bibkey>johnson-etal-2019-cross</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="24">
       <title>Neural Text Normalization with Subword Units</title>
@@ -6642,7 +6642,7 @@
       <doi>10.18653/v1/N19-4010</doi>
       <bibkey>akbik-etal-2019-flair</bibkey>
       <pwccode url="https://github.com/zalandoresearch/flair" additional="false">zalandoresearch/flair</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/imdb-movie-reviews">IMDb Movie Reviews</pwcdataset>
     </paper>
     <paper id="11">
diff --git a/data/xml/P14.xml b/data/xml/P14.xml
index 26f1a4edf0..604dccef88 100644
--- a/data/xml/P14.xml
+++ b/data/xml/P14.xml
@@ -2632,7 +2632,7 @@
       <url hash="e340626d">P14-2076</url>
       <doi>10.3115/v1/P14-2076</doi>
       <bibkey>hachey-etal-2014-cheap</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="77">
       <title>Identifying Real-Life Complex Task Names with Task-Intrinsic Entities from Microblogs</title>
@@ -3547,7 +3547,7 @@
       <url hash="d4bc2897">P14-5003</url>
       <doi>10.3115/v1/P14-5003</doi>
       <bibkey>strakova-etal-2014-open</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="4">
       <title>Community Evaluation and Exchange of Word Vectors at wordvectors.org</title>
diff --git a/data/xml/P15.xml b/data/xml/P15.xml
index 9762601c8e..ce3e59b567 100644
--- a/data/xml/P15.xml
+++ b/data/xml/P15.xml
@@ -171,7 +171,7 @@
       <url hash="6fda9ccf">P15-1013</url>
       <doi>10.3115/v1/P15-1013</doi>
       <bibkey>primadhanty-etal-2015-low</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="14">
       <title>Learning Word Representations by Jointly Modeling Syntagmatic and Paradigmatic Relations</title>
diff --git a/data/xml/P16.xml b/data/xml/P16.xml
index 1f7887b256..b663b30d5b 100644
--- a/data/xml/P16.xml
+++ b/data/xml/P16.xml
@@ -976,6 +976,7 @@
       <pwcdataset url="https://paperswithcode.com/dataset/cbt">CBT</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/cnn-daily-mail-1">CNN/Daily Mail</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/children-s-book-test">Children's Book Test</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/ptr-nets">PTR_NETS</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/searchqa">SearchQA</pwcdataset>
     </paper>
     <paper id="87">
@@ -1138,8 +1139,8 @@
       <doi>10.18653/v1/P16-1101</doi>
       <bibkey>ma-hovy-2016-end</bibkey>
       <pwccode url="" additional="true"/>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/conll">CoNLL++</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/penn-treebank">Penn Treebank</pwcdataset>
     </paper>
     <paper id="102">
@@ -1517,7 +1518,7 @@
       <doi>10.18653/v1/P16-1134</doi>
       <attachment type="note" hash="988773d0">P16-1134.Notes.pdf</attachment>
       <bibkey>zhuo-etal-2016-segment</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/rcv1">RCV1</pwcdataset>
     </paper>
     <paper id="135">
@@ -2583,7 +2584,7 @@
       <attachment type="note" hash="b1c0f96b">P16-1228.Notes.pdf</attachment>
       <bibkey>hu-etal-2016-harnessing</bibkey>
       <pwccode url="" additional="true"/>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/glue">GLUE</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/sst">SST</pwcdataset>
     </paper>
diff --git a/data/xml/P17.xml b/data/xml/P17.xml
index c8cc80231b..d90faf0384 100644
--- a/data/xml/P17.xml
+++ b/data/xml/P17.xml
@@ -412,7 +412,7 @@ two word-vectors results in a vector that is only a small angle away from the ve
       <video href="https://vimeo.com/234953632"/>
       <bibkey>nguyen-etal-2017-aggregating</bibkey>
       <pwccode url="https://github.com/thanhan/seqcrowd-acl17" additional="false">thanhan/seqcrowd-acl17</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="29">
       <title>Multi-space Variational Encoder-Decoders for Semi-supervised Labeled Sequence Transduction</title>
@@ -2259,7 +2259,7 @@ two word-vectors results in a vector that is only a small angle away from the ve
       <abstract>Pre-trained word embeddings learned from unlabeled text have become a standard component of neural network architectures for NLP tasks. However, in most cases, the recurrent network that operates on word-level representations to produce context sensitive representations is trained on relatively little labeled data. In this paper, we demonstrate a general semi-supervised approach for adding pretrained context embeddings from bidirectional language models to NLP systems and apply it to sequence labeling tasks. We evaluate our model on two standard datasets for named entity recognition (NER) and chunking, and in both cases achieve state of the art results, surpassing previous systems that use other forms of transfer or joint learning with additional labeled data and task specific gazetteers.</abstract>
       <bibkey>peters-etal-2017-semi</bibkey>
       <pwccode url="" additional="true"/>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="162">
       <title>Learning Symmetric Collaborative Dialogue Agents with Dynamic Knowledge Graph Embeddings</title>
@@ -2723,7 +2723,7 @@ two word-vectors results in a vector that is only a small angle away from the ve
       <abstract>We propose a sequence labeling framework with a secondary training objective, learning to predict surrounding words for every word in the dataset. This language modeling objective incentivises the system to learn general-purpose patterns of semantic and syntactic composition, which are also useful for improving accuracy on different sequence labeling tasks. The architecture was evaluated on a range of datasets, covering the tasks of error detection in learner texts, named entity recognition, chunking and POS-tagging. The novel language modeling objective provided consistent performance improvements on every benchmark, without requiring any additional annotated or unannotated data.</abstract>
       <bibkey>rei-2017-semi</bibkey>
       <pwccode url="https://github.com/marekrei/sequence-labeler" additional="true">marekrei/sequence-labeler</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/fce">FCE</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/penn-treebank">Penn Treebank</pwcdataset>
     </paper>
diff --git a/data/xml/P18.xml b/data/xml/P18.xml
index fa7bd16dee..c07ccdd80e 100644
--- a/data/xml/P18.xml
+++ b/data/xml/P18.xml
@@ -481,7 +481,7 @@
       <attachment type="poster" hash="76675ee7">P18-1030.Poster.pdf</attachment>
       <doi>10.18653/v1/P18-1030</doi>
       <bibkey>zhang-etal-2018-sentence</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/imdb-movie-reviews">IMDb Movie Reviews</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/mr">MR</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/penn-treebank">Penn Treebank</pwcdataset>
@@ -662,8 +662,8 @@
       <bibkey>shen-etal-2018-baseline</bibkey>
       <pwccode url="https://github.com/dinghanshen/SWEM" additional="true">dinghanshen/SWEM</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/ag-news">AG News</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/conll-2000-1">CoNLL-2000</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/dbpedia">DBpedia</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/glue">GLUE</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/mr">MR</pwcdataset>
@@ -4145,7 +4145,7 @@
       <attachment type="poster" hash="31fab8f2">P18-2020.Poster.pdf</attachment>
       <doi>10.18653/v1/P18-2020</doi>
       <bibkey>riedl-pado-2018-named</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="21">
       <title>A dataset for identifying actionable feedback in collaborative software development</title>
@@ -4391,7 +4391,7 @@
       <doi>10.18653/v1/P18-2038</doi>
       <bibkey>ye-ling-2018-hybrid</bibkey>
       <pwccode url="https://github.com/ZhixiuYe/HSCRF-pytorch" additional="false">ZhixiuYe/HSCRF-pytorch</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="39">
       <title>A Study of the Importance of External Knowledge in the Named Entity Recognition Task</title>
@@ -4407,7 +4407,7 @@
       <doi>10.18653/v1/P18-2039</doi>
       <bibkey>seyler-etal-2018-study</bibkey>
       <pwcdataset url="https://paperswithcode.com/dataset/conll-2002">CoNLL 2002</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/yago">YAGO</pwcdataset>
     </paper>
     <paper id="40">
@@ -6042,7 +6042,7 @@
       <doi>10.18653/v1/P18-4013</doi>
       <bibkey>yang-zhang-2018-ncrf</bibkey>
       <pwccode url="https://github.com/jiesutd/NCRFpp" additional="true">jiesutd/NCRFpp</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/penn-treebank">Penn Treebank</pwcdataset>
     </paper>
     <paper id="14">
diff --git a/data/xml/P19.xml b/data/xml/P19.xml
index 5dfe89f7c1..f0cd02ccf9 100644
--- a/data/xml/P19.xml
+++ b/data/xml/P19.xml
@@ -1956,7 +1956,7 @@
       <bibkey>xia-etal-2019-multi</bibkey>
       <pwcdataset url="https://paperswithcode.com/dataset/ace-2004">ACE 2004</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/ace-2005">ACE 2005</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="139">
       <title><fixed-case>ERNIE</fixed-case>: Enhanced Language Representation with Informative Entities</title>
@@ -2132,7 +2132,7 @@
       <doi>10.18653/v1/P19-1149</doi>
       <bibkey>zhang-sennrich-2019-lightweight</bibkey>
       <pwccode url="https://github.com/bzhangGo/lrn" additional="false">bzhangGo/lrn</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/snli">SNLI</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/squad">SQuAD</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wmt-2014">WMT 2014</pwcdataset>
@@ -3342,7 +3342,7 @@
       <doi>10.18653/v1/P19-1233</doi>
       <bibkey>liu-etal-2019-gcdt</bibkey>
       <pwccode url="https://github.com/Adaxry/GCDT" additional="false">Adaxry/GCDT</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="234">
       <title>Unsupervised Learning of <fixed-case>PCFG</fixed-case>s with Normalizing Flow</title>
@@ -3381,7 +3381,7 @@
       <doi>10.18653/v1/P19-1236</doi>
       <bibkey>jia-etal-2019-cross</bibkey>
       <pwccode url="https://github.com/jiachenwestlake/Cross-Domain_NER" additional="false">jiachenwestlake/Cross-Domain_NER</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="237">
       <title>Graph-based Dependency Parsing with Graph Neural Networks</title>
@@ -4769,7 +4769,7 @@
       <doi>10.18653/v1/P19-1336</doi>
       <bibkey>zhou-etal-2019-dual</bibkey>
       <pwcdataset url="https://paperswithcode.com/dataset/conll-2002">CoNLL 2002</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="337">
       <title>Scalable Syntax-Aware Language Models Using Knowledge Distillation</title>
@@ -7434,7 +7434,7 @@
       <doi>10.18653/v1/P19-1524</doi>
       <bibkey>liu-etal-2019-towards</bibkey>
       <pwccode url="https://github.com/lyutyuh/acl19_subtagger" additional="false">lyutyuh/acl19_subtagger</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/ontonotes-5-0">OntoNotes 5.0</pwcdataset>
     </paper>
     <paper id="525">
@@ -7473,7 +7473,7 @@
       <pwcdataset url="https://paperswithcode.com/dataset/ace-2004">ACE 2004</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/ace-2005">ACE 2005</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/conll-2002">CoNLL 2002</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/genia">GENIA</pwcdataset>
     </paper>
     <paper id="528">
@@ -7539,7 +7539,7 @@
       <url hash="76c69e20">P19-1532</url>
       <doi>10.18653/v1/P19-1532</doi>
       <bibkey>liu-etal-2019-prism</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="533">
       <title>Label-Agnostic Sequence Labeling by Copying Nearest Neighbors</title>
@@ -7551,7 +7551,7 @@
       <doi>10.18653/v1/P19-1533</doi>
       <bibkey>wiseman-stratos-2019-label</bibkey>
       <pwccode url="https://github.com/swiseman/neighbor-tagging" additional="false">swiseman/neighbor-tagging</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/penn-treebank">Penn Treebank</pwcdataset>
     </paper>
     <paper id="534">
@@ -8136,7 +8136,7 @@
       <video href="https://vimeo.com/385434363"/>
       <doi>10.18653/v1/P19-1575</doi>
       <bibkey>fiacco-etal-2019-deep</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/multinli">MultiNLI</pwcdataset>
     </paper>
     <paper id="576">
@@ -9599,7 +9599,7 @@
       <doi>10.18653/v1/P19-2026</doi>
       <bibkey>martins-etal-2019-joint</bibkey>
       <pwcdataset url="https://paperswithcode.com/dataset/aida-conll-yago">AIDA CoNLL-YAGO</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="27">
       <title>Dialogue-Act Prediction of Future Responses Based on Conversation History</title>
diff --git a/data/xml/Q16.xml b/data/xml/Q16.xml
index 01462b2120..39eee80128 100644
--- a/data/xml/Q16.xml
+++ b/data/xml/Q16.xml
@@ -325,7 +325,7 @@
       <url hash="c1db0998">Q16-1026</url>
       <bibkey>chiu-nichols-2016-named</bibkey>
       <pwccode url="" additional="true"/>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/dbpedia">DBpedia</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/ontonotes-5-0">OntoNotes 5.0</pwcdataset>
     </paper>
diff --git a/data/xml/S18.xml b/data/xml/S18.xml
index 432335495d..2ff00102df 100644
--- a/data/xml/S18.xml
+++ b/data/xml/S18.xml
@@ -2573,7 +2573,7 @@
       <bibkey>yadav-etal-2018-deep</bibkey>
       <pwccode url="https://github.com/vikas95/Pref_Suff_Span_NN" additional="false">vikas95/Pref_Suff_Span_NN</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/conll-2002">CoNLL 2002</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="22">
       <title>Fine-grained Entity Typing through Increased Discourse Context and Adaptive Classification Thresholds</title>
diff --git a/data/xml/U15.xml b/data/xml/U15.xml
index e577008455..cf91c7783e 100644
--- a/data/xml/U15.xml
+++ b/data/xml/U15.xml
@@ -108,7 +108,7 @@
       <url hash="5edfde0b">U15-1010</url>
       <bibkey>salinas-alvarado-etal-2015-domain</bibkey>
       <pwcdataset url="https://paperswithcode.com/dataset/fin">FIN</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="11">
       <title>Do <fixed-case>POS</fixed-case> Tags Help to Learn Better Morphological Segmentations?</title>
diff --git a/data/xml/W14.xml b/data/xml/W14.xml
index 3b97d86f95..c12b39bb01 100644
--- a/data/xml/W14.xml
+++ b/data/xml/W14.xml
@@ -2516,7 +2516,7 @@
       <url hash="3e10b79c">W14-1601</url>
       <doi>10.3115/v1/W14-1601</doi>
       <bibkey>sogaard-etal-2014-whats</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="2">
       <title>Domain-Specific Image Captioning</title>
@@ -2595,7 +2595,7 @@
       <url hash="901c8fec">W14-1609</url>
       <doi>10.3115/v1/W14-1609</doi>
       <bibkey>passos-etal-2014-lexicon</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/rcv1">RCV1</pwcdataset>
     </paper>
     <paper id="10">
@@ -9315,7 +9315,7 @@
       <pages>107–112</pages>
       <url hash="d221f3bd">W14-5117</url>
       <bibkey>govind-etal-2014-multiobjective</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="18">
       <title>Improving the accuracy of pronunciation lexicon using Naive <fixed-case>B</fixed-case>ayes classifier with character n-gram as feature: for language classified pronunciation lexicon generation</title>
diff --git a/data/xml/W16.xml b/data/xml/W16.xml
index 7d2d78dacc..e9da62ed55 100644
--- a/data/xml/W16.xml
+++ b/data/xml/W16.xml
@@ -5054,7 +5054,7 @@
       <url hash="7bcbc117">W16-2501</url>
       <doi>10.18653/v1/W16-2501</doi>
       <bibkey>chiu-etal-2016-intrinsic</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="2">
       <title>A critique of word similarity as a method for evaluating distributional semantic models</title>
@@ -5085,7 +5085,7 @@
       <url hash="45313556">W16-2504</url>
       <doi>10.18653/v1/W16-2504</doi>
       <bibkey>nayak-etal-2016-evaluating</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="5">
       <title>Story Cloze Evaluator: Vector Space Representation Evaluation by Predicting What Happens Next</title>
@@ -5525,7 +5525,7 @@
       <url hash="97875bb5">W16-2703</url>
       <doi>10.18653/v1/W16-2703</doi>
       <bibkey>jiang-etal-2016-evaluating</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="4">
       <title><fixed-case>G</fixed-case>erman <fixed-case>NER</fixed-case> with a Multilingual Rule Based Information Extraction System: Analysis and Issues</title>
@@ -5536,7 +5536,7 @@
       <url hash="8293aabe">W16-2704</url>
       <doi>10.18653/v1/W16-2704</doi>
       <bibkey>druzhkina-etal-2016-german</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="5">
       <title><fixed-case>S</fixed-case>panish <fixed-case>NER</fixed-case> with Word Representations and Conditional Random Fields</title>
@@ -12717,7 +12717,7 @@
       <pages>36–45</pages>
       <url hash="b7a0cf06">W16-6306</url>
       <bibkey>akarapu-chowdary-2016-extending</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="7">
       <title>Sentence Based Discourse Classification for <fixed-case>H</fixed-case>indi Story Text-to-Speech (<fixed-case>TTS</fixed-case>) System</title>
@@ -12842,7 +12842,7 @@
       <pages>154–160</pages>
       <url hash="064e837a">W16-6320</url>
       <bibkey>athavale-etal-2016-towards</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="21">
       <title><fixed-case>V</fixed-case>aidya: A Spoken Dialog System for Health Domain</title>
diff --git a/data/xml/W17.xml b/data/xml/W17.xml
index ef40a2194d..9adc994a41 100644
--- a/data/xml/W17.xml
+++ b/data/xml/W17.xml
@@ -361,7 +361,7 @@
       <pages>87–95</pages>
       <url hash="6d21fea7">W17-0211</url>
       <bibkey>sodergren-nugues-2017-multilingual</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="12">
       <title>Linear Ensembles of Word Embedding Models</title>
@@ -7417,7 +7417,7 @@ is able to handle phenomena related to scope by means of an higher-order type th
       <doi>10.18653/v1/W17-4114</doi>
       <abstract>Recently, neural models have shown superior performance over conventional models in NER tasks. These models use CNN to extract sub-word information along with RNN to predict a tag for each word. However, these models have been tested almost entirely on English texts. It remains unclear whether they perform similarly in other languages. We worked on Japanese NER using neural models and discovered two obstacles of the state-of-the-art model. First, CNN is unsuitable for extracting Japanese sub-word information. Secondly, a model predicting a tag for each word cannot extract an entity when a part of a word composes an entity. The contributions of this work are (1) verifying the effectiveness of the state-of-the-art NER model for Japanese, (2) proposing a neural model for predicting a tag for each character using word and character information. Experimentally obtained results demonstrate that our model outperforms the state-of-the-art neural English NER model in Japanese.</abstract>
       <bibkey>misawa-etal-2017-character</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="15">
       <title>Word Representation Models for Morphologically Rich Languages in Neural Machine Translation</title>
@@ -7815,7 +7815,7 @@ is able to handle phenomena related to scope by means of an higher-order type th
       <abstract>Standard approaches in entity identification hard-code boundary detection and type prediction into labels and perform Viterbi. This has two disadvantages: 1. the runtime complexity grows quadratically in the number of types, and 2. there is no natural segment-level representation. In this paper, we propose a neural architecture that addresses these disadvantages. We frame the problem as multitasking, separating boundary detection and type prediction but optimizing them jointly. Despite its simplicity, this architecture performs competitively with fully structured models such as BiLSTM-CRFs while scaling linearly in the number of types. Furthermore, by construction, the model induces type-disambiguating embeddings of predicted mentions.</abstract>
       <bibkey>stratos-2017-entity</bibkey>
       <pwccode url="https://github.com/karlstratos/mention2vec" additional="false">karlstratos/mention2vec</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="3">
       <title>Towards Neural Machine Translation with Latent Tree Attention</title>
@@ -9875,7 +9875,7 @@ is able to handle phenomena related to scope by means of an higher-order type th
       <doi>10.18653/v1/W17-5004</doi>
       <abstract>We investigate the utility of different auxiliary objectives and training strategies within a neural sequence labeling approach to error detection in learner writing. Auxiliary costs provide the model with additional linguistic information, allowing it to learn general-purpose compositional features that can then be exploited for other objectives. Our experiments show that a joint learning approach trained with parallel labels on in-domain data improves performance over the previous best error detection system. While the resulting model has the same number of parameters, the additional objectives allow it to be optimised more efficiently and achieve better performance.</abstract>
       <bibkey>rei-yannakoudakis-2017-auxiliary</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/fce">FCE</pwcdataset>
     </paper>
     <paper id="5">
diff --git a/data/xml/W18.xml b/data/xml/W18.xml
index 1aa12821af..157b88b185 100644
--- a/data/xml/W18.xml
+++ b/data/xml/W18.xml
@@ -3929,7 +3929,7 @@
       <doi>10.18653/v1/W18-2506</doi>
       <bibkey>pressel-etal-2018-baseline</bibkey>
       <pwccode url="https://github.com/dpressel/baseline" additional="false">dpressel/baseline</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/sst">SST</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wnut-2017-emerging-and-rare-entity">WNUT 2017</pwcdataset>
     </paper>
@@ -5513,7 +5513,7 @@
       <url hash="dfda07f6">W18-3402</url>
       <doi>10.18653/v1/W18-3402</doi>
       <bibkey>hedderich-klakow-2018-training</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="3">
       <title>Multi-task learning for historical text normalization: Size matters</title>
@@ -10960,7 +10960,7 @@
       <doi>10.18653/v1/W18-5622</doi>
       <bibkey>tourille-etal-2018-evaluation</bibkey>
       <pwccode url="https://github.com/strayMat/bio-medical_ner" additional="false">strayMat/bio-medical_ner</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/ncbi-disease-corpus">NCBI Disease Corpus</pwcdataset>
     </paper>
     <paper id="23">
@@ -11121,7 +11121,7 @@
       <abstract>Slot filling is a crucial task in the Natural Language Understanding (NLU) component of a dialogue system. Most approaches for this task rely solely on the domain-specific datasets for training. We propose a joint model of slot filling and Named Entity Recognition (NER) in a multi-task learning (MTL) setup. Our experiments on three slot filling datasets show that using NER as an auxiliary task improves slot filling performance and achieve competitive performance compared with state-of-the-art. In particular, NER is effective when supervised at the lower layer of the model. For low-resource scenarios, we found that MTL is effective for one dataset.</abstract>
       <doi>10.18653/v1/W18-5711</doi>
       <bibkey>louvan-magnini-2018-exploring</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="12">
       <title>Why are Sequence-to-Sequence Models So Dull? Understanding the Low-Diversity Problem of Chatbots</title>
diff --git a/data/xml/W19.xml b/data/xml/W19.xml
index a9a3a81dd7..ac4e9b9ee8 100644
--- a/data/xml/W19.xml
+++ b/data/xml/W19.xml
@@ -1765,7 +1765,7 @@
       <url hash="d358e9b7">W19-1504</url>
       <doi>10.18653/v1/W19-1504</doi>
       <bibkey>zupon-etal-2019-lightly</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="5">
       <title>Semi-Supervised Teacher-Student Architecture for Relation Extraction</title>
@@ -2544,7 +2544,7 @@
       <doi>10.18653/v1/W19-2011</doi>
       <bibkey>jin-etal-2019-probing</bibkey>
       <pwccode url="https://github.com/Andy-jqa/bioelmo" additional="false">Andy-jqa/bioelmo</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/snli">SNLI</pwcdataset>
     </paper>
     <paper id="12">
@@ -6302,7 +6302,7 @@ In this paper, we describe a compression scheme for lexicons when represented as
       <url hash="d514907d">W19-3711</url>
       <doi>10.18653/v1/W19-3711</doi>
       <bibkey>moreno-etal-2019-tlr</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="12">
       <title>Tuning Multilingual Transformers for Language-Specific Named Entity Recognition</title>
@@ -7740,7 +7740,7 @@ One of the references was wrong therefore it is corrected to cite the appropriat
       <url hash="fcff1713">W19-4302</url>
       <doi>10.18653/v1/W19-4302</doi>
       <bibkey>peters-etal-2019-tune</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/glue">GLUE</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/mrpc">MRPC</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/multinli">MultiNLI</pwcdataset>
@@ -10296,7 +10296,7 @@ One of the references was wrong therefore it is corrected to cite the appropriat
       <url hash="2c80a217">W19-4826</url>
       <doi>10.18653/v1/W19-4826</doi>
       <bibkey>gralinski-etal-2019-geval</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="27">
       <title>From Balustrades to Pierre Vinken: Looking for Syntax in Transformer Self-Attentions</title>
@@ -13538,7 +13538,7 @@ One of the references was wrong therefore it is corrected to cite the appropriat
       <pages>40–49</pages>
       <url hash="93e7690f">W19-5807</url>
       <bibkey>magnolini-etal-2019-use</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="8">
       <title>Learning Household Task Knowledge from <fixed-case>W</fixed-case>iki<fixed-case>H</fixed-case>ow Descriptions</title>
@@ -14123,7 +14123,6 @@ One of the references was wrong therefore it is corrected to cite the appropriat
       <url hash="9b228715">W19-5945</url>
       <doi>10.18653/v1/W19-5945</doi>
       <bibkey>keizer-etal-2019-user</bibkey>
-      <pwccode url="https://bitbucket.org/skeizer/madrigal" additional="false">skeizer/madrigal</pwccode>
     </paper>
     <paper id="46">
       <title>Dialogue Act Classification in Team Communication for Robot Assisted Disaster Response</title>
@@ -14790,7 +14789,7 @@ One of the references was wrong therefore it is corrected to cite the appropriat
       <url hash="13236d45">W19-6143</url>
       <bibkey>plank-2019-neural</bibkey>
       <pwccode url="https://github.com/bplank/danish_ner_transfer" additional="false">bplank/danish_ner_transfer</pwccode>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/universal-dependencies">Universal Dependencies</pwcdataset>
     </paper>
     <paper id="44">
diff --git a/data/xml/Y18.xml b/data/xml/Y18.xml
index f6d93fb13d..d291f0a233 100644
--- a/data/xml/Y18.xml
+++ b/data/xml/Y18.xml
@@ -511,7 +511,7 @@
       <author><first>Aravindh</first><last>Amaresan</last></author>
       <url hash="3a9d2ffa">Y18-1061</url>
       <bibkey>panchendrarajan-amaresan-2018-bidirectional</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL-2003</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/conll-2003">CoNLL 2003</pwcdataset>
     </paper>
     <paper id="62">
       <title><fixed-case>S</fixed-case>mart<fixed-case>W</fixed-case>rite: Extracting <fixed-case>C</fixed-case>hinese Lexical Grammar Patterns Using Dependency Parsing</title>

From 267e7b08709ee5797d4a3fb74bea0a5c9ecf3ae2 Mon Sep 17 00:00:00 2001
From: acl-pwc-bot <94475230+acl-pwc-bot@users.noreply.github.com>
Date: Thu, 23 Nov 2023 02:05:38 +0100
Subject: [PATCH 09/12] Update metadata from Papers with Code

---
 data/xml/2020.aacl.xml     | 1 +
 data/xml/2020.emnlp.xml    | 2 +-
 data/xml/2021.emnlp.xml    | 1 +
 data/xml/2021.findings.xml | 1 +
 data/xml/2022.coling.xml   | 1 +
 data/xml/2022.findings.xml | 2 +-
 data/xml/2022.lrec.xml     | 2 ++
 data/xml/N18.xml           | 2 +-
 data/xml/P19.xml           | 2 +-
 9 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/data/xml/2020.aacl.xml b/data/xml/2020.aacl.xml
index b6764cb4ec..a9316f53dc 100644
--- a/data/xml/2020.aacl.xml
+++ b/data/xml/2020.aacl.xml
@@ -621,6 +621,7 @@
       <url hash="545b660a">2020.aacl-main.49</url>
       <bibkey>gao-etal-2020-systematic</bibkey>
       <pwccode url="https://github.com/HQ01/gSCAN_with_language_conditioned_embedding" additional="true">HQ01/gSCAN_with_language_conditioned_embedding</pwccode>
+      <pwcdataset url="https://paperswithcode.com/dataset/gscan">GSCAN</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/scan">SCAN</pwcdataset>
     </paper>
     <paper id="50">
diff --git a/data/xml/2020.emnlp.xml b/data/xml/2020.emnlp.xml
index c65befe00a..eb38820ce8 100644
--- a/data/xml/2020.emnlp.xml
+++ b/data/xml/2020.emnlp.xml
@@ -2918,7 +2918,7 @@
       <doi>10.18653/v1/2020.emnlp-main.190</doi>
       <video href="https://slideslive.com/38939080"/>
       <bibkey>sen-saffari-2020-models</bibkey>
-      <pwccode url="https://github.com/amazon-research/qa-dataset-converter" additional="false">amazon-research/qa-dataset-converter</pwccode>
+      <pwccode url="https://github.com/amazon-research/qa-dataset-converter" additional="true">amazon-research/qa-dataset-converter</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/natural-questions">Natural Questions</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/newsqa">NewsQA</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/quac">QuAC</pwcdataset>
diff --git a/data/xml/2021.emnlp.xml b/data/xml/2021.emnlp.xml
index 569733e119..ffde7ab1ad 100644
--- a/data/xml/2021.emnlp.xml
+++ b/data/xml/2021.emnlp.xml
@@ -2602,6 +2602,7 @@
       <doi>10.18653/v1/2021.emnlp-main.166</doi>
       <video href="2021.emnlp-main.166.mp4"/>
       <pwccode url="https://github.com/LauraRuis/groundedSCAN" additional="true">LauraRuis/groundedSCAN</pwccode>
+      <pwcdataset url="https://paperswithcode.com/dataset/gscan">GSCAN</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/scan">SCAN</pwcdataset>
     </paper>
     <paper id="167">
diff --git a/data/xml/2021.findings.xml b/data/xml/2021.findings.xml
index 17c4dd74f0..49b349f93a 100644
--- a/data/xml/2021.findings.xml
+++ b/data/xml/2021.findings.xml
@@ -6779,6 +6779,7 @@
       <doi>10.18653/v1/2021.findings-emnlp.21</doi>
       <video href="2021.findings-emnlp.21.mp4"/>
       <pwccode url="https://github.com/ylkuo/compositional-gscan" additional="false">ylkuo/compositional-gscan</pwccode>
+      <pwcdataset url="https://paperswithcode.com/dataset/gscan">GSCAN</pwcdataset>
     </paper>
     <paper id="22">
       <title>An Unsupervised Method for Building Sentence Simplification Corpora in Multiple Languages</title>
diff --git a/data/xml/2022.coling.xml b/data/xml/2022.coling.xml
index 589529b6a3..da7371e668 100644
--- a/data/xml/2022.coling.xml
+++ b/data/xml/2022.coling.xml
@@ -6891,6 +6891,7 @@
       <revision id="1" href="2022.coling-1.525v1" hash="6754108c"/>
       <revision id="2" href="2022.coling-1.525v2" hash="602a0e58" date="2023-07-31">Various corrections.</revision>
       <pwccode url="https://github.com/valvoda/neuraltransducer" additional="false">valvoda/neuraltransducer</pwccode>
+      <pwcdataset url="https://paperswithcode.com/dataset/gscan">GSCAN</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/scan">SCAN</pwcdataset>
     </paper>
     <paper id="526">
diff --git a/data/xml/2022.findings.xml b/data/xml/2022.findings.xml
index 8de9cc7791..099873e70b 100644
--- a/data/xml/2022.findings.xml
+++ b/data/xml/2022.findings.xml
@@ -2726,7 +2726,7 @@
       <bibkey>beau-crabbe-2022-impact</bibkey>
       <doi>10.18653/v1/2022.findings-acl.173</doi>
       <video href="2022.findings-acl.173.mp4"/>
-      <pwccode url="https://gitlab.com/codegenfact/BertranX" additional="false">codegenfact/BertranX</pwccode>
+      <pwccode url="https://gitlab.com/codegenfactors/BertranX" additional="true">codegenfactors/BertranX</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/conala">CoNaLa</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/django">Django</pwcdataset>
     </paper>
diff --git a/data/xml/2022.lrec.xml b/data/xml/2022.lrec.xml
index 76a4d61ba2..dd630e2344 100644
--- a/data/xml/2022.lrec.xml
+++ b/data/xml/2022.lrec.xml
@@ -768,6 +768,7 @@
       <abstract>Sentiment analysis is one of the most widely studied tasks in natural language processing. While BERT-based models have achieved state-of-the-art results in this task, little attention has been given to its performance variability across class labels, multi-source and multi-domain corpora. In this paper, we present an improved state-of-the-art and comparatively evaluate BERT-based models for sentiment analysis on Italian corpora. The proposed model is evaluated over eight sentiment analysis corpora from different domains (social media, finance, e-commerce, health, travel) and sources (Twitter, YouTube, Facebook, Amazon, Tripadvisor, Opera and Personal Healthcare Agent) on the prediction of positive, negative and neutral classes. Our findings suggest that BERT-based models are confident in predicting positive and negative examples but not as much with neutral examples. We release the sentiment analysis model as well as a newly financial domain sentiment corpus.</abstract>
       <url hash="cc707322">2022.lrec-1.62</url>
       <bibkey>roccabruna-etal-2022-multi</bibkey>
+      <pwccode url="https://gitlab.com/sislab/multi-source-multi-domain-sentiment-analysis-with-bert-based-models" additional="false">sislab/multi-source-multi-domain-sentiment-analysis-with-bert-based-models</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/imdb-movie-reviews">IMDb Movie Reviews</pwcdataset>
     </paper>
     <paper id="63">
@@ -4152,6 +4153,7 @@
       <abstract>Specialist high-quality information is typically first available in English, and it is written in a language that may be difficult to understand by most readers. While Machine Translation technologies contribute to mitigate the first issue, the translated content will most likely still contain complex language. In order to investigate and address both problems simultaneously, we introduce Simple TICO-19, a new language resource containing manual simplifications of the English and Spanish portions of the TICO-19 corpus for Machine Translation of COVID-19 literature. We provide an in-depth description of the annotation process, which entailed designing an annotation manual and employing four annotators (two native English speakers and two native Spanish speakers) who simplified over 6,000 sentences from the English and Spanish portions of the TICO-19 corpus. We report several statistics on the new dataset, focusing on analysing the improvements in readability from the original texts to their simplified versions. In addition, we propose baseline methodologies for automatically generating the simplifications, translations and joint translation and simplifications contained in our dataset.</abstract>
       <url hash="fbb6e489">2022.lrec-1.331</url>
       <bibkey>shardlow-alva-manchego-2022-simple</bibkey>
+      <pwccode url="https://github.com/mmu-tdmlab/simpletico19" additional="false">mmu-tdmlab/simpletico19</pwccode>
     </paper>
     <paper id="332">
       <title>Building Comparable Corpora for Assessing Multi-Word Term Alignment</title>
diff --git a/data/xml/N18.xml b/data/xml/N18.xml
index e296f4d09b..46656c99f2 100644
--- a/data/xml/N18.xml
+++ b/data/xml/N18.xml
@@ -526,7 +526,7 @@
       <doi>10.18653/v1/N18-1037</doi>
       <video href="http://vimeo.com/276453229"/>
       <bibkey>wang-etal-2018-scene</bibkey>
-      <pwccode url="https://github.com/Yusics/bist-parser" additional="false">Yusics/bist-parser</pwccode>
+      <pwccode url="https://github.com/Yusics/bist-parser" additional="true">Yusics/bist-parser</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/coco">COCO</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/visual-genome">Visual Genome</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/visual-question-answering">Visual Question Answering</pwcdataset>
diff --git a/data/xml/P19.xml b/data/xml/P19.xml
index f0cd02ccf9..650fc51fcb 100644
--- a/data/xml/P19.xml
+++ b/data/xml/P19.xml
@@ -9075,7 +9075,7 @@
       <revision id="1" href="P19-1644v1" hash="590f6420"/>
       <revision id="2" href="P19-1644v2" hash="9e13eba7">In the main paper (attached), in Table 3, the row for "Cardinality (soft)" has incorrect values under the NLVR and NLVR2 columns. The respective values should be 16 and 23.6. The value of 16 was reported in Suhr et al. 2017 (P17-2034; the error occurred when I accidentally overwrote the NLVR cell value).</revision>
       <bibkey>suhr-etal-2019-corpus</bibkey>
-      <pwccode url="https://github.com/lil-lab/nlvr" additional="true">lil-lab/nlvr</pwccode>
+      <pwccode url="https://github.com/lil-lab/nlvr" additional="false">lil-lab/nlvr</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/clevr">CLEVR</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/clevr-humans">CLEVR-Humans</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/coco">COCO</pwcdataset>

From eaf16bf152bf345991c1b12f3de4368cd0cbc9d5 Mon Sep 17 00:00:00 2001
From: Matt Post <post@cs.jhu.edu>
Date: Sun, 26 Nov 2023 17:49:54 -0500
Subject: [PATCH 10/12] Add SIGDIAL / INLG DOIs (#2887)

---
 data/xml/2020.inlg.xml    | 46 ++++++++++++++++++++++++++++
 data/xml/2020.sigdial.xml | 41 +++++++++++++++++++++++++
 data/xml/2021.inlg.xml    | 45 +++++++++++++++++++++++++++
 data/xml/2021.sigdial.xml | 61 ++++++++++++++++++++++++++++++++++++-
 data/xml/2022.inlg.xml    | 37 ++++++++++++++++++----
 data/xml/2022.sigdial.xml | 64 +++++++++++++++++++++++++++++++++++++++
 data/xml/2023.inlg.xml    | 36 ++++++++++++++++++++++
 data/xml/2023.sigdial.xml | 60 ++++++++++++++++++++++++++++++++++++
 8 files changed, 383 insertions(+), 7 deletions(-)

diff --git a/data/xml/2020.inlg.xml b/data/xml/2020.inlg.xml
index 810809ebb5..9cc7cf2a42 100644
--- a/data/xml/2020.inlg.xml
+++ b/data/xml/2020.inlg.xml
@@ -31,6 +31,7 @@
       <abstract>This paper presents a novel fusion method for integrating an external language model (LM) into the Transformer based sequence-to-sequence (seq2seq) model. While paired data are basically required to train the seq2seq model, the external LM can be trained with only unpaired data. Thus, it is important to leverage memorized knowledge in the external LM for building the seq2seq model, since it is hard to prepare a large amount of paired data. However, the existing fusion methods assume that the LM is integrated with recurrent neural network-based seq2seq models instead of the Transformer. Therefore, this paper proposes a fusion method that can explicitly utilize network structures in the Transformer. The proposed method, called memory attentive fusion, leverages the Transformer-style attention mechanism that repeats source-target attention in a multi-hop manner for reading the memorized knowledge in the LM. Our experiments on two text-style conversion tasks demonstrate that the proposed method performs better than conventional fusion methods.</abstract>
       <url hash="a150208f">2020.inlg-1.1</url>
       <bibkey>ihori-etal-2020-memory</bibkey>
+      <doi>10.18653/v1/2020.inlg-1.1</doi>
     </paper>
     <paper id="2">
       <title><fixed-case>A</fixed-case>rabic <fixed-case>NLG</fixed-case> Language Functions</title>
@@ -41,6 +42,7 @@
       <url hash="4805a436">2020.inlg-1.2</url>
       <bibkey>abed-reiter-2020-arabic</bibkey>
       <pwccode url="https://github.com/waelmohammedabed/natural-language-generation-for-the-arabic-language" additional="false">waelmohammedabed/natural-language-generation-for-the-arabic-language</pwccode>
+      <doi>10.18653/v1/2020.inlg-1.2</doi>
     </paper>
     <paper id="3">
       <title>Generating Intelligible Plumitifs Descriptions: Use Case Application with Ethical Considerations</title>
@@ -55,6 +57,7 @@
       <url hash="32e62603">2020.inlg-1.3</url>
       <attachment type="Supplementary_Attachment" hash="bc8805cc">2020.inlg-1.3.Supplementary_Attachment.pdf</attachment>
       <bibkey>beauchemin-etal-2020-generating</bibkey>
+      <doi>10.18653/v1/2020.inlg-1.3</doi>
     </paper>
     <paper id="4">
       <title><fixed-case>R</fixed-case>ecipe<fixed-case>NLG</fixed-case>: A Cooking Recipes Dataset for Semi-Structured Text Generation</title>
@@ -69,6 +72,7 @@
       <url hash="d1455a80">2020.inlg-1.4</url>
       <bibkey>bien-etal-2020-recipenlg</bibkey>
       <pwcdataset url="https://paperswithcode.com/dataset/recipenlg">RecipeNLG</pwcdataset>
+      <doi>10.18653/v1/2020.inlg-1.4</doi>
     </paper>
     <paper id="5">
       <title>Controlled Text Generation with Adversarial Learning</title>
@@ -80,6 +84,7 @@
       <url hash="8854ee4d">2020.inlg-1.5</url>
       <attachment type="Supplementary_Attachment" hash="9de882a5">2020.inlg-1.5.Supplementary_Attachment.pdf</attachment>
       <bibkey>betti-etal-2020-controlled</bibkey>
+      <doi>10.18653/v1/2020.inlg-1.5</doi>
     </paper>
     <paper id="6">
       <title>Studying the Impact of Filling Information Gaps on the Output Quality of Neural Data-to-Text</title>
@@ -93,6 +98,7 @@
       <pwccode url="https://github.com/nlgcat/adding_data" additional="false">nlgcat/adding_data</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/rotowire">RotoWire</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/sportsett">SportSett</pwcdataset>
+      <doi>10.18653/v1/2020.inlg-1.6</doi>
     </paper>
     <paper id="7">
       <title>Improving the Naturalness and Diversity of Referring Expression Generation models using Minimum Risk Training</title>
@@ -105,6 +111,7 @@
       <bibkey>panagiaris-etal-2020-improving</bibkey>
       <pwcdataset url="https://paperswithcode.com/dataset/coco">COCO</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/refcoco">RefCOCO</pwcdataset>
+      <doi>10.18653/v1/2020.inlg-1.7</doi>
     </paper>
     <paper id="8">
       <title>Assessing Discourse Relations in Language Generation from <fixed-case>GPT</fixed-case>-2</title>
@@ -114,6 +121,7 @@
       <abstract>Recent advances in NLP have been attributed to the emergence of large-scale pre-trained language models. GPT-2, in particular, is suited for generation tasks given its left-to-right language modeling objective, yet the linguistic quality of its generated text has largely remain unexplored. Our work takes a step in understanding GPT-2’s outputs in terms of discourse coherence. We perform a comprehensive study on the validity of explicit discourse relations in GPT-2’s outputs under both organic generation and fine-tuned scenarios. Results show GPT-2 does not always generate text containing valid discourse relations; nevertheless, its text is more aligned with human expectation in the fine-tuned scenario. We propose a decoupled strategy to mitigate these problems and highlight the importance of explicitly modeling discourse information.</abstract>
       <url hash="de60e57f">2020.inlg-1.8</url>
       <bibkey>ko-li-2020-assessing</bibkey>
+      <doi>10.18653/v1/2020.inlg-1.8</doi>
     </paper>
     <paper id="9">
       <title>Data-to-Text Generation with Iterative Text Editing</title>
@@ -125,6 +133,7 @@
       <attachment type="Supplementary_Attachment" hash="3dc9e310">2020.inlg-1.9.Supplementary_Attachment.pdf</attachment>
       <bibkey>kasner-dusek-2020-data</bibkey>
       <pwccode url="https://github.com/kasnerz/d2t_iterative_editing" additional="false">kasnerz/d2t_iterative_editing</pwccode>
+      <doi>10.18653/v1/2020.inlg-1.9</doi>
     </paper>
     <paper id="10">
       <title>The <fixed-case>CACAPO</fixed-case> Dataset: A Multilingual, Multi-Domain Dataset for Neural Pipeline and End-to-End Data-to-Text Generation</title>
@@ -139,6 +148,7 @@
       <bibkey>van-der-lee-etal-2020-cacapo</bibkey>
       <pwcdataset url="https://paperswithcode.com/dataset/rotowire">RotoWire</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/webnlg">WebNLG</pwcdataset>
+      <doi>10.18653/v1/2020.inlg-1.10</doi>
     </paper>
     <paper id="11">
       <title>Towards Generating Query to Perform Query Focused Abstractive Summarization using Pre-trained Model</title>
@@ -151,6 +161,7 @@
       <pwccode url="https://github.com/deen-abdullah/QABSBERT" additional="false">deen-abdullah/QABSBERT</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/cnn-daily-mail-1">CNN/Daily Mail</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/newsroom">NEWSROOM</pwcdataset>
+      <doi>10.18653/v1/2020.inlg-1.11</doi>
     </paper>
     <paper id="12">
       <title><fixed-case>S</fixed-case>imple<fixed-case>NLG</fixed-case>-<fixed-case>TI</fixed-case>: Adapting <fixed-case>S</fixed-case>imple<fixed-case>NLG</fixed-case> to <fixed-case>T</fixed-case>ibetan</title>
@@ -161,6 +172,7 @@
       <abstract>Surface realisation is the last but not the least phase of Natural Language Generation, which aims to produce high-quality natural language text based on meaning representations. In this article, we present our work on SimpleNLG-TI, a Tibetan surface realiser, which follows the design paradigm of SimpleNLG-EN. SimpleNLG-TI is built up by our investigation of the core features of Tibetan morphology and syntax. Through this work, we provide a robust and flexible surface realiser for Tibetan generation systems.</abstract>
       <url hash="34b47233">2020.inlg-1.12</url>
       <bibkey>kuanzhuo-etal-2020-simplenlg</bibkey>
+      <doi>10.18653/v1/2020.inlg-1.12</doi>
     </paper>
     <paper id="13">
       <title>Machine Translation Pre-training for Data-to-Text Generation - A Case Study in <fixed-case>C</fixed-case>zech</title>
@@ -170,6 +182,7 @@
       <abstract>While there is a large body of research studying deep learning methods for text generation from structured data, almost all of it focuses purely on English. In this paper, we study the effectiveness of machine translation based pre-training for data-to-text generation in non-English languages. Since the structured data is generally expressed in English, text generation into other languages involves elements of translation, transliteration and copying - elements already encoded in neural machine translation systems. Moreover, since data-to-text corpora are typically small, this task can benefit greatly from pre-training. We conduct experiments on Czech, a morphologically complex language. Results show that machine translation pre-training lets us train endto-end models that significantly improve upon unsupervised pre-training and linguistically informed pipelined neural systems, as judged by automatic metrics and human evaluation. We also show that this approach enjoys several desirable properties, including improved performance in low data scenarios and applicability to low resource languages.</abstract>
       <url hash="c68ae570">2020.inlg-1.13</url>
       <bibkey>kale-roy-2020-machine</bibkey>
+      <doi>10.18653/v1/2020.inlg-1.13</doi>
     </paper>
     <paper id="14">
       <title>Text-to-Text Pre-Training for Data-to-Text Tasks</title>
@@ -183,6 +196,7 @@
       <pwcdataset url="https://paperswithcode.com/dataset/multiwoz">MultiWOZ</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/totto">ToTTo</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/webnlg">WebNLG</pwcdataset>
+      <doi>10.18653/v1/2020.inlg-1.14</doi>
     </paper>
     <paper id="15">
       <title><fixed-case>D</fixed-case>a<fixed-case>M</fixed-case>ata: A Robot-Journalist Covering the <fixed-case>B</fixed-case>razilian <fixed-case>A</fixed-case>mazon Deforestation</title>
@@ -197,6 +211,7 @@
       <url hash="2aee9bfe">2020.inlg-1.15</url>
       <bibkey>rosa-teixeira-etal-2020-damata</bibkey>
       <pwccode url="https://github.com/botsdobem/demo_inpe_covid" additional="false">botsdobem/demo_inpe_covid</pwccode>
+      <doi>10.18653/v1/2020.inlg-1.15</doi>
     </paper>
     <paper id="16">
       <title>Generating Quantified Referring Expressions through Attention-Driven Incremental Perception</title>
@@ -205,6 +220,7 @@
       <abstract>We model the production of quantified referring expressions (QREs) that identity collections of visual items. A previous approach, called Perceptual Cost Pruning, modeled human QRE production using a preference-based referring expression generation algorithm, first removing facts from the input knowledge base based on a model of perceptual cost. In this paper, we present an alternative model that incrementally constructs a symbolic knowledge base through simulating human visual attention/perception from raw images. We demonstrate that this model produces the same output as Perceptual Cost Pruning. We argue that this is a more extensible approach and a step toward developing a wider range of process-level models of human visual description.</abstract>
       <url hash="87cdbf85">2020.inlg-1.16</url>
       <bibkey>briggs-2020-generating</bibkey>
+      <doi>10.18653/v1/2020.inlg-1.16</doi>
     </paper>
     <paper id="17">
       <title>Rich Syntactic and Semantic Information Helps Unsupervised Text Style Transfer</title>
@@ -216,6 +232,7 @@
       <url hash="3fd06001">2020.inlg-1.17</url>
       <attachment type="Supplementary_Attachment" hash="68e480d0">2020.inlg-1.17.Supplementary_Attachment.pdf</attachment>
       <bibkey>gong-etal-2020-rich</bibkey>
+      <doi>10.18653/v1/2020.inlg-1.17</doi>
     </paper>
     <paper id="18">
       <title><fixed-case>PARENT</fixed-case>ing via Model-Agnostic Reinforcement Learning to Correct Pathological Behaviors in Data-to-Text Generation</title>
@@ -229,6 +246,7 @@
       <bibkey>rebuffel-etal-2020-parenting</bibkey>
       <pwccode url="https://github.com/KaijuML/PARENTing-rl" additional="false">KaijuML/PARENTing-rl</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/wikibio">WikiBio</pwcdataset>
+      <doi>10.18653/v1/2020.inlg-1.18</doi>
     </paper>
     <paper id="19">
       <title>Evaluating Semantic Accuracy of Data-to-Text Generation with Natural Language Inference</title>
@@ -240,6 +258,7 @@
       <attachment type="Supplementary_Attachment" hash="b098e192">2020.inlg-1.19.Supplementary_Attachment.pdf</attachment>
       <bibkey>dusek-kasner-2020-evaluating</bibkey>
       <pwccode url="https://github.com/ufal/nlgi_eval" additional="false">ufal/nlgi_eval</pwccode>
+      <doi>10.18653/v1/2020.inlg-1.19</doi>
     </paper>
     <paper id="20">
       <title>Chart-to-Text: Generating Natural Language Descriptions for Charts by Adapting the Transformer Model</title>
@@ -251,6 +270,7 @@
       <bibkey>obeid-hoque-2020-chart</bibkey>
       <pwccode url="https://github.com/JasonObeid/Chart2Text" additional="false">JasonObeid/Chart2Text</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/chart2text">Chart2Text</pwcdataset>
+      <doi>10.18653/v1/2020.inlg-1.20</doi>
     </paper>
     <paper id="21">
       <title>Market Comment Generation from Data with Noisy Alignments</title>
@@ -264,6 +284,7 @@
       <abstract>End-to-end models on data-to-text learn the mapping of data and text from the aligned pairs in the dataset. However, these alignments are not always obtained reliably, especially for the time-series data, for which real time comments are given to some situation and there might be a delay in the comment delivery time compared to the actual event time. To handle this issue of possible noisy alignments in the dataset, we propose a neural network model with multi-timestep data and a copy mechanism, which allows the models to learn the correspondences between data and text from the dataset with noisier alignments. We focus on generating market comments in Japanese that are delivered each time an event occurs in the market. The core idea of our approach is to utilize multi-timestep data, which is not only the latest market price data when the comment is delivered, but also the data obtained at several timesteps earlier. On top of this, we employ a copy mechanism that is suitable for referring to the content of data records in the market price data. We confirm the superiority of our proposal by two evaluation metrics and show the accuracy improvement of the sentence generation using the time series data by our proposed method.</abstract>
       <url hash="b409bb28">2020.inlg-1.21</url>
       <bibkey>hamazono-etal-2020-market</bibkey>
+      <doi>10.18653/v1/2020.inlg-1.21</doi>
     </paper>
     <paper id="22">
       <title>A Gold Standard Methodology for Evaluating Accuracy in Data-To-Text Systems</title>
@@ -274,6 +295,7 @@
       <url hash="e63c8fae">2020.inlg-1.22</url>
       <bibkey>thomson-reiter-2020-gold</bibkey>
       <pwccode url="https://github.com/nlgcat/evaluating_accuracy" additional="false">nlgcat/evaluating_accuracy</pwccode>
+      <doi>10.18653/v1/2020.inlg-1.22</doi>
     </paper>
     <paper id="23">
       <title>Twenty Years of Confusion in Human Evaluation: <fixed-case>NLG</fixed-case> Needs Evaluation Sheets and Standardised Definitions</title>
@@ -291,6 +313,7 @@
       <abstract>Human assessment remains the most trusted form of evaluation in NLG, but highly diverse approaches and a proliferation of different quality criteria used by researchers make it difficult to compare results and draw conclusions across papers, with adverse implications for meta-evaluation and reproducibility. In this paper, we present (i) our dataset of 165 NLG papers with human evaluations, (ii) the annotation scheme we developed to label the papers for different aspects of evaluations, (iii) quantitative analyses of the annotations, and (iv) a set of recommendations for improving standards in evaluation reporting. We use the annotations as a basis for examining information included in evaluation reports, and levels of consistency in approaches, experimental design and terminology, focusing in particular on the 200+ different terms that have been used for evaluated aspects of quality. We conclude that due to a pervasive lack of clarity in reports and extreme diversity in approaches, human evaluation in NLG presents as extremely confused in 2020, and that the field is in urgent need of standard methods and terminology.</abstract>
       <url hash="deb0088a">2020.inlg-1.23</url>
       <bibkey>howcroft-etal-2020-twenty</bibkey>
+      <doi>10.18653/v1/2020.inlg-1.23</doi>
     </paper>
     <paper id="24">
       <title>Disentangling the Properties of Human Evaluation Methods: A Classification System to Support Comparability, Meta-Evaluation and Reproducibility Testing</title>
@@ -303,6 +326,7 @@
       <revision id="1" href="2020.inlg-1.24v1" hash="b11eeb47"/>
       <revision id="2" href="2020.inlg-1.24v2" hash="08a92927" date="2021-04-16">Extended the Acknowledgments section.</revision>
       <bibkey>belz-etal-2020-disentangling</bibkey>
+      <doi>10.18653/v1/2020.inlg-1.24</doi>
     </paper>
     <paper id="25">
       <title>Stable Style Transformer: Delete and Generate Approach with Encoder-Decoder for Text Style Transfer</title>
@@ -311,6 +335,7 @@
       <abstract>Text style transfer is the task that generates a sentence by preserving the content of the input sentence and transferring the style. Most existing studies are progressing on non-parallel datasets because parallel datasets are limited and hard to construct. In this work, we introduce a method that follows two stages in non-parallel datasets. The first stage is to delete attribute markers of a sentence directly through a classifier. The second stage is to generate a transferred sentence by combining the content tokens and the target style. We experiment on two benchmark datasets and evaluate context, style, fluency, and semantic. It is difficult to select the best system using only these automatic metrics, but it is possible to select stable systems. We consider only robust systems in all automatic evaluation metrics to be the minimum conditions that can be used in real applications. Many previous systems are difficult to use in certain situations because performance is significantly lower in several evaluation metrics. However, our system is stable in all automatic evaluation metrics and has results comparable to other models. Also, we compare the performance results of our system and the unstable system through human evaluation.</abstract>
       <url hash="4c0c5c2a">2020.inlg-1.25</url>
       <bibkey>lee-2020-stable</bibkey>
+      <doi>10.18653/v1/2020.inlg-1.25</doi>
     </paper>
     <paper id="26">
       <title>Listener’s Social Identity Matters in Personalised Response Generation</title>
@@ -321,6 +346,7 @@
       <abstract>Personalised response generation enables generating human-like responses by means of assigning the generator a social identity. However, pragmatics theory suggests that human beings adjust the way of speaking based on not only who they are but also whom they are talking to. In other words, when modelling personalised dialogues, it might be favourable if we also take the listener’s social identity into consideration. To validate this idea, we use gender as a typical example of a social variable to investigate how the listener’s identity influences the language used in Chinese dialogues on social media. Also, we build personalised generators. The experiment results demonstrate that the listener’s identity indeed matters in the language use of responses and that the response generator can capture such differences in language use. More interestingly, by additionally modelling the listener’s identity, the personalised response generator performs better in its own identity.</abstract>
       <url hash="fd4cd4ad">2020.inlg-1.26</url>
       <bibkey>chen-etal-2020-listeners</bibkey>
+      <doi>10.18653/v1/2020.inlg-1.26</doi>
     </paper>
     <paper id="27">
       <title>Understanding and Explicitly Measuring Linguistic and Stylistic Properties of Deception via Generation and Translation</title>
@@ -331,6 +357,7 @@
       <abstract>Massive digital disinformation is one of the main risks of modern society. Hundreds of models and linguistic analyses have been done to compare and contrast misleading and credible content online. However, most models do not remove the confounding factor of a topic or narrative when training, so the resulting models learn a clear topical separation for misleading versus credible content. We study the feasibility of using two strategies to disentangle the topic bias from the models to understand and explicitly measure linguistic and stylistic properties of content from misleading versus credible content. First, we develop conditional generative models to create news content that is characteristic of different credibility levels. We perform multi-dimensional evaluation of model performance on mimicking both the style and linguistic differences that distinguish news of different credibility using machine translation metrics and classification models. We show that even though generative models are able to imitate both the style and language of the original content, additional conditioning on both the news category and the topic leads to reduced performance. In a second approach, we perform deception style “transfer” by translating deceptive content into the style of credible content and vice versa. Extending earlier studies, we demonstrate that, when conditioned on a topic, deceptive content is shorter, less readable, more biased, and more subjective than credible content, and transferring the style from deceptive to credible content is more challenging than the opposite direction.</abstract>
       <url hash="f959707c">2020.inlg-1.27</url>
       <bibkey>saldanha-etal-2020-understanding</bibkey>
+      <doi>10.18653/v1/2020.inlg-1.27</doi>
     </paper>
     <paper id="28">
       <title>Shared Task on Evaluating Accuracy</title>
@@ -340,6 +367,7 @@
       <abstract>We propose a shared task on methodologies and algorithms for evaluating the accuracy of generated texts, specifically summaries of basketball games produced from basketball box score and other game data. We welcome submissions based on protocols for human evaluation, automatic metrics, as well as combinations of human evaluations and metrics.</abstract>
       <url hash="e0fbeabf">2020.inlg-1.28</url>
       <bibkey>reiter-thomson-2020-shared</bibkey>
+      <doi>10.18653/v1/2020.inlg-1.28</doi>
     </paper>
     <paper id="29">
       <title><fixed-case>R</fixed-case>epro<fixed-case>G</fixed-case>en: Proposal for a Shared Task on Reproducibility of Human Evaluations in <fixed-case>NLG</fixed-case></title>
@@ -351,6 +379,7 @@
       <abstract>Across NLP, a growing body of work is looking at the issue of reproducibility. However, replicability of human evaluation experiments and reproducibility of their results is currently under-addressed, and this is of particular concern for NLG where human evaluations are the norm. This paper outlines our ideas for a shared task on reproducibility of human evaluations in NLG which aims (i) to shed light on the extent to which past NLG evaluations are replicable and reproducible, and (ii) to draw conclusions regarding how evaluations can be designed and reported to increase replicability and reproducibility. If the task is run over several years, we hope to be able to document an overall increase in levels of replicability and reproducibility over time.</abstract>
       <url hash="9e44ab52">2020.inlg-1.29</url>
       <bibkey>belz-etal-2020-reprogen</bibkey>
+      <doi>10.18653/v1/2020.inlg-1.29</doi>
     </paper>
     <paper id="30">
       <title>Task Proposal: Abstractive Snippet Generation for Web Pages</title>
@@ -364,6 +393,7 @@
       <abstract>We propose a shared task on abstractive snippet generation for web pages, a novel task of generating query-biased abstractive summaries for documents that are to be shown on a search results page. Conventional snippets are extractive in nature, which recently gave rise to copyright claims from news publishers as well as a new copyright legislation being passed in the European Union, limiting the fair use of web page contents for snippets. At the same time, abstractive summarization has matured considerably in recent years, potentially allowing for more personalization of snippets in the future. Taken together, these facts render further research into generating abstractive snippets both timely and promising.</abstract>
       <url hash="3b652cfd">2020.inlg-1.30</url>
       <bibkey>syed-etal-2020-task</bibkey>
+      <doi>10.18653/v1/2020.inlg-1.30</doi>
     </paper>
     <paper id="31">
       <title><fixed-case>BERT</fixed-case>-Based Simplification of <fixed-case>J</fixed-case>apanese Sentence-Ending Predicates in Descriptive Text</title>
@@ -374,6 +404,7 @@
       <abstract>Japanese sentence-ending predicates intricately combine content words and functional elements, such as aspect, modality, and honorifics; this can often hinder the understanding of language learners and children. Conventional lexical simplification methods, which replace difficult target words with simpler synonyms acquired from lexical resources in a word-by-word manner, are not always suitable for the simplification of such Japanese predicates. Given this situation, we propose a BERT-based simplification method, the core feature of which is the high ability to substitute the whole predicates with simple ones while maintaining their core meanings in the context by utilizing pre-trained masked language models. Experimental results showed that our proposed methods consistently outperformed the conventional thesaurus-based method by a wide margin. Furthermore, we investigated in detail the effectiveness of the average token embedding and dropout, and the remaining errors of our BERT-based methods.</abstract>
       <url hash="356fbb4a">2020.inlg-1.31</url>
       <bibkey>kato-etal-2020-bert</bibkey>
+      <doi>10.18653/v1/2020.inlg-1.31</doi>
     </paper>
     <paper id="32">
       <title>Amplifying the Range of News Stories with Creativity: Methods and their Evaluation, in <fixed-case>P</fixed-case>ortuguese</title>
@@ -383,6 +414,7 @@
       <abstract>Headlines are key for attracting people to a story, but writing appealing headlines requires time and talent. This work aims to automate the production of creative short texts (e.g., news headlines) for an input context (e.g., existing headlines), thus amplifying its range. Well-known expressions (e.g., proverbs, movie titles), which typically include word-play and resort to figurative language, are used as a starting point. Given an input text, they can be recommended by exploiting Semantic Textual Similarity (STS) techniques, or adapted towards higher relatedness. For the latter, three methods that exploit static word embeddings are proposed. Experimentation in Portuguese lead to some conclusions, based on human opinions: STS methods that look exclusively at the surface text, recommend more related expressions; resulting expressions are somewhat related to the input, but adaptation leads to higher relatedness and novelty; humour can be an indirect consequence, but most outputs are not funny.</abstract>
       <url hash="4d7477ff">2020.inlg-1.32</url>
       <bibkey>mendes-goncalo-oliveira-2020-amplifying</bibkey>
+      <doi>10.18653/v1/2020.inlg-1.32</doi>
     </paper>
     <paper id="33">
       <title>Lessons from Computational Modelling of Reference Production in <fixed-case>M</fixed-case>andarin and <fixed-case>E</fixed-case>nglish</title>
@@ -395,6 +427,7 @@
       <revision id="1" href="2020.inlg-1.33v1" hash="7ccacc3e"/>
       <revision id="2" href="2020.inlg-1.33v2" hash="eef16ba7" date="2021-01-01">Adds the definition of the "real over-specification".</revision>
       <bibkey>chen-van-deemter-2020-lessons</bibkey>
+      <doi>10.18653/v1/2020.inlg-1.33</doi>
     </paper>
     <paper id="34">
       <title>Generating Varied Training Corpora in <fixed-case>R</fixed-case>unyankore Using a Combined Semantic and Syntactic, Pattern-Grammar-based Approach</title>
@@ -403,6 +436,7 @@
       <abstract>Machine learning algorithms have been applied to achieve high levels of accuracy in tasks associated with the processing of natural language. However, these algorithms require large amounts of training data in order to perform efficiently. Since most Bantu languages lack the required training corpora because they are computationally under-resourced, we investigated how to generate a large varied training corpus in Runyankore, a Bantu language indigenous to Uganda. We found the use of a combined semantic and syntactic, pattern and grammar-based approach to be applicable to this purpose, and used it to generate one million sentences, both labelled and unlabelled, which can be applied as training data for machine learning algorithms. The generated text was evaluated in two ways: (1) assessing the semantics encoded in word embeddings obtained from the generated text, which showed correct word similarity; and (2) applying the labelled data to tasks such as sentiment analysis, which achieved satisfactory levels of accuracy.</abstract>
       <url hash="73be2e7e">2020.inlg-1.34</url>
       <bibkey>byamugisha-2020-generating</bibkey>
+      <doi>10.18653/v1/2020.inlg-1.34</doi>
     </paper>
     <paper id="35">
       <title>Schema-Guided Natural Language Generation</title>
@@ -420,6 +454,7 @@
       <bibkey>du-etal-2020-schema</bibkey>
       <pwccode url="https://github.com/alexa/schema-guided-nlg" additional="false">alexa/schema-guided-nlg</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/sg-nlg">SG-NLG</pwcdataset>
+      <doi>10.18653/v1/2020.inlg-1.35</doi>
     </paper>
     <paper id="36">
       <title><fixed-case>OMEGA</fixed-case> : A probabilistic approach to referring expression generation in a virtual environment</title>
@@ -428,6 +463,7 @@
       <abstract>In recent years, referring expression genera- tion algorithms were inspired by game theory and probability theory. In this paper, an al- gorithm is designed for the generation of re- ferring expressions (REG) that base on both models by integrating maximization of utilities into the content determination process. It im- plements cognitive models for assessing visual salience of objects and additional features. In order to evaluate the algorithm properly and validate the applicability of existing models and evaluative information criteria, both, pro- duction and comprehension studies, are con- ducted using a complex domain of objects, pro- viding new directions of approaching the eval- uation of REG algorithms.</abstract>
       <url hash="7f81f784">2020.inlg-1.36</url>
       <bibkey>langner-2020-omega</bibkey>
+      <doi>10.18653/v1/2020.inlg-1.36</doi>
     </paper>
     <paper id="37">
       <title>Neural <fixed-case>NLG</fixed-case> for Methodius: From <fixed-case>RST</fixed-case> Meaning Representations to Texts</title>
@@ -443,6 +479,7 @@
       <bibkey>stevens-guille-etal-2020-neural</bibkey>
       <pwccode url="https://github.com/methodius-project/neural-methodius" additional="false">methodius-project/neural-methodius</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/webnlg">WebNLG</pwcdataset>
+      <doi>10.18653/v1/2020.inlg-1.37</doi>
     </paper>
     <paper id="38">
       <title>From “Before” to “After”: Generating Natural Language Instructions from Image Pairs in a Simple Visual Domain</title>
@@ -459,6 +496,7 @@
       <revision id="1" href="2020.inlg-1.38v1" hash="0d842728"/>
       <revision id="2" href="2020.inlg-1.38v2" hash="e0f2a2a8" date="2021-01-01">References to appendix were corrected as they were broken (e.g. pages 318, 319 "...Appendix ??" changed to "...Appendix [A,B,...]")</revision>
       <bibkey>rojowiec-etal-2020-generating</bibkey>
+      <doi>10.18653/v1/2020.inlg-1.38</doi>
     </paper>
     <paper id="39">
       <title>What <fixed-case>BERT</fixed-case> Sees: Cross-Modal Transfer for Visual Question Generation</title>
@@ -476,6 +514,7 @@
       <pwcdataset url="https://paperswithcode.com/dataset/coco">COCO</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/vqg">VQG</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/visual-question-answering">Visual Question Answering</pwcdataset>
+      <doi>10.18653/v1/2020.inlg-1.39</doi>
     </paper>
     <paper id="40">
       <title>When an Image Tells a Story: The Role of Visual and Semantic Information for Generating Paragraph Descriptions</title>
@@ -487,6 +526,7 @@
       <bibkey>ilinykh-dobnik-2020-image</bibkey>
       <pwcdataset url="https://paperswithcode.com/dataset/image-description-sequences">Image Description Sequences</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/image-paragraph-captioning">Image Paragraph Captioning</pwcdataset>
+      <doi>10.18653/v1/2020.inlg-1.40</doi>
     </paper>
     <paper id="41">
       <title>Transformer based Natural Language Generation for Question-Answering</title>
@@ -498,6 +538,7 @@
       <url hash="255a4e7b">2020.inlg-1.41</url>
       <bibkey>akermi-etal-2020-tansformer</bibkey>
       <pwcdataset url="https://paperswithcode.com/dataset/universal-dependencies">Universal Dependencies</pwcdataset>
+      <doi>10.18653/v1/2020.inlg-1.41</doi>
     </paper>
     <paper id="42">
       <title>Rapformer: Conditional Rap Lyrics Generation with Denoising Autoencoders</title>
@@ -510,6 +551,7 @@
       <url hash="966d1de8">2020.inlg-1.42</url>
       <attachment type="Supplementary_Attachment" hash="315249f7">2020.inlg-1.42.Supplementary_Attachment.zip</attachment>
       <bibkey>nikolov-etal-2020-rapformer</bibkey>
+      <doi>10.18653/v1/2020.inlg-1.42</doi>
     </paper>
     <paper id="43">
       <title>Reducing Non-Normative Text Generation from Language Models</title>
@@ -522,6 +564,7 @@
       <url hash="935f932c">2020.inlg-1.43</url>
       <bibkey>peng-etal-2020-reducing</bibkey>
       <pwcdataset url="https://paperswithcode.com/dataset/rocstories">ROCStories</pwcdataset>
+      <doi>10.18653/v1/2020.inlg-1.43</doi>
     </paper>
     <paper id="44">
       <title><fixed-case>R</fixed-case>eview<fixed-case>R</fixed-case>obot: Explainable Paper Review Generation based on Knowledge Synthesis</title>
@@ -538,6 +581,7 @@
       <pwccode url="https://github.com/EagleW/ReviewRobot" additional="false">EagleW/ReviewRobot</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/reviewrobot-dataset">ReviewRobot Dataset</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/peerread">PeerRead</pwcdataset>
+      <doi>10.18653/v1/2020.inlg-1.44</doi>
     </paper>
     <paper id="45">
       <title>Gradations of Error Severity in Automatic Image Descriptions</title>
@@ -553,6 +597,7 @@
       <url hash="60a0c14e">2020.inlg-1.45</url>
       <attachment type="Supplementary_Attachment" hash="fedc1586">2020.inlg-1.45.Supplementary_Attachment.zip</attachment>
       <bibkey>van-miltenburg-etal-2020-gradations</bibkey>
+      <doi>10.18653/v1/2020.inlg-1.45</doi>
     </paper>
     <paper id="46">
       <title>Policy-Driven Neural Response Generation for Knowledge-Grounded Dialog Systems</title>
@@ -566,6 +611,7 @@
       <abstract>Open-domain dialog systems aim to generate relevant, informative and engaging responses. In this paper, we propose using a dialog policy to plan the content and style of target, open domain responses in the form of an action plan, which includes knowledge sentences related to the dialog context, targeted dialog acts, topic information, etc. For training, the attributes within the action plan are obtained by automatically annotating the publicly released Topical-Chat dataset. We condition neural response generators on the action plan which is then realized as target utterances at the turn and sentence levels. We also investigate different dialog policy models to predict an action plan given the dialog context. Through automated and human evaluation, we measure the appropriateness of the generated responses and check if the generation models indeed learn to realize the given action plans. We demonstrate that a basic dialog policy that operates at the sentence level generates better responses in comparison to turn level generation as well as baseline models with no action plan. Additionally the basic dialog policy has the added benefit of controllability.</abstract>
       <url hash="009b2e1d">2020.inlg-1.46</url>
       <bibkey>hedayatnia-etal-2020-policy</bibkey>
+      <doi>10.18653/v1/2020.inlg-1.46</doi>
     </paper>
   </volume>
   <event id="inlg-2020">
diff --git a/data/xml/2020.sigdial.xml b/data/xml/2020.sigdial.xml
index 312548e318..a613544d1b 100644
--- a/data/xml/2020.sigdial.xml
+++ b/data/xml/2020.sigdial.xml
@@ -32,6 +32,7 @@
       <url hash="b4dfdde2">2020.sigdial-1.1</url>
       <video href="https://youtube.com/watch?v=nNvH4co2qr0"/>
       <bibkey>hsueh-ma-2020-semantic</bibkey>
+      <doi>10.18653/v1/2020.sigdial-1.1</doi>
     </paper>
     <paper id="2">
       <title>Counseling-Style Reflection Generation Using Generative Pretrained Transformers with Augmented Context</title>
@@ -44,6 +45,7 @@
       <url hash="4c5659b2">2020.sigdial-1.2</url>
       <video href="https://youtube.com/watch?v=Y9dOYM98rqI"/>
       <bibkey>shen-etal-2020-counseling</bibkey>
+      <doi>10.18653/v1/2020.sigdial-1.2</doi>
     </paper>
     <paper id="3">
       <title>Learning from Mistakes: Combining Ontologies via Self-Training for Dialogue Generation</title>
@@ -58,6 +60,7 @@
       <video href="https://youtube.com/watch?v=PxlR3DSFqkg"/>
       <bibkey>reed-etal-2020-learning</bibkey>
       <pwcdataset url="https://paperswithcode.com/dataset/e2e">E2E</pwcdataset>
+      <doi>10.18653/v1/2020.sigdial-1.3</doi>
     </paper>
     <paper id="4">
       <title><fixed-case>T</fixed-case>rip<fixed-case>P</fixed-case>y: A Triple Copy Strategy for Value Independent Neural Dialog State Tracking</title>
@@ -74,6 +77,7 @@
       <video href="https://youtube.com/watch?v=qWLnp4tPbPM"/>
       <bibkey>heck-etal-2020-trippy</bibkey>
       <pwcdataset url="https://paperswithcode.com/dataset/multiwoz">MultiWOZ</pwcdataset>
+      <doi>10.18653/v1/2020.sigdial-1.4</doi>
     </paper>
     <paper id="5">
       <title>Conversational Agents for Intelligent Buildings</title>
@@ -86,6 +90,7 @@
       <url hash="bc4cad4c">2020.sigdial-1.5</url>
       <video href="https://youtube.com/watch?v=JL0ERwP41kk"/>
       <bibkey>sieinska-etal-2020-conversational</bibkey>
+      <doi>10.18653/v1/2020.sigdial-1.5</doi>
     </paper>
     <paper id="6">
       <title>Retico: An incremental framework for spoken dialogue systems</title>
@@ -95,6 +100,7 @@
       <url hash="8b8de6cb">2020.sigdial-1.6</url>
       <video href="https://youtube.com/watch?v=ExnADSR4FaQ"/>
       <bibkey>michael-2020-retico</bibkey>
+      <doi>10.18653/v1/2020.sigdial-1.6</doi>
     </paper>
     <paper id="7">
       <title><fixed-case>MC</fixed-case>-Saar-Instruct: a Platform for <fixed-case>M</fixed-case>inecraft Instruction Giving Agents</title>
@@ -109,6 +115,7 @@
       <url hash="649a092c">2020.sigdial-1.7</url>
       <video href="https://youtube.com/watch?v=ipGAj_qLXz4"/>
       <bibkey>kohn-etal-2020-mc</bibkey>
+      <doi>10.18653/v1/2020.sigdial-1.7</doi>
     </paper>
     <paper id="8">
       <title><fixed-case>C</fixed-case>onvo<fixed-case>K</fixed-case>it: A Toolkit for the Analysis of Conversations</title>
@@ -124,6 +131,7 @@
       <video href="https://youtube.com/watch?v=nofzyxM4h1k"/>
       <bibkey>chang-etal-2020-convokit</bibkey>
       <pwccode url="https://github.com/CornellNLP/Cornell-Conversational-Analysis-Toolkit" additional="false">CornellNLP/Cornell-Conversational-Analysis-Toolkit</pwccode>
+      <doi>10.18653/v1/2020.sigdial-1.8</doi>
     </paper>
     <paper id="9">
       <title>Commonsense Evidence Generation and Injection in Reading Comprehension</title>
@@ -139,6 +147,7 @@
       <pwcdataset url="https://paperswithcode.com/dataset/cos-e">CoS-E</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/commonsenseqa">CommonsenseQA</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/conceptnet">ConceptNet</pwcdataset>
+      <doi>10.18653/v1/2020.sigdial-1.9</doi>
     </paper>
     <paper id="10">
       <title>Identifying Collaborative Conversations using Latent Discourse Behaviors</title>
@@ -151,6 +160,7 @@
       <abstract>In this work, we study collaborative online conversations. Such conversations are rich in content, constructive and motivated by a shared goal. Automatically identifying such conversations requires modeling complex discourse behaviors, which characterize the flow of information, sentiment and community structure within discussions. To help capture these behaviors, we define a hybrid relational model in which relevant discourse behaviors are formulated as discrete latent variables and scored using neural networks. These variables provide the information needed for predicting the overall collaborative characterization of the entire conversational thread. We show that adding inductive bias in the form of latent variables results in performance improvement, while providing a natural way to explain the decision.</abstract>
       <url hash="99f84b81">2020.sigdial-1.10</url>
       <bibkey>jain-etal-2020-identifying</bibkey>
+      <doi>10.18653/v1/2020.sigdial-1.10</doi>
     </paper>
     <paper id="11">
       <title>A Case Study of User Communication Styles with Customer Service Agents versus Intelligent Virtual Agents</title>
@@ -161,6 +171,7 @@
       <url hash="676673a0">2020.sigdial-1.11</url>
       <video href="https://youtube.com/watch?v=2Zzu8atsg8s"/>
       <bibkey>hewitt-beaver-2020-case</bibkey>
+      <doi>10.18653/v1/2020.sigdial-1.11</doi>
     </paper>
     <paper id="12">
       <title>It’s About Time: Turn-Entry Timing For Situated Human-Robot Dialogue</title>
@@ -173,6 +184,7 @@
       <url hash="9a2a2a7a">2020.sigdial-1.12</url>
       <video href="https://youtube.com/watch?v=xUWAxoIuf4o"/>
       <bibkey>gervits-etal-2020-time</bibkey>
+      <doi>10.18653/v1/2020.sigdial-1.12</doi>
     </paper>
     <paper id="13">
       <title>Learning Word Groundings from Humans Facilitated by Robot Emotional Displays</title>
@@ -184,6 +196,7 @@
       <video href="https://youtube.com/watch?v=xTNbo840EPk"/>
       <bibkey>mcneill-kennington-2020-learning</bibkey>
       <pwcdataset url="https://paperswithcode.com/dataset/imagenet">ImageNet</pwcdataset>
+      <doi>10.18653/v1/2020.sigdial-1.13</doi>
     </paper>
     <paper id="14">
       <title>Learning and Reasoning for Robot Dialog and Navigation Tasks</title>
@@ -198,6 +211,7 @@
       <revision id="1" href="2020.sigdial-1.14v1" hash="681a0ae1"/>
       <revision id="2" href="2020.sigdial-1.14v2" hash="f3873d64" date="2020-09-08">A sponsor was removed from the Acknowledgments section.</revision>
       <bibkey>lu-etal-2020-learning</bibkey>
+      <doi>10.18653/v1/2020.sigdial-1.14</doi>
     </paper>
     <paper id="15">
       <title>An Attentive Listening System with Android <fixed-case>ERICA</fixed-case>: Comparison of Autonomous and <fixed-case>WOZ</fixed-case> Interactions</title>
@@ -212,6 +226,7 @@
       <url hash="7bcad570">2020.sigdial-1.15</url>
       <video href="https://youtube.com/watch?v=Ds4LiqSh_EA"/>
       <bibkey>inoue-etal-2020-attentive</bibkey>
+      <doi>10.18653/v1/2020.sigdial-1.15</doi>
     </paper>
     <paper id="16">
       <title>A Spoken Dialogue System for Spatial Question Answering in a Physical Blocks World</title>
@@ -225,6 +240,7 @@
       <video href="https://youtube.com/watch?v=ynx2F5Hme4I"/>
       <bibkey>platonov-etal-2020-spoken</bibkey>
       <pwcdataset url="https://paperswithcode.com/dataset/clevr">CLEVR</pwcdataset>
+      <doi>10.18653/v1/2020.sigdial-1.16</doi>
     </paper>
     <paper id="17">
       <title>rr<fixed-case>SDS</fixed-case>: Towards a Robot-ready Spoken Dialogue System</title>
@@ -237,6 +253,7 @@
       <abstract>Spoken interaction with a physical robot requires a dialogue system that is modular, multimodal, distributive, incremental and temporally aligned. In this demo paper, we make significant contributions towards fulfilling these requirements by expanding upon the ReTiCo incremental framework. We outline the incremental and multimodal modules and how their computation can be distributed. We demonstrate the power and flexibility of our robot-ready spoken dialogue system to be integrated with almost any robot.</abstract>
       <url hash="a3b985ef">2020.sigdial-1.17</url>
       <bibkey>kennington-etal-2020-rrsds</bibkey>
+      <doi>10.18653/v1/2020.sigdial-1.17</doi>
     </paper>
     <paper id="18">
       <title>Discovering Knowledge Graph Schema from Short Natural Language Text via Dialog</title>
@@ -249,6 +266,7 @@
       <url hash="ab21813f">2020.sigdial-1.18</url>
       <video href="https://youtube.com/watch?v=OD_c-tim8JI"/>
       <bibkey>ghosh-etal-2020-discovering</bibkey>
+      <doi>10.18653/v1/2020.sigdial-1.18</doi>
     </paper>
     <paper id="19">
       <title>User Impressions of Questions to Acquire Lexical Knowledge</title>
@@ -259,6 +277,7 @@
       <url hash="b4a55b09">2020.sigdial-1.19</url>
       <video href="https://youtube.com/watch?v=-i9XnHcoIRc"/>
       <bibkey>komatani-nakano-2020-user</bibkey>
+      <doi>10.18653/v1/2020.sigdial-1.19</doi>
     </paper>
     <paper id="20">
       <title>Simulating Turn-Taking in Conversations with Delayed Transmission</title>
@@ -269,6 +288,7 @@
       <url hash="816d6543">2020.sigdial-1.20</url>
       <video href="https://youtube.com/watch?v=9jerzgGw0pY"/>
       <bibkey>michael-moller-2020-simulating</bibkey>
+      <doi>10.18653/v1/2020.sigdial-1.20</doi>
     </paper>
     <paper id="21">
       <title>Is this Dialogue Coherent? Learning from Dialogue Acts and Entities</title>
@@ -280,6 +300,7 @@
       <video href="https://youtube.com/watch?v=IIcHVI9Kc0Y"/>
       <bibkey>cervone-riccardi-2020-dialogue</bibkey>
       <pwccode url="https://github.com/alecervi/switchboard-coherence-corpus" additional="true">alecervi/switchboard-coherence-corpus</pwccode>
+      <doi>10.18653/v1/2020.sigdial-1.21</doi>
     </paper>
     <paper id="22">
       <title>Analyzing Speaker Strategy in Referential Communication</title>
@@ -290,6 +311,7 @@
       <url hash="978b37fc">2020.sigdial-1.22</url>
       <video href="https://youtube.com/watch?v=LPkEyzaJMvI"/>
       <bibkey>mcmahan-stone-2020-analyzing</bibkey>
+      <doi>10.18653/v1/2020.sigdial-1.22</doi>
     </paper>
     <paper id="23">
       <title>Contextualized Emotion Recognition in Conversation as Sequence Tagging</title>
@@ -306,6 +328,7 @@
       <pwcdataset url="https://paperswithcode.com/dataset/dailydialog">DailyDialog</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/iemocap">IEMOCAP</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/meld">MELD</pwcdataset>
+      <doi>10.18653/v1/2020.sigdial-1.23</doi>
     </paper>
     <paper id="24">
       <title>How Self-Attention Improves Rare Class Performance in a Question-Answering Dialogue Agent</title>
@@ -317,6 +340,7 @@
       <url hash="87ecb41a">2020.sigdial-1.24</url>
       <video href="https://youtube.com/watch?v=oRWUuuwpIIo"/>
       <bibkey>stiff-etal-2020-self</bibkey>
+      <doi>10.18653/v1/2020.sigdial-1.24</doi>
     </paper>
     <paper id="25">
       <title>Filtering conversations through dialogue acts labels for improving corpus-based convergence studies</title>
@@ -328,6 +352,7 @@
       <url hash="e8155888">2020.sigdial-1.25</url>
       <video href="https://youtube.com/watch?v=ZDB3JaLVU08"/>
       <bibkey>fuscone-etal-2020-filtering</bibkey>
+      <doi>10.18653/v1/2020.sigdial-1.25</doi>
     </paper>
     <paper id="26">
       <title>Nontrivial Lexical Convergence in a Geography-Themed Game</title>
@@ -339,6 +364,7 @@
       <abstract>The present study aims to examine the prevalent notion that people entrain to the vocabulary of a dialogue system. Although previous research shows that people will replace their choice of words with simple substitutes, studies using more challenging substitutions are sparse. In this paper, we investigate whether people adapt their speech to the vocabulary of a dialogue system when the system’s suggested words are not direct synonyms. 32 participants played a geography-themed game with a remote-controlled agent and were primed by referencing strategies (rather than individual terms) introduced in follow-up questions. Our results suggest that context-appropriate substitutes support convergence and that the convergence has a lasting effect within a dialogue session if the system’s wording is more consistent with the norms of the domain than the original wording of the speaker.</abstract>
       <url hash="a8574bc0">2020.sigdial-1.26</url>
       <bibkey>bergqvist-etal-2020-nontrivial</bibkey>
+      <doi>10.18653/v1/2020.sigdial-1.26</doi>
     </paper>
     <paper id="27">
       <title>A unifying framework for modeling acoustic/prosodic entrainment: definition and evaluation on two large corpora</title>
@@ -351,6 +377,7 @@
       <url hash="3a5bd4d0">2020.sigdial-1.27</url>
       <video href="https://youtube.com/watch?v=cyZY4WyARkI"/>
       <bibkey>galvez-etal-2020-unifying</bibkey>
+      <doi>10.18653/v1/2020.sigdial-1.27</doi>
     </paper>
     <paper id="28">
       <title>Unsupervised Evaluation of Interactive Dialog with <fixed-case>D</fixed-case>ialo<fixed-case>GPT</fixed-case></title>
@@ -363,6 +390,7 @@
       <bibkey>mehri-eskenazi-2020-unsupervised</bibkey>
       <pwccode url="https://github.com/shikib/fed" additional="false">shikib/fed</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/fed">FED</pwcdataset>
+      <doi>10.18653/v1/2020.sigdial-1.28</doi>
     </paper>
     <paper id="29">
       <title>Towards Unified Dialogue System Evaluation: A Comprehensive Analysis of Current Evaluation Protocols</title>
@@ -373,6 +401,7 @@
       <url hash="50d5438a">2020.sigdial-1.29</url>
       <video href="https://youtube.com/watch?v=icJNtco4EoI"/>
       <bibkey>finch-choi-2020-towards</bibkey>
+      <doi>10.18653/v1/2020.sigdial-1.29</doi>
     </paper>
     <paper id="30">
       <title>Human-Human Health Coaching via Text Messages: Corpus, Annotation, and Analysis</title>
@@ -390,6 +419,7 @@
       <url hash="ce0b20f5">2020.sigdial-1.30</url>
       <video href="https://youtube.com/watch?v=1ga22I-NZeA"/>
       <bibkey>gupta-etal-2020-human</bibkey>
+      <doi>10.18653/v1/2020.sigdial-1.30</doi>
     </paper>
     <paper id="31">
       <title>Agent-Based Dynamic Collaboration Support in a Smart Office Space</title>
@@ -402,6 +432,7 @@
       <url hash="8d12f903">2020.sigdial-1.31</url>
       <video href="https://youtube.com/watch?v=3uC3ZJSL2Xc"/>
       <bibkey>wang-etal-2020-agent</bibkey>
+      <doi>10.18653/v1/2020.sigdial-1.31</doi>
     </paper>
     <paper id="32">
       <title>Emora <fixed-case>STDM</fixed-case>: A Versatile Framework for Innovative Dialogue System Development</title>
@@ -413,6 +444,7 @@
       <video href="https://youtube.com/watch?v=GnxClvqoi-4"/>
       <bibkey>finch-choi-2020-emora</bibkey>
       <pwccode url="https://github.com/emora-chat/emora_stdm" additional="false">emora-chat/emora_stdm</pwccode>
+      <doi>10.18653/v1/2020.sigdial-1.32</doi>
     </paper>
     <paper id="33">
       <title>Boosting Naturalness of Language in Task-oriented Dialogues via Adversarial Training</title>
@@ -422,6 +454,7 @@
       <url hash="60cd4f39">2020.sigdial-1.33</url>
       <video href="https://youtube.com/watch?v=JZHzDvmG6Ns"/>
       <bibkey>zhu-2020-boosting</bibkey>
+      <doi>10.18653/v1/2020.sigdial-1.33</doi>
     </paper>
     <paper id="34">
       <title>A Sequence-to-sequence Approach for Numerical Slot-filling Dialog Systems</title>
@@ -431,6 +464,7 @@
       <url hash="e62fb03c">2020.sigdial-1.34</url>
       <video href="https://youtube.com/watch?v=p8cvYEjct5g"/>
       <bibkey>shi-2020-sequence</bibkey>
+      <doi>10.18653/v1/2020.sigdial-1.34</doi>
     </paper>
     <paper id="35">
       <title>Beyond Domain <fixed-case>API</fixed-case>s: Task-oriented Conversational Modeling with Unstructured Knowledge Access</title>
@@ -446,6 +480,7 @@
       <video href="https://youtube.com/watch?v=0NwAtEe-vUA"/>
       <bibkey>kim-etal-2020-beyond</bibkey>
       <pwccode url="" additional="true"/>
+      <doi>10.18653/v1/2020.sigdial-1.35</doi>
     </paper>
     <paper id="36">
       <title>Multi-Action Dialog Policy Learning with Interactive Human Teaching</title>
@@ -457,6 +492,7 @@
       <url hash="4607697b">2020.sigdial-1.36</url>
       <video href="https://youtube.com/watch?v=_0WiUcv_KNI"/>
       <bibkey>jhunjhunwala-etal-2020-multi</bibkey>
+      <doi>10.18653/v1/2020.sigdial-1.36</doi>
     </paper>
     <paper id="37">
       <title>Is Your Goal-Oriented Dialog Model Performing Really Well? Empirical Analysis of System-wise Evaluation</title>
@@ -471,6 +507,7 @@
       <url hash="fe4b974e">2020.sigdial-1.37</url>
       <video href="https://youtube.com/watch?v=1Xfqq0mt1X8"/>
       <bibkey>takanobu-etal-2020-goal</bibkey>
+      <doi>10.18653/v1/2020.sigdial-1.37</doi>
     </paper>
     <paper id="38">
       <title>Similarity Scoring for Dialogue Behaviour Comparison</title>
@@ -481,6 +518,7 @@
       <url hash="84acdb32">2020.sigdial-1.38</url>
       <video href="https://youtube.com/watch?v=zs0yOpHWBf8"/>
       <bibkey>ultes-maier-2020-similarity</bibkey>
+      <doi>10.18653/v1/2020.sigdial-1.38</doi>
     </paper>
     <paper id="39">
       <title>Collection and Analysis of Dialogues Provided by Two Speakers Acting as One</title>
@@ -496,6 +534,7 @@
       <url hash="18cb50ab">2020.sigdial-1.39</url>
       <video href="https://youtube.com/watch?v=hFIHx-PqzDE"/>
       <bibkey>arimoto-etal-2020-collection</bibkey>
+      <doi>10.18653/v1/2020.sigdial-1.39</doi>
     </paper>
     <paper id="40">
       <title>Adaptive Dialog Policy Learning with Hindsight and User Modeling</title>
@@ -508,6 +547,7 @@
       <url hash="4435202f">2020.sigdial-1.40</url>
       <video href="https://youtube.com/watch?v=ZEXvT2F7UR4"/>
       <bibkey>cao-etal-2020-adaptive</bibkey>
+      <doi>10.18653/v1/2020.sigdial-1.40</doi>
     </paper>
     <paper id="41">
       <title>Dialogue Policies for Learning Board Games through Multimodal Communication</title>
@@ -522,6 +562,7 @@
       <url hash="54f8364b">2020.sigdial-1.41</url>
       <video href="https://youtube.com/watch?v=Mu1Wb2oLeCw"/>
       <bibkey>zare-etal-2020-dialogue</bibkey>
+      <doi>10.18653/v1/2020.sigdial-1.41</doi>
     </paper>
   </volume>
 </collection>
diff --git a/data/xml/2021.inlg.xml b/data/xml/2021.inlg.xml
index fb27ffc488..0e964de256 100644
--- a/data/xml/2021.inlg.xml
+++ b/data/xml/2021.inlg.xml
@@ -29,6 +29,7 @@
       <bibkey>han-etal-2021-generating</bibkey>
       <pwccode url="https://github.com/Jiuzhouh/Multi-Score" additional="false">Jiuzhouh/Multi-Score</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/webnlg">WebNLG</pwcdataset>
+      <doi>10.18653/v1/2021.inlg-1.1</doi>
     </paper>
     <paper id="2">
       <title>Neural Methodius Revisited: Do Discourse Relations Help with Pre-Trained Models Too?</title>
@@ -41,6 +42,7 @@
       <url hash="c3d2f45e">2021.inlg-1.2</url>
       <bibkey>maskharashvili-etal-2021-neural</bibkey>
       <pwccode url="https://github.com/aleksadre/methodiusneuralinlg2021" additional="false">aleksadre/methodiusneuralinlg2021</pwccode>
+      <doi>10.18653/v1/2021.inlg-1.2</doi>
     </paper>
     <paper id="3">
       <title>Exploring Input Representation Granularity for Generating Questions Satisfying Question-Answer Congruence</title>
@@ -54,6 +56,7 @@
       <url hash="09da1a7f">2021.inlg-1.3</url>
       <bibkey>kannan-etal-2021-exploring</bibkey>
       <pwcdataset url="https://paperswithcode.com/dataset/squad">SQuAD</pwcdataset>
+      <doi>10.18653/v1/2021.inlg-1.3</doi>
     </paper>
     <paper id="4">
       <title>Towards Zero-Shot Multilingual Synthetic Question and Answer Generation for Cross-Lingual Reading Comprehension</title>
@@ -71,6 +74,7 @@
       <pwcdataset url="https://paperswithcode.com/dataset/tydi-qa">TyDi QA</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/xquad">XQuAD</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/mc4">mC4</pwcdataset>
+      <doi>10.18653/v1/2021.inlg-1.4</doi>
     </paper>
     <paper id="5">
       <title>Chefbot: A Novel Framework for the Generation of Commonsense-enhanced Responses for Task-based Dialogue Systems</title>
@@ -80,6 +84,7 @@
       <abstract>Conversational systems aim to generate responses that are accurate, relevant and engaging, either through utilising neural end-to-end models or through slot filling. Human-to-human conversations are enhanced by not only the latest utterance of the interlocutor, but also by recalling relevant information about concepts/objects covered in the dialogue and integrating them into their responses. Such information may contain recent referred concepts, commonsense knowledge and more. A concrete scenario of such dialogues is the cooking scenario, i.e. when an artificial agent (personal assistant, robot, chatbot) and a human converse about a recipe. We will demo a novel system for commonsense enhanced response generation in the scenario of cooking, where the conversational system is able to not only provide directions for cooking step-by-step, but also display <i>commonsense</i> capabilities by offering explanations of how objects can be used and provide recommendations for replacing ingredients.</abstract>
       <url hash="532dee8f">2021.inlg-1.5</url>
       <bibkey>strathearn-gkatzia-2021-chefbot</bibkey>
+      <doi>10.18653/v1/2021.inlg-1.5</doi>
     </paper>
     <paper id="6">
       <title>Predicting Antonyms in Context using <fixed-case>BERT</fixed-case></title>
@@ -91,6 +96,7 @@
       <url hash="ffc3387c">2021.inlg-1.6</url>
       <bibkey>niwa-etal-2021-predicting</bibkey>
       <pwcdataset url="https://paperswithcode.com/dataset/semeval-2018-task-9-hypernym-discovery">SemEval-2018 Task 9: Hypernym Discovery</pwcdataset>
+      <doi>10.18653/v1/2021.inlg-1.6</doi>
     </paper>
     <paper id="7">
       <title>Examining Covert Gender Bias: A Case Study in <fixed-case>T</fixed-case>urkish and <fixed-case>E</fixed-case>nglish Machine Translation Models</title>
@@ -102,6 +108,7 @@
       <url hash="5260729e">2021.inlg-1.7</url>
       <bibkey>ciora-etal-2021-examining</bibkey>
       <pwccode url="https://github.com/NurIren/Gender-Bias-in-TR-to-EN-MT-Models" additional="false">NurIren/Gender-Bias-in-TR-to-EN-MT-Models</pwccode>
+      <doi>10.18653/v1/2021.inlg-1.7</doi>
     </paper>
     <paper id="8">
       <title><fixed-case>W</fixed-case>ea<fixed-case>S</fixed-case>u<fixed-case>L</fixed-case>: Weakly Supervised Dialogue Policy Learning: Reward Estimation for Multi-turn Dialogue</title>
@@ -110,6 +117,7 @@
       <abstract>An intelligent dialogue system in a multi-turn setting should not only generate the responses which are of good quality, but it should also generate the responses which can lead to long-term success of the dialogue. Although, the current approaches improved the response quality, but they over-look the training signals present in the dialogue data. We can leverage these signals to generate the weakly supervised training data for learning dialog policy and reward estimator, and make the policy take actions (generates responses) which can foresee the future direction for a successful (rewarding) conversation. We simulate the dialogue between an agent and a user (modelled similar to an agent with supervised learning objective) to interact with each other. The agent uses dynamic blocking to generate ranked diverse responses and exploration-exploitation to select among the Top-K responses. Each simulated state-action pair is evaluated (works as a weak annotation) with three quality modules: Semantic Relevant, Semantic Coherence and Consistent Flow. Empirical studies with two benchmarks indicate that our model can significantly out-perform the response quality and lead to a successful conversation on both automatic evaluation and human judgment.</abstract>
       <url hash="126b2d6e">2021.inlg-1.8</url>
       <bibkey>khandelwal-2021-weasul-weakly</bibkey>
+      <doi>10.18653/v1/2021.inlg-1.8</doi>
     </paper>
     <paper id="9">
       <title>Multi-Sentence Knowledge Selection in Open-Domain Dialogue</title>
@@ -126,6 +134,7 @@
       <bibkey>eric-etal-2021-multi</bibkey>
       <pwccode url="https://github.com/alexa/wow-plus-plus" additional="false">alexa/wow-plus-plus</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/wizard-of-wikipedia">Wizard of Wikipedia</pwcdataset>
+      <doi>10.18653/v1/2021.inlg-1.9</doi>
     </paper>
     <paper id="10">
       <title>Self-Training for Compositional Neural <fixed-case>NLG</fixed-case> in Task-Oriented Dialogue</title>
@@ -138,6 +147,7 @@
       <url hash="a39a71a0">2021.inlg-1.10</url>
       <bibkey>li-etal-2021-self</bibkey>
       <pwccode url="https://github.com/znculee/treenlg-bart" additional="true">znculee/treenlg-bart</pwccode>
+      <doi>10.18653/v1/2021.inlg-1.10</doi>
     </paper>
     <paper id="11">
       <title>Generating Racing Game Commentary from Vision, Language, and Structured Data</title>
@@ -152,6 +162,7 @@
       <abstract>We propose the task of automatically generating commentaries for races in a motor racing game, from vision, structured numerical, and textual data. Commentaries provide information to support spectators in understanding events in races. Commentary generation models need to interpret the race situation and generate the correct content at the right moment. We divide the task into two subtasks: utterance timing identification and utterance generation. Because existing datasets do not have such alignments of data in multiple modalities, this setting has not been explored in depth. In this study, we introduce a new large-scale dataset that contains aligned video data, structured numerical data, and transcribed commentaries that consist of 129,226 utterances in 1,389 races in a game. Our analysis reveals that the characteristics of commentaries change over time or from viewpoints. Our experiments on the subtasks show that it is still challenging for a state-of-the-art vision encoder to capture useful information from videos to generate accurate commentaries. We make the dataset and baseline implementation publicly available for further research.</abstract>
       <url hash="4c3983f7">2021.inlg-1.11</url>
       <bibkey>ishigaki-etal-2021-generating</bibkey>
+      <doi>10.18653/v1/2021.inlg-1.11</doi>
     </paper>
     <paper id="12">
       <title>Explaining Decision-Tree Predictions by Addressing Potential Conflicts between Predictions and Plausible Expectations</title>
@@ -163,6 +174,7 @@
       <abstract>We offer an approach to explain Decision Tree (DT) predictions by addressing potential conflicts between aspects of these predictions and plausible expectations licensed by background information. We define four types of conflicts, operationalize their identification, and specify explanatory schemas that address them. Our human evaluation focused on the effect of explanations on users’ understanding of a DT’s reasoning and their willingness to act on its predictions. The results show that (1) explanations that address potential conflicts are considered at least as good as baseline explanations that just follow a DT path; and (2) the conflict-based explanations are deemed especially valuable when users’ expectations disagree with the DT’s predictions.</abstract>
       <url hash="0a6f5d87">2021.inlg-1.12</url>
       <bibkey>maruf-etal-2021-explaining</bibkey>
+      <doi>10.18653/v1/2021.inlg-1.12</doi>
     </paper>
     <paper id="13">
       <title>Formulating Neural Sentence Ordering as the Asymmetric Traveling Salesman Problem</title>
@@ -173,6 +185,7 @@
       <url hash="c41bbe02">2021.inlg-1.13</url>
       <bibkey>keswani-jhamtani-2021-formulating</bibkey>
       <pwccode url="https://github.com/vkeswani/bertsp" additional="false">vkeswani/bertsp</pwccode>
+      <doi>10.18653/v1/2021.inlg-1.13</doi>
     </paper>
     <paper id="14">
       <title>Underreporting of errors in <fixed-case>NLG</fixed-case> output, and what to do about it</title>
@@ -192,6 +205,7 @@
       <url hash="a17c8e0e">2021.inlg-1.14</url>
       <attachment type="Supplementary_Attachment" hash="302452fb">2021.inlg-1.14.Supplementary_Attachment.zip</attachment>
       <bibkey>van-miltenburg-etal-2021-underreporting</bibkey>
+      <doi>10.18653/v1/2021.inlg-1.14</doi>
     </paper>
     <paper id="15">
       <title>What can Neural Referential Form Selectors Learn?</title>
@@ -204,6 +218,7 @@
       <attachment type="Supplementary_Attachment" hash="634b0c83">2021.inlg-1.15.Supplementary_Attachment.zip</attachment>
       <bibkey>chen-etal-2021-neural-referential</bibkey>
       <pwcdataset url="https://paperswithcode.com/dataset/webnlg">WebNLG</pwcdataset>
+      <doi>10.18653/v1/2021.inlg-1.15</doi>
     </paper>
     <paper id="16">
       <title><fixed-case>HI</fixed-case>-<fixed-case>CMLM</fixed-case>: Improve <fixed-case>CMLM</fixed-case> with Hybrid Decoder Input</title>
@@ -220,6 +235,7 @@
       <abstract>Mask-predict CMLM (Ghazvininejad et al.,2019) has achieved stunning performance among non-autoregressive NMT models, but we find that the mechanism of predicting all of the target words only depending on the hidden state of [MASK] is not effective and efficient in initial iterations of refinement, resulting in ungrammatical repetitions and slow convergence. In this work, we mitigate this problem by combining copied source with embeddings of [MASK] in decoder. Notably. it’s not a straightforward copying that is shown to be useless, but a novel heuristic hybrid strategy — fence-mask. Experimental results show that it gains consistent boosts on both WMT14 En&lt;-&gt;De and WMT16 En&lt;-&gt;Ro corpus by 0.5 BLEU on average, and 1 BLEU for less-informative short sentences. This reveals that incorporating additional information by proper strategies is beneficial to improve CMLM, particularly translation quality of short texts and speeding up early-stage convergence.</abstract>
       <url hash="aaa17567">2021.inlg-1.16</url>
       <bibkey>wang-etal-2021-hi</bibkey>
+      <doi>10.18653/v1/2021.inlg-1.16</doi>
     </paper>
     <paper id="17">
       <title>Using <fixed-case>BERT</fixed-case> for choosing classifiers in <fixed-case>M</fixed-case>andarin</title>
@@ -232,6 +248,7 @@
       <url hash="ac529884">2021.inlg-1.17</url>
       <bibkey>jarnfors-etal-2021-using</bibkey>
       <pwcdataset url="https://paperswithcode.com/dataset/chinese-classifier">Chinese Classifier</pwcdataset>
+      <doi>10.18653/v1/2021.inlg-1.17</doi>
     </paper>
     <paper id="18">
       <title>Enriching the <fixed-case>E</fixed-case>2<fixed-case>E</fixed-case> dataset</title>
@@ -245,6 +262,7 @@
       <bibkey>castro-ferreira-etal-2021-enriching</bibkey>
       <pwccode url="https://github.com/ThiagoCF05/EnrichedE2E" additional="false">ThiagoCF05/EnrichedE2E</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/webnlg">WebNLG</pwcdataset>
+      <doi>10.18653/v1/2021.inlg-1.18</doi>
     </paper>
     <paper id="19">
       <title>Goal-Oriented Script Construction</title>
@@ -260,6 +278,7 @@
       <revision id="2" href="2021.inlg-1.19v2" hash="06410bb7" date="2022-12-26">Corrected the Acknowledgement section.</revision>
       <pwccode url="https://github.com/veronica320/wikihow-gosc" additional="false">veronica320/wikihow-gosc</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/wikihow">WikiHow</pwcdataset>
+      <doi>10.18653/v1/2021.inlg-1.19</doi>
     </paper>
     <paper id="20">
       <title>Single Example Can Improve Zero-Shot Data Generation</title>
@@ -273,6 +292,7 @@
       <url hash="bc6b2754">2021.inlg-1.20</url>
       <bibkey>burnyshev-etal-2021-single</bibkey>
       <pwcdataset url="https://paperswithcode.com/dataset/sgd">SGD</pwcdataset>
+      <doi>10.18653/v1/2021.inlg-1.20</doi>
     </paper>
     <paper id="21">
       <title><fixed-case>SAPPHIRE</fixed-case>: Approaches for Enhanced Concept-to-Text Generation</title>
@@ -287,6 +307,7 @@
       <bibkey>feng-etal-2021-sapphire</bibkey>
       <pwccode url="https://github.com/styfeng/sapphire" additional="false">styfeng/sapphire</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/commongen">CommonGen</pwcdataset>
+      <doi>10.18653/v1/2021.inlg-1.21</doi>
     </paper>
     <paper id="22">
       <title>Contextualizing Variation in Text Style Transfer Datasets</title>
@@ -299,6 +320,7 @@
       <bibkey>schoch-etal-2021-contextualizing</bibkey>
       <pwcdataset url="https://paperswithcode.com/dataset/gyafc">GYAFC</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/penn-treebank">Penn Treebank</pwcdataset>
+      <doi>10.18653/v1/2021.inlg-1.22</doi>
     </paper>
     <paper id="23">
       <title>Generation Challenges: Results of the Accuracy Evaluation Shared Task</title>
@@ -309,6 +331,7 @@
       <url hash="611b4355">2021.inlg-1.23</url>
       <bibkey>thomson-reiter-2021-generation</bibkey>
       <pwccode url="https://github.com/ehudreiter/accuracysharedtask" additional="false">ehudreiter/accuracysharedtask</pwccode>
+      <doi>10.18653/v1/2021.inlg-1.23</doi>
     </paper>
     <paper id="24">
       <title>The <fixed-case>R</fixed-case>epro<fixed-case>G</fixed-case>en Shared Task on Reproducibility of Human Evaluations in <fixed-case>NLG</fixed-case>: Overview and Results</title>
@@ -320,6 +343,7 @@
       <abstract>The NLP field has recently seen a substantial increase in work related to reproducibility of results, and more generally in recognition of the importance of having shared definitions and practices relating to evaluation. Much of the work on reproducibility has so far focused on metric scores, with reproducibility of human evaluation results receiving far less attention. As part of a research programme designed to develop theory and practice of reproducibility assessment in NLP, we organised the first shared task on reproducibility of human evaluations, ReproGen 2021. This paper describes the shared task in detail, summarises results from each of the reproduction studies submitted, and provides further comparative analysis of the results. Out of nine initial team registrations, we received submissions from four teams. Meta-analysis of the four reproduction studies revealed varying degrees of reproducibility, and allowed very tentative first conclusions about what types of evaluation tend to have better reproducibility.</abstract>
       <url hash="f6d3a5f2">2021.inlg-1.24</url>
       <bibkey>belz-etal-2021-reprogen</bibkey>
+      <doi>10.18653/v1/2021.inlg-1.24</doi>
     </paper>
     <paper id="25">
       <title>Text-in-Context: Token-Level Error Detection for Table-to-Text Generation</title>
@@ -332,6 +356,7 @@
       <bibkey>kasner-etal-2021-text</bibkey>
       <pwccode url="https://github.com/kasnerz/accuracysharedtask_cuni-upf" additional="false">kasnerz/accuracysharedtask_cuni-upf</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/rotowire">RotoWire</pwcdataset>
+      <doi>10.18653/v1/2021.inlg-1.25</doi>
     </paper>
     <paper id="26">
       <title>Shared Task in Evaluating Accuracy: Leveraging Pre-Annotations in the Validation Process</title>
@@ -341,6 +366,7 @@
       <abstract>We hereby present our submission to the Shared Task in Evaluating Accuracy at the INLG 2021 Conference. Our evaluation protocol relies on three main components; rules and text classifiers that pre-annotate the dataset, a human annotator that validates the pre-annotations, and a web interface that facilitates this validation. Our submission consists in fact of two submissions; we first analyze solely the performance of the rules and classifiers (pre-annotations), and then the human evaluation aided by the former pre-annotations using the web interface (hybrid). The code for the web interface and the classifiers is publicly available.</abstract>
       <url hash="fbf1bf5a">2021.inlg-1.26</url>
       <bibkey>garneau-lamontagne-2021-shared</bibkey>
+      <doi>10.18653/v1/2021.inlg-1.26</doi>
     </paper>
     <paper id="27">
       <title>Automatic Verification of Data Summaries</title>
@@ -351,6 +377,7 @@
       <abstract>We present a generic method to compute thefactual accuracy of a generated data summarywith minimal user effort. We look at the prob-lem as a fact-checking task to verify the nu-merical claims in the text. The verification al-gorithm assumes that the data used to generatethe text is available. In this paper, we describehow the proposed solution has been used toidentify incorrect claims about basketball tex-tual summaries in the context of the AccuracyShared Task at INLG 2021.</abstract>
       <url hash="87ee832b">2021.inlg-1.27</url>
       <bibkey>rezgui-etal-2021-automatic</bibkey>
+      <doi>10.18653/v1/2021.inlg-1.27</doi>
     </paper>
     <paper id="28">
       <title>Grounding <fixed-case>NBA</fixed-case> Matchup Summaries</title>
@@ -361,6 +388,7 @@
       <bibkey>nomoto-2021-grounding</bibkey>
       <pwccode url="https://github.com/ehudreiter/accuracysharedtask" additional="false">ehudreiter/accuracysharedtask</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/rotowire">RotoWire</pwcdataset>
+      <doi>10.18653/v1/2021.inlg-1.28</doi>
     </paper>
     <paper id="29">
       <title>Reproducing a Comparison of Hedged and Non-hedged <fixed-case>NLG</fixed-case> Texts</title>
@@ -370,6 +398,7 @@
       <url hash="64443a05">2021.inlg-1.29</url>
       <bibkey>mahamood-2021-reproducing</bibkey>
       <pwccode url="https://github.com/saad-mahamood/reprohum2021" additional="false">saad-mahamood/reprohum2021</pwccode>
+      <doi>10.18653/v1/2021.inlg-1.29</doi>
     </paper>
     <paper id="30">
       <title>Another <fixed-case>PASS</fixed-case>: A Reproduction Study of the Human Evaluation of a Football Report Generation System</title>
@@ -381,6 +410,7 @@
       <abstract>This paper reports results from a reproduction study in which we repeated the human evaluation of the PASS Dutch-language football report generation system (van der Lee et al., 2017). The work was carried out as part of the ReproGen Shared Task on Reproducibility of Human Evaluations in NLG, in Track A (Paper 1). We aimed to repeat the original study exactly, with the main difference that a different set of evaluators was used. We describe the study design, present the results from the original and the reproduction study, and then compare and analyse the differences between the two sets of results. For the two ‘headline’ results of average Fluency and Clarity, we find that in both studies, the system was rated more highly for Clarity than for Fluency, and Clarity had higher standard deviation. Clarity and Fluency ratings were higher, and their standard deviations lower, in the reproduction study than in the original study by substantial margins. Clarity had a higher degree of reproducibility than Fluency, as measured by the coefficient of variation. Data and code are publicly available.</abstract>
       <url hash="09d878f2">2021.inlg-1.30</url>
       <bibkey>mille-etal-2021-another</bibkey>
+      <doi>10.18653/v1/2021.inlg-1.30</doi>
     </paper>
     <paper id="31">
       <title>A Reproduction Study of an Annotation-based Human Evaluation of <fixed-case>MT</fixed-case> Outputs</title>
@@ -390,6 +420,7 @@
       <abstract>In this paper we report our reproduction study of the Croatian part of an annotation-based human evaluation of machine-translated user reviews (Popovic, 2020). The work was carried out as part of the ReproGen Shared Task on Reproducibility of Human Evaluation in NLG. Our aim was to repeat the original study exactly, except for using a different set of evaluators. We describe the experimental design, characterise differences between original and reproduction study, and present the results from each study, along with analysis of the similarity between them. For the six main evaluation results of Major/Minor/All Comprehension error rates and Major/Minor/All Adequacy error rates, we find that (i) 4/6 system rankings are the same in both studies, (ii) the relative differences between systems are replicated well for Major Comprehension and Adequacy (Pearson’s &gt; 0.9), but not for the corresponding Minor error rates (Pearson’s 0.36 for Adequacy, 0.67 for Comprehension), and (iii) the individual system scores for both types of Minor error rates had a higher degree of reproducibility than the corresponding Major error rates. We also examine inter-annotator agreement and compare the annotations obtained in the original and reproduction studies.</abstract>
       <url hash="d8082155">2021.inlg-1.31</url>
       <bibkey>popovic-belz-2021-reproduction</bibkey>
+      <doi>10.18653/v1/2021.inlg-1.31</doi>
     </paper>
     <paper id="32">
       <title><fixed-case>TUDA</fixed-case>-Reproducibility @ <fixed-case>R</fixed-case>epro<fixed-case>G</fixed-case>en: Replicability of Human Evaluation of Text-to-Text and Concept-to-Text Generation</title>
@@ -400,6 +431,7 @@
       <abstract>This paper describes our contribution to the Shared Task ReproGen by Belz et al. (2021), which investigates the reproducibility of human evaluations in the context of Natural Language Generation. We selected the paper “Generation of Company descriptions using concept-to-text and text-to-text deep models: data set collection and systems evaluation” (Qader et al., 2018) and aimed to replicate, as closely to the original as possible, the human evaluation and the subsequent comparison between the human judgements and the automatic evaluation metrics. Here, we first outline the text generation task of the paper of Qader et al. (2018). Then, we document how we approached our replication of the paper’s human evaluation. We also discuss the difficulties we encountered and which information was missing. Our replication has medium to strong correlation (0.66 Spearman overall) with the original results of Qader et al. (2018), but due to the missing information about how Qader et al. (2018) compared the human judgements with the metric scores, we have refrained from reproducing this comparison.</abstract>
       <url hash="65d43df5">2021.inlg-1.32</url>
       <bibkey>richter-etal-2021-tuda</bibkey>
+      <doi>10.18653/v1/2021.inlg-1.32</doi>
     </paper>
     <paper id="33">
       <title><fixed-case>D</fixed-case>ialog<fixed-case>S</fixed-case>um Challenge: Summarizing Real-Life Scenario Dialogues</title>
@@ -415,6 +447,7 @@
       <pwcdataset url="https://paperswithcode.com/dataset/dialogsum">DialogSum</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/mutual">MuTual</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/samsum-corpus">SAMSum Corpus</pwcdataset>
+      <doi>10.18653/v1/2021.inlg-1.33</doi>
     </paper>
     <paper id="34">
       <title>Quality Evaluation of the Low-Resource Synthetically Generated Code-Mixed <fixed-case>H</fixed-case>inglish Text</title>
@@ -424,6 +457,7 @@
       <abstract>In this shared task, we seek the participating teams to investigate the factors influencing the quality of the code-mixed text generation systems. We synthetically generate code-mixed Hinglish sentences using two distinct approaches and employ human annotators to rate the generation quality. We propose two subtasks, quality rating prediction and annotators’ disagreement prediction of the synthetic Hinglish dataset. The proposed subtasks will put forward the reasoning and explanation of the factors influencing the quality and human perception of the code-mixed text.</abstract>
       <url hash="7ca3c662">2021.inlg-1.34</url>
       <bibkey>srivastava-singh-2021-quality</bibkey>
+      <doi>10.18653/v1/2021.inlg-1.34</doi>
     </paper>
     <paper id="35">
       <title>Shared Task on Feedback Comment Generation for Language Learners</title>
@@ -437,6 +471,7 @@
       <abstract>In this paper, we propose a generation challenge called Feedback comment generation for language learners. It is a task where given a text and a span, a system generates, for the span, an explanatory note that helps the writer (language learner) improve their writing skills. The motivations for this challenge are: (i) practically, it will be beneficial for both language learners and teachers if a computer-assisted language learning system can provide feedback comments just as human teachers do; (ii) theoretically, feedback comment generation for language learners has a mixed aspect of other generation tasks together with its unique features and it will be interesting to explore what kind of generation technique is effective against what kind of writing rule. To this end, we have created a dataset and developed baseline systems to estimate baseline performance. With these preparations, we propose a generation challenge of feedback comment generation.</abstract>
       <url hash="b671e03a">2021.inlg-1.35</url>
       <bibkey>nagata-etal-2021-shared</bibkey>
+      <doi>10.18653/v1/2021.inlg-1.35</doi>
     </paper>
     <paper id="36">
       <title>The <fixed-case>S</fixed-case>elect<fixed-case>G</fixed-case>en Challenge: Finding the Best Training Samples for Few-Shot Neural Text Generation</title>
@@ -448,6 +483,7 @@
       <abstract>We propose a shared task on training instance selection for few-shot neural text generation. Large-scale pretrained language models have led to dramatic improvements in few-shot text generation. Nonetheless, almost all previous work simply applies random sampling to select the few-shot training instances. Little to no attention has been paid to the selection strategies and how they would affect model performance. Studying the selection strategy can help us (1) make the most use of our annotation budget in downstream tasks and (2) better benchmark few-shot text generative models. We welcome submissions that present their selection strategies and the effects on the generation quality.</abstract>
       <url hash="d330b73f">2021.inlg-1.36</url>
       <bibkey>chang-etal-2021-selectgen</bibkey>
+      <doi>10.18653/v1/2021.inlg-1.36</doi>
     </paper>
     <paper id="37">
       <title>Affective Decoding for Empathetic Response Generation</title>
@@ -461,6 +497,7 @@
       <url hash="e4e2e2ae">2021.inlg-1.37</url>
       <bibkey>zeng-etal-2021-affective</bibkey>
       <pwccode url="https://github.com/zenggo/affective-decoding-4-empathetic-dialog" additional="false">zenggo/affective-decoding-4-empathetic-dialog</pwccode>
+      <doi>10.18653/v1/2021.inlg-1.37</doi>
     </paper>
     <paper id="38">
       <title>Controllable Sentence Simplification with a Unified Text-to-Text Transfer Transformer</title>
@@ -474,6 +511,7 @@
       <pwcdataset url="https://paperswithcode.com/dataset/asset">ASSET</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/turkcorpus">TurkCorpus</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikilarge">WikiLarge</pwcdataset>
+      <doi>10.18653/v1/2021.inlg-1.38</doi>
     </paper>
     <paper id="39">
       <title><fixed-case>SEPRG</fixed-case>: Sentiment aware Emotion controlled Personalized Response Generation</title>
@@ -485,6 +523,7 @@
       <abstract>Social chatbots have gained immense popularity, and their appeal lies not just in their capacity to respond to the diverse requests from users, but also in the ability to develop an emotional connection with users. To further develop and promote social chatbots, we need to concentrate on increasing user interaction and take into account both the intellectual and emotional quotient in the conversational agents. Therefore, in this work, we propose the task of sentiment aware emotion controlled personalized dialogue generation giving the machine the capability to respond emotionally and in accordance with the persona of the user. As sentiment and emotions are highly co-related, we use the sentiment knowledge of the previous utterance to generate the correct emotional response in accordance with the user persona. We design a Transformer based Dialogue Generation framework, that generates responses that are sensitive to the emotion of the user and corresponds to the persona and sentiment as well. Moreover, the persona information is encoded by a different Transformer encoder, along with the dialogue history, is fed to the decoder for generating responses. We annotate the PersonaChat dataset with sentiment information to improve the response quality. Experimental results on the PersonaChat dataset show that the proposed framework significantly outperforms the existing baselines, thereby generating personalized emotional responses in accordance with the sentiment that provides better emotional connection and user satisfaction as desired in a social chatbot.</abstract>
       <url hash="5d17bbe4">2021.inlg-1.39</url>
       <bibkey>firdaus-etal-2021-seprg</bibkey>
+      <doi>10.18653/v1/2021.inlg-1.39</doi>
     </paper>
     <paper id="40">
       <title>Biomedical Data-to-Text Generation via Fine-Tuning Transformers</title>
@@ -497,6 +536,7 @@
       <bibkey>yermakov-etal-2021-biomedical</bibkey>
       <pwccode url="https://github.com/bayer-science-for-a-better-life/data2text-bioleaflets" additional="false">bayer-science-for-a-better-life/data2text-bioleaflets</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/bioleaflets">BioLeaflets</pwcdataset>
+      <doi>10.18653/v1/2021.inlg-1.40</doi>
     </paper>
     <paper id="41">
       <title>Decoding, Fast and Slow: A Case Study on Balancing Trade-Offs in Incremental, Character-level Pragmatic Reasoning</title>
@@ -509,6 +549,7 @@
       <url hash="5bfa463c">2021.inlg-1.41</url>
       <bibkey>zarriess-etal-2021-decoding</bibkey>
       <pwcdataset url="https://paperswithcode.com/dataset/coco">COCO</pwcdataset>
+      <doi>10.18653/v1/2021.inlg-1.41</doi>
     </paper>
     <paper id="42">
       <title><fixed-case>G</fixed-case>raph<fixed-case>P</fixed-case>lan: Story Generation by Planning with Event Graph</title>
@@ -520,6 +561,7 @@
       <abstract>Story generation is a task that aims to automatically generate a meaningful story. This task is challenging because it requires high-level understanding of the semantic meaning of sentences and causality of story events. Naivesequence-to-sequence models generally fail to acquire such knowledge, as it is difficult to guarantee logical correctness in a text generation model without strategic planning. In this study, we focus on planning a sequence of events assisted by event graphs and use the events to guide the generator. Rather than using a sequence-to-sequence model to output a sequence, as in some existing works, we propose to generate an event sequence by walking on an event graph. The event graphs are built automatically based on the corpus. To evaluate the proposed approach, we incorporate human participation, both in event planning and story generation. Based on the largescale human annotation results, our proposed approach has been shown to provide more logically correct event sequences and stories compared with previous approaches.</abstract>
       <url hash="7a88f47e">2021.inlg-1.42</url>
       <bibkey>chen-etal-2021-graphplan</bibkey>
+      <doi>10.18653/v1/2021.inlg-1.42</doi>
     </paper>
     <paper id="43">
       <title><fixed-case>BERT</fixed-case>-based distractor generation for <fixed-case>S</fixed-case>wedish reading comprehension questions using a small-scale dataset</title>
@@ -530,6 +572,7 @@
       <url hash="5c2bc4e0">2021.inlg-1.43</url>
       <bibkey>kalpakchi-boye-2021-bert</bibkey>
       <pwccode url="https://github.com/dkalpakchi/swequad-mc" additional="false">dkalpakchi/swequad-mc</pwccode>
+      <doi>10.18653/v1/2021.inlg-1.43</doi>
     </paper>
     <paper id="44">
       <title>Exploring Structural Encoding for Data-to-Text Generation</title>
@@ -540,6 +583,7 @@
       <url hash="b676d59f">2021.inlg-1.44</url>
       <bibkey>mahapatra-garain-2021-exploring</bibkey>
       <pwcdataset url="https://paperswithcode.com/dataset/wikibio">WikiBio</pwcdataset>
+      <doi>10.18653/v1/2021.inlg-1.44</doi>
     </paper>
     <paper id="45">
       <title>Attention Is Indeed All You Need: Semantically Attention-Guided Decoding for Data-to-Text <fixed-case>NLG</fixed-case></title>
@@ -551,6 +595,7 @@
       <bibkey>juraska-walker-2021-attention</bibkey>
       <pwccode url="https://github.com/jjuraska/data2text-nlg" additional="false">jjuraska/data2text-nlg</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/viggo">ViGGO</pwcdataset>
+      <doi>10.18653/v1/2021.inlg-1.45</doi>
     </paper>
   </volume>
 </collection>
diff --git a/data/xml/2021.sigdial.xml b/data/xml/2021.sigdial.xml
index eb2ae300bb..b25b893a1f 100644
--- a/data/xml/2021.sigdial.xml
+++ b/data/xml/2021.sigdial.xml
@@ -33,6 +33,7 @@
       <url hash="062c3123">2021.sigdial-1.1</url>
       <bibkey>see-manning-2021-understanding</bibkey>
       <video href="https://www.youtube.com/watch?v=j50NQ1U5WYk"/>
+      <doi>10.18653/v1/2021.sigdial-1.1</doi>
     </paper>
     <paper id="2">
       <title>Towards Continuous Estimation of Dissatisfaction in Spoken Dialog</title>
@@ -44,6 +45,7 @@
       <url hash="43fb474c">2021.sigdial-1.2</url>
       <bibkey>ward-etal-2021-towards</bibkey>
       <video href="https://www.youtube.com/watch?v=Ij_264nuk0s"/>
+      <doi>10.18653/v1/2021.sigdial-1.2</doi>
     </paper>
     <paper id="3">
       <title><fixed-case>D</fixed-case>ialog<fixed-case>S</fixed-case>titch: Synthetic Deeper and Multi-Context Task-Oriented Dialogs</title>
@@ -62,6 +64,7 @@
       <pwcdataset url="https://paperswithcode.com/dataset/dailydialog">DailyDialog</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/visdial">VisDial</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wizard-of-wikipedia">Wizard of Wikipedia</pwcdataset>
+      <doi>10.18653/v1/2021.sigdial-1.3</doi>
     </paper>
     <paper id="4">
       <title>Individual Interaction Styles: Evidence from a Spoken Chat Corpus</title>
@@ -71,6 +74,7 @@
       <url hash="269878c8">2021.sigdial-1.4</url>
       <bibkey>ward-2021-individual</bibkey>
       <video href="https://www.youtube.com/watch?v=cSNGdDL-MVY"/>
+      <doi>10.18653/v1/2021.sigdial-1.4</doi>
     </paper>
     <paper id="5">
       <title>Evaluation of In-Person Counseling Strategies To Develop Physical Activity Chatbot for Women</title>
@@ -86,6 +90,7 @@
       <bibkey>liang-etal-2021-evaluation</bibkey>
       <video href="https://www.youtube.com/watch?v=h_L_uiu_BSo"/>
       <pwccode url="https://github.com/KaihuiLiang/physical-activity-counseling" additional="false">KaihuiLiang/physical-activity-counseling</pwccode>
+      <doi>10.18653/v1/2021.sigdial-1.5</doi>
     </paper>
     <paper id="6">
       <title>Improving Named Entity Recognition in Spoken Dialog Systems by Context and Speech Pattern Modeling</title>
@@ -96,6 +101,7 @@
       <url hash="ab415bae">2021.sigdial-1.6</url>
       <bibkey>nguyen-yu-2021-improving</bibkey>
       <video href="https://www.youtube.com/watch?v=JIGvcylPvPI"/>
+      <doi>10.18653/v1/2021.sigdial-1.6</doi>
     </paper>
     <paper id="7">
       <title><fixed-case>S</fixed-case>o<fixed-case>DA</fixed-case>: On-device Conversational Slot Extraction</title>
@@ -106,6 +112,7 @@
       <url hash="6d8b59c4">2021.sigdial-1.7</url>
       <bibkey>ravi-kozareva-2021-soda</bibkey>
       <video href="https://www.youtube.com/watch?v=0hDaafkctwI"/>
+      <doi>10.18653/v1/2021.sigdial-1.7</doi>
     </paper>
     <paper id="8">
       <title>Getting to Production with Few-shot Natural Language Generation Models</title>
@@ -126,6 +133,7 @@
       <url hash="ac13489a">2021.sigdial-1.8</url>
       <bibkey>heidari-etal-2021-getting</bibkey>
       <video href="https://www.youtube.com/watch?v=JKZ_96erOyY"/>
+      <doi>10.18653/v1/2021.sigdial-1.8</doi>
     </paper>
     <paper id="9">
       <title><fixed-case>ARTA</fixed-case>: Collection and Classification of Ambiguous Requests and Thoughtful Actions</title>
@@ -139,6 +147,7 @@
       <bibkey>tanaka-etal-2021-arta</bibkey>
       <video href="https://www.youtube.com/watch?v=Y4OAaQzoIhA"/>
       <pwccode url="https://github.com/ahclab/arta_corpus" additional="false">ahclab/arta_corpus</pwccode>
+      <doi>10.18653/v1/2021.sigdial-1.9</doi>
     </paper>
     <paper id="10">
       <title>Integrated taxonomy of errors in chat-oriented dialogue systems</title>
@@ -152,6 +161,7 @@
       <bibkey>higashinaka-etal-2021-integrated</bibkey>
       <video href="https://www.youtube.com/watch?v=--OxJR0Q0Xk"/>
       <pwccode url="https://github.com/ryuichiro-higashinaka/taxonomy-of-errors" additional="false">ryuichiro-higashinaka/taxonomy-of-errors</pwccode>
+      <doi>10.18653/v1/2021.sigdial-1.10</doi>
     </paper>
     <paper id="11">
       <title>Effective Social Chatbot Strategies for Increasing User Initiative</title>
@@ -163,6 +173,7 @@
       <url hash="bbe5972b">2021.sigdial-1.11</url>
       <bibkey>hardy-etal-2021-effective</bibkey>
       <video href="https://www.youtube.com/watch?v=tOw47J0ErTg"/>
+      <doi>10.18653/v1/2021.sigdial-1.11</doi>
     </paper>
     <paper id="12">
       <title>Generative Conversational Networks</title>
@@ -180,6 +191,7 @@
       <pwcdataset url="https://paperswithcode.com/dataset/atis">ATIS</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/cifar-10">CIFAR-10</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/snips">SNIPS</pwcdataset>
+      <doi>10.18653/v1/2021.sigdial-1.12</doi>
     </paper>
     <paper id="13">
       <title>Commonsense-Focused Dialogues for Response Generation: An Empirical Study</title>
@@ -202,6 +214,7 @@
       <pwcdataset url="https://paperswithcode.com/dataset/dailydialog">DailyDialog</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/fed">FED</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/mutual">MuTual</pwcdataset>
+      <doi>10.18653/v1/2021.sigdial-1.13</doi>
     </paper>
     <paper id="14">
       <title>Velocidapter: Task-oriented Dialogue Comprehension Modeling Pairing Synthetic Text Generation with Domain Adaptation</title>
@@ -217,6 +230,7 @@
       <pwccode url="https://github.com/cuthalionn/velocidapter" additional="false">cuthalionn/velocidapter</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/race">RACE</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/triviaqa">TriviaQA</pwcdataset>
+      <doi>10.18653/v1/2021.sigdial-1.14</doi>
     </paper>
     <paper id="15">
       <title>An Analysis of State-of-the-Art Models for Situated Interactive <fixed-case>M</fixed-case>ulti<fixed-case>M</fixed-case>odal Conversations (<fixed-case>SIMMC</fixed-case>)</title>
@@ -233,6 +247,7 @@
       <bibkey>kottur-etal-2021-analysis</bibkey>
       <video href="https://www.youtube.com/watch?v=VmdHZSno2MQ"/>
       <pwcdataset url="https://paperswithcode.com/dataset/simmc">SIMMC</pwcdataset>
+      <doi>10.18653/v1/2021.sigdial-1.15</doi>
     </paper>
     <paper id="16">
       <title>A Simple yet Effective Method for Sentence Ordering</title>
@@ -243,6 +258,7 @@
       <url hash="e2aab5e3">2021.sigdial-1.16</url>
       <bibkey>shen-baldwin-2021-simple</bibkey>
       <video href="https://www.youtube.com/watch?v=HcurPPqHcrY"/>
+      <doi>10.18653/v1/2021.sigdial-1.16</doi>
     </paper>
     <paper id="17">
       <title>Topic Shift Detection for Mixed Initiative Response</title>
@@ -255,6 +271,7 @@
       <url hash="a341191c">2021.sigdial-1.17</url>
       <bibkey>konigari-etal-2021-topic</bibkey>
       <video href="https://www.youtube.com/watch?v=0QAMrBoEwmk"/>
+      <doi>10.18653/v1/2021.sigdial-1.17</doi>
     </paper>
     <paper id="18">
       <title>Improving Unsupervised Dialogue Topic Segmentation with Utterance-Pair Coherence Scoring</title>
@@ -269,6 +286,7 @@
       <pwcdataset url="https://paperswithcode.com/dataset/dailydialog">DailyDialog</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/doc2dial-1">Doc2Dial</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/doc2dial">doc2dial</pwcdataset>
+      <doi>10.18653/v1/2021.sigdial-1.18</doi>
     </paper>
     <paper id="19">
       <title>Fundamental Exploration of Evaluation Metrics for Persona Characteristics of Text Utterances</title>
@@ -281,6 +299,7 @@
       <abstract>To maintain utterance quality of a persona-aware dialog system, inappropriate utterances for the persona should be thoroughly filtered. When evaluating the appropriateness of a large number of arbitrary utterances to be registered in the utterance database of a retrieval-based dialog system, evaluation metrics that require a reference (or a “correct” utterance) for each evaluation target cannot be used. In addition, practical utterance filtering requires the ability to select utterances based on the intensity of persona characteristics. Therefore, we are developing metrics that can be used to capture the intensity of persona characteristics and can be computed without references tailored to the evaluation targets. To this end, we explore existing metrics and propose two new metrics: persona speaker probability and persona term salience. Experimental results show that our proposed metrics show weak to moderate correlations between scores of persona characteristics based on human judgments and outperform other metrics overall in filtering inappropriate utterances for particular personas.</abstract>
       <url hash="7889c158">2021.sigdial-1.19</url>
       <bibkey>miyazaki-etal-2021-fundamental</bibkey>
+      <doi>10.18653/v1/2021.sigdial-1.19</doi>
     </paper>
     <paper id="20">
       <title>Multi-Referenced Training for Dialogue Response Generation</title>
@@ -293,6 +312,7 @@
       <video href="https://www.youtube.com/watch?v=PIZcPh7CGcI"/>
       <pwccode url="https://github.com/ZHAOTING/dialog-processing" additional="false">ZHAOTING/dialog-processing</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/dailydialog">DailyDialog</pwcdataset>
+      <doi>10.18653/v1/2021.sigdial-1.20</doi>
     </paper>
     <paper id="21">
       <title>Contrastive Response Pairs for Automatic Evaluation of Non-task-oriented Neural Conversational Models</title>
@@ -307,6 +327,7 @@
       <url hash="6ccd3ce7">2021.sigdial-1.21</url>
       <bibkey>okano-etal-2021-contrastive</bibkey>
       <video href="https://www.youtube.com/watch?v=Wtma3lm9AMc"/>
+      <doi>10.18653/v1/2021.sigdial-1.21</doi>
     </paper>
     <paper id="22">
       <title>How does <fixed-case>BERT</fixed-case> process disfluency?</title>
@@ -320,6 +341,7 @@
       <bibkey>tian-etal-2021-bert</bibkey>
       <video href="https://www.youtube.com/watch?v=bQypRvHeOR0"/>
       <pwcdataset url="https://paperswithcode.com/dataset/snli">SNLI</pwcdataset>
+      <doi>10.18653/v1/2021.sigdial-1.22</doi>
     </paper>
     <paper id="23">
       <title>Hi-<fixed-case>DST</fixed-case>: A Hierarchical Approach for Scalable and Extensible Dialogue State Tracking</title>
@@ -334,6 +356,7 @@
       <pwcdataset url="https://paperswithcode.com/dataset/multiwoz">MultiWOZ</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/sgd">SGD</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/squad">SQuAD</pwcdataset>
+      <doi>10.18653/v1/2021.sigdial-1.23</doi>
     </paper>
     <paper id="24">
       <title>Dialogue State Tracking with Multi-Level Fusion of Predicted Dialogue States and Conversations</title>
@@ -349,6 +372,7 @@
       <video href="https://www.youtube.com/watch?v=LOC-0HQz5Lg"/>
       <pwccode url="https://github.com/helloacl/DST-DCPDS" additional="false">helloacl/DST-DCPDS</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/multiwoz">MultiWOZ</pwcdataset>
+      <doi>10.18653/v1/2021.sigdial-1.24</doi>
     </paper>
     <paper id="25">
       <title>Recent Neural Methods on Dialogue State Tracking for Task-Oriented Dialogue Systems: A Survey</title>
@@ -362,6 +386,7 @@
       <video href="https://www.youtube.com/watch?v=zQuaI9czmJk"/>
       <pwcdataset url="https://paperswithcode.com/dataset/multiwoz">MultiWOZ</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/sgd">SGD</pwcdataset>
+      <doi>10.18653/v1/2021.sigdial-1.25</doi>
     </paper>
     <paper id="26">
       <title><fixed-case>S</fixed-case>cikit-talk: A toolkit for processing real-world conversational speech data</title>
@@ -369,10 +394,12 @@
       <author><first>Gabor</first><last>Parti</last></author>
       <author><first>Chu-Ren</first><last>Huang</last></author>
       <pages>252–256</pages>
-      <abstract>We present Scikit-talk, an open-source toolkit for processing collections of real-world conversational speech in Python. First of its kind, the toolkit equips those interested in studying or modeling conversations with an easy-to-use interface to build and explore large collections of transcriptions and annotations of talk-in-interaction. Designed for applications in speech processing and Conversational AI, Scikit-talk provides tools to custom-build datasets for tasks such as intent prototyping, dialog flow testing, and conversation design. Its <i>preprocessor</i> module comes with several pre-built interfaces for common transcription formats, which aim to make working across multiple data sources more accessible. The <i>explorer</i> module provides a collection of tools to explore and analyse this data type via string matching and unsupervised machine learning techniques. Scikit-talk serves as a platform to collect and connect different transcription formats and representations of talk, enabling the user to quickly build multilingual datasets of varying detail and granularity. Thus, the toolkit aims to make working with authentic conversational speech data in Python more accessible and to provide the user with comprehensive options to work with representations of talk in appropriate detail for any downstream task. For the latest updates and information on currently supported languages and language resources, please refer to: <url>https://pypi.org/project/scikit-talk/</url></abstract>
+      <abstract>We present Scikit-talk, an open-source toolkit for processing collections of real-world conversational speech in Python. First of its kind, the toolkit equips those interested in studying or modeling conversations with an easy-to-use interface to build and explore large collections of transcriptions and annotations of talk-in-interaction. Designed for applications in speech processing and Conversational AI, Scikit-talk provides tools to custom-build datasets for tasks such as intent prototyping, dialog flow testing, and conversation design. Its <i>preprocessor</i> module comes with several pre-built interfaces for common transcription formats, which aim to make working across multiple data sources more accessible. The <i>explorer</i> module provides a collection of tools to explore and analyse this data type via string matching and unsupervised machine learning techniques. Scikit-talk serves as a platform to collect and connect different transcription formats and representations of talk, enabling the user to quickly build multilingual datasets of varying detail and granularity. Thus, the toolkit aims to make working with authentic conversational speech data in Python more accessible and to provide the user with comprehensive options to work with representations of talk in appropriate detail for any downstream task. For the latest updates and information on currently supported languages and language resources, please refer to: <url>https://pypi.org/project/scikit-talk/</url>
+      </abstract>
       <url hash="c9f37c38">2021.sigdial-1.26</url>
       <bibkey>liesenfeld-etal-2021-scikit</bibkey>
       <video href="https://www.youtube.com/watch?v=yNtYLKCo3xI"/>
+      <doi>10.18653/v1/2021.sigdial-1.26</doi>
     </paper>
     <paper id="27">
       <title><fixed-case>ERICA</fixed-case>: An Empathetic Android Companion for Covid-19 Quarantine</title>
@@ -387,6 +414,7 @@
       <url hash="92389a79">2021.sigdial-1.27</url>
       <bibkey>ishii-etal-2021-erica</bibkey>
       <video href="https://www.youtube.com/watch?v=F8aiQdYeLVs"/>
+      <doi>10.18653/v1/2021.sigdial-1.27</doi>
     </paper>
     <paper id="28">
       <title>A multi-party attentive listening robot which stimulates involvement from side participants</title>
@@ -400,6 +428,7 @@
       <url hash="96579f69">2021.sigdial-1.28</url>
       <bibkey>inoue-etal-2021-multi</bibkey>
       <video href="https://www.youtube.com/watch?v=39GMIjEOAP0"/>
+      <doi>10.18653/v1/2021.sigdial-1.28</doi>
     </paper>
     <paper id="29">
       <title>A Cloud-based User-Centered Time-Offset Interaction Application</title>
@@ -416,6 +445,7 @@
       <url hash="95e7f460">2021.sigdial-1.29</url>
       <bibkey>chierici-etal-2021-cloud</bibkey>
       <video href="https://www.youtube.com/watch?v=10FFOHH2hk0"/>
+      <doi>10.18653/v1/2021.sigdial-1.29</doi>
     </paper>
     <paper id="30">
       <title>Telling Stories through Multi-User Dialogue by Modeling Character Relations</title>
@@ -428,6 +458,7 @@
       <bibkey>si-etal-2021-telling</bibkey>
       <video href="https://www.youtube.com/watch?v=WF8fyqWJVBA"/>
       <pwcdataset url="https://paperswithcode.com/dataset/crd3">CRD3</pwcdataset>
+      <doi>10.18653/v1/2021.sigdial-1.30</doi>
     </paper>
     <paper id="31">
       <title>Summarizing Behavioral Change Goals from <fixed-case>SMS</fixed-case> Exchanges to Support Health Coaches</title>
@@ -442,6 +473,7 @@
       <url hash="99e5467b">2021.sigdial-1.31</url>
       <bibkey>gupta-etal-2021-summarizing</bibkey>
       <video href="https://www.youtube.com/watch?v=0FxAJvs93WA"/>
+      <doi>10.18653/v1/2021.sigdial-1.31</doi>
     </paper>
     <paper id="32">
       <title>Rare-Class Dialogue Act Tagging for <fixed-case>A</fixed-case>lzheimer’s Disease Diagnosis</title>
@@ -453,6 +485,7 @@
       <url hash="f71585b8">2021.sigdial-1.32</url>
       <bibkey>nasreen-etal-2021-rare</bibkey>
       <video href="https://www.youtube.com/watch?v=AQQrmCtwGe0"/>
+      <doi>10.18653/v1/2021.sigdial-1.32</doi>
     </paper>
     <paper id="33">
       <title><fixed-case>CIDER</fixed-case>: Commonsense Inference for Dialogue Explanation and Reasoning</title>
@@ -476,6 +509,7 @@
       <pwcdataset url="https://paperswithcode.com/dataset/multinli">MultiNLI</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/squad">SQuAD</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/swag">SWAG</pwcdataset>
+      <doi>10.18653/v1/2021.sigdial-1.33</doi>
     </paper>
     <paper id="34">
       <title>Where Are We in Discourse Relation Recognition?</title>
@@ -488,6 +522,7 @@
       <bibkey>atwell-etal-2021-discourse</bibkey>
       <video href="https://www.youtube.com/watch?v=QnTpLGN8QvY"/>
       <pwcdataset url="https://paperswithcode.com/dataset/penn-treebank">Penn Treebank</pwcdataset>
+      <doi>10.18653/v1/2021.sigdial-1.34</doi>
     </paper>
     <paper id="35">
       <title>Annotation Inconsistency and Entity Bias in <fixed-case>M</fixed-case>ulti<fixed-case>WOZ</fixed-case></title>
@@ -503,6 +538,7 @@
       <url hash="feb6871a">2021.sigdial-1.35</url>
       <bibkey>qian-etal-2021-annotation</bibkey>
       <video href="https://www.youtube.com/watch?v=IoP7BmXTMyY"/>
+      <doi>10.18653/v1/2021.sigdial-1.35</doi>
     </paper>
     <paper id="36">
       <title>On the Need for Thoughtful Data Collection for Multi-Party Dialogue: A Survey of Available Corpora and Collection Methods</title>
@@ -519,6 +555,7 @@
       <pwcdataset url="https://paperswithcode.com/dataset/molweni">Molweni</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/opensubtitles">OpenSubtitles</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/serial-speakers">Serial Speakers</pwcdataset>
+      <doi>10.18653/v1/2021.sigdial-1.36</doi>
     </paper>
     <paper id="37">
       <title>How Should Agents Ask Questions For Situated Learning? An Annotated Dialogue Corpus</title>
@@ -534,6 +571,7 @@
       <video href="https://www.youtube.com/watch?v=9IAwjDa0Wp0"/>
       <pwccode url="https://github.com/USArmyResearchLab/ARL-HuRDL" additional="false">USArmyResearchLab/ARL-HuRDL</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/hurdl">HuRDL</pwcdataset>
+      <doi>10.18653/v1/2021.sigdial-1.37</doi>
     </paper>
     <paper id="38">
       <title>How Will <fixed-case>I</fixed-case> Argue? A Dataset for Evaluating Recommender Systems for Argumentations</title>
@@ -546,6 +584,7 @@
       <bibkey>brenneis-etal-2021-will</bibkey>
       <video href="https://www.youtube.com/watch?v=gfM2Vf-xFJ8"/>
       <pwccode url="https://github.com/hhucn/argumentation-attitude-dataset" additional="false">hhucn/argumentation-attitude-dataset</pwccode>
+      <doi>10.18653/v1/2021.sigdial-1.38</doi>
     </paper>
     <paper id="39">
       <title>From Argument Search to Argumentative Dialogue: A Topic-independent Approach to Argument Acquisition for Dialogue Systems</title>
@@ -561,6 +600,7 @@
       <bibkey>rach-etal-2021-argument</bibkey>
       <video href="https://www.youtube.com/watch?v=r02BENY-Fu0"/>
       <pwccode url="https://github.com/csacro/from-argument-search-to-argumentative-dialogue" additional="false">csacro/from-argument-search-to-argumentative-dialogue</pwccode>
+      <doi>10.18653/v1/2021.sigdial-1.39</doi>
     </paper>
     <paper id="40">
       <title>What to Fact-Check: Guiding Check-Worthy Information Detection in News Articles through Argumentative Discourse Structure</title>
@@ -573,6 +613,7 @@
       <bibkey>alhindi-etal-2021-fact</bibkey>
       <video href="https://www.youtube.com/watch?v=oBT795ipFFM"/>
       <pwccode url="https://github.com/tariq60/whattofactcheck" additional="false">tariq60/whattofactcheck</pwccode>
+      <doi>10.18653/v1/2021.sigdial-1.40</doi>
     </paper>
     <paper id="41">
       <title>How “open” are the conversations with open-domain chatbots? A proposal for Speech Event based evaluation</title>
@@ -583,6 +624,7 @@
       <url hash="aa2871ba">2021.sigdial-1.41</url>
       <bibkey>dogruoz-skantze-2021-open</bibkey>
       <video href="https://www.youtube.com/watch?v=bYXcZg_VWiE"/>
+      <doi>10.18653/v1/2021.sigdial-1.41</doi>
     </paper>
     <paper id="42">
       <title>Blending Task Success and User Satisfaction: Analysis of Learned Dialogue Behaviour with Multiple Rewards</title>
@@ -594,6 +636,7 @@
       <bibkey>ultes-maier-2021-blending</bibkey>
       <video href="https://www.youtube.com/watch?v=6US5hE70vRU"/>
       <pwcdataset url="https://paperswithcode.com/dataset/multiwoz">MultiWOZ</pwcdataset>
+      <doi>10.18653/v1/2021.sigdial-1.42</doi>
     </paper>
     <paper id="43">
       <title>Diversity as a By-Product: Goal-oriented Language Generation Leads to Linguistic Variation</title>
@@ -605,6 +648,7 @@
       <url hash="22ecd501">2021.sigdial-1.43</url>
       <bibkey>schuz-etal-2021-diversity</bibkey>
       <video href="https://www.youtube.com/watch?v=CqplKEQcikI"/>
+      <doi>10.18653/v1/2021.sigdial-1.43</doi>
     </paper>
     <paper id="44">
       <title><fixed-case>DTAFA</fixed-case>: Decoupled Training Architecture for Efficient <fixed-case>FAQ</fixed-case> Retrieval</title>
@@ -616,6 +660,7 @@
       <url hash="ff972d8c">2021.sigdial-1.44</url>
       <bibkey>assem-etal-2021-dtafa</bibkey>
       <video href="https://www.youtube.com/watch?v=_tJhKdtu8EM"/>
+      <doi>10.18653/v1/2021.sigdial-1.44</doi>
     </paper>
     <paper id="45">
       <title>Projection of Turn Completion in Incremental Spoken Dialogue Systems</title>
@@ -626,6 +671,7 @@
       <url hash="cbe97965">2021.sigdial-1.45</url>
       <bibkey>ekstedt-skantze-2021-projection</bibkey>
       <video href="https://www.youtube.com/watch?v=jfB1gE1wP6Y"/>
+      <doi>10.18653/v1/2021.sigdial-1.45</doi>
     </paper>
     <paper id="46">
       <title>A Task-Oriented Dialogue Architecture via Transformer Neural Language Models and Symbolic Injection</title>
@@ -639,6 +685,7 @@
       <url hash="cedbcb4c">2021.sigdial-1.46</url>
       <bibkey>romero-etal-2021-task</bibkey>
       <video href="https://www.youtube.com/watch?v=mv-YwVGKhh8"/>
+      <doi>10.18653/v1/2021.sigdial-1.46</doi>
     </paper>
     <paper id="47">
       <title>Domain-independent User Simulation with Transformers for Task-oriented Dialogue Systems</title>
@@ -656,6 +703,7 @@
       <bibkey>lin-etal-2021-domain</bibkey>
       <video href="https://www.youtube.com/watch?v=WAWFDEk1yvc"/>
       <pwcdataset url="https://paperswithcode.com/dataset/multiwoz">MultiWOZ</pwcdataset>
+      <doi>10.18653/v1/2021.sigdial-1.47</doi>
     </paper>
     <paper id="48">
       <title>A Practical 2-step Approach to Assist Enterprise Question-Answering Live Chat</title>
@@ -666,6 +714,7 @@
       <url hash="94b8895f">2021.sigdial-1.48</url>
       <bibkey>liao-fares-2021-practical</bibkey>
       <video href="https://www.youtube.com/watch?v=wSH8bPlXM88"/>
+      <doi>10.18653/v1/2021.sigdial-1.48</doi>
     </paper>
     <paper id="49">
       <title>A Brief Study on the Effects of Training Generative Dialogue Models with a Semantic loss</title>
@@ -679,6 +728,7 @@
       <bibkey>parthasarathi-etal-2021-brief</bibkey>
       <video href="https://www.youtube.com/watch?v=2bnoJOzinms"/>
       <pwccode url="https://github.com/ppartha03/Semantic-Loss-Dialogue-Generation" additional="false">ppartha03/Semantic-Loss-Dialogue-Generation</pwccode>
+      <doi>10.18653/v1/2021.sigdial-1.49</doi>
     </paper>
     <paper id="50">
       <title>Do Encoder Representations of Generative Dialogue Models have sufficient summary of the Information about the task ?</title>
@@ -691,6 +741,7 @@
       <bibkey>parthasarathi-etal-2021-encoder</bibkey>
       <video href="https://www.youtube.com/watch?v=AwHuUPEpJFA"/>
       <pwccode url="https://github.com/ppartha03/Dialogue-Probe-Tasks-Public" additional="false">ppartha03/Dialogue-Probe-Tasks-Public</pwccode>
+      <doi>10.18653/v1/2021.sigdial-1.50</doi>
     </paper>
     <paper id="51">
       <title><fixed-case>G</fixed-case>en<fixed-case>SF</fixed-case>: Simultaneous Adaptation of Generative Pre-trained Models and Slot Filling</title>
@@ -702,6 +753,7 @@
       <bibkey>mehri-eskenazi-2021-gensf</bibkey>
       <video href="https://www.youtube.com/watch?v=PNCr4am-1Gc"/>
       <pwccode url="https://github.com/shikib/generative_slot_filling" additional="false">shikib/generative_slot_filling</pwccode>
+      <doi>10.18653/v1/2021.sigdial-1.51</doi>
     </paper>
     <paper id="52">
       <title>Schema-Guided Paradigm for Zero-Shot Dialog</title>
@@ -714,6 +766,7 @@
       <video href="https://www.youtube.com/watch?v=usZQulwdOZs"/>
       <pwccode url="https://github.com/Shikib/schema_attention_model" additional="false">Shikib/schema_attention_model</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/star">STAR</pwcdataset>
+      <doi>10.18653/v1/2021.sigdial-1.52</doi>
     </paper>
     <paper id="53">
       <title>Coreference-Aware Dialogue Summarization</title>
@@ -729,6 +782,7 @@
       <video href="https://www.youtube.com/watch?v=XNiUdhaW6LI"/>
       <pwccode url="https://github.com/seq-to-mind/coref_dial_summ" additional="false">seq-to-mind/coref_dial_summ</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/samsum-corpus">SAMSum Corpus</pwcdataset>
+      <doi>10.18653/v1/2021.sigdial-1.53</doi>
     </paper>
     <paper id="54">
       <title>Weakly Supervised Extractive Summarization with Attention</title>
@@ -740,6 +794,7 @@
       <url hash="96373a53">2021.sigdial-1.54</url>
       <bibkey>zhuang-etal-2021-weakly</bibkey>
       <video href="https://www.youtube.com/watch?v=0xiQe0OPwBA"/>
+      <doi>10.18653/v1/2021.sigdial-1.54</doi>
     </paper>
     <paper id="55">
       <title>Incremental temporal summarization in multi-party meetings</title>
@@ -753,6 +808,7 @@
       <bibkey>manuvinakurike-etal-2021-incremental</bibkey>
       <video href="https://www.youtube.com/watch?v=CnHqotO89jQ"/>
       <pwcdataset url="https://paperswithcode.com/dataset/cnn-daily-mail-1">CNN/Daily Mail</pwcdataset>
+      <doi>10.18653/v1/2021.sigdial-1.55</doi>
     </paper>
     <paper id="56">
       <title>Mitigating Topic Bias when Detecting Decisions in Dialogue</title>
@@ -765,6 +821,7 @@
       <url hash="d174bcaf">2021.sigdial-1.56</url>
       <bibkey>karan-etal-2021-mitigating</bibkey>
       <video href="https://www.youtube.com/watch?v=vJiJn1cjFH0"/>
+      <doi>10.18653/v1/2021.sigdial-1.56</doi>
     </paper>
     <paper id="57">
       <title>Assessing Political Prudence of Open-domain Chatbots</title>
@@ -779,6 +836,7 @@
       <bibkey>bang-etal-2021-assessing</bibkey>
       <video href="https://www.youtube.com/watch?v=R_wMpI7i6TQ"/>
       <pwccode url="https://github.com/HLTCHKUST/chatbot-political-prudence-test" additional="false">HLTCHKUST/chatbot-political-prudence-test</pwccode>
+      <doi>10.18653/v1/2021.sigdial-1.57</doi>
     </paper>
     <paper id="58">
       <title>Large-Scale Quantitative Evaluation of Dialogue Agents’ Response Strategies against Offensive Users</title>
@@ -791,6 +849,7 @@
       <bibkey>li-etal-2021-large</bibkey>
       <video href="https://www.youtube.com/watch?v=FLsqwyGx4zM"/>
       <pwccode url="https://github.com/lithiumh/offensive" additional="false">lithiumh/offensive</pwccode>
+      <doi>10.18653/v1/2021.sigdial-1.58</doi>
     </paper>
   </volume>
 </collection>
diff --git a/data/xml/2022.inlg.xml b/data/xml/2022.inlg.xml
index b74bb450a8..61792352a7 100644
--- a/data/xml/2022.inlg.xml
+++ b/data/xml/2022.inlg.xml
@@ -27,6 +27,7 @@
       <url hash="0c98da54">2022.inlg-main.1</url>
       <attachment type="software" hash="bfbbdb14">2022.inlg-main.1.software.zip</attachment>
       <bibkey>han-etal-2022-evaluating</bibkey>
+      <doi>10.18653/v1/2022.inlg-main.1</doi>
     </paper>
     <paper id="2">
       <title>Template-based Approach to Zero-shot Intent Recognition</title>
@@ -41,6 +42,7 @@
       <url hash="7da089e9">2022.inlg-main.2</url>
       <attachment type="software" hash="0e11fdd8">2022.inlg-main.2.software.zip</attachment>
       <bibkey>lamanov-etal-2022-template</bibkey>
+      <doi>10.18653/v1/2022.inlg-main.2</doi>
     </paper>
     <paper id="3">
       <title>“Slow Service” ↛ “Great Food”: Enhancing Content Preservation in Unsupervised Text Style Transfer</title>
@@ -51,6 +53,7 @@
       <url hash="0395840c">2022.inlg-main.3</url>
       <attachment type="software" hash="366788c0">2022.inlg-main.3.software.zip</attachment>
       <bibkey>zhu-bhat-2022-slow</bibkey>
+      <doi>10.18653/v1/2022.inlg-main.3</doi>
     </paper>
     <paper id="4">
       <title><fixed-case>A</fixed-case>rabic Image Captioning using Pre-training of Deep Bidirectional Transformers</title>
@@ -63,6 +66,7 @@
       <url hash="4c081471">2022.inlg-main.4</url>
       <attachment type="software" hash="108d5f4d">2022.inlg-main.4.software.zip</attachment>
       <bibkey>emami-etal-2022-arabic</bibkey>
+      <doi>10.18653/v1/2022.inlg-main.4</doi>
     </paper>
     <paper id="5">
       <title>Plot Writing From Pre-Trained Language Models</title>
@@ -74,6 +78,7 @@
       <url hash="09e08029">2022.inlg-main.5</url>
       <attachment type="software" hash="73d25213">2022.inlg-main.5.software.zip</attachment>
       <bibkey>jin-etal-2022-plot</bibkey>
+      <doi>10.18653/v1/2022.inlg-main.5</doi>
     </paper>
     <paper id="6">
       <title>Paraphrasing via Ranking Many Candidates</title>
@@ -83,6 +88,7 @@
       <url hash="e92b1c1d">2022.inlg-main.6</url>
       <attachment type="software" hash="d93c9651">2022.inlg-main.6.software.zip</attachment>
       <bibkey>lee-2022-paraphrasing</bibkey>
+      <doi>10.18653/v1/2022.inlg-main.6</doi>
     </paper>
     <paper id="7">
       <title>Evaluating Legal Accuracy of Neural Generators on the Generation of Criminal Court Dockets Description</title>
@@ -95,6 +101,7 @@
       <url hash="5f43f1cc">2022.inlg-main.7</url>
       <attachment type="software" hash="51b25aff">2022.inlg-main.7.software.zip</attachment>
       <bibkey>garneau-etal-2022-evaluating</bibkey>
+      <doi>10.18653/v1/2022.inlg-main.7</doi>
     </paper>
     <paper id="8">
       <title>Automatic Generation of Factual News Headlines in <fixed-case>F</fixed-case>innish</title>
@@ -107,6 +114,7 @@
       <url hash="c4a2b96f">2022.inlg-main.8</url>
       <attachment type="software" hash="a4fb9656">2022.inlg-main.8.software.zip</attachment>
       <bibkey>koppatz-etal-2022-automatic</bibkey>
+      <doi>10.18653/v1/2022.inlg-main.8</doi>
     </paper>
     <paper id="9">
       <title>Generating Coherent and Informative Descriptions for Groups of Visual Objects and Categories: A Simple Decoding Approach</title>
@@ -120,6 +128,7 @@
       <url hash="3d6ba171">2022.inlg-main.9</url>
       <attachment type="software" hash="a7ede148">2022.inlg-main.9.software.zip</attachment>
       <bibkey>attari-etal-2022-generating</bibkey>
+      <doi>10.18653/v1/2022.inlg-main.9</doi>
     </paper>
     <paper id="10">
       <title>Dealing with hallucination and omission in neural Natural Language Generation: A use case on meteorology.</title>
@@ -132,6 +141,7 @@
       <url hash="a4d3233d">2022.inlg-main.10</url>
       <attachment type="software" hash="95fe3dff">2022.inlg-main.10.software.zip</attachment>
       <bibkey>gonzalez-corbelle-etal-2022-dealing</bibkey>
+      <doi>10.18653/v1/2022.inlg-main.10</doi>
     </paper>
     <paper id="11">
       <title>Amortized Noisy Channel Neural Machine Translation</title>
@@ -143,6 +153,7 @@
       <url hash="d56e5f7d">2022.inlg-main.11</url>
       <attachment type="software" hash="55618b2a">2022.inlg-main.11.software.zip</attachment>
       <bibkey>pang-etal-2022-amortized</bibkey>
+      <doi>10.18653/v1/2022.inlg-main.11</doi>
     </paper>
     <paper id="12">
       <title>Math Word Problem Generation with Multilingual Language Models</title>
@@ -155,6 +166,7 @@
       <url hash="48a1fb78">2022.inlg-main.12</url>
       <attachment type="software" hash="37566ec6">2022.inlg-main.12.software.zip</attachment>
       <bibkey>niyarepola-etal-2022-math</bibkey>
+      <doi>10.18653/v1/2022.inlg-main.12</doi>
     </paper>
     <paper id="13">
       <title>Comparing informativeness of an <fixed-case>NLG</fixed-case> chatbot vs graphical app in diet-information domain</title>
@@ -165,6 +177,7 @@
       <url hash="f1197d0f">2022.inlg-main.13</url>
       <attachment type="software" hash="3145bf40">2022.inlg-main.13.software.zip</attachment>
       <bibkey>balloccu-reiter-2022-comparing</bibkey>
+      <doi>10.18653/v1/2022.inlg-main.13</doi>
     </paper>
     <paper id="14">
       <title>Generation of Student Questions for Inquiry-based Learning</title>
@@ -177,6 +190,7 @@
       <url hash="89d436f2">2022.inlg-main.14</url>
       <attachment type="software" hash="469e960a">2022.inlg-main.14.software.zip</attachment>
       <bibkey>ros-etal-2022-generation</bibkey>
+      <doi>10.18653/v1/2022.inlg-main.14</doi>
     </paper>
     <paper id="15">
       <title>Keyword Provision Question Generation for Facilitating Educational Reading Comprehension Preparation</title>
@@ -188,6 +202,7 @@
       <url hash="1d091c19">2022.inlg-main.15</url>
       <attachment type="software" hash="3a0a7c97">2022.inlg-main.15.software.zip</attachment>
       <bibkey>chan-etal-2022-keyword</bibkey>
+      <doi>10.18653/v1/2022.inlg-main.15</doi>
     </paper>
     <paper id="16">
       <title>Generating Landmark-based Manipulation Instructions from Image Pairs</title>
@@ -200,6 +215,7 @@
       <url hash="7ab68662">2022.inlg-main.16</url>
       <attachment type="software" hash="3d1d44bb">2022.inlg-main.16.software.zip</attachment>
       <bibkey>zarriess-etal-2022-generating</bibkey>
+      <doi>10.18653/v1/2022.inlg-main.16</doi>
     </paper>
     <paper id="17">
       <title>Zero-shot Cross-Linguistic Learning of Event Semantics</title>
@@ -217,6 +233,7 @@
       <url hash="fba33e46">2022.inlg-main.17</url>
       <attachment type="software" hash="2f487f07">2022.inlg-main.17.software.zip</attachment>
       <bibkey>alikhani-etal-2022-zero</bibkey>
+      <doi>10.18653/v1/2022.inlg-main.17</doi>
     </paper>
     <paper id="18">
       <title>Nominal Metaphor Generation with Multitask Learning</title>
@@ -230,6 +247,7 @@
       <bibkey>li-etal-2022-nominal</bibkey>
       <revision id="1" href="2022.inlg-main.18v1" hash="8444c67c"/>
       <revision id="2" href="2022.inlg-main.18v2" hash="be82da68" date="2022-11-29">Updated paper pdf due to openreview issue.</revision>
+      <doi>10.18653/v1/2022.inlg-main.18</doi>
     </paper>
     <paper id="19">
       <title>Look and Answer the Question: On the Role of Vision in Embodied Question Answering</title>
@@ -241,6 +259,7 @@
       <url hash="0860f8be">2022.inlg-main.19</url>
       <attachment type="software" hash="a402b5ab">2022.inlg-main.19.software.zip</attachment>
       <bibkey>ilinykh-etal-2022-look</bibkey>
+      <doi>10.18653/v1/2022.inlg-main.19</doi>
     </paper>
     <paper id="20">
       <title>Strategies for framing argumentative conclusion generation</title>
@@ -253,6 +272,7 @@
       <url hash="08fbab70">2022.inlg-main.20</url>
       <attachment type="software" hash="047dd236">2022.inlg-main.20.software.zip</attachment>
       <bibkey>heinisch-etal-2022-strategies</bibkey>
+      <doi>10.18653/v1/2022.inlg-main.20</doi>
     </paper>
     <paper id="21">
       <title><fixed-case>LAFT</fixed-case>: Cross-lingual Transfer for Text Generation by Language-Agnostic Finetuning</title>
@@ -265,6 +285,7 @@
       <url hash="f6b8b91c">2022.inlg-main.21</url>
       <attachment type="software" hash="0dcdeb59">2022.inlg-main.21.software.zip</attachment>
       <bibkey>wu-etal-2022-laft</bibkey>
+      <doi>10.18653/v1/2022.inlg-main.21</doi>
     </paper>
     <paper id="22">
       <title>Quantum Natural Language Generation on Near-Term Devices</title>
@@ -276,6 +297,7 @@
       <url hash="50daf1ca">2022.inlg-main.22</url>
       <attachment type="software" hash="0eeb4efb">2022.inlg-main.22.software.zip</attachment>
       <bibkey>karamlou-etal-2022-quantum</bibkey>
+      <doi>10.18653/v1/2022.inlg-main.22</doi>
     </paper>
     <paper id="23">
       <title>Towards Evaluation of Multi-party Dialogue Systems</title>
@@ -287,6 +309,7 @@
       <url hash="ace10072">2022.inlg-main.23</url>
       <attachment type="software" hash="7d8ec7f2">2022.inlg-main.23.software.zip</attachment>
       <bibkey>mahajan-etal-2022-towards</bibkey>
+      <doi>10.18653/v1/2022.inlg-main.23</doi>
     </paper>
     <paper id="24">
       <title>Are Current Decoding Strategies Capable of Facing the Challenges of Visual Dialogue?</title>
@@ -299,6 +322,7 @@
       <url hash="8920ae30">2022.inlg-main.24</url>
       <attachment type="software" hash="fc61c1a2">2022.inlg-main.24.software.zip</attachment>
       <bibkey>chaudhary-etal-2022-current</bibkey>
+      <doi>10.18653/v1/2022.inlg-main.24</doi>
     </paper>
     <paper id="25">
       <title>Analogy Generation by Prompting Large Language Models: A Case Study of <fixed-case>I</fixed-case>nstruct<fixed-case>GPT</fixed-case></title>
@@ -309,6 +333,7 @@
       <abstract/>
       <url hash="1ead88db">2022.inlg-main.25</url>
       <bibkey>bhavya-etal-2022-analogy</bibkey>
+      <doi>10.18653/v1/2022.inlg-main.25</doi>
     </paper>
   </volume>
   <volume id="demos" ingest-date="2022-11-18" type="proceedings">
@@ -454,7 +479,7 @@
       <title>niksss at <fixed-case>H</fixed-case>inglish<fixed-case>E</fixed-case>val: Language-agnostic <fixed-case>BERT</fixed-case>-based Contextual Embeddings with Catboost for Quality Evaluation of the Low-Resource Synthetically Generated Code-Mixed <fixed-case>H</fixed-case>inglish Text</title>
       <author><first>Nikhil</first><last>Singh</last></author>
       <pages>31-34</pages>
-      <abstract>This paper describes the system description for the HinglishEval challenge at INLG 2022. The goal of this task was to investigate the factors influencing the quality of the code- mixed text generation system. The task was divided into two subtasks, quality rating pre- diction and annotators’ disagreement predic- tion of the synthetic Hinglish dataset. We at- tempted to solve these tasks using sentence- level embeddings, which are obtained from mean pooling the contextualized word embed- dings for all input tokens in our text. We experimented with various classifiers on top of the embeddings produced for respective tasks. Our best-performing system ranked 1st on subtask B and 3rd on subtask A. We make our code available here: <url>https://github.com/nikhilbyte/Hinglish-qEval</url></abstract>
+      <abstract>This paper describes the system description for the HinglishEval challenge at INLG 2022. The goal of this task was to investigate the factors influencing the quality of the code- mixed text generation system. The task was divided into two subtasks, quality rating prediction and annotators’ disagreement prediction of the synthetic Hinglish dataset. We attempted to solve these tasks using sentence-level embeddings, which are obtained from mean pooling the contextualized word embeddings for all input tokens in our text. We experimented with various classifiers on top of the embeddings produced for respective tasks. Our best-performing system ranked 1st on subtask B and 3rd on subtask A. We make our code available here: <url>https://github.com/nikhilbyte/Hinglish-qEval</url></abstract>
       <url hash="282566e2">2022.inlg-genchal.5</url>
       <bibkey>singh-2022-niksss-hinglisheval</bibkey>
     </paper>
@@ -466,7 +491,7 @@
       <author><first>Hriday</first><last>Kedia</last></author>
       <author><first>Yashvardhan</first><last>Sharma</last></author>
       <pages>35-38</pages>
-      <abstract>Code-Mixed text data consists of sentences having words or phrases from more than one language. Most multi-lingual communities worldwide communicate using multiple lan- guages, with English usually one of them. Hinglish is a Code-Mixed text composed of Hindi and English but written in Roman script. This paper aims to determine the factors in- fluencing the quality of Code-Mixed text data generated by the system. For the Hingli- shEval task, the proposed model uses multi- lingual BERT to find the similarity between synthetically generated and human-generated sentences to predict the quality of synthetically generated Hinglish sentences.</abstract>
+      <abstract>Code-Mixed text data consists of sentences having words or phrases from more than one language. Most multi-lingual communities worldwide communicate using multiple languages, with English usually one of them. Hinglish is a Code-Mixed text composed of Hindi and English but written in Roman script. This paper aims to determine the factors influencing the quality of Code-Mixed text data generated by the system. For the HinglishEval task, the proposed model uses multilingual BERT to find the similarity between synthetically generated and human-generated sentences to predict the quality of synthetically generated Hinglish sentences.</abstract>
       <url hash="53daa48b">2022.inlg-genchal.6</url>
       <bibkey>furniturewala-etal-2022-bits</bibkey>
     </paper>
@@ -476,7 +501,7 @@
       <author><first>Rudra</first><last>Dhar</last></author>
       <author><first>Dipankar</first><last>Das</last></author>
       <pages>39-42</pages>
-      <abstract>In this paper we describe a system submit- ted to the INLG 2022 Generation Challenge (GenChal) on Quality Evaluation of the Low- Resource Synthetically Generated Code-Mixed Hinglish Text. We implement a Bi-LSTM- based neural network model to predict the Av- erage rating score and Disagreement score of the synthetic Hinglish dataset. In our mod- els, we used word embeddings for English and Hindi data, and one hot encodings for Hinglish data. We achieved a F1 score of 0.11, and mean squared error of 6.0 in the average rating score prediction task. In the task of Disagreement score prediction, we achieve a F1 score of 0.18, and mean squared error of 5.0.</abstract>
+      <abstract>In this paper we describe a system submitted to the INLG 2022 Generation Challenge (GenChal) on Quality Evaluation of the Low-Resource Synthetically Generated Code-Mixed Hinglish Text. We implement a Bi-LSTM-based neural network model to predict the Average rating score and Disagreement score of the synthetic Hinglish dataset. In our models, we used word embeddings for English and Hindi data, and one hot encodings for Hinglish data. We achieved a F1 score of 0.11, and mean squared error of 6.0 in the average rating score prediction task. In the task of Disagreement score prediction, we achieve a F1 score of 0.18, and mean squared error of 5.0.</abstract>
       <url hash="93d56482">2022.inlg-genchal.7</url>
       <bibkey>guha-etal-2022-ju</bibkey>
     </paper>
@@ -487,7 +512,7 @@
       <author><first>Maja</first><last>Popović</last></author>
       <author><first>Ehud</first><last>Reiter</last></author>
       <pages>43-51</pages>
-      <abstract>Against a background of growing interest in reproducibility in NLP and ML, and as part of an ongoing research programme designed to develop theory and practice of reproducibility assessment in NLP, we organised the second shared task on reproducibility of evaluations in NLG, ReproGen 2022. This paper describes the shared task, summarises results from the reproduction studies submitted, and provides further comparative analysis of the results. Out of six initial team registrations, we received submissions from five teams. Meta-analysis of the five reproduction studies revealed varying degrees of reproducibility, and allowed further tentative conclusions about what types of eval- uation tend to have better reproducibility.</abstract>
+      <abstract>Against a background of growing interest in reproducibility in NLP and ML, and as part of an ongoing research programme designed to develop theory and practice of reproducibility assessment in NLP, we organised the second shared task on reproducibility of evaluations in NLG, ReproGen 2022. This paper describes the shared task, summarises results from the reproduction studies submitted, and provides further comparative analysis of the results. Out of six initial team registrations, we received submissions from five teams. Meta-analysis of the five reproduction studies revealed varying degrees of reproducibility, and allowed further tentative conclusions about what types of evaluation tend to have better reproducibility.</abstract>
       <url hash="a2c42e90">2022.inlg-genchal.8</url>
       <bibkey>belz-etal-2022-2022</bibkey>
     </paper>
@@ -499,7 +524,7 @@
       <author><first>Thiago</first><last>Castro Ferreira</last></author>
       <author><first>Anya</first><last>Belz</last></author>
       <pages>52-61</pages>
-      <abstract>In this paper, we present the results of two re- production studies for the human evaluation originally reported by Dušek and Kasner (2020) in which the authors comparatively evaluated outputs produced by a semantic error detection system for data-to-text generation against ref- erence outputs. In the first reproduction, the original evaluators repeat the evaluation, in a test of the repeatability of the original evalua- tion. In the second study, two new evaluators carry out the evaluation task, in a test of the reproducibility of the original evaluation under otherwise identical conditions. We describe our approach to reproduction, and present and analyse results, finding different degrees of re- producibility depending on result type, data and labelling task. Our resources are available and open-sourced.</abstract>
+      <abstract>In this paper, we present the results of two reproduction studies for the human evaluation originally reported by Dušek and Kasner (2020) in which the authors comparatively evaluated outputs produced by a semantic error detection system for data-to-text generation against reference outputs. In the first reproduction, the original evaluators repeat the evaluation, in a test of the repeatability of the original evaluation. In the second study, two new evaluators carry out the evaluation task, in a test of the reproducibility of the original evaluation under otherwise identical conditions. We describe our approach to reproduction, and present and analyse results, finding different degrees of reproducibility depending on result type, data and labelling task. Our resources are available and open-sourced.</abstract>
       <url hash="e58575d6">2022.inlg-genchal.9</url>
       <bibkey>huidrom-etal-2022-two</bibkey>
     </paper>
@@ -518,7 +543,7 @@
       <author><first>Craig</first><last>Thomson</last></author>
       <author><first>Ehud</first><last>Reiter</last></author>
       <pages>71-79</pages>
-      <abstract>We investigate the data collected for the Accuracy Evaluation Shared Task as a retrospective reproduction study. The shared task was based upon errors found by human annotation of com- puter generated summaries of basketball games. Annotation was performed in three separate stages, with texts taken from the same three systems and checked for errors by the same three annotators. We show that the mean count of errors was consistent at the highest level for each experiment, with increased variance when looking at per-system and/or per-error- type breakdowns.</abstract>
+      <abstract>We investigate the data collected for the Accuracy Evaluation Shared Task as a retrospective reproduction study. The shared task was based upon errors found by human annotation of computer generated summaries of basketball games. Annotation was performed in three separate stages, with texts taken from the same three systems and checked for errors by the same three annotators. We show that the mean count of errors was consistent at the highest level for each experiment, with increased variance when looking at per-system and/or per-error- type breakdowns.</abstract>
       <url hash="ff3034b7">2022.inlg-genchal.11</url>
       <bibkey>thomson-reiter-2022-accuracy</bibkey>
     </paper>
diff --git a/data/xml/2022.sigdial.xml b/data/xml/2022.sigdial.xml
index 0399e265a6..cea695463f 100644
--- a/data/xml/2022.sigdial.xml
+++ b/data/xml/2022.sigdial.xml
@@ -33,6 +33,7 @@
       <video href="https://youtu.be/3O0Jj4lpxn8"/>
       <pwccode url="https://github.com/nu-dialogue/post-processing-networks" additional="false">nu-dialogue/post-processing-networks</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/multiwoz">MultiWOZ</pwcdataset>
+      <doi>10.18653/v1/2022.sigdial-1.1</doi>
     </paper>
     <paper id="2">
       <title>Reducing Model Churn: Stable Re-training of Conversational Agents</title>
@@ -46,6 +47,7 @@
       <video href="https://youtu.be/okIrVZD-zDE"/>
       <pwccode url="https://github.com/google/stable-retraining-conversational-agents" additional="false">google/stable-retraining-conversational-agents</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/topv2">TOPv2</pwcdataset>
+      <doi>10.18653/v1/2022.sigdial-1.2</doi>
     </paper>
     <paper id="3">
       <title>Knowledge-Grounded Conversational Data Augmentation with Generative Conversational Networks</title>
@@ -59,6 +61,7 @@
       <bibkey>lin-etal-2022-knowledge</bibkey>
       <video href="https://youtu.be/P8Ns-WWF770"/>
       <pwcdataset url="https://paperswithcode.com/dataset/topical-chat">Topical-Chat</pwcdataset>
+      <doi>10.18653/v1/2022.sigdial-1.3</doi>
     </paper>
     <paper id="4">
       <title>Guiding the Release of Safer <fixed-case>E</fixed-case>2<fixed-case>E</fixed-case> Conversational <fixed-case>AI</fixed-case> through Value Sensitive Design</title>
@@ -74,6 +77,7 @@
       <url hash="2adda792">2022.sigdial-1.4</url>
       <bibkey>bergman-etal-2022-guiding</bibkey>
       <video href="https://youtu.be/oyIBL-umAHM"/>
+      <doi>10.18653/v1/2022.sigdial-1.4</doi>
     </paper>
     <paper id="5">
       <title>Controllable User Dialogue Act Augmentation for Dialogue State Tracking</title>
@@ -86,6 +90,7 @@
       <url hash="f95fb5b7">2022.sigdial-1.5</url>
       <bibkey>lai-etal-2022-controllable</bibkey>
       <pwccode url="https://github.com/miulab/cuda-dst" additional="false">miulab/cuda-dst</pwccode>
+      <doi>10.18653/v1/2022.sigdial-1.5</doi>
     </paper>
     <paper id="6">
       <title>Developing an argument annotation scheme based on a semantic classification of arguments</title>
@@ -97,6 +102,7 @@
       <abstract>Corpora of argumentative discourse are commonly analyzed in terms of argumentative units, consisting of claims and premises. Both argument detection and classification are complex discourse processing tasks. Our paper introduces a semantic classification of arguments that can help to facilitate argument detection. We report on our experiences with corpus annotations using a function-based classification of arguments and a procedure for operationalizing the scheme by using semantic templates.</abstract>
       <url hash="2506a3a4">2022.sigdial-1.6</url>
       <bibkey>kawaletz-etal-2022-developing</bibkey>
+      <doi>10.18653/v1/2022.sigdial-1.6</doi>
     </paper>
     <paper id="7">
       <title>Multi-Task Learning for Depression Detection in Dialogs</title>
@@ -109,6 +115,7 @@
       <bibkey>li-etal-2022-multi</bibkey>
       <pwccode url="https://github.com/chuyuanli/mtl4depr" additional="false">chuyuanli/mtl4depr</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/dailydialog">DailyDialog</pwcdataset>
+      <doi>10.18653/v1/2022.sigdial-1.7</doi>
     </paper>
     <paper id="8">
       <title>To laugh or not to laugh? The use of laughter to mark discourse structure</title>
@@ -118,6 +125,7 @@
       <abstract>A number of cues, both linguistic and non-linguistic, have been found to mark discourse structure in conversation. This paper investigates the role of laughter, one of the most encountered non-verbal vocalizations in human communication, in the signalling of turn boundaries. We employ a corpus of informal dyadic conversations to determine the likelihood of laughter at the end of speaker turns and to establish the potential role of laughter in discourse organization. Our results show that, on average, about 10% of the turns are marked by laughter, but also that the marking is subject to individual variation, as well as effects of other factors, such as the type of relationship between speakers. More importantly, we find that turn ends are twice more likely than transition relevance places to be marked by laughter, suggesting that, indeed, laughter plays a role in marking discourse structure.</abstract>
       <url hash="65590439">2022.sigdial-1.8</url>
       <bibkey>ludusan-schuppler-2022-laugh</bibkey>
+      <doi>10.18653/v1/2022.sigdial-1.8</doi>
     </paper>
     <paper id="9">
       <title><fixed-case>Q</fixed-case>uality<fixed-case>A</fixed-case>dapt: an Automatic Dialogue Quality Estimation Framework</title>
@@ -131,6 +139,7 @@
       <pwccode url="https://github.com/johndmendonca/qualityadapt" additional="false">johndmendonca/qualityadapt</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/dailydialog">DailyDialog</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/fed">FED</pwcdataset>
+      <doi>10.18653/v1/2022.sigdial-1.9</doi>
     </paper>
     <paper id="10">
       <title>Graph Neural Network Policies and Imitation Learning for Multi-Domain Task-Oriented Dialogues</title>
@@ -142,6 +151,7 @@
       <abstract>Task-oriented dialogue systems are designed to achieve specific goals while conversing with humans. In practice, they may have to handle simultaneously several domains and tasks. The dialogue manager must therefore be able to take into account domain changes and plan over different domains/tasks in order to deal with multi-domain dialogues. However, learning with reinforcement in such context becomes difficult because the state-action dimension is larger while the reward signal remains scarce. Our experimental results suggest that structured policies based on graph neural networks combined with different degrees of imitation learning can effectively handle multi-domain dialogues. The reported experiments underline the benefit of structured policies over standard policies.</abstract>
       <url hash="6976778a">2022.sigdial-1.10</url>
       <bibkey>cordier-etal-2022-graph</bibkey>
+      <doi>10.18653/v1/2022.sigdial-1.10</doi>
     </paper>
     <paper id="11">
       <title>The <fixed-case>D</fixed-case>ial<fixed-case>P</fixed-case>ort tools</title>
@@ -153,6 +163,7 @@
       <abstract>The DialPort project (<url>http://dialport.org/</url>), funded by the National Science Foundation (NSF), covers a group of tools and services that aim at fulfilling the needs of the dialog research community. Over the course of six years, several offerings have been created, including the DialPort Portal and DialCrowd. This paper describes these contributions, which will be demoed at SIGDIAL, including implementation, prior studies, corresponding discoveries, and the locations at which the tools will remain freely available to the community going forward.</abstract>
       <url hash="1dc6ce47">2022.sigdial-1.11</url>
       <bibkey>huynh-etal-2022-dialport</bibkey>
+      <doi>10.18653/v1/2022.sigdial-1.11</doi>
     </paper>
     <paper id="12">
       <title>Simultaneous Job Interview System Using Multiple Semi-autonomous Agents</title>
@@ -166,6 +177,7 @@
       <abstract>In recent years, spoken dialogue systems have been applied to job interviews where an applicant talks to a system that asks pre-defined questions, called on-demand and self-paced job interviews. We propose a simultaneous job interview system, where one interviewer can conduct one-on-one interviews with multiple applicants simultaneously by cooperating with the multiple autonomous job interview dialogue systems. However, it is challenging for interviewers to monitor and understand all the parallel interviews done by the autonomous system at the same time. As a solution to this issue, we implemented two automatic dialogue understanding functions: (1) response evaluation of each applicant’s responses and (2) keyword extraction as a summary of the responses. It is expected that interviewers, as needed, can intervene in one dialogue and smoothly ask a proper question that elaborates the interview. We report a pilot experiment where an interviewer conducted simultaneous job interviews with three candidates.</abstract>
       <url hash="c7e9fd37">2022.sigdial-1.12</url>
       <bibkey>kawai-etal-2022-simultaneous</bibkey>
+      <doi>10.18653/v1/2022.sigdial-1.12</doi>
     </paper>
     <paper id="13">
       <title>Dialog Acts for Task Driven Embodied Agents</title>
@@ -178,6 +190,7 @@
       <url hash="b28ef333">2022.sigdial-1.13</url>
       <bibkey>gella-etal-2022-dialog</bibkey>
       <video href="https://youtu.be/YE8cWLaKrTw"/>
+      <doi>10.18653/v1/2022.sigdial-1.13</doi>
     </paper>
     <paper id="14">
       <title>Symbol and Communicative Grounding through Object Permanence with a Mobile Robot</title>
@@ -190,6 +203,7 @@
       <bibkey>torres-foncesca-etal-2022-symbol</bibkey>
       <video href="https://youtu.be/xxulenUa754"/>
       <pwcdataset url="https://paperswithcode.com/dataset/coco">COCO</pwcdataset>
+      <doi>10.18653/v1/2022.sigdial-1.14</doi>
     </paper>
     <paper id="15">
       <title>Towards Personality-Aware Chatbots</title>
@@ -203,6 +217,7 @@
       <url hash="8f5c8677">2022.sigdial-1.15</url>
       <bibkey>fernau-etal-2022-towards</bibkey>
       <video href="https://youtu.be/06hrn65ypxE"/>
+      <doi>10.18653/v1/2022.sigdial-1.15</doi>
     </paper>
     <paper id="16">
       <title>Towards Socially Intelligent Agents with Mental State Transition and Human Value</title>
@@ -218,6 +233,7 @@
       <url hash="94ce9ca6">2022.sigdial-1.16</url>
       <bibkey>qiu-etal-2022-towards</bibkey>
       <video href="https://youtu.be/Oo2gsoHc6YI"/>
+      <doi>10.18653/v1/2022.sigdial-1.16</doi>
     </paper>
     <paper id="17">
       <title>Automatic Verbal Depiction of a Brick Assembly for a Robot Instructing Humans</title>
@@ -230,6 +246,7 @@
       <url hash="d9ac131e">2022.sigdial-1.17</url>
       <bibkey>younes-etal-2022-automatic</bibkey>
       <video href="https://youtu.be/PbLerk3eURo"/>
+      <doi>10.18653/v1/2022.sigdial-1.17</doi>
     </paper>
     <paper id="18">
       <title>Are Interaction Patterns Helpful for Task-Agnostic Dementia Detection? An Empirical Exploration</title>
@@ -240,6 +257,7 @@
       <url hash="d8da1422">2022.sigdial-1.18</url>
       <bibkey>farzana-parde-2022-interaction</bibkey>
       <video href="https://youtu.be/7mJS9l1Oan8"/>
+      <doi>10.18653/v1/2022.sigdial-1.18</doi>
     </paper>
     <paper id="19">
       <title><fixed-case>EDU</fixed-case>-<fixed-case>AP</fixed-case>: Elementary Discourse Unit based Argument Parser</title>
@@ -252,6 +270,7 @@
       <bibkey>saha-etal-2022-edu</bibkey>
       <video href="https://youtu.be/4T_oMl8rAcE"/>
       <pwccode url="https://github.com/sougata-ub/edu-ap" additional="false">sougata-ub/edu-ap</pwccode>
+      <doi>10.18653/v1/2022.sigdial-1.19</doi>
     </paper>
     <paper id="20">
       <title>Using Transition Duration to Improve Turn-taking in Conversational Agents</title>
@@ -263,6 +282,7 @@
       <url hash="bfaa0e2c">2022.sigdial-1.20</url>
       <bibkey>threlkeld-etal-2022-using</bibkey>
       <video href="https://youtu.be/KDjyyabmzWU"/>
+      <doi>10.18653/v1/2022.sigdial-1.20</doi>
     </paper>
     <paper id="21">
       <title><fixed-case>DG</fixed-case>2: Data Augmentation Through Document Grounded Dialogue Generation</title>
@@ -282,6 +302,7 @@
       <pwcdataset url="https://paperswithcode.com/dataset/quac">QuAC</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/sharc">ShARC</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/doc2dial">doc2dial</pwcdataset>
+      <doi>10.18653/v1/2022.sigdial-1.21</doi>
     </paper>
     <paper id="22">
       <title>When can <fixed-case>I</fixed-case> Speak? Predicting initiation points for spoken dialogue agents</title>
@@ -294,6 +315,7 @@
       <bibkey>li-etal-2022-speak</bibkey>
       <video href="https://youtu.be/63FxVG68-G8"/>
       <pwccode url="https://github.com/siyan-sylvia-li/icarus_final" additional="false">siyan-sylvia-li/icarus_final</pwccode>
+      <doi>10.18653/v1/2022.sigdial-1.22</doi>
     </paper>
     <paper id="23">
       <title>Using Interaction Style Dimensions to Characterize Spoken Dialog Corpora</title>
@@ -303,6 +325,7 @@
       <url hash="08bd9c4b">2022.sigdial-1.23</url>
       <bibkey>ward-2022-using</bibkey>
       <video href="https://youtu.be/XmZ4hR3eMoE"/>
+      <doi>10.18653/v1/2022.sigdial-1.23</doi>
     </paper>
     <paper id="24">
       <title>Multi-Domain Dialogue State Tracking with Top-K Slot Self Attention</title>
@@ -315,6 +338,7 @@
       <url hash="6cdd35d9">2022.sigdial-1.24</url>
       <bibkey>yang-etal-2022-multi</bibkey>
       <video href="https://youtu.be/HOaC1W3yAow"/>
+      <doi>10.18653/v1/2022.sigdial-1.24</doi>
     </paper>
     <paper id="25">
       <title>Building a Knowledge-Based Dialogue System with Text Infilling</title>
@@ -327,6 +351,7 @@
       <bibkey>xue-etal-2022-building</bibkey>
       <video href="https://youtu.be/-R9IKfbZci8"/>
       <pwcdataset url="https://paperswithcode.com/dataset/opendialkg">OpenDialKG</pwcdataset>
+      <doi>10.18653/v1/2022.sigdial-1.25</doi>
     </paper>
     <paper id="26">
       <title>Generating Meaningful Topic Descriptions with Sentence Embeddings and <fixed-case>LDA</fixed-case></title>
@@ -339,6 +364,7 @@
       <url hash="ef8d94c4">2022.sigdial-1.26</url>
       <bibkey>sastre-martinez-etal-2022-generating</bibkey>
       <video href="https://youtu.be/R1_YcuKYzKs"/>
+      <doi>10.18653/v1/2022.sigdial-1.26</doi>
     </paper>
     <paper id="27">
       <title>How Well Do You Know Your Audience? Toward Socially-aware Question Generation</title>
@@ -349,6 +375,7 @@
       <url hash="70ee50d7">2022.sigdial-1.27</url>
       <bibkey>stewart-mihalcea-2022-well</bibkey>
       <video href="https://youtu.be/QMhSJXon4Z4"/>
+      <doi>10.18653/v1/2022.sigdial-1.27</doi>
     </paper>
     <paper id="28">
       <title><fixed-case>G</fixed-case>en<fixed-case>TUS</fixed-case>: Simulating User Behaviour and Language in Task-oriented Dialogues with Generative Transformers</title>
@@ -365,6 +392,7 @@
       <bibkey>lin-etal-2022-gentus</bibkey>
       <video href="https://youtu.be/v_zOm3OF_LI"/>
       <pwcdataset url="https://paperswithcode.com/dataset/multiwoz">MultiWOZ</pwcdataset>
+      <doi>10.18653/v1/2022.sigdial-1.28</doi>
     </paper>
     <paper id="29">
       <title><fixed-case>AARGH</fixed-case>! End-to-end Retrieval-Generation for Task-Oriented Dialog</title>
@@ -377,6 +405,7 @@
       <video href="https://youtu.be/o_-G6L9wL9U"/>
       <pwccode url="https://github.com/tomiinek/aargh" additional="false">tomiinek/aargh</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/multiwoz">MultiWOZ</pwcdataset>
+      <doi>10.18653/v1/2022.sigdial-1.29</doi>
     </paper>
     <paper id="30">
       <title>A Systematic Evaluation of Response Selection for Open Domain Dialogue</title>
@@ -389,6 +418,7 @@
       <url hash="7b908bfd">2022.sigdial-1.30</url>
       <bibkey>hedayatnia-etal-2022-systematic</bibkey>
       <video href="https://youtu.be/05psFdQrRjs"/>
+      <doi>10.18653/v1/2022.sigdial-1.30</doi>
     </paper>
     <paper id="31">
       <title>Inferring Ranked Dialog Flows from Human-to-Human Conversations</title>
@@ -399,6 +429,7 @@
       <url hash="fb86ff41">2022.sigdial-1.31</url>
       <bibkey>sastre-martinez-nugent-2022-inferring</bibkey>
       <video href="https://youtu.be/Fn8syqjZFCA"/>
+      <doi>10.18653/v1/2022.sigdial-1.31</doi>
     </paper>
     <paper id="32">
       <title>Structured Dialogue Discourse Parsing</title>
@@ -411,6 +442,7 @@
       <video href="https://youtu.be/wYYXdaFAQWA"/>
       <pwccode url="https://github.com/chijames/structured_dialogue_discourse_parsing" additional="false">chijames/structured_dialogue_discourse_parsing</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/molweni">Molweni</pwcdataset>
+      <doi>10.18653/v1/2022.sigdial-1.32</doi>
     </paper>
     <paper id="33">
       <title>“Do you follow me?”: A Survey of Recent Approaches in Dialogue State Tracking</title>
@@ -425,6 +457,7 @@
       <pwcdataset url="https://paperswithcode.com/dataset/crosswoz">CrossWOZ</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/multiwoz">MultiWOZ</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/sgd">SGD</pwcdataset>
+      <doi>10.18653/v1/2022.sigdial-1.33</doi>
     </paper>
     <paper id="34">
       <title><fixed-case>M</fixed-case>ulti<fixed-case>WOZ</fixed-case> 2.4: A Multi-Domain Task-Oriented Dialogue Dataset with Essential Annotation Corrections to Improve State Tracking Evaluation</title>
@@ -438,6 +471,7 @@
       <video href="https://youtu.be/mI5UNXEtSTI"/>
       <pwccode url="https://github.com/smartyfh/MultiWOZ2.4" additional="false">smartyfh/MultiWOZ2.4</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/multiwoz">MultiWOZ</pwcdataset>
+      <doi>10.18653/v1/2022.sigdial-1.34</doi>
     </paper>
     <paper id="35">
       <title>The Duration of a Turn Cannot be Used to Predict When It Ends</title>
@@ -448,6 +482,7 @@
       <url hash="4985d8bd">2022.sigdial-1.35</url>
       <bibkey>threlkeld-de-ruiter-2022-duration</bibkey>
       <video href="https://youtu.be/pjOECUyktkw"/>
+      <doi>10.18653/v1/2022.sigdial-1.35</doi>
     </paper>
     <paper id="36">
       <title>Getting Better Dialogue Context for Knowledge Identification by Leveraging Document-level Topic Shift</title>
@@ -461,6 +496,7 @@
       <pwcdataset url="https://paperswithcode.com/dataset/doc2dial-1">Doc2Dial</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/multidoc2dial">MultiDoc2Dial</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/doc2dial">doc2dial</pwcdataset>
+      <doi>10.18653/v1/2022.sigdial-1.36</doi>
     </paper>
     <paper id="37">
       <title>Neural Generation Meets Real People: Building a Social, Informative Open-Domain Dialogue Agent</title>
@@ -490,6 +526,7 @@
       <url hash="6883106a">2022.sigdial-1.37</url>
       <bibkey>chi-etal-2022-neural</bibkey>
       <video href="https://youtu.be/4F3Az88q3KI"/>
+      <doi>10.18653/v1/2022.sigdial-1.37</doi>
     </paper>
     <paper id="38">
       <title><fixed-case>D</fixed-case>eep<fixed-case>C</fixed-case>on: An End-to-End Multilingual Toolkit for Automatic Minuting of Multi-Party Dialogues</title>
@@ -503,6 +540,7 @@
       <video href="https://youtu.be/1sfwi9oXIZg"/>
       <pwcdataset url="https://paperswithcode.com/dataset/must-c">MuST-C</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/samsum-corpus">SAMSum Corpus</pwcdataset>
+      <doi>10.18653/v1/2022.sigdial-1.38</doi>
     </paper>
     <paper id="39">
       <title><fixed-case>ICM</fixed-case> : Intent and Conversational Mining from Conversation Logs</title>
@@ -515,6 +553,7 @@
       <url hash="97731310">2022.sigdial-1.39</url>
       <bibkey>mitra-etal-2022-icm</bibkey>
       <video href="https://youtu.be/xnFqxU_8c8A"/>
+      <doi>10.18653/v1/2022.sigdial-1.39</doi>
     </paper>
     <paper id="40">
       <title>Entity-based De-noising Modeling for Controllable Dialogue Summarization</title>
@@ -525,6 +564,7 @@
       <url hash="6c4f785f">2022.sigdial-1.40</url>
       <bibkey>liu-chen-2022-entity</bibkey>
       <video href="https://youtu.be/5FTuOs7nTK4"/>
+      <doi>10.18653/v1/2022.sigdial-1.40</doi>
     </paper>
     <paper id="41">
       <title>i<fixed-case>E</fixed-case>val: Interactive Evaluation Framework for Open-Domain Empathetic Chatbots</title>
@@ -537,6 +577,7 @@
       <bibkey>svikhnushina-etal-2022-ieval</bibkey>
       <video href="https://youtu.be/7h3jwTTfXUY"/>
       <pwccode url="https://github.com/sea94/ieval" additional="false">sea94/ieval</pwccode>
+      <doi>10.18653/v1/2022.sigdial-1.41</doi>
     </paper>
     <paper id="42">
       <title>Unsupervised Domain Adaptation on Question-Answering System with Conversation Data</title>
@@ -551,6 +592,7 @@
       <pwcdataset url="https://paperswithcode.com/dataset/cnn-daily-mail-1">CNN/Daily Mail</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/doc2dial-1">Doc2Dial</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/doc2dial">doc2dial</pwcdataset>
+      <doi>10.18653/v1/2022.sigdial-1.42</doi>
     </paper>
     <paper id="43">
       <title><fixed-case>U</fixed-case>ni<fixed-case>DU</fixed-case>: Towards A Unified Generative Dialogue Understanding Framework</title>
@@ -567,6 +609,7 @@
       <url hash="7759c166">2022.sigdial-1.43</url>
       <bibkey>chen-etal-2022-unidu</bibkey>
       <video href="https://youtu.be/LSASmJ1aIWU"/>
+      <doi>10.18653/v1/2022.sigdial-1.43</doi>
     </paper>
     <paper id="44">
       <title>Advancing Semi-Supervised Task Oriented Dialog Systems by <fixed-case>JSA</fixed-case> Learning of Discrete Latent Variable Models</title>
@@ -581,6 +624,7 @@
       <bibkey>cai-etal-2022-advancing</bibkey>
       <video href="https://youtu.be/ea-475Ex9do"/>
       <pwccode url="https://github.com/cycrab/JSA-TOD" additional="false">cycrab/JSA-TOD</pwccode>
+      <doi>10.18653/v1/2022.sigdial-1.44</doi>
     </paper>
     <paper id="45">
       <title>Redwood: Using Collision Detection to Grow a Large-Scale Intent Classification Dataset</title>
@@ -596,6 +640,7 @@
       <pwcdataset url="https://paperswithcode.com/dataset/banking77">BANKING77</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/clinc150">CLINC150</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/talk2car">Talk2Car</pwcdataset>
+      <doi>10.18653/v1/2022.sigdial-1.45</doi>
     </paper>
     <paper id="46">
       <title>Dialogue Evaluation with Offline Reinforcement Learning</title>
@@ -611,6 +656,7 @@
       <url hash="9a60ee03">2022.sigdial-1.46</url>
       <bibkey>lubis-etal-2022-dialogue</bibkey>
       <video href="https://youtu.be/PMrVVI4g5mM"/>
+      <doi>10.18653/v1/2022.sigdial-1.46</doi>
     </paper>
     <paper id="47">
       <title>Disruptive Talk Detection in Multi-Party Dialogue within Collaborative Learning Environments with a Regularized User-Aware Network</title>
@@ -626,6 +672,7 @@
       <url hash="f9769efc">2022.sigdial-1.47</url>
       <bibkey>park-etal-2022-disruptive</bibkey>
       <video href="https://youtu.be/e7nQwF82LBo"/>
+      <doi>10.18653/v1/2022.sigdial-1.47</doi>
     </paper>
     <paper id="48">
       <title>Generating Discourse Connectives with Pre-trained Language Models: Conditioning on Discourse Relations Helps Reconstruct the <fixed-case>PDTB</fixed-case></title>
@@ -639,6 +686,7 @@
       <bibkey>stevens-guille-etal-2022-generating</bibkey>
       <video href="https://youtu.be/Io_RIOZj1hQ"/>
       <pwccode url="https://github.com/symonjorystevens-guille/penngen" additional="false">symonjorystevens-guille/penngen</pwccode>
+      <doi>10.18653/v1/2022.sigdial-1.48</doi>
     </paper>
     <paper id="49">
       <title>Toward Self-Learning End-to-End Task-oriented Dialog Systems</title>
@@ -651,6 +699,7 @@
       <url hash="35fba05f">2022.sigdial-1.49</url>
       <bibkey>zhang-etal-2022-toward-self</bibkey>
       <video href="https://youtu.be/FBO3PMW57gU"/>
+      <doi>10.18653/v1/2022.sigdial-1.49</doi>
     </paper>
     <paper id="50">
       <title>Combining Structured and Unstructured Knowledge in an Interactive Search Dialogue System</title>
@@ -666,6 +715,7 @@
       <video href="https://youtu.be/4f2urztZCdQ"/>
       <pwcdataset url="https://paperswithcode.com/dataset/sgd">SGD</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/snli">SNLI</pwcdataset>
+      <doi>10.18653/v1/2022.sigdial-1.50</doi>
     </paper>
     <paper id="51">
       <title>How Much Does Prosody Help Turn-taking? Investigations using Voice Activity Projection Models</title>
@@ -677,6 +727,7 @@
       <bibkey>ekstedt-skantze-2022-much</bibkey>
       <video href="https://youtu.be/QstNOaBfJ5k"/>
       <pwccode url="https://github.com/erikekstedt/conv_ssl" additional="true">erikekstedt/conv_ssl</pwccode>
+      <doi>10.18653/v1/2022.sigdial-1.51</doi>
     </paper>
     <paper id="52">
       <title>What makes you change your mind? An empirical investigation in online group decision-making conversations</title>
@@ -689,6 +740,7 @@
       <bibkey>karadzhov-etal-2022-makes</bibkey>
       <video href="https://youtu.be/wA9RLUEdwOg"/>
       <pwcdataset url="https://paperswithcode.com/dataset/delidata">DeliData</pwcdataset>
+      <doi>10.18653/v1/2022.sigdial-1.52</doi>
     </paper>
     <paper id="53">
       <title>Dialogue Term Extraction using Transfer Learning and Topological Data Analysis</title>
@@ -705,6 +757,7 @@
       <video href="https://youtu.be/keSRDRwRK3Y"/>
       <pwcdataset url="https://paperswithcode.com/dataset/multiwoz">MultiWOZ</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/sgd">SGD</pwcdataset>
+      <doi>10.18653/v1/2022.sigdial-1.53</doi>
     </paper>
     <paper id="54">
       <title>Evaluating N-best Calibration of Natural Language Understanding for Dialogue Systems</title>
@@ -717,6 +770,7 @@
       <bibkey>khojah-etal-2022-evaluating</bibkey>
       <video href="https://youtu.be/VW97fUNgUw8"/>
       <pwccode url="https://github.com/ranimkhojah/confidence-estimation-benchmark" additional="false">ranimkhojah/confidence-estimation-benchmark</pwccode>
+      <doi>10.18653/v1/2022.sigdial-1.54</doi>
     </paper>
     <paper id="55">
       <title><fixed-case>LAD</fixed-case>: Language Models as Data for Zero-Shot Dialog</title>
@@ -732,6 +786,7 @@
       <pwcdataset url="https://paperswithcode.com/dataset/clinc150">CLINC150</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/hwu64">HWU64</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/star">STAR</pwcdataset>
+      <doi>10.18653/v1/2022.sigdial-1.55</doi>
     </paper>
     <paper id="56">
       <title>Improving Bot Response Contradiction Detection via Utterance Rewriting</title>
@@ -747,6 +802,7 @@
       <pwccode url="https://github.com/jind11/utterance-rewriting" additional="false">jind11/utterance-rewriting</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/canard">CANARD</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/dailydialog">DailyDialog</pwcdataset>
+      <doi>10.18653/v1/2022.sigdial-1.56</doi>
     </paper>
     <paper id="57">
       <title>Comparison of Lexical Alignment with a Teachable Robot in Human-Robot and Human-Human-Robot Interactions</title>
@@ -761,6 +817,7 @@
       <abstract>Speakers build rapport in the process of aligning conversational behaviors with each other. Rapport engendered with a teachable agent while instructing domain material has been shown to promote learning. Past work on lexical alignment in the field of education suffers from limitations in both the measures used to quantify alignment and the types of interactions in which alignment with agents has been studied. In this paper, we apply alignment measures based on a data-driven notion of shared expressions (possibly composed of multiple words) and compare alignment in one-on-one human-robot (H-R) interactions with the H-R portions of collaborative human-human-robot (H-H-R) interactions. We find that students in the H-R setting align with a teachable robot more than in the H-H-R setting and that the relationship between lexical alignment and rapport is more complex than what is predicted by previous theoretical and empirical work.</abstract>
       <url hash="74b8fbc8">2022.sigdial-1.57</url>
       <bibkey>asano-etal-2022-comparison</bibkey>
+      <doi>10.18653/v1/2022.sigdial-1.57</doi>
     </paper>
     <paper id="58">
       <title><fixed-case>TREND</fixed-case>: Trigger-Enhanced Relation-Extraction Network for Dialogues</title>
@@ -774,6 +831,7 @@
       <pwccode url="https://github.com/miulab/trend" additional="false">miulab/trend</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/ddrel">DDRel</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/dialogre">DialogRE</pwcdataset>
+      <doi>10.18653/v1/2022.sigdial-1.58</doi>
     </paper>
     <paper id="59">
       <title>User Satisfaction Modeling with Domain Adaptation in Task-oriented Dialogue Systems</title>
@@ -787,6 +845,7 @@
       <bibkey>pan-etal-2022-user</bibkey>
       <pwcdataset url="https://paperswithcode.com/dataset/emorynlp">EmoryNLP</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/sgd">SGD</pwcdataset>
+      <doi>10.18653/v1/2022.sigdial-1.59</doi>
     </paper>
     <paper id="60">
       <title>N-best Response-based Analysis of Contradiction-awareness in Neural Response Generation Models</title>
@@ -801,6 +860,7 @@
       <url hash="f312a19c">2022.sigdial-1.60</url>
       <bibkey>sato-etal-2022-n</bibkey>
       <pwccode url="https://github.com/shiki-sato/nbest-contradiction-analysis" additional="false">shiki-sato/nbest-contradiction-analysis</pwccode>
+      <doi>10.18653/v1/2022.sigdial-1.60</doi>
     </paper>
     <paper id="61">
       <title>A Visually-Aware Conversational Robot Receptionist</title>
@@ -816,6 +876,7 @@
       <abstract>Socially Assistive Robots (SARs) have the potential to play an increasingly important role in a variety of contexts including healthcare, but most existing systems have very limited interactive capabilities. We will demonstrate a robot receptionist that not only supports task-based and social dialogue via natural spoken conversation but is also capable of visually grounded dialogue; able to perceive and discuss the shared physical environment (e.g. helping users to locate personal belongings or objects of interest). Task-based dialogues include check-in, navigation and FAQs about facilities, alongside social features such as chit-chat, access to the latest news and a quiz game to play while waiting. We also show how visual context (objects and their spatial relations) can be combined with linguistic representations of dialogue context, to support visual dialogue and question answering. We will demonstrate the system on a humanoid ARI robot, which is being deployed in a hospital reception area.</abstract>
       <url hash="bdaac07e">2022.sigdial-1.61</url>
       <bibkey>gunson-etal-2022-visually</bibkey>
+      <doi>10.18653/v1/2022.sigdial-1.61</doi>
     </paper>
     <paper id="62">
       <title>Demonstrating <fixed-case>EMMA</fixed-case>: Embodied <fixed-case>M</fixed-case>ulti<fixed-case>M</fixed-case>odal Agent for Language-guided Action Execution in 3<fixed-case>D</fixed-case> Simulated Environments</title>
@@ -835,6 +896,7 @@
       <bibkey>suglia-etal-2022-demonstrating</bibkey>
       <pwcdataset url="https://paperswithcode.com/dataset/ai2-thor">AI2-THOR</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/alfred">ALFRED</pwcdataset>
+      <doi>10.18653/v1/2022.sigdial-1.62</doi>
     </paper>
     <paper id="63">
       <title><fixed-case>GRILLB</fixed-case>ot: A multi-modal conversational agent for complex real-world tasks</title>
@@ -848,6 +910,7 @@
       <abstract>We present GRILLBot, an open-source multi-modal task-oriented voice assistant to help users perform complex tasks, focusing on the domains of cooking and home improvement. GRILLBot curates and leverages web information extraction to build coverage over a broad range of tasks for which a user can receive guidance. To represent each task, we propose TaskGraphs as a dynamic graph unifying steps, requirements, and curated domain knowledge enabling contextual question answering, and detailed explanations. Multi-modal elements play a key role in GRILLBot both helping the user navigate through the task and enriching the experience with helpful videos and images that are automatically linked throughout the task. We leverage a contextual neural semantic parser to enable flexible navigation when interacting with the system by jointly encoding stateful information with the conversation history. GRILLBot enables dynamic and adaptable task planning and assistance for complex tasks by combining elements of task representations that incorporate text and structure, combined with neural models for search, question answering, and dialogue state management. GRILLBot competed in the Alexa prize TaskBot Challenge as one of the finalists.</abstract>
       <url hash="749214f4">2022.sigdial-1.63</url>
       <bibkey>gemmell-etal-2022-grillbot</bibkey>
+      <doi>10.18653/v1/2022.sigdial-1.63</doi>
     </paper>
     <paper id="64">
       <title>A System For Robot Concept Learning Through Situated Dialogue</title>
@@ -860,6 +923,7 @@
       <url hash="7dd0c205">2022.sigdial-1.64</url>
       <bibkey>kane-etal-2022-system</bibkey>
       <pwcdataset url="https://paperswithcode.com/dataset/hurdl">HuRDL</pwcdataset>
+      <doi>10.18653/v1/2022.sigdial-1.64</doi>
     </paper>
   </volume>
 </collection>
diff --git a/data/xml/2023.inlg.xml b/data/xml/2023.inlg.xml
index 673c53dca1..047d41f619 100644
--- a/data/xml/2023.inlg.xml
+++ b/data/xml/2023.inlg.xml
@@ -26,6 +26,7 @@
       <abstract>In this paper, we introduce a new beam search algorithm that improves the generalization of neural generators to unseen examples, especially in low-resource data-to-text settings. Our algorithm aims to reduce the number of omissions and hallucinations during the decoding process. For this purpose, it relies on two regression models to explicitly characterize factual errors. We explain how to create a new dataset to train these models given an original training set of less than a thousand data points. We apply our approach in the low-resource, legal setting using the French Plum2Text dataset, as well as in English using WebNLG. We observe in our experiment that this combination improves the faithfulness of pre-trained neural text generators using both human and automatic evaluation. Moreover, our approach offers a level of interpretability by predicting the number of omissions and hallucinations present in a given generation with respect to the input data. Finally, we visualize our algorithm’s exploration of the hypothesis space at different steps during the decoding process.</abstract>
       <url hash="b759b498">2023.inlg-main.1</url>
       <bibkey>garneau-lamontagne-2023-guided</bibkey>
+      <doi>10.18653/v1/2023.inlg-main.1</doi>
     </paper>
     <paper id="2">
       <title><fixed-case>XF</fixed-case>2<fixed-case>T</fixed-case>: Cross-lingual Fact-to-Text Generation for Low-Resource Languages</title>
@@ -39,6 +40,7 @@
       <abstract>Multiple business scenarios require an automated generation of descriptive human-readable text from structured input data. This has resulted into substantial work on fact-to-text generation systems recently. Unfortunately, previous work on fact-to-text (F2T) generation has focused primarily on English mainly due to the high availability of relevant datasets. Only recently, the problem of cross-lingual fact-to-text (XF2T) was proposed for generation across multiple languages alongwith a dataset, XAlign for eight languages. However, there has been no rigorous work on the actual XF2T generation problem. We extend XAlign dataset with annotated data for four more languages: Punjabi, Malayalam, Assamese and Oriya. We conduct an extensive study using popular Transformer-based text generation models on our extended multi-lingual dataset, which we call XAlignV2. Further, we investigate the performance of different text generation strategies: multiple variations of pretraining, fact-aware embeddings and structure-aware input encoding. Our extensive experiments show that a multi-lingual mT5 model which uses fact-aware embeddings with structure-aware input encoding leads to best results (30.90 BLEU, 55.12 METEOR and 59.17 chrF++) across the twelve languages. We make our code, dataset and model publicly available, and hope that this will help advance further research in this critical area.</abstract>
       <url hash="97fc12c4">2023.inlg-main.2</url>
       <bibkey>sagare-etal-2023-xf2t</bibkey>
+      <doi>10.18653/v1/2023.inlg-main.2</doi>
     </paper>
     <paper id="3">
       <title>Preventing Generation of Verbatim Memorization in Language Models Gives a False Sense of Privacy</title>
@@ -54,6 +56,7 @@
       <abstract>Studying data memorization in neural language models helps us understand the risks (e.g., to privacy or copyright) associated with models regurgitating training data and aids in the development of countermeasures. Many prior works—and some recently deployed defenses—focus on “verbatim memorization”, defined as a model generation that exactly matches a substring from the training set. We argue that verbatim memorization definitions are too restrictive and fail to capture more subtle forms of memorization. Specifically, we design and implement an efficient defense that _perfectly_ prevents all verbatim memorization. And yet, we demonstrate that this “perfect” filter does not prevent the leakage of training data. Indeed, it is easily circumvented by plausible and minimally modified “style-transfer” prompts—and in some cases even the non-modified original prompts—to extract memorized information. We conclude by discussing potential alternative definitions and why defining memorization is a difficult yet crucial open question for neural language models.</abstract>
       <url hash="be51a5e1">2023.inlg-main.3</url>
       <bibkey>ippolito-etal-2023-preventing</bibkey>
+      <doi>10.18653/v1/2023.inlg-main.3</doi>
     </paper>
     <paper id="4">
       <title>Fine-Tuning <fixed-case>GPT</fixed-case>-3 for Synthetic <fixed-case>D</fixed-case>anish News Generation</title>
@@ -63,6 +66,7 @@
       <abstract>While GPT-3 has garnered significant attention for its capabilities in natural language generation, research on its use outside of English is still relatively limited. We focus on how GPT-3 can be fine-tuned for generating synthetic news articles in a low-resource language, namely Danish. The model’s performance is evaluated on the dimensions of human and machine detection in two separate experiments. When presented with either a real or GPT-3 generated news article, human participants achieve a 58.1% classification accuracy. Contrarily, a fine-tuned BERT classifier obtains a 92.7% accuracy on the same task. This discrepancy likely pertains to the fine-tuned GPT-3 model oversampling high-likelihood tokens in its text generation. Although this is undetectable to the human eye, it leaves a statistical discrepancy for machine classifiers to detect. We address how decisions in the experimental design favoured the machine classifiers over the human evaluators, and whether the produced synthetic articles are applicable in a real-world context.</abstract>
       <url hash="7a898dbc">2023.inlg-main.4</url>
       <bibkey>almasi-schionning-2023-fine</bibkey>
+      <doi>10.18653/v1/2023.inlg-main.4</doi>
     </paper>
     <paper id="5">
       <title><fixed-case>GAN</fixed-case>-<fixed-case>LM</fixed-case>: Generative Adversarial Network using Language Models for Downstream Applications</title>
@@ -75,6 +79,7 @@
       <url hash="fb4baf53">2023.inlg-main.5</url>
       <attachment type="Supplementary_Attachment" hash="2b2a986a">2023.inlg-main.5.Supplementary_Attachment.zip</attachment>
       <bibkey>hwang-etal-2023-gan</bibkey>
+      <doi>10.18653/v1/2023.inlg-main.5</doi>
     </paper>
     <paper id="6">
       <title>Summaries as Captions: Generating Figure Captions for Scientific Documents with Automated Text Summarization</title>
@@ -92,6 +97,7 @@
       <url hash="a86436c2">2023.inlg-main.6</url>
       <attachment type="Supplementary_Attachment" hash="83583473">2023.inlg-main.6.Supplementary_Attachment.pdf</attachment>
       <bibkey>huang-etal-2023-summaries</bibkey>
+      <doi>10.18653/v1/2023.inlg-main.6</doi>
     </paper>
     <paper id="7">
       <title>Models of reference production: How do they withstand the test of time?</title>
@@ -102,6 +108,7 @@
       <abstract>In recent years, many NLP studies have focused solely on performance improvement. In this work, we focus on the linguistic and scientific aspects of NLP. We use the task of generating referring expressions in context (REG-in-context) as a case study and start our analysis from GREC, a comprehensive set of shared tasks in English that addressed this topic over a decade ago. We ask what the performance of models would be if we assessed them (1) on more realistic datasets, and (2) using more advanced methods. We test the models using different evaluation metrics and feature selection experiments. We conclude that GREC can no longer be regarded as offering a reliable assessment of models’ ability to mimic human reference production, because the results are highly impacted by the choice of corpus and evaluation metrics. Our results also suggest that pre-trained language models are less dependent on the choice of corpus than classic Machine Learning models, and therefore make more robust class predictions.</abstract>
       <url hash="6c4e84b2">2023.inlg-main.7</url>
       <bibkey>same-etal-2023-models</bibkey>
+      <doi>10.18653/v1/2023.inlg-main.7</doi>
     </paper>
     <paper id="8">
       <title>Generating Faithful Text From a Knowledge Graph with Noisy Reference Text</title>
@@ -115,6 +122,7 @@
       <url hash="cac4b0a7">2023.inlg-main.8</url>
       <attachment type="Supplementary_Attachment" hash="292495ae">2023.inlg-main.8.Supplementary_Attachment.pdf</attachment>
       <bibkey>hashem-etal-2023-generating</bibkey>
+      <doi>10.18653/v1/2023.inlg-main.8</doi>
     </paper>
     <paper id="9">
       <title>Entropy-based Sampling for Abstractive Multi-document Summarization in Low-resource Settings</title>
@@ -125,6 +133,7 @@
       <abstract>Research in Multi-document Summarization (MDS) mostly focuses on the English language and depends on large MDS datasets that are not available for other languages. Some of these approaches concatenate the source documents, resulting in overlong model inputs. Existing transformer architectures are unable to process such long inputs entirely, omitting documents in the summarization process. Other solutions address this issue by implementing multi-stage approaches that also require changes in the model architecture. In this paper, we introduce various sampling approaches based on information entropy that allow us to perform MDS in a single stage. These approaches also consider all source documents without using MDS training data nor changing the model’s architecture. Besides, we build a MDS test set of German news articles to assess the performance of our methods on abstractive multi-document summaries. Experimental results show that our entropy-based approaches outperform previous state-of-the-art on German MDS, while still remaining primarily abstractive. We release our code and MDS test set to encourage further research in German abstractive MDS.</abstract>
       <url hash="3959a88a">2023.inlg-main.9</url>
       <bibkey>mascarell-etal-2023-entropy</bibkey>
+      <doi>10.18653/v1/2023.inlg-main.9</doi>
     </paper>
     <paper id="10">
       <title>Claim Optimization in Computational Argumentation</title>
@@ -135,6 +144,7 @@
       <abstract>An optimal delivery of arguments is key to persuasion in any debate, both for humans and for AI systems. This requires the use of clear and fluent claims relevant to the given debate. Prior work has studied the automatic assessment of argument quality extensively. Yet, no approach actually improves the quality so far. To fill this gap, this paper proposes the task of claim optimization: to rewrite argumentative claims in order to optimize their delivery. As multiple types of optimization are possible, we approach this task by first generating a diverse set of candidate claims using a large language model, such as BART, taking into account contextual information. Then, the best candidate is selected using various quality metrics. In automatic and human evaluation on an English-language corpus, our quality-based candidate selection outperforms several baselines, improving 60% of all claims (worsening 16% only). Follow-up analyses reveal that, beyond copy editing, our approach often specifies claims with details, whereas it adds less evidence than humans do. Moreover, its capabilities generalize well to other domains, such as instructional texts.</abstract>
       <url hash="6623f0a4">2023.inlg-main.10</url>
       <bibkey>skitalinskaya-etal-2023-claim</bibkey>
+      <doi>10.18653/v1/2023.inlg-main.10</doi>
     </paper>
     <paper id="11">
       <title><fixed-case>C</fixed-case>hat<fixed-case>GPT</fixed-case>’s Information Seeking Strategy: Insights from the 20-Questions Game</title>
@@ -147,6 +157,7 @@
       <url hash="9e0a8c6c">2023.inlg-main.11</url>
       <attachment type="Supplementary_Attachment" hash="36927300">2023.inlg-main.11.Supplementary_Attachment.pdf</attachment>
       <bibkey>bertolazzi-etal-2023-chatgpts</bibkey>
+      <doi>10.18653/v1/2023.inlg-main.11</doi>
     </paper>
     <paper id="12">
       <title>This is not correct! Negation-aware Evaluation of Language Generation Systems</title>
@@ -157,6 +168,7 @@
       <abstract>Large language models underestimate the impact of negations on how much they change the meaning of a sentence. Therefore, learned evaluation metrics based on these models are insensitive to negations. In this paper, we propose NegBLEURT, a negation-aware version of the BLEURT evaluation metric. For that, we designed a rule-based sentence negation tool and used it to create the CANNOT negation evaluation dataset. Based on this dataset, we fine-tuned a sentence transformer and an evaluation metric to improve their negation sensitivity. Evaluating these models on existing benchmarks shows that our fine-tuned models outperform existing metrics on the negated sentences by far while preserving their base models’ performances on other perturbations.</abstract>
       <url hash="f87d6996">2023.inlg-main.12</url>
       <bibkey>anschutz-etal-2023-correct</bibkey>
+      <doi>10.18653/v1/2023.inlg-main.12</doi>
     </paper>
     <paper id="13">
       <title>Guidance in Radiology Report Summarization: An Empirical Evaluation and Error Analysis</title>
@@ -168,6 +180,7 @@
       <abstract>Automatically summarizing radiology reports into a concise impression can reduce the manual burden of clinicians and improve the consistency of reporting. Previous work aimed to enhance content selection and factuality through guided abstractive summarization. However, two key issues persist. First, current methods heavily rely on domain-specific resources to extract the guidance signal, limiting their transferability to domains and languages where those resources are unavailable. Second, while automatic metrics like ROUGE show progress, we lack a good understanding of the errors and failure modes in this task. To bridge these gaps, we first propose a domain-agnostic guidance signal in form of variable-length extractive summaries. Our empirical results on two English benchmarks demonstrate that this guidance signal improves upon unguided summarization while being competitive with domain-specific methods. Additionally, we run an expert evaluation of four systems according to a taxonomy of 11 fine-grained errors. We find that the most pressing differences between automatic summaries and those of radiologists relate to content selection including omissions (up to 52%) and additions (up to 57%). We hypothesize that latent reporting factors and corpus-level inconsistencies may limit models to reliably learn content selection from the available data, presenting promising directions for future work.</abstract>
       <url hash="8896b2c9">2023.inlg-main.13</url>
       <bibkey>trienes-etal-2023-guidance</bibkey>
+      <doi>10.18653/v1/2023.inlg-main.13</doi>
     </paper>
     <paper id="14">
       <title>A Zero-Shot Approach for Multi-User Task-Oriented Dialog Generation</title>
@@ -180,6 +193,7 @@
       <url hash="458c648e">2023.inlg-main.14</url>
       <attachment type="Supplementary_Attachment" hash="d7a06a1d">2023.inlg-main.14.Supplementary_Attachment.pdf</attachment>
       <bibkey>surya-etal-2023-zero</bibkey>
+      <doi>10.18653/v1/2023.inlg-main.14</doi>
     </paper>
     <paper id="15">
       <title>Beyond the Bias: Unveiling the Quality of Implicit Causality Prompt Continuations in Language Models</title>
@@ -191,6 +205,7 @@
       <abstract>Recent studies have used human continuations of Implicit Causality (IC) prompts collected in linguistic experiments to evaluate discourse understanding in large language models (LLMs), focusing on the well-known IC coreference bias in the LLMs’ predictions of the next word following the prompt. In this study, we investigate how continuations of IC prompts can be used to evaluate the text generation capabilities of LLMs in a linguistically controlled setting. We conduct an experiment using two open-source GPT-based models, employing human evaluation to assess different aspects of continuation quality. Our findings show that LLMs struggle in particular with generating coherent continuations in this rather simple setting, indicating a lack of discourse knowledge beyond the well-known IC bias. Our results also suggest that a bias congruent continuation does not necessarily equate to a higher continuation quality. Furthermore, our study draws upon insights from the Uniform Information Density hypothesis, testing different prompt modifications and decoding procedures and showing that sampling-based methods are particularly sensitive to the information density of the prompts.</abstract>
       <url hash="65ffcd43">2023.inlg-main.15</url>
       <bibkey>sieker-etal-2023-beyond</bibkey>
+      <doi>10.18653/v1/2023.inlg-main.15</doi>
     </paper>
     <paper id="16">
       <title>Enhancing factualness and controllability of Data-to-Text Generation via data Views and constraints</title>
@@ -205,6 +220,7 @@
       <url hash="ca1e2fd2">2023.inlg-main.16</url>
       <attachment type="Supplementary_Attachment" hash="3c966389">2023.inlg-main.16.Supplementary_Attachment.pdf</attachment>
       <bibkey>thomson-etal-2023-enhancing</bibkey>
+      <doi>10.18653/v1/2023.inlg-main.16</doi>
     </paper>
     <paper id="17">
       <title>Memories for Virtual <fixed-case>AI</fixed-case> Characters</title>
@@ -215,6 +231,7 @@
       <abstract>In this paper, we present a system for augmenting virtual AI characters with long-term memory, enabling them to remember facts about themselves, their world, and past experiences. We propose a memory-creation pipeline that converts raw text into condensed memories and a memory-retrieval system that utilizes these memories to generate character responses. Using a fact-checking pipeline based on GPT-4, our evaluation demonstrates that the character responses are grounded in the retrieved memories and maintain factual accuracy. We discuss the implications of our system for creating engaging and consistent virtual characters and highlight areas for future research, including large language model (LLM) guardrailing and virtual character personality development.</abstract>
       <url hash="30c577a9">2023.inlg-main.17</url>
       <bibkey>landwehr-etal-2023-memories</bibkey>
+      <doi>10.18653/v1/2023.inlg-main.17</doi>
     </paper>
     <paper id="18">
       <title>Metric-Based In-context Learning: A Case Study in Text Simplification</title>
@@ -224,6 +241,7 @@
       <abstract>In-context learning (ICL) for large language models has proven to be a powerful approach for many natural language processing tasks. However, determining the best method to select examples for ICL is nontrivial as the results can vary greatly depending on the quality, quantity, and order of examples used. In this paper, we conduct a case study on text simplification (TS) to investigate how to select the best and most robust examples for ICL. We propose Metric-Based in-context Learning (MBL) method that utilizes commonly used TS metrics such as SARI, compression ratio, and BERT-Precision for selection. Through an extensive set of experiments with various-sized GPT models on standard TS benchmarks such as TurkCorpus and ASSET, we show that examples selected by the top SARI scores perform the best on larger models such as GPT-175B, while the compression ratio generally performs better on smaller models such as GPT-13B and GPT-6.7B. Furthermore, we demonstrate that MBL is generally robust to example orderings and out-of-domain test sets, and outperforms strong baselines and state-of-the-art finetuned language models. Finally, we show that the behavior of large GPT models can be implicitly controlled by the chosen metric. Our research provides a new framework for selecting examples in ICL, and demonstrates its effectiveness in text simplification tasks, breaking new ground for more accurate and efficient NLG systems.</abstract>
       <url hash="a46468c1">2023.inlg-main.18</url>
       <bibkey>vadlamannati-sahin-2023-metric</bibkey>
+      <doi>10.18653/v1/2023.inlg-main.18</doi>
     </paper>
     <paper id="19">
       <title>Exploring the Naturalness of Cognitive Status-Informed Referring Form Selection Models</title>
@@ -235,6 +253,7 @@
       <abstract>Language-capable robots must be able to efficiently and naturally communicate about objects in the environment. A key part of communication is Referring Form Selection (RFS): the process of selecting a form like it, that, or the N to use when referring to an object. Recent cognitive status-informed computational RFS models have been evaluated in terms of goodness-of-fit to human data. But it is as yet unclear whether these models actually select referring forms that are any more natural than baseline alternatives, regardless of goodness-of-fit. Through a human subject study designed to assess this question, we show that even though cognitive status-informed referring selection models achieve good fit to human data, they do not (yet) produce concrete benefits in terms of naturality. On the other hand, our results show that human utterances also had high variability in perceived naturality, demonstrating the challenges of evaluating RFS naturality.</abstract>
       <url hash="8bd423c9">2023.inlg-main.19</url>
       <bibkey>del-castillo-etal-2023-exploring</bibkey>
+      <doi>10.18653/v1/2023.inlg-main.19</doi>
     </paper>
     <paper id="20">
       <title>System-Initiated Transitions from Chit-Chat to Task-Oriented Dialogues with Transition Info Extractor and Transition Sentence Generator</title>
@@ -246,6 +265,7 @@
       <abstract>In this work, we study dialogue scenarios that start from chit-chat but eventually switch to task-related services, and investigate how a unified dialogue model, which can engage in both chit-chat and task-oriented dialogues, takes the initiative during the dialogue mode transition from chit-chat to task-oriented in a coherent and cooperative manner. We firstly build a <i>transition info extractor</i> (TIE) that keeps track of the preceding chit-chat interaction and detects the potential user intention to switch to a task-oriented service. Meanwhile, in the unified model, a <i>transition sentence generator</i> (TSG) is extended through efficient Adapter tuning and transition prompt learning. When the TIE successfully finds task-related information from the preceding chit-chat, such as a transition domain (“train” in Figure fig: system-initiated transition from chit-chat to task-oriented.), then the TSG is activated automatically in the unified model to initiate this transition by generating a transition sentence under the guidance of transition information extracted by TIE. The experimental results show promising performance regarding the proactive transitions. We achieve an additional large improvement on TIE model by utilizing Conditional Random Fields (CRF). The TSG can flexibly generate transition sentences while maintaining the unified capabilities of normal chit-chat and task-oriented response generation.</abstract>
       <url hash="200309f4">2023.inlg-main.20</url>
       <bibkey>liu-etal-2023-system-initiated</bibkey>
+      <doi>10.18653/v1/2023.inlg-main.20</doi>
     </paper>
     <paper id="21">
       <title><fixed-case>HL</fixed-case> Dataset: Visually-grounded Description of Scenes, Actions and Rationales</title>
@@ -256,6 +276,7 @@
       <abstract>Current captioning datasets focus on object-centric captions, describing the visible objects in the image, often ending up stating the obvious (for humans), e.g. “people eating food in a park”. Although these datasets are useful to evaluate the ability of Vision &amp; Language models to recognize and describe visual content, they do not support controlled experiments involving model testing or fine-tuning, with more high-level captions, which humans find easy and natural to produce. For example, people often describe images based on the type of scene they depict (“people at a holiday resort”) and the actions they perform (“people having a picnic”). Such concepts are based on personal experience and contribute to forming common sense assumptions. We present the High-Level Dataset, a dataset extending 14997 images from the COCO dataset, aligned with a new set of 134,973 human-annotated (high-level) captions collected along three axes: scenes, actions and rationales. We further extend this dataset with confidence scores collected from an independent set of readers, as well as a set of narrative captions generated synthetically, by combining each of the three axes. We describe this dataset and analyse it extensively. We also present baseline results for the High-Level Captioning task.</abstract>
       <url hash="788340bd">2023.inlg-main.21</url>
       <bibkey>cafagna-etal-2023-hl</bibkey>
+      <doi>10.18653/v1/2023.inlg-main.21</doi>
     </paper>
     <paper id="22">
       <title>Validating Predictive Models Of Evaluative Language For Controllable <fixed-case>D</fixed-case>ata2<fixed-case>T</fixed-case>ext Generation</title>
@@ -266,6 +287,7 @@
       <url hash="a2b44f4d">2023.inlg-main.22</url>
       <attachment type="Supplementary_Attachment" hash="0ac99d8c">2023.inlg-main.22.Supplementary_Attachment.zip</attachment>
       <bibkey>langner-klabunde-2023-validating</bibkey>
+      <doi>10.18653/v1/2023.inlg-main.22</doi>
     </paper>
     <paper id="23">
       <title>The Next Chapter: A Study of Large Language Models in Storytelling</title>
@@ -277,6 +299,7 @@
       <url hash="cfef4c09">2023.inlg-main.23</url>
       <attachment type="Supplementary_Attachment" hash="f494d13b">2023.inlg-main.23.Supplementary_Attachment.pdf</attachment>
       <bibkey>xie-etal-2023-next</bibkey>
+      <doi>10.18653/v1/2023.inlg-main.23</doi>
     </paper>
     <paper id="24">
       <title>Trustworthiness of Children Stories Generated by Large Language Models</title>
@@ -286,6 +309,7 @@
       <abstract>Large Language Models (LLMs) have shown a tremendous capacity for generating literary text. However, their effectiveness in generating children’s stories has yet to be thoroughly examined. In this study, we evaluate the trustworthiness of children’s stories generated by LLMs using various measures, and we compare and contrast our results with both old and new children’s stories to better assess their significance. Our findings suggest that LLMs still struggle to generate children’s stories at the level of quality and nuance found in actual stories.</abstract>
       <url hash="0fea507b">2023.inlg-main.24</url>
       <bibkey>bhandari-brennan-2023-trustworthiness</bibkey>
+      <doi>10.18653/v1/2023.inlg-main.24</doi>
     </paper>
     <paper id="25">
       <title>On Text Style Transfer via Style-Aware Masked Language Models</title>
@@ -297,6 +321,7 @@
       <abstract>Text Style Transfer (TST) is performable through approaches such as latent space disentanglement, cycle-consistency losses, prototype editing etc. The prototype editing approach, which is known to be quite successful in TST, involves two key phases a) Masking of source style-associated tokens and b) Reconstruction of this source-style masked sentence conditioned with the target style. We follow a similar transduction method, in which we transpose the more difficult direct source to target TST task to a simpler Style-Masked Language Model (SMLM) Task, wherein, similar to BERT (CITATION), the goal of our model is now to reconstruct the source sentence from its style-masked version. We arrive at the SMLM mechanism naturally by formulating prototype editing/ transduction methods in a probabilistic framework, where TST resolves into estimating a hypothetical parallel dataset from a partially observed parallel dataset, wherein each domain is assumed to have a common latent style-masked prior. To generate this style-masked prior, we use “Explainable Attention” as our choice of attribution for a more precise style-masking step and also introduce a cost-effective and accurate “Attribution-Surplus” method of determining the position of masks from any arbitrary attribution model in O(1) time. We empirically show that this non-generational approach well suites the “content preserving” criteria for a task like TST, even for a complex style like Discourse Manipulation. Our model, the Style MLM, outperforms strong TST baselines and is on par with state-of-the-art TST models, which use complex architectures and orders of more parameters.</abstract>
       <url hash="12fa2e6f">2023.inlg-main.25</url>
       <bibkey>narasimhan-etal-2023-text</bibkey>
+      <doi>10.18653/v1/2023.inlg-main.25</doi>
     </paper>
     <paper id="26">
       <title>Affective Natural Language Generation of Event Descriptions through Fine-grained Appraisal Conditions</title>
@@ -306,6 +331,7 @@
       <abstract>Models for affective text generation have shown a remarkable progress, but they commonly rely only on basic emotion theories or valance/arousal values as conditions. This is appropriate when the goal is to create explicit emotion statements (“The kid is happy.”). Emotions are, however, commonly communicated implicitly. For instance, the emotional interpretation of an event (“Their dog died.”) does often not require an explicit emotion statement. In psychology, appraisal theories explain the link between a cognitive evaluation of an event and the potentially developed emotion. They put the assessment of the situation on the spot, for instance regarding the own control or the responsibility for what happens. We hypothesize and subsequently show that including appraisal variables as conditions in a generation framework comes with two advantages. (1) The generation model is informed in greater detail about what makes a specific emotion and what properties it has. This leads to text generation that better fulfills the condition. (2) The variables of appraisal allow a user to perform a more fine-grained control of the generated text, by stating properties of a situation instead of only providing the emotion category. Our Bart and T5-based experiments with 7 emotions (Anger, Disgust, Fear, Guilt, Joy, Sadness, Shame), and 7 appraisals (Attention, Responsibility, Control, Circumstance, Pleasantness, Effort, Certainty) show that (1) adding appraisals during training improves the accurateness of the generated texts by 10 pp in F1. Further, (2) the texts with appraisal variables are longer and contain more details. This exemplifies the greater control for users.</abstract>
       <url hash="a25a3456">2023.inlg-main.26</url>
       <bibkey>menchaca-resendiz-klinger-2023-affective</bibkey>
+      <doi>10.18653/v1/2023.inlg-main.26</doi>
     </paper>
     <paper id="27">
       <title>Leveraging Low-resource Parallel Data for Text Style Transfer</title>
@@ -315,6 +341,7 @@
       <abstract>Text style transfer (TST) involves transforming a text into a desired style while approximately preserving its content. The biggest challenge in TST in the general lack of parallel data. Many existing approaches rely on complex models using substantial non-parallel data, with mixed results. In this paper, we leverage a pretrained BART language model with minimal parallel data and incorporate low-resource methods such as hyperparameter tuning, data augmentation, and self-training, which have not been explored in TST. We further include novel style-based rewards in the training loss. Through extensive experiments in sentiment transfer, a sub-task of TST, we demonstrate that our simple yet effective approaches achieve well-balanced results, surpassing non-parallel approaches and highlighting the usefulness of parallel data even in small amounts.</abstract>
       <url hash="af4396f3">2023.inlg-main.27</url>
       <bibkey>mukherjee-dusek-2023-leveraging</bibkey>
+      <doi>10.18653/v1/2023.inlg-main.27</doi>
     </paper>
     <paper id="28">
       <title>Reverse-Engineering Decoding Strategies Given Blackbox Access to a Language Generation System</title>
@@ -327,6 +354,7 @@
       <abstract>Neural language models are increasingly deployed into APIs and websites that allow a user to pass in a prompt and receive generated text. Many of these systems do not reveal generation parameters. In this paper, we present methods to reverse-engineer the decoding method used to generate text (i.e., top-_k_ or nucleus sampling). Our ability to discover which decoding strategy was used has implications for detecting generated text. Additionally, the process of discovering the decoding strategy can reveal biases caused by selecting decoding settings which severely truncate a model’s predicted distributions. We perform our attack on several families of open-source language models, as well as on production systems (e.g., ChatGPT).</abstract>
       <url hash="6aceabd9">2023.inlg-main.28</url>
       <bibkey>ippolito-etal-2023-reverse</bibkey>
+      <doi>10.18653/v1/2023.inlg-main.28</doi>
     </paper>
     <paper id="29">
       <title>Controlling keywords and their positions in text generation</title>
@@ -340,6 +368,7 @@
       <url hash="ec9dc713">2023.inlg-main.29</url>
       <attachment type="Supplementary_Attachment" hash="b11d55f9">2023.inlg-main.29.Supplementary_Attachment.pdf</attachment>
       <bibkey>sasazawa-etal-2023-controlling</bibkey>
+      <doi>10.18653/v1/2023.inlg-main.29</doi>
     </paper>
     <paper id="30">
       <title>Tackling Hallucinations in Neural Chart Summarization</title>
@@ -351,6 +380,7 @@
       <abstract>Hallucinations in text generation occur when the system produces text that is not grounded in the input. In this work, we tackle the problem of hallucinations in neural chart summarization. Our analysis shows that the target side of chart summarization training datasets often contains additional information, leading to hallucinations. We propose a natural language inference (NLI) based method to preprocess the training data and show through human evaluation that our method significantly reduces hallucinations. We also found that shortening long-distance dependencies in the input sequence and adding chart-related information like title and legends improves the overall performance.</abstract>
       <url hash="3e939e29">2023.inlg-main.30</url>
       <bibkey>obaid-ul-islam-etal-2023-tackling</bibkey>
+      <doi>10.18653/v1/2023.inlg-main.30</doi>
     </paper>
     <paper id="31">
       <title>Learning Disentangled Meaning and Style Representations for Positive Text Reframing</title>
@@ -363,6 +393,7 @@
       <abstract>The positive text reframing (PTR) task which generates a text giving a positive perspective with preserving the sense of the input text, has attracted considerable attention as one of the NLP applications. Due to the significant representation capability of the pre-trained language model (PLM), a beneficial baseline can be easily obtained by just fine-tuning the PLM. However, how to interpret a diversity of contexts to give a positive perspective is still an open problem. Especially, it is more serious when the size of the training data is limited. In this paper, we present a PTR framework, that learns representations where the meaning and style of text are structurally disentangled. The method utilizes pseudo-positive reframing datasets which are generated with two augmentation strategies. A simple but effective multi-task learning-based model is learned to fuse the generation capabilities from these datasets. Experimental results on Positive Psychology Frames (PPF) dataset, show that our approach outperforms the baselines, BART by five and T5 by six evaluation metrics. Our source codes and data are available online.</abstract>
       <url hash="efc08cfb">2023.inlg-main.31</url>
       <bibkey>sheng-etal-2023-learning</bibkey>
+      <doi>10.18653/v1/2023.inlg-main.31</doi>
     </paper>
     <paper id="32">
       <title>Generating clickbait spoilers with an ensemble of large language models</title>
@@ -372,6 +403,7 @@
       <abstract>Clickbait posts are a widespread problem in the webspace. The generation of spoilers, i.e. short texts that neutralize clickbait by providing information that makes it uninteresting, is one of the proposed solutions to the problem. Current state-of-the-art methods are based on passage retrieval or question answering approaches and are limited to generating spoilers only in the form of a phrase or a passage. In this work, we propose an ensemble of fine-tuned large language models for clickbait spoiler generation. Our approach is not limited to phrase or passage spoilers, but is also able to generate multipart spoilers that refer to several non-consecutive parts of text. Experimental evaluation demonstrates that the proposed ensemble model outperforms the baselines in terms of BLEU, METEOR and BERTScore metrics.</abstract>
       <url hash="157a1f5e">2023.inlg-main.32</url>
       <bibkey>wozny-lango-2023-generating</bibkey>
+      <doi>10.18653/v1/2023.inlg-main.32</doi>
     </paper>
     <paper id="33">
       <title>Reducing named entity hallucination risk to ensure faithful summary generation</title>
@@ -384,6 +416,7 @@
       <url hash="9796be57">2023.inlg-main.33</url>
       <attachment type="Supplementary_Attachment" hash="ea1b5bb4">2023.inlg-main.33.Supplementary_Attachment.pdf</attachment>
       <bibkey>akani-etal-2023-reducing</bibkey>
+      <doi>10.18653/v1/2023.inlg-main.33</doi>
     </paper>
     <paper id="34">
       <title>Building a dual dataset of text- and image-grounded conversations and summarisation in Gàidhlig (<fixed-case>S</fixed-case>cottish <fixed-case>G</fixed-case>aelic)</title>
@@ -396,6 +429,7 @@
       <url hash="6d0d7087">2023.inlg-main.34</url>
       <attachment type="Supplementary_Attachment" hash="8a897a3d">2023.inlg-main.34.Supplementary_Attachment.pdf</attachment>
       <bibkey>howcroft-etal-2023-building</bibkey>
+      <doi>10.18653/v1/2023.inlg-main.34</doi>
     </paper>
     <paper id="35">
       <title>Generating Multiple Questions from Presentation Transcripts: A Pilot Study on Earnings Conference Calls</title>
@@ -407,6 +441,7 @@
       <abstract>In various scenarios, such as conference oral presentations, company managers’ talks, and politicians’ speeches, individuals often contemplate the potential questions that may arise from their presentations. This common practice prompts the research question addressed in this study: to what extent can models generate multiple questions based on a given presentation transcript? To investigate this, we conduct pilot explorations using earnings conference call transcripts, which serve as regular meetings between professional investors and company managers. We experiment with different task settings and methods and evaluate the results from various perspectives. Our findings highlight that incorporating key points retrieval techniques enhances the accuracy and diversity of the generated questions.</abstract>
       <url hash="8bf2af18">2023.inlg-main.35</url>
       <bibkey>juan-etal-2023-generating</bibkey>
+      <doi>10.18653/v1/2023.inlg-main.35</doi>
     </paper>
     <paper id="36">
       <title>Mod-<fixed-case>D</fixed-case>2<fixed-case>T</fixed-case>: A Multi-layer Dataset for Modular Data-to-Text Generation</title>
@@ -418,6 +453,7 @@
       <abstract>Rule-based text generators lack the coverage and fluency of their neural counterparts, but have two big advantages over them: (i) they are entirely controllable and do not hallucinate; and (ii) they can fully explain how an output was generated from an input. In this paper we leverage these two advantages to create large and reliable synthetic datasets with multiple human-intelligible intermediate representations. We present the Modular Data-to-Text (Mod-D2T) Dataset which incorporates ten intermediate-level representations between input triple sets and output text; the mappings from one level to the next can broadly be interpreted as the traditional modular tasks of an NLG pipeline. We describe the Mod-D2T dataset, evaluate its quality via manual validation and discuss its applications and limitations. Data, code and documentation are available at https://github.com/mille-s/Mod-D2T.</abstract>
       <url hash="1ad1c710">2023.inlg-main.36</url>
       <bibkey>mille-etal-2023-mod</bibkey>
+      <doi>10.18653/v1/2023.inlg-main.36</doi>
     </paper>
   </volume>
   <volume id="demos" ingest-date="2023-09-01" type="proceedings">
diff --git a/data/xml/2023.sigdial.xml b/data/xml/2023.sigdial.xml
index 204c60cc3b..120830841f 100644
--- a/data/xml/2023.sigdial.xml
+++ b/data/xml/2023.sigdial.xml
@@ -28,6 +28,7 @@
       <abstract>Training dialogue systems often entails dealing with noisy training examples and unexpected user inputs. Despite their prevalence, there currently lacks an accurate survey of dialogue noise, nor is there a clear sense of the impact of each noise type on task performance. This paper addresses this gap by first constructing a taxonomy of noise encountered by dialogue systems. In addition, we run a series of experiments to show how different models behave when subjected to varying levels of noise and types of noise. Our results reveal that models are quite robust to label errors commonly tackled by existing denoising algorithms, but that performance suffers from dialogue-specific noise. Driven by these observations, we design a data cleaning algorithm specialized for conversational settings and apply it as a proof-of-concept for targeted dialogue denoising.</abstract>
       <url hash="11a386f2">2023.sigdial-1.1</url>
       <bibkey>chen-yu-2023-sources</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.1</doi>
     </paper>
     <paper id="2">
       <title>Investigating Explicitation of Discourse Connectives in Translation using Automatic Annotations</title>
@@ -40,6 +41,7 @@
       <abstract>Discourse relations have different patterns of marking across different languages. As a result, discourse connectives are often added, omitted, or rephrased in translation. Prior work has shown a tendency for explicitation of discourse connectives, but such work was conducted using restricted sample sizes due to difficulty of connective identification and alignment. The current study exploits automatic methods to facilitate a large-scale study of connectives in English and German parallel texts. Our results based on over 300 types and 18000 instances of aligned connectives and an empirical approach to compare the cross-lingual specificity gap provide strong evidence of the Explicitation Hypothesis. We conclude that discourse relations are indeed more explicit in translation than texts written originally in the same language. Automatic annotations allow us to carry out translation studies of discourse relations on a large scale. Our methodology using relative entropy to study the specificity of connectives also provides more fine-grained insights into translation patterns.</abstract>
       <url hash="939cc996">2023.sigdial-1.2</url>
       <bibkey>yung-etal-2023-investigating</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.2</doi>
     </paper>
     <paper id="3">
       <title>What’s Hard in <fixed-case>E</fixed-case>nglish <fixed-case>RST</fixed-case> Parsing? Predictive Models for Error Analysis</title>
@@ -50,6 +52,7 @@
       <abstract>Despite recent advances in Natural Language Processing (NLP), hierarchical discourse parsing in the framework of Rhetorical Structure Theory remains challenging, and our understanding of the reasons for this are as yet limited. In this paper, we examine and model some of the factors associated with parsing difficulties in previous work: the existence of implicit discourse relations, challenges in identifying long-distance relations, out-of-vocabulary items, and more. In order to assess the relative importance of these variables, we also release two annotated English test-sets with explicit correct and distracting discourse markers associated with gold standard RST relations. Our results show that as in shallow discourse parsing, the explicit/implicit distinction plays a role, but that long-distance dependencies are the main challenge, while lack of lexical overlap is less of a problem, at least for in-domain parsing. Our final model is able to predict where errors will occur with an accuracy of 76.3% for the bottom-up parser and 76.6% for the top-down parser.</abstract>
       <url hash="8e058de6">2023.sigdial-1.3</url>
       <bibkey>liu-etal-2023-whats</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.3</doi>
     </paper>
     <paper id="4">
       <title>Grounded Complex Task Segmentation for Conversational Assistants</title>
@@ -60,6 +63,7 @@
       <abstract>Following complex instructions in conversational assistants can be quite daunting due to the shorter attention and memory spans when compared to reading the same instructions. Hence, when conversational assistants walk users through the steps of complex tasks, there is a need to structure the task into manageable pieces of information of the right length and complexity. In this paper, we tackle the recipes domain and convert reading structured instructions into conversational structured ones. We annotated the structure of instructions according to a conversational scenario, which provided insights into what is expected in this setting. To computationally model the conversational step’s characteristics, we tested various Transformer-based architectures, showing that a token-based approach delivers the best results. A further user study showed that users tend to favor steps of manageable complexity and length, and that the proposed methodology can improve the original web-based instructional text. Specifically, 86% of the evaluated tasks were improved from a conversational suitability point of view.</abstract>
       <url hash="c6868f37">2023.sigdial-1.4</url>
       <bibkey>ferreira-etal-2023-grounded</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.4</doi>
     </paper>
     <paper id="5">
       <title>A Statistical Approach for Quantifying Group Difference in Topic Distributions Using Clinical Discourse Samples</title>
@@ -72,6 +76,7 @@
       <abstract>Topic distribution matrices created by topic models are typically used for document classification or as features in a separate machine learning algorithm. Existing methods for evaluating these topic distributions include metrics such as coherence and perplexity; however, there is a lack of statistically grounded evaluation tools. We present a statistical method for investigating group differences in the document-topic distribution vectors created by Latent Dirichlet Allocation (LDA) that uses Aitchison geometry to transform the vectors, multivariate analysis of variance (MANOVA) to compare sample means, and partial eta squared to calculate effect size. Using a corpus of dialogues between Autistic and Typically Developing (TD) children and trained examiners, we found that the topic distributions of Autistic children differed from those of TD children when responding to questions about social difficulties (p = .0083, partial eta squared = .19). Furthermore, the examiners’ topic distributions differed between the Autistic and TD groups when discussing emotions (p = .0035, partial eta squared = .20), social difficulties (p &lt; .001, partial eta squared = .30), and friends (p = .0224, partial eta squared = .17). These results support the use of topic modeling in studying clinically relevant features of social communication such as topic maintenance.</abstract>
       <url hash="048a1263">2023.sigdial-1.5</url>
       <bibkey>lawley-etal-2023-statistical</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.5</doi>
     </paper>
     <paper id="6">
       <title><fixed-case>O</fixed-case>pinion<fixed-case>C</fixed-case>onv: Conversational Product Search with Grounded Opinions</title>
@@ -82,6 +87,7 @@
       <abstract>When searching for products, the opinions of others play an important role in making informed decisions. Subjective experiences about a product can be a valuable source of information. This is also true in sales conversations, where a customer and a sales assistant exchange facts and opinions about products. However, training an AI for such conversations is complicated by the fact that language models do not possess authentic opinions for their lack of real-world experience. We address this problem by leveraging product reviews as a rich source of product opinions to ground conversational AI in true subjective narratives. With OpinionConv, we develop the first conversational AI for simulating sales conversations. To validate the generated conversations, we conduct several user studies showing that the generated opinions are perceived as realistic. Our assessors also confirm the importance of opinions as an informative basis for decision making.</abstract>
       <url hash="48c60bdd">2023.sigdial-1.6</url>
       <bibkey>sadiri-javadi-etal-2023-opinionconv</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.6</doi>
     </paper>
     <paper id="7">
       <title>Dial-<fixed-case>M</fixed-case>: A Masking-based Framework for Dialogue Evaluation</title>
@@ -91,6 +97,7 @@
       <abstract>In dialogue systems, automatically evaluating machine-generated responses is critical and challenging. Despite the tremendous progress in dialogue generation research, its evaluation heavily depends on human judgments. The standard word-overlapping based evaluation metrics are ineffective for dialogues. As a result, most of the recently proposed metrics are model-based and reference-free, which learn to score different aspects of a conversation. However, understanding each aspect requires a separate model, which makes them computationally expensive. To this end, we propose Dial-M, a Masking-based reference-free framework for Dialogue evaluation. The main idea is to mask the keywords of the current utterance and predict them, given the dialogue history and various conditions (like knowledge, persona, etc.), thereby making the evaluation framework simple and easily extensible for multiple datasets. Regardless of its simplicity, Dial-M achieves comparable performance to state-of-the-art metrics on several dialogue evaluation datasets. We also discuss the interpretability of our proposed metric along with error analysis.</abstract>
       <url hash="344b77e3">2023.sigdial-1.7</url>
       <bibkey>dey-desarkar-2023-dial</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.7</doi>
     </paper>
     <paper id="8">
       <title>From Chatter to Matter: Addressing Critical Steps of Emotion Recognition Learning in Task-oriented Dialogue</title>
@@ -107,6 +114,7 @@
       <abstract>Emotion recognition in conversations (ERC) is a crucial task for building human-like conversational agents. While substantial efforts have been devoted to ERC for chit-chat dialogues, the task-oriented counterpart is largely left unattended. Directly applying chit-chat ERC models to task-oriented dialogues (ToDs) results in suboptimal performance as these models overlook key features such as the correlation between emotions and task completion in ToDs. In this paper, we propose a framework that turns a chit-chat ERC model into a task-oriented one, addressing three critical aspects: data, features and objective. First, we devise two ways of augmenting rare emotions to improve ERC performance. Second, we use dialogue states as auxiliary features to incorporate key information from the goal of the user. Lastly, we leverage a multi-aspect emotion definition in ToDs to devise a multi-task learning objective and a novel emotion-distance weighted loss function. Our framework yields significant improvements for a range of chit-chat ERC models on EmoWOZ, a large-scale dataset for user emotions in ToDs. We further investigate the generalisability of the best resulting model to predict user satisfaction in different ToD datasets. A comparison with supervised baselines shows a strong zero-shot capability, highlighting the potential usage of our framework in wider scenarios.</abstract>
       <url hash="bee946dc">2023.sigdial-1.8</url>
       <bibkey>feng-etal-2023-chatter</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.8</doi>
     </paper>
     <paper id="9">
       <title>Analyzing Differences in Subjective Annotations by Participants and Third-party Annotators in Multimodal Dialogue Corpus</title>
@@ -117,6 +125,7 @@
       <abstract>Estimating the subjective impressions of human users during a dialogue is necessary when constructing a dialogue system that can respond adaptively to their emotional states. However, such subjective impressions (e.g., how much the user enjoys the dialogue) are inherently ambiguous, and the annotation results provided by multiple annotators do not always agree because they depend on the subjectivity of the annotators. In this paper, we analyzed the annotation results using 13,226 exchanges from 155 participants in a multimodal dialogue corpus called Hazumi that we had constructed, where each exchange was annotated by five third-party annotators. We investigated the agreement between the subjective annotations given by the third-party annotators and the participants themselves, on both per-exchange annotations (i.e., participant’s sentiments) and per-dialogue (-participant) annotations (i.e., questionnaires on rapport and personality traits). We also investigated the conditions under which the annotation results are reliable. Our findings demonstrate that the dispersion of third-party sentiment annotations correlates with agreeableness of the participants, one of the Big Five personality traits.</abstract>
       <url hash="d13818bf">2023.sigdial-1.9</url>
       <bibkey>komatani-etal-2023-analyzing</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.9</doi>
     </paper>
     <paper id="10">
       <title>Frame-oriented Summarization of Argumentative Discussions</title>
@@ -129,6 +138,7 @@
       <abstract>Online discussions on controversial topics with many participants frequently include hundreds of arguments that cover different framings of the topic. But these arguments and frames are often spread across the various branches of the discussion tree structure. This makes it difficult for interested participants to follow the discussion in its entirety as well as to introduce new arguments. In this paper, we present a new rank-based approach to extractive summarization of online discussions focusing on argumentation frames that capture the different aspects of a discussion. Our approach includes three retrieval tasks to find arguments in a discussion that are (1) relevant to a frame of interest, (2) relevant to the topic under discussion, and (3) informative to the reader. Based on a joint ranking by these three criteria for a set of user-selected frames, our approach allows readers to quickly access an ongoing discussion. We evaluate our approach using a test set of 100 controversial Reddit ChangeMyView discussions, for which the relevance of a total of 1871 arguments was manually annotated.</abstract>
       <url hash="10d816d4">2023.sigdial-1.10</url>
       <bibkey>syed-etal-2023-frame</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.10</doi>
     </paper>
     <paper id="11">
       <title>Towards Multilingual Automatic Open-Domain Dialogue Evaluation</title>
@@ -139,6 +149,7 @@
       <abstract>The main limiting factor in the development of robust multilingual open-domain dialogue evaluation metrics is the lack of multilingual data and the limited availability of open-sourced multilingual dialogue systems. In this work, we propose a workaround for this lack of data by leveraging a strong multilingual pretrained encoder-based Language Model and augmenting existing English dialogue data using Machine Translation. We empirically show that the naive approach of finetuning a pretrained multilingual encoder model with translated data is insufficient to outperform the strong baseline of finetuning a multilingual model with only source data. Instead, the best approach consists in the careful curation of translated data using MT Quality Estimation metrics, excluding low quality translations that hinder its performance.</abstract>
       <url hash="7ffdd50a">2023.sigdial-1.11</url>
       <bibkey>mendonca-etal-2023-towards</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.11</doi>
     </paper>
     <paper id="12">
       <title>Dialog Action-Aware Transformer for Dialog Policy Learning</title>
@@ -149,6 +160,7 @@
       <abstract>Recent works usually address Dialog policy learning DPL by training a reinforcement learning (RL) agent to determine the best dialog action. However, existing works on deep RL require a large volume of agent-user interactions to achieve acceptable performance. In this paper, we propose to make full use of the plain text knowledge from the pre-trained language model to accelerate the RL agent’s learning speed. Specifically, we design a dialog action-aware transformer encoder (DaTrans), which integrates a new fine-tuning procedure named masked last action task to encourage DaTrans to be dialog-aware and distill action-specific features. Then, DaTrans is further optimized in an RL setting with ongoing interactions and evolves through exploration in the dialog action space toward maximizing long-term accumulated rewards. The effectiveness and efficiency of the proposed model are demonstrated with both simulator evaluation and human evaluation.</abstract>
       <url hash="20aeb91f">2023.sigdial-1.12</url>
       <bibkey>wang-etal-2023-dialog</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.12</doi>
     </paper>
     <paper id="13">
       <title>The Wizard of Curiosities: Enriching Dialogues with Fun Facts</title>
@@ -160,6 +172,7 @@
       <abstract>Introducing curiosities in a conversation is a way to teach something new to the person in a pleasant and enjoyable way. Enriching dialogues with contextualized curiosities can improve the users’ perception of a dialog system and their overall user experience. In this paper, we introduce a set of curated curiosities, targeting dialogues in the cooking and DIY domains. In particular, we use real human-agent conversations collected in the context of the Amazon Alexa TaskBot challenge, a multimodal and multi-turn conversational setting. According to an A/B test with over 1000 conversations, curiosities not only increase user engagement, but provide an average relative rating improvement of 9.7%.</abstract>
       <url hash="b7a49295">2023.sigdial-1.13</url>
       <bibkey>vicente-etal-2023-wizard</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.13</doi>
     </paper>
     <paper id="14">
       <title>The Road to Quality is Paved with Good Revisions: A Detailed Evaluation Methodology for Revision Policies in Incremental Sequence Labelling</title>
@@ -170,6 +183,7 @@
       <abstract>Incremental dialogue model components produce a sequence of output prefixes based on incoming input. Mistakes can occur due to local ambiguities or to wrong hypotheses, making the ability to revise past outputs a desirable property that can be governed by a policy. In this work, we formalise and characterise edits and revisions in incremental sequence labelling and propose metrics to evaluate revision policies. We then apply our methodology to profile the incremental behaviour of three Transformer-based encoders in various tasks, paving the road for better revision policies.</abstract>
       <url hash="74619e78">2023.sigdial-1.14</url>
       <bibkey>madureira-etal-2023-road</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.14</doi>
     </paper>
     <paper id="15">
       <title>The effect of conversation type on entrainment: Evidence from laughter</title>
@@ -179,6 +193,7 @@
       <abstract>Entrainment is a phenomenon that occurs across several modalities and at different linguistic levels in conversation. Previous work has shown that its effects may be modulated by conversation extrinsic factors, such as the relation between the interlocutors or the speakers’ traits. The current study investigates the role of conversation type on laughter entrainment. Employing dyadic interaction materials in German, containing two conversation types (free dialogues and task-based interactions), we analyzed three measures of entrainment previously proposed in the literature. The results show that the entrainment effects depend on the type of conversation, with two of the investigated measures being affected by this factor. These findings represent further evidence towards the role of situational aspects as a mediating factor in conversation.</abstract>
       <url hash="c3551418">2023.sigdial-1.15</url>
       <bibkey>ludusan-wagner-2023-effect</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.15</doi>
     </paper>
     <paper id="16">
       <title>‘What are you referring to?’ Evaluating the Ability of Multi-Modal Dialogue Models to Process Clarificational Exchanges</title>
@@ -190,6 +205,7 @@
       <abstract>Referential ambiguities arise in dialogue when a referring expression does not uniquely identify the intended referent for the addressee. Addressees usually detect such ambiguities immediately and work with the speaker to repair it using meta-communicative, Clarificational Exchanges (CE): a Clarification Request (CR) and a response. Here, we argue that the ability to generate and respond to CRs imposes specific constraints on the architecture and objective functions of multi-modal, visually grounded dialogue models. We use the SIMMC 2.0 dataset to evaluate the ability of different state-of-the-art model architectures to process CEs, with a metric that probes the contextual updates that arise from them in the model. We find that language-based models are able to encode simple multi-modal semantic information and process some CEs, excelling with those related to the dialogue history, whilst multi-modal models can use additional learning objectives to obtain disentangled object representations, which become crucial to handle complex referential ambiguities across modalities overall.</abstract>
       <url hash="17d8f74d">2023.sigdial-1.16</url>
       <bibkey>chiyah-garcia-etal-2023-referring</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.16</doi>
     </paper>
     <paper id="17">
       <title><fixed-case>PGT</fixed-case>ask: Introducing the Task of Profile Generation from Dialogues</title>
@@ -200,6 +216,7 @@
       <abstract>Recent approaches have attempted to personalize dialogue systems by leveraging profile information into models. However, this knowledge is scarce and difficult to obtain, which makes the extraction/generation of profile information from dialogues a fundamental asset. To surpass this limitation, we introduce the Profile Generation Task (PGTask). We contribute with a new dataset for this problem, comprising profile sentences aligned with related utterances, extracted from a corpus of dialogues. Furthermore, using state-of-the-art methods, we provide a benchmark for profile generation on this novel dataset. Our experiments disclose the challenges of profile generation, and we hope that this introduces a new research direction.</abstract>
       <url hash="dadbdc5c">2023.sigdial-1.17</url>
       <bibkey>ribeiro-etal-2023-pgtask</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.17</doi>
     </paper>
     <paper id="18">
       <title>Question Generation to Elicit Users’ Food Preferences by Considering the Semantic Content</title>
@@ -210,6 +227,7 @@
       <abstract>To obtain a better understanding of user preferences in providing tailored services, dialogue systems have to generate semi-structured interviews that require flexible dialogue control while following a topic guide to accomplish the purpose of the interview. Toward this goal, this study proposes a semantics-aware GPT-3 fine-tuning model that generates interviews to acquire users’ food preferences. The model was trained using dialogue history and semantic representation constructed from the communicative function and semantic content of the utterance. Using two baseline models: zero-shot ChatGPT and fine-tuned GPT-3, we conducted a user study for subjective evaluations alongside automatic objective evaluations. In the user study, in impression rating, the outputs of the proposed model were superior to those of baseline models and comparable to real human interviews in terms of eliciting the interviewees’ food preferences.</abstract>
       <url hash="d629de5e">2023.sigdial-1.18</url>
       <bibkey>zeng-etal-2023-question</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.18</doi>
     </paper>
     <paper id="19">
       <title>Roll Up Your Sleeves: Working with a Collaborative and Engaging Task-Oriented Dialogue System</title>
@@ -230,6 +248,7 @@
       <abstract>We introduce TacoBot, a user-centered task-oriented digital assistant designed to guide users through complex real-world tasks with multiple steps. Covering a wide range of cooking and how-to tasks, we aim to deliver a collaborative and engaging dialogue experience. Equipped with language understanding, dialogue management, and response generation components supported by a robust search engine, TacoBot ensures efficient task assistance. To enhance the dialogue experience, we explore a series of data augmentation strategies using LLMs to train advanced neural models continuously. TacoBot builds upon our successful participation in the inaugural Alexa Prize TaskBot Challenge, where our team secured third place among ten competing teams. We offer TacoBot as an open-source framework that serves as a practical example for deploying task-oriented dialogue systems.</abstract>
       <url hash="03e58437">2023.sigdial-1.19</url>
       <bibkey>mo-etal-2023-roll</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.19</doi>
     </paper>
     <paper id="20">
       <title>Leveraging Large Language Models for Automated Dialogue Analysis</title>
@@ -240,6 +259,7 @@
       <abstract>Developing high-performing dialogue systems benefits from the automatic identification of undesirable behaviors in system responses. However, detecting such behaviors remains challenging, as it draws on a breadth of general knowledge and understanding of conversational practices. Although recent research has focused on building specialized classifiers for detecting specific dialogue behaviors, the behavior coverage is still incomplete and there is a lack of testing on real-world human-bot interactions. This paper investigates the ability of a state-of-the-art large language model (LLM), ChatGPT-3.5, to perform dialogue behavior detection for nine categories in real human-bot dialogues. We aim to assess whether ChatGPT can match specialized models and approximate human performance, thereby reducing the cost of behavior detection tasks. Our findings reveal that neither specialized models nor ChatGPT have yet achieved satisfactory results for this task, falling short of human performance. Nevertheless, ChatGPT shows promising potential and often outperforms specialized detection models. We conclude with an in-depth examination of the prevalent shortcomings of ChatGPT, offering guidance for future research to enhance LLM capabilities.</abstract>
       <url hash="79093a5d">2023.sigdial-1.20</url>
       <bibkey>finch-etal-2023-leveraging</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.20</doi>
     </paper>
     <paper id="21">
       <title>Are Large Language Models All You Need for Task-Oriented Dialogue?</title>
@@ -249,6 +269,7 @@
       <abstract>Instruction-finetuned large language models (LLMs) gained a huge popularity recently, thanks to their ability to interact with users through conversation. In this work, we aim to evaluate their ability to complete multi-turn tasks and interact with external databases in the context of established task-oriented dialogue benchmarks. We show that in explicit belief state tracking, LLMs underperform compared to specialized task-specific models. Nevertheless, they show some ability to guide the dialogue to a successful ending through their generated responses if they are provided with correct slot values. Furthermore, this ability improves with few-shot in-domain examples.</abstract>
       <url hash="a0826b1a">2023.sigdial-1.21</url>
       <bibkey>hudecek-dusek-2023-large</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.21</doi>
     </paper>
     <paper id="22">
       <title>Multi-party Goal Tracking with <fixed-case>LLM</fixed-case>s: Comparing Pre-training, Fine-tuning, and Prompt Engineering</title>
@@ -262,6 +283,7 @@
       <abstract>This paper evaluates the extent to which current LLMs can capture task-oriented multi-party conversations (MPCs). We have recorded and transcribed 29 MPCs between patients, their companions, and a social robot in a hospital. We then annotated this corpus for multi-party goal-tracking and intent-slot recognition. People share goals, answer each other’s goals, and provide other people’s goals in MPCs - none of which occur in dyadic interactions. To understand user goals in MPCs, we compared three methods in zero-shot and few-shot settings: we fine-tuned T5, created pre-training tasks to train DialogLM using LED, and employed prompt engineering techniques with GPT-3.5-turbo, to determine which approach can complete this novel task with limited data. GPT-3.5-turbo significantly outperformed the others in a few-shot setting. The ‘reasoning’ style prompt, when given 7% of the corpus as example annotated conversations, was the best performing method. It correctly annotated 62.32% of the goal tracking MPCs, and 69.57% of the intent-slot recognition MPCs. A ‘story’ style prompt increased model hallucination, which could be detrimental if deployed in safety-critical settings. We conclude that multi-party conversations still challenge state-of-the-art LLMs.</abstract>
       <url hash="2955a99a">2023.sigdial-1.22</url>
       <bibkey>addlesee-etal-2023-multi</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.22</doi>
     </paper>
     <paper id="23">
       <title><fixed-case>C</fixed-case>hat<fixed-case>GPT</fixed-case> vs. Crowdsourcing vs. Experts: Annotating Open-Domain Conversations with Speech Functions</title>
@@ -274,6 +296,7 @@
       <abstract>This paper deals with the task of annotating open-domain conversations with speech functions. We propose a semi-automated method for annotating dialogs following the topic-oriented, multi-layered taxonomy of speech functions with the use of hierarchical guidelines using Large Language Models. These guidelines comprise simple questions about the topic and speaker change, sentence types, pragmatic aspects of the utterance, and examples that aid untrained annotators in understanding the taxonomy. We compare the results of dialog annotation performed by experts, crowdsourcing workers, and ChatGPT. To improve the performance of ChatGPT, several experiments utilising different prompt engineering techniques were conducted. We demonstrate that in some cases large language models can achieve human-like performance following a multi-step tree-like annotation pipeline on complex discourse annotation, which is usually challenging and costly in terms of time and money when performed by humans.</abstract>
       <url hash="f580941a">2023.sigdial-1.23</url>
       <bibkey>ostyakova-etal-2023-chatgpt</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.23</doi>
     </paper>
     <paper id="24">
       <title><fixed-case>D</fixed-case>iact<fixed-case>TOD</fixed-case>: Learning Generalizable Latent Dialogue Acts for Controllable Task-Oriented Dialogue Systems</title>
@@ -285,6 +308,7 @@
       <abstract>Dialogue act annotations are important to improve response generation quality in task-oriented dialogue systems. However, it can be challenging to use dialogue acts to control response generation in a generalizable way because different datasets and tasks may have incompatible annotations. While alternative methods that utilize latent action spaces or reinforcement learning do not require explicit annotations, they may lack interpretability or face difficulties defining task-specific rewards. In this work, we present a novel end-to-end latent dialogue act model (DiactTOD) that represents dialogue acts in a latent space. DiactTOD, when pre-trained on a large corpus, is able to predict and control dialogue acts to generate controllable responses using these latent representations in a zero-shot fashion. Our approach demonstrates state-of-the-art performance across a wide range of experimental settings on the MultiWOZ dataset, including zero-shot, few-shot, and full data fine-tuning with both end-to-end and policy optimization configurations.</abstract>
       <url hash="fab36604">2023.sigdial-1.24</url>
       <bibkey>wu-etal-2023-diacttod</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.24</doi>
     </paper>
     <paper id="25">
       <title>Approximating Online Human Evaluation of Social Chatbots with Prompting</title>
@@ -294,6 +318,7 @@
       <abstract>With conversational models becoming increasingly available to the general public, developing scalable and robust evaluation metrics is crucial to minimize potential social and psychological risks for the users. Existing evaluation metrics aim to automate offline user evaluation and approximate human judgment of pre-curated dialogs. However, they are limited in their ability to capture subjective perceptions of users who actually interact with the chatbots and might not generalize to real-world settings. To address this limitation, we propose an approach to approximate online human evaluation, leveraging large language models (LLMs) from the GPT-family. We introduce a new Dialog system Evaluation framework based on Prompting (DEP), which enables a fully automatic evaluation pipeline that replicates live user studies and achieves an impressive correlation with human judgment (up to Pearson r=0.95 on a system level). The DEP approach involves collecting synthetic chat logs of evaluated bots with an LLM in the other-play setting, where the LLM is carefully conditioned to follow a specific scenario. We further explore different prompting approaches to produce evaluation scores with the same LLM. The best-performing prompts, which contain few-shot demonstrations and instructions, show outstanding performance on the tested dataset and demonstrate the ability to generalize to other dialog corpora.</abstract>
       <url hash="9d8e4c86">2023.sigdial-1.25</url>
       <bibkey>svikhnushina-pu-2023-approximating</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.25</doi>
     </paper>
     <paper id="26">
       <title>Dialogue Response Generation Using Completion of Omitted Predicate Arguments Based on Zero Anaphora Resolution</title>
@@ -303,6 +328,7 @@
       <abstract>Human conversation attempts to build common ground consisting of shared beliefs, knowledge, and perceptions that form the premise for understanding utterances. Recent deep learning-based dialogue systems use human dialogue data to train a mapping from a dialogue history to responses, but common ground not directly expressed in words makes it difficult to generate coherent responses by learning statistical patterns alone. We propose Dialogue Completion using Zero Anaphora Resolution (DCZAR), a framework that explicitly completes omitted information in the dialogue history and generates responses from the completed dialogue history. In this study, we conducted automatic and human evaluations by applying several pretraining methods and datasets in Japanese in various combinations. Experimental results show that the DCZAR framework contributes to the generation of more coherent and engaging responses.</abstract>
       <url hash="2a175dfc">2023.sigdial-1.26</url>
       <bibkey>ueyama-kano-2023-dialogue</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.26</doi>
     </paper>
     <paper id="27">
       <title>Syndicom: Improving Conversational Commonsense with Error-Injection and Natural Language Feedback</title>
@@ -312,6 +338,7 @@
       <abstract>Commonsense reasoning is a critical aspect of human communication. Despite recent advances in conversational AI driven by large language models, commonsense reasoning remains a challenging task. In this work, we introduce Syndicom - a method for improving commonsense in dialogue response generation. Syndicom consists of two components. The first component is a dataset composed of commonsense dialogues created from a knowledge graph and synthesized into natural language. This dataset includes both valid and invalid responses to dialogue contexts, along with natural language feedback (NLF) for the invalid responses. The second contribution is a two-step procedure: training a model to predict natural language feedback (NLF) for invalid responses, and then training a response generation model conditioned on the predicted NLF, the invalid response, and the dialogue. Syndicom is scalable and does not require reinforcement learning. Empirical results on three tasks are evaluated using a broad range of metrics. Syndicom achieves a relative improvement of 53% over ChatGPT on ROUGE-1, and human evaluators prefer Syndicom over ChatGPT 57% of the time. We will publicly release the code and the full dataset.</abstract>
       <url hash="3875dd3a">2023.sigdial-1.27</url>
       <bibkey>richardson-heck-2023-syndicom</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.27</doi>
     </paper>
     <paper id="28">
       <title>“What do others think?”: Task-Oriented Conversational Modeling with Subjective Knowledge</title>
@@ -329,6 +356,7 @@
       <abstract>Task-oriented Dialogue (TOD) Systems aim to build dialogue systems that assist users in accomplishing specific goals, such as booking a hotel or a restaurant. Traditional TODs rely on domain-specific APIs/DBs or external factual knowledge to generate responses, which cannot accommodate subjective user requests (e.g.,”Is the WIFI reliable?” or “Does the restaurant have a good atmosphere?”). To address this issue, we propose a novel task of subjective-knowledge-based TOD (SK-TOD). We also propose the first corresponding dataset, which contains subjective knowledge-seeking dialogue contexts and manually annotated responses grounded in subjective knowledge sources. When evaluated with existing TOD approaches, we find that this task poses new challenges such as aggregating diverse opinions from multiple knowledge snippets. We hope this task and dataset can promote further research on TOD and subjective content understanding. The code and the dataset are available at https://github.com/alexa/dstc11-track5.</abstract>
       <url hash="7787c3f4">2023.sigdial-1.28</url>
       <bibkey>zhao-etal-2023-others</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.28</doi>
     </paper>
     <paper id="29">
       <title><fixed-case>UD</fixed-case>_<fixed-case>J</fixed-case>apanese-<fixed-case>CEJC</fixed-case>: Dependency Relation Annotation on Corpus of Everyday <fixed-case>J</fixed-case>apanese Conversation</title>
@@ -340,6 +368,7 @@
       <abstract>In this study, we have developed Universal Dependencies (UD) resources for spoken Japanese in the Corpus of Everyday Japanese Conversation (CEJC). The CEJC is a large corpus of spoken language that encompasses various everyday conversations in Japanese, and includes word delimitation and part-of-speech annotation. We have newly annotated Long Word Unit delimitation and Bunsetsu (Japanese phrase)-based dependencies, including Bunsetsu boundaries, for CEJC. The UD of Japanese resources was constructed in accordance with hand-maintained conversion rules from the CEJC with two types of word delimitation, part-of-speech tags and Bunsetsu-based syntactic dependency relations. Furthermore, we examined various issues pertaining to the construction of UD in the CEJC by comparing it with the written Japanese corpus and evaluating UD parsing accuracy.</abstract>
       <url hash="0d7344f2">2023.sigdial-1.29</url>
       <bibkey>omura-etal-2023-ud</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.29</doi>
     </paper>
     <paper id="30">
       <title>Unravelling Indirect Answers to Wh-Questions: Corpus Construction, Analysis, and Generation</title>
@@ -349,6 +378,7 @@
       <abstract>Indirect answers, crucial in human communication, serve to maintain politeness, avoid conflicts, and align with social customs. Although there has been a substantial number of studies on recognizing and understanding indirect answers to polar questions (often known as yes/no questions), there is a dearth of such work regarding wh-questions. This study takes up the challenge by constructing what is, to our knowledge, the first corpus of indirect answers to wh-questions. We analyze and interpret indirect answers to different wh-questions based on our carefully compiled corpus. In addition, we conducted a pilot study on generating indirect answers to wh-questions by fine-tuning the pre-trained generative language model DialoGPT (Zhang et al., 2020). Our results suggest this is a task that GPT finds difficult.</abstract>
       <url hash="4e63bfb9">2023.sigdial-1.30</url>
       <bibkey>yusupujiang-ginzburg-2023-unravelling</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.30</doi>
     </paper>
     <paper id="31">
       <title>A New Dataset for Causality Identification in Argumentative Texts</title>
@@ -362,6 +392,7 @@
       <abstract>Existing datasets for causality identification in argumentative texts have several limitations, such as the type of input text (e.g., only claims), causality type (e.g., only positive), and the linguistic patterns investigated (e.g., only verb connectives). To resolve these limitations, we build the Webis-Causality-23 dataset, with sophisticated inputs (all units from arguments), a balanced distribution of causality types, and a larger number of linguistic patterns denoting causality. The dataset contains 1485 examples derived by combining the two paradigms of distant supervision and uncertainty sampling to identify diverse, high-quality samples of causality relations, and annotate them in a cost-effective manner.</abstract>
       <url hash="e6a8b40a">2023.sigdial-1.31</url>
       <bibkey>al-khatib-etal-2023-new</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.31</doi>
     </paper>
     <paper id="32">
       <title>Controllable Generation of Dialogue Acts for Dialogue Systems via Few-Shot Response Generation and Ranking</title>
@@ -374,6 +405,7 @@
       <abstract>Dialogue systems need to produce responses that realize multiple types of dialogue acts (DAs) with high semantic fidelity. In the past, natural language generators (NLGs) for dialogue were trained on large parallel corpora that map from a domain-specific DA and its semantic attributes to an output utterance. Recent work shows that pretrained language models (LLMs) offer new possibilities for controllable NLG using prompt-based learning. Here we develop a novel few-shot overgenerate-and-rank approach that achieves the controlled generation of DAs. We compare eight few-shot prompt styles that include a novel method of generating from textual pseudo-references using a textual style transfer approach. We develop six automatic ranking functions that identify outputs with both the correct DA and high semantic accuracy at generation time. We test our approach on three domains and four LLMs. To our knowledge, this is the first work on NLG for dialogue that automatically ranks outputs using both DA and attribute accuracy. For completeness, we compare our results to fine-tuned few-shot models trained with 5 to 100 instances per DA. Our results show that several prompt settings achieve perfect DA accuracy, and near perfect semantic accuracy (99.81%) and perform better than few-shot fine-tuning.</abstract>
       <url hash="0819a538">2023.sigdial-1.32</url>
       <bibkey>ramirez-etal-2023-controllable</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.32</doi>
     </paper>
     <paper id="33">
       <title>Reference Resolution and New Entities in Exploratory Data Visualization: From Controlled to Unconstrained Interactions with a Conversational Assistant</title>
@@ -390,6 +422,7 @@
       <abstract>In the context of data visualization, as in other grounded settings, referents are created by the task the agents engage in and are salient because they belong to the shared physical setting. Our focus is on resolving references to visualizations on large displays; crucially, reference resolution is directly involved in the process of creating new entities, namely new visualizations. First, we developed a reference resolution model for a conversational assistant. We trained the assistant on controlled dialogues for data visualizations involving a single user. Second, we ported the conversational assistant including its reference resolution model to a different domain, supporting two users collaborating on a data exploration task. We explore how the new setting affects reference detection and resolution; we compare the performance in the controlled vs unconstrained setting, and discuss the general lessons that we draw from this adaptation.</abstract>
       <url hash="7ddc5276">2023.sigdial-1.33</url>
       <bibkey>bhattacharya-etal-2023-reference</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.33</doi>
     </paper>
     <paper id="34">
       <title><fixed-case>CONVERSER</fixed-case>: Few-shot Conversational Dense Retrieval with Synthetic Data Generation</title>
@@ -402,6 +435,7 @@
       <abstract>Conversational search provides a natural interface for information retrieval (IR). Recent approaches have demonstrated promising results in applying dense retrieval to conversational IR. However, training dense retrievers requires large amounts of in-domain paired data. This hinders the development of conversational dense retrievers, as abundant in-domain conversations are expensive to collect. In this paper, we propose Converser, a framework for training conversational dense retrievers with at most 6 examples of in-domain dialogues. Specifically, we utilize the in-context learning capability of large language models to generate conversational queries given a passage in the retrieval corpus. Experimental results on conversational retrieval benchmarks OR-QuAC and TREC CAsT 19 show that the proposed Converser achieves comparable performance to fully-supervised models, demonstrating the effectiveness of our proposed framework in few-shot conversational dense retrieval. All source code and generated datasets are available: https://github.com/MiuLab/CONVERSER</abstract>
       <url hash="7bd46af3">2023.sigdial-1.34</url>
       <bibkey>huang-etal-2023-converser</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.34</doi>
     </paper>
     <paper id="35">
       <title>Speaker Role Identification in Call Centre Dialogues: Leveraging Opening Sentences and Large Language Models</title>
@@ -412,6 +446,7 @@
       <abstract>This paper addresses the task of speaker role identification in call centre dialogues, focusing on distinguishing between the customer and the agent. We propose a text-based approach that utilises the identification of the agent’s opening sentence as a key feature for role classification. The opening sentence is identified using a model trained through active learning. By combining this information with a large language model, we accurately classify the speaker roles. The proposed approach is evaluated on a dataset of call centre dialogues and achieves 93.61% accuracy. This work contributes to the field by providing an effective solution for speaker role identification in call centre settings, with potential applications in interaction analysis and information retrieval.</abstract>
       <url hash="c9c8ffb0">2023.sigdial-1.35</url>
       <bibkey>nghiem-etal-2023-speaker</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.35</doi>
     </paper>
     <paper id="36">
       <title>Synthesising Personality with Neural Speech Synthesis</title>
@@ -423,6 +458,7 @@
       <abstract>Matching the personality of conversational agent to the personality of the user can significantly improve the user experience, with many successful examples in text-based chatbots. It is also important for a voice-based system to be able to alter the personality of the speech as perceived by the users. In this pilot study, fifteen voices were rated using Big Five personality traits. Five content-neutral sentences were chosen for the listening tests. The audio data, together with two rated traits (Extroversion and Agreeableness), were used to train a neural speech synthesiser based on one male and one female voices. The effect of altering the personality trait features was evaluated by a second listening test. Both perceived extroversion and agreeableness in the synthetic voices were affected significantly. The controllable range was limited due to a lack of variance in the source audio data. The perceived personality traits correlated with each other and with the naturalness of the speech. Future work can be making a chatbot speak in a voice with a pre-defined or adaptive personality by using personality synthesis in speech together with text-based personality generation.</abstract>
       <url hash="80026bcd">2023.sigdial-1.36</url>
       <bibkey>gao-etal-2023-synthesising</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.36</doi>
     </paper>
     <paper id="37">
       <title>Prompting, Retrieval, Training: An exploration of different approaches for task-oriented dialogue generation</title>
@@ -433,6 +469,7 @@
       <abstract>Task-oriented dialogue systems need to generate appropriate responses to help fulfill users’ requests. This paper explores different strategies, namely prompting, retrieval, and fine-tuning, for task-oriented dialogue generation. Through a systematic evaluation, we aim to provide valuable insights and guidelines for researchers and practitioners working on developing efficient and effective dialogue systems for real-world applications. Evaluation is performed on the MultiWOZ and Taskmaster-2 datasets, and we test various versions of FLAN-T5, GPT-3.5, and GPT-4 models. Costs associated with running these models are analyzed, and dialogue evaluation is briefly discussed. Our findings suggest that when testing data differs from the training data, fine-tuning may decrease performance, favoring a combination of a more general language model and a prompting mechanism based on retrieved examples.</abstract>
       <url hash="5a332f7c">2023.sigdial-1.37</url>
       <bibkey>raposo-etal-2023-prompting</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.37</doi>
     </paper>
     <paper id="38">
       <title>Bootstrapping a Conversational Guide for Colonoscopy Prep</title>
@@ -447,6 +484,7 @@
       <abstract>Creating conversational systems for niche domains is a challenging task, further exacerbated by a lack of quality datasets. We explore the construction of safer conversational systems for guiding patients in preparing for colonoscopies. This has required a data generation pipeline to generate a minimum viable dataset to bootstrap a semantic parser, augmented by automatic paraphrasing. Our study suggests large language models (e.g., GPT-3.5 and GPT-4) are a viable alternative to crowd sourced paraphrasing, but conversational systems that rely upon language models’ ability to do temporal reasoning struggle to provide accurate responses. A neural-symbolic system that performs temporal reasoning on an intermediate representation of user queries shows promising results compared to an end-to-end dialogue system, improving the number of correct responses while vastly reducing the number of incorrect or misleading ones.</abstract>
       <url hash="22c90b02">2023.sigdial-1.38</url>
       <bibkey>arya-etal-2023-bootstrapping</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.38</doi>
     </paper>
     <paper id="39">
       <title>Applying Item Response Theory to Task-oriented Dialogue Systems for Accurately Determining User’s Task Success Ability</title>
@@ -457,6 +495,7 @@
       <abstract>While task-oriented dialogue systems have improved, not all users can fully accomplish their tasks. Users with limited knowledge about the system may experience dialogue breakdowns or fail to achieve their tasks because they do not know how to interact with the system. For addressing this issue, it would be desirable to construct a system that can estimate the user’s task success ability and adapt to that ability. In this study, we propose a method that estimates this ability by applying item response theory (IRT), commonly used in education for estimating examinee abilities, to task-oriented dialogue systems. Through experiments predicting the probability of a correct answer to each slot by using the estimated task success ability, we found that the proposed method significantly outperformed baselines.</abstract>
       <url hash="8cf80326">2023.sigdial-1.39</url>
       <bibkey>hirai-etal-2023-applying</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.39</doi>
     </paper>
     <paper id="40">
       <title>An Open-Domain Avatar Chatbot by Exploiting a Large Language Model</title>
@@ -470,6 +509,7 @@
       <abstract>With the ambition to create avatars capable of human-level casual conversation, we developed an open-domain avatar chatbot, situated in a virtual reality environment, that employs a large language model (LLM). Introducing the LLM posed several challenges for multimodal integration, such as developing techniques to align diverse outputs and avatar control, as well as addressing the issue of slow generation speed. To address these challenges, we integrated various external modules into our system. Our system is based on the award-winning model from the Dialogue System Live Competition 5. Through this work, we hope to stimulate discussions within the research community about the potential and challenges of multimodal dialogue systems enhanced with LLMs.</abstract>
       <url hash="fe832c67">2023.sigdial-1.40</url>
       <bibkey>yamazaki-etal-2023-open</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.40</doi>
     </paper>
     <paper id="41">
       <title>Learning Multimodal Cues of Children’s Uncertainty</title>
@@ -486,6 +526,7 @@
       <abstract>Understanding uncertainty plays a critical role in achieving common ground (Clark et al., 1983). This is especially important for multimodal AI systems that collaborate with users to solve a problem or guide the user through a challenging concept. In this work, for the first time, we present a dataset annotated in collaboration with developmental and cognitive psychologists for the purpose of studying nonverbal cues of uncertainty. We then present an analysis of the data, studying different roles of uncertainty and its relationship with task difficulty and performance. Lastly, we present a multimodal machine learning model that can predict uncertainty given a real-time video clip of a participant, which we find improves upon a baseline multimodal transformer model. This work informs research on cognitive coordination between human-human and human-AI and has broad implications for gesture understanding and generation. The anonymized version of our data and code will be publicly available upon the completion of the required consent forms and data sheets.</abstract>
       <url hash="7cbbb74b">2023.sigdial-1.41</url>
       <bibkey>cheng-etal-2023-learning</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.41</doi>
     </paper>
     <paper id="42">
       <title>Grounding Description-Driven Dialogue State Trackers with Knowledge-Seeking Turns</title>
@@ -500,6 +541,7 @@
       <abstract>Schema-guided dialogue state trackers can generalise to new domains without further training, yet they are sensitive to the writing style of the schemata. Augmenting the training set with human or synthetic schema paraphrases improves the model robustness to these variations but can be either costly or difficult to control. We propose to circumvent these issues by grounding the state tracking model in knowledge-seeking turns collected from the dialogue corpus as well as the schema. Including these turns in prompts during finetuning and inference leads to marked improvements in model robustness, as demonstrated by large average joint goal accuracy and schema sensitivity improvements on SGD and SGD-X.</abstract>
       <url hash="35382728">2023.sigdial-1.42</url>
       <bibkey>coca-etal-2023-grounding</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.42</doi>
     </paper>
     <paper id="43">
       <title>Resolving References in Visually-Grounded Dialogue via Text Generation</title>
@@ -510,6 +552,7 @@
       <abstract>Vision-language models (VLMs) have shown to be effective at image retrieval based on simple text queries, but text-image retrieval based on conversational input remains a challenge. Consequently, if we want to use VLMs for reference resolution in visually-grounded dialogue, the discourse processing capabilities of these models need to be augmented. To address this issue, we propose fine-tuning a causal large language model (LLM) to generate definite descriptions that summarize coreferential information found in the linguistic context of references. We then use a pretrained VLM to identify referents based on the generated descriptions, zero-shot. We evaluate our approach on a manually annotated dataset of visually-grounded dialogues and achieve results that, on average, exceed the performance of the baselines we compare against. Furthermore, we find that using referent descriptions based on larger context windows has the potential to yield higher returns.</abstract>
       <url hash="635628f9">2023.sigdial-1.43</url>
       <bibkey>willemsen-etal-2023-resolving</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.43</doi>
     </paper>
     <paper id="44">
       <title>Slot Induction via Pre-trained Language Model Probing and Multi-level Contrastive Learning</title>
@@ -521,6 +564,7 @@
       <abstract>Recent advanced methods in Natural Language Understanding for Task-oriented Dialogue (TOD) Systems (e.g., intent detection and slot filling) require a large amount of annotated data to achieve competitive performance. In reality, token-level annotations (slot labels) are time-consuming and difficult to acquire. In this work, we study the Slot Induction (SI) task whose objective is to induce slot boundaries without explicit knowledge of token-level slot annotations. We propose leveraging Unsupervised Pre-trained Language Model (PLM) Probing and Contrastive Learning mechanism to exploit (1) unsupervised semantic knowledge extracted from PLM, and (2) additional sentence-level intent label signals available from TOD. Our approach is shown to be effective in SI task and capable of bridging the gaps with token-level supervised models on two NLU benchmark datasets. When generalized to emerging intents, our SI objectives also provide enhanced slot label representations, leading to improved performance on the Slot Filling tasks.</abstract>
       <url hash="db1acca5">2023.sigdial-1.44</url>
       <bibkey>nguyen-etal-2023-slot</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.44</doi>
     </paper>
     <paper id="45">
       <title>The timing bottleneck: Why timing and overlap are mission-critical for conversational user interfaces, speech recognition and dialogue systems</title>
@@ -531,6 +575,7 @@
       <abstract>Speech recognition systems are a key intermediary in voice-driven human-computer interaction. Although speech recognition works well for pristine monologic audio, real-life use cases in open-ended interactive settings still present many challenges. We argue that timing is mission-critical for dialogue systems, and evaluate 5 major commercial ASR systems for their conversational and multilingual support. We find that word error rates for natural conversational data in 6 languages remain abysmal, and that overlap remains a key challenge (study 1). This impacts especially the recognition of conversational words (study 2), and in turn has dire consequences for downstream intent recognition (study 3). Our findings help to evaluate the current state of conversational ASR, contribute towards multidimensional error analysis and evaluation, and identify phenomena that need most attention on the way to build robust interactive speech technologies.</abstract>
       <url hash="a5e58b5d">2023.sigdial-1.45</url>
       <bibkey>liesenfeld-etal-2023-timing</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.45</doi>
     </paper>
     <paper id="46">
       <title>Enhancing Task Bot Engagement with Synthesized Open-Domain Dialog</title>
@@ -543,6 +588,7 @@
       <abstract>The construction of dialog systems for various types of conversations, such as task-oriented dialog (TOD) and open-domain dialog (ODD), has been an active area of research. In order to more closely mimic human-like conversations that often involve the fusion of different dialog modes, it is important to develop systems that can effectively handle both TOD and ODD and access different knowledge sources. In this work, we present a new automatic framework to enrich TODs with synthesized ODDs. We also introduce the PivotBot model, which is capable of handling both TOD and ODD modes and can access different knowledge sources to generate informative responses. Evaluation results indicate the superior ability of the proposed model to switch smoothly between TOD and ODD tasks.</abstract>
       <url hash="bb2aeac5">2023.sigdial-1.46</url>
       <bibkey>li-etal-2023-enhancing-task</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.46</doi>
     </paper>
     <paper id="47">
       <title>Enhancing Performance on Seen and Unseen Dialogue Scenarios using Retrieval-Augmented End-to-End Task-Oriented System</title>
@@ -559,6 +605,7 @@
       <abstract>End-to-end task-oriented dialogue (TOD) systems have achieved promising performance by leveraging sophisticated natural language understanding and natural language generation capabilities of pre-trained models. This work enables the TOD systems with more flexibility through a simple cache. The cache provides the flexibility to dynamically update the TOD systems and handle both existing and unseen dialogue scenarios. Towards this end, we first fine-tune a retrieval module to effectively retrieve the most relevant information entries from the cache. We then train end-to-end TOD models that can refer to and ground on both dialogue history and retrieved information during TOD generation. The introduced cache is straightforward to construct, and the backbone models of TOD systems are compatible with existing pre-trained generative models. Extensive experiments demonstrate the superior performance of our framework, with a notable improvement in non-empty joint goal accuracy by 6.7% compared to strong baselines.</abstract>
       <url hash="b68c02e0">2023.sigdial-1.47</url>
       <bibkey>zhang-etal-2023-enhancing-performance</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.47</doi>
     </paper>
     <paper id="48">
       <title>Transformer-based Multi-Party Conversation Generation using Dialogue Discourse Acts Planning</title>
@@ -568,6 +615,7 @@
       <abstract>Recent transformer-based approaches to multi-party conversation generation may produce syntactically coherent but discursively inconsistent dialogues in some cases. To address this issue, we propose an approach to integrate a dialogue act planning stage into the end-to-end transformer-based generation pipeline. This approach consists of a transformer fine-tuning procedure based on linearized dialogue representations that include special discourse tokens. The obtained results demonstrate that incorporating discourse tokens into training sequences is sufficient to significantly improve dialogue consistency and overall generation quality. The suggested approach performs well, including for automatically annotated data. Apart from that, it is observed that increasing the weight of the discourse planning task in the loss function accelerates learning convergence.</abstract>
       <url hash="e9d7f0b3">2023.sigdial-1.48</url>
       <bibkey>chernyavskiy-ilvovsky-2023-transformer</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.48</doi>
     </paper>
     <paper id="49">
       <title>Incorporating Annotator Uncertainty into Representations of Discourse Relations</title>
@@ -577,6 +625,7 @@
       <abstract>Annotation of discourse relations is a known difficult task, especially for non-expert annotators. In this paper, we investigate novice annotators’ uncertainty on the annotation of discourse relations on spoken conversational data. We find that dialogue context (single turn, pair of turns within speaker, and pair of turns across speakers) is a significant predictor of confidence scores. We compute distributed representations of discourse relations from co-occurrence statistics that incorporate information about confidence scores and dialogue context. We perform a hierarchical clustering analysis using these representations and show that weighting discourse relation representations with information about confidence and dialogue context coherently models our annotators’ uncertainty about discourse relation labels.</abstract>
       <url hash="b86c1b9a">2023.sigdial-1.49</url>
       <bibkey>lopez-cortez-jacobs-2023-incorporating</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.49</doi>
     </paper>
     <paper id="50">
       <title>Investigating the Representation of Open Domain Dialogue Context for Transformer Models</title>
@@ -592,6 +641,7 @@
       <abstract>The bulk of work adapting transformer models to open-domain dialogue represents dialogue context as the concatenated set of turns in natural language. However, it is unclear if this is the best approach. In this work, we investigate this question by means of an empirical controlled experiment varying the dialogue context format from text-only formats (all recent utterances, summaries, selected utterances) as well as variants that are more structurally different (triples, AMR). We compare these formats based on fine-tuned model performance on two downstream tasks—knowledge selection and response generation. We find that simply concatenating the utterances works as a strong baseline in most cases, but is outperformed in longer contexts by a hybrid approach of combining a summary of the context with recent utterances. Through empirical analysis, our work highlights the need to examine the format of context representation and offers recommendations on adapting general-purpose language models to dialogue tasks.</abstract>
       <url hash="439af716">2023.sigdial-1.50</url>
       <bibkey>padmakumar-etal-2023-investigating</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.50</doi>
     </paper>
     <paper id="51">
       <title>C3: Compositional Counterfactual Contrastive Learning for Video-grounded Dialogues</title>
@@ -602,6 +652,7 @@
       <abstract>Video-grounded dialogue systems aim to integrate video understanding and dialogue understanding to generate responses that are relevant to both the dialogue and video context. Most existing approaches employ deep learning models and have achieved remarkable performance, given the relatively small datasets available. However, the results are partially accomplished by exploiting biases in the datasets rather than developing multimodal reasoning, resulting in limited generalization. In this paper, we propose a novel approach of Compositional Counterfactual Contrastive Learning (C3) to develop contrastive training between factual and counterfactual samples in video-grounded dialogues. Specifically, we design factual/counterfactual samples based on the temporal steps in videos and tokens in dialogues and propose contrastive loss functions that exploit object-level or action-level variance. Different from prior approaches, we focus on contrastive hidden state representations among compositional output tokens to optimize the representation space in a generation setting. We achieved promising performance gains on the Audio-Visual Scene-Aware Dialogues (AVSD) benchmark and showed the benefits of our approach in grounding video and dialogue context.</abstract>
       <url hash="fa1e3263">2023.sigdial-1.51</url>
       <bibkey>le-etal-2023-c3</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.51</doi>
     </paper>
     <paper id="52">
       <title>No that’s not what <fixed-case>I</fixed-case> meant: Handling Third Position Repair in Conversational Question Answering</title>
@@ -613,6 +664,7 @@
       <abstract>The ability to handle miscommunication is crucial to robust and faithful conversational AI. People usually deal with miscommunication immediately as they detect it, using highly systematic interactional mechanisms called repair. One important type of repair is Third Position Repair (TPR) whereby a speaker is initially misunderstood but then corrects the misunderstanding as it becomes apparent after the addressee’s erroneous response. Here, we collect and publicly release REPAIR-QA, the first large dataset of TPRs in a conversational question answering (QA) setting. The data is comprised of the TPR turns, corresponding dialogue contexts, and candidate repairs of the original turn for execution of TPRs. We demonstrate the usefulness of the data by training and evaluating strong baseline models for executing TPRs. For stand-alone TPR execution, we perform both automatic and human evaluations on a fine-tuned T5 model, as well as OpenAI’s GPT-3 LLMs. Additionally, we extrinsically evaluate the LLMs’ TPR processing capabilities in the downstream conversational QA task. The results indicate poor out-of-the-box performance on TPR’s by the GPT-3 models, which then significantly improves when exposed to REPAIR-QA.</abstract>
       <url hash="17eb34c6">2023.sigdial-1.52</url>
       <bibkey>balaraman-etal-2023-thats</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.52</doi>
     </paper>
     <paper id="53">
       <title>When to generate hedges in peer-tutoring interactions</title>
@@ -623,6 +675,7 @@
       <abstract>This paper explores the application of machine learning techniques to predict where hedging occurs in peer-tutoring interactions. The study uses a naturalistic face-to-face dataset annotated for natural language turns, conversational strategies, tutoring strategies, and nonverbal behaviors. These elements are processed into a vector representation of the previous turns, which serves as input to several machine learning models, including MLP and LSTM. The results show that embedding layers, capturing the semantic information of the previous turns, significantly improves the model’s performance. Additionally, the study provides insights into the importance of various features, such as interpersonal rapport and nonverbal behaviors, in predicting hedges by using Shapley values for feature explanation. We discover that the eye gaze of both the tutor and the tutee has a significant impact on hedge prediction. We further validate this observation through a follow-up ablation study.</abstract>
       <url hash="3f6d61e3">2023.sigdial-1.53</url>
       <bibkey>abulimiti-etal-2023-generate</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.53</doi>
     </paper>
     <paper id="54">
       <title><fixed-case>P</fixed-case>aper<fixed-case>P</fixed-case>ersi<fixed-case>C</fixed-case>hat: Scientific Paper Discussion Chatbot using Transformers and Discourse Flow Management</title>
@@ -633,6 +686,7 @@
       <abstract>The rate of scientific publications is increasing exponentially, necessitating a significant investment of time in order to read and comprehend the most important articles. While ancillary services exist to facilitate this process, they are typically closed-model and paid services or have limited capabilities. In this paper, we present <i>PaperPersiChat</i>, an open chatbot-system designed for the discussion of scientific papers. This system supports summarization and question-answering modes within a single end-to-end chatbot pipeline, which is guided by discourse analysis. To expedite the development of similar systems, we also release the gathered dataset, which has no publicly available analogues.</abstract>
       <url hash="5bfe4c7b">2023.sigdial-1.54</url>
       <bibkey>chernyavskiy-etal-2023-paperpersichat</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.54</doi>
     </paper>
     <paper id="55">
       <title><fixed-case>F</fixed-case>ur<fixed-case>C</fixed-case>hat: An Embodied Conversational Agent using <fixed-case>LLM</fixed-case>s, Combining Open and Closed-Domain Dialogue with Facial Expressions</title>
@@ -652,6 +706,7 @@
       <abstract>We demonstrate an embodied conversational agent that can function as a receptionist and generate a mixture of open and closed-domain dialogue along with facial expressions, by using a large language model (LLM) to develop an engaging conversation. We deployed the system onto a Furhat robot, which is highly expressive and capable of using both verbal and nonverbal cues during interaction. The system was designed specifically for the National Robotarium to interact with visitors through natural conversations, providing them with information about the facilities, research, news, upcoming events, etc. The system utilises the state-of-the-art GPT-3.5 model to generate such information along with domain-general conversations and facial expressions based on prompt engineering.</abstract>
       <url hash="3001ab5b">2023.sigdial-1.55</url>
       <bibkey>cherakara-etal-2023-furchat</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.55</doi>
     </paper>
     <paper id="56">
       <title>Towards Breaking the Self-imposed Filter Bubble in Argumentative Dialogues</title>
@@ -665,6 +720,7 @@
       <abstract>Human users tend to selectively ignore information that contradicts their pre-existing beliefs or opinions in their process of information seeking. These “self-imposed filter bubbles” (SFB) pose a significant challenge for cooperative argumentative dialogue systems aiming to build an unbiased opinion and a better understanding of the topic at hand. To address this issue, we develop a strategy for overcoming users’ SFB within the course of the interaction. By continuously modeling the user’s position in relation to the SFB, we are able to identify the respective arguments which maximize the probability to get outside the SFB and present them to the user. We implemented this approach in an argumentative dialogue system and evaluated in a laboratory user study with 60 participants to show its validity and applicability. The findings suggest that the strategy was successful in breaking users’ SFBs and promoting a more reflective and comprehensive discussion of the topic.</abstract>
       <url hash="b8ddc231">2023.sigdial-1.56</url>
       <bibkey>aicher-etal-2023-towards</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.56</doi>
     </paper>
     <paper id="57">
       <title>The Open-domain Paradox for Chatbots: Common Ground as the Basis for Human-like Dialogue</title>
@@ -674,6 +730,7 @@
       <abstract>There is a surge in interest in the development of open-domain chatbots, driven by the recent advancements of large language models. The “openness” of the dialogue is expected to be maximized by providing minimal information to the users about the common ground they can expect, including the presumed joint activity. However, evidence suggests that the effect is the opposite. Asking users to “just chat about anything” results in a very narrow form of dialogue, which we refer to as the “open-domain paradox”. In this position paper, we explain this paradox through the theory of common ground as the basis for human-like communication. Furthermore, we question the assumptions behind open-domain chatbots and identify paths forward for enabling common ground in human-computer dialogue.</abstract>
       <url hash="152a0922">2023.sigdial-1.57</url>
       <bibkey>skantze-dogruoz-2023-open</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.57</doi>
     </paper>
     <paper id="58">
       <title><fixed-case>MERCY</fixed-case>: Multiple Response Ranking Concurrently in Realistic Open-Domain Conversational Systems</title>
@@ -688,6 +745,7 @@
       <abstract>Automatic Evaluation (AE) and Response Selection (RS) models assign quality scores to various candidate responses and rank them in conversational setups. Prior response ranking research compares various models’ performance on synthetically generated test sets. In this work, we investigate the performance of model-based reference-free AE and RS models on our constructed response ranking datasets that mirror real-case scenarios of ranking candidates during inference time. Metrics’ unsatisfying performance can be interpreted as their low generalizability over more pragmatic conversational domains such as human-chatbot dialogs. To alleviate this issue we propose a novel RS model called MERCY that simulates human behavior in selecting the best candidate by taking into account distinct candidates concurrently and learns to rank them. In addition, MERCY leverages natural language feedback as another component to help the ranking task by explaining why each candidate response is relevant/irrelevant to the dialog context. These feedbacks are generated by prompting large language models in a few-shot setup. Our experiments show the better performance of MERCY over baselines for the response ranking task in our curated realistic datasets.</abstract>
       <url hash="83e346a7">2023.sigdial-1.58</url>
       <bibkey>ghazarian-etal-2023-mercy</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.58</doi>
     </paper>
     <paper id="59">
       <title>Empathetic Response Generation for Distress Support</title>
@@ -698,6 +756,7 @@
       <abstract>AI-driven chatbots are seen as an attractive solution to support people undergoing emotional distress. One of the main components of such a chatbot is the ability to empathize with the user. But a significant limitation in achieving this goal is the lack of a large dialogue dataset containing empathetic support for those undergoing distress. In this work, we curate a large-scale dialogue dataset that contains ≈1.3M peer support dialogues spanning across more than 4K distress-related topics. We analyze the empathetic characteristics of this dataset using statistical and visual means. To demonstrate the utility of this dataset, we train four baseline neural dialogue models that can respond empathetically to distress prompts. Two of the baselines adapt existing architecture and the other two incorporate a framework identifying levels of cognitive and emotional empathy in responses. Automatic and human evaluation of these models validate the utility of the dataset in generating empathetic responses for distress support and show that identifying levels of empathy in peer-support responses facilitates generating responses that are lengthier, richer in empathy, and closer to the ground truth.</abstract>
       <url hash="d0b2444d">2023.sigdial-1.59</url>
       <bibkey>welivita-etal-2023-empathetic</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.59</doi>
     </paper>
     <paper id="60">
       <title>Reasoning before Responding: Integrating Commonsense-based Causality Explanation for Empathetic Response Generation</title>
@@ -709,6 +768,7 @@
       <abstract>Recent approaches to empathetic response generation try to incorporate commonsense knowledge or reasoning about the causes of emotions to better understand the user’s experiences and feelings. However, these approaches mainly focus on understanding the causalities of context from the user’s perspective, ignoring the system’s perspective. In this paper, we propose a commonsense-based causality explanation approach for diverse empathetic response generation that considers both the user’s perspective (user’s desires and reactions) and the system’s perspective (system’s intentions and reactions). We enhance ChatGPT’s ability to reason for the system’s perspective by integrating in-context learning with commonsense knowledge. Then, we integrate the commonsense-based causality explanation with both ChatGPT and a T5-based model. Experimental evaluations demonstrate that our method outperforms other comparable methods on both automatic and human evaluations.</abstract>
       <url hash="4718ebc4">2023.sigdial-1.60</url>
       <bibkey>fu-etal-2023-reasoning</bibkey>
+      <doi>10.18653/v1/2023.sigdial-1.60</doi>
     </paper>
   </volume>
   <event id="sigdial-2023">

From 132ac6666a55a6b54ef346d66ee30cec8b5a79e7 Mon Sep 17 00:00:00 2001
From: acl-pwc-bot <94475230+acl-pwc-bot@users.noreply.github.com>
Date: Thu, 30 Nov 2023 02:07:28 +0100
Subject: [PATCH 11/12] Update metadata from Papers with Code

---
 data/xml/2020.acl.xml      |  3 +-
 data/xml/2020.coling.xml   |  1 -
 data/xml/2020.inlg.xml     | 40 +++++++++++-----------
 data/xml/2020.sigdial.xml  | 22 ++++++------
 data/xml/2021.acl.xml      |  2 ++
 data/xml/2021.cl.xml       |  1 +
 data/xml/2021.eacl.xml     |  2 +-
 data/xml/2021.emnlp.xml    |  3 ++
 data/xml/2021.gem.xml      |  1 -
 data/xml/2021.inlg.xml     | 56 +++++++++++++++---------------
 data/xml/2021.nlp4prog.xml |  1 +
 data/xml/2021.sigdial.xml  | 64 +++++++++++++++++-----------------
 data/xml/2022.acl.xml      |  1 +
 data/xml/2022.coling.xml   |  1 +
 data/xml/2022.findings.xml |  2 ++
 data/xml/2022.naacl.xml    |  3 +-
 data/xml/2022.sigdial.xml  | 70 +++++++++++++++++++-------------------
 data/xml/D18.xml           |  1 +
 data/xml/D19.xml           |  1 -
 data/xml/P17.xml           |  1 -
 20 files changed, 143 insertions(+), 133 deletions(-)

diff --git a/data/xml/2020.acl.xml b/data/xml/2020.acl.xml
index c58bc1dc35..3ac01f90e7 100644
--- a/data/xml/2020.acl.xml
+++ b/data/xml/2020.acl.xml
@@ -8199,7 +8199,6 @@
       <pwcdataset url="https://paperswithcode.com/dataset/reddit">Reddit</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/reddit-tifu">Reddit TIFU</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikihow">WikiHow</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/xsum">XSum</pwcdataset>
     </paper>
     <paper id="553">
       <title>Heterogeneous Graph Neural Networks for Extractive Document Summarization</title>
@@ -10027,6 +10026,7 @@
       <video href="http://slideslive.com/38929348"/>
       <bibkey>wang-etal-2020-rat</bibkey>
       <pwccode url="https://github.com/Microsoft/rat-sql" additional="true">Microsoft/rat-sql</pwccode>
+      <pwcdataset url="https://paperswithcode.com/dataset/spider-realistic">Spider-Realistic</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikisql">WikiSQL</pwcdataset>
     </paper>
     <paper id="678">
@@ -11034,6 +11034,7 @@
       <video href="http://slideslive.com/38929345"/>
       <bibkey>yin-etal-2020-tabert</bibkey>
       <pwccode url="https://github.com/facebookresearch/tabert" additional="false">facebookresearch/tabert</pwccode>
+      <pwcdataset url="https://paperswithcode.com/dataset/spider-realistic">Spider-Realistic</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitablequestions">WikiTableQuestions</pwcdataset>
     </paper>
     <paper id="746">
diff --git a/data/xml/2020.coling.xml b/data/xml/2020.coling.xml
index 442c82d684..945c14d36e 100644
--- a/data/xml/2020.coling.xml
+++ b/data/xml/2020.coling.xml
@@ -5218,7 +5218,6 @@
       <url hash="d02dc5d3">2020.coling-main.392</url>
       <doi>10.18653/v1/2020.coling-main.392</doi>
       <bibkey>zhang-etal-2020-knowledge</bibkey>
-      <pwccode url="https://github.com/BladeDancer957/KAITML" additional="false">BladeDancer957/KAITML</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/conceptnet">ConceptNet</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/emorynlp">EmoryNLP</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/iemocap">IEMOCAP</pwcdataset>
diff --git a/data/xml/2020.inlg.xml b/data/xml/2020.inlg.xml
index 9cc7cf2a42..3a0baeabf6 100644
--- a/data/xml/2020.inlg.xml
+++ b/data/xml/2020.inlg.xml
@@ -41,8 +41,8 @@
       <abstract>The Arabic language has very limited supports from NLG researchers. In this paper, we explain the challenges of the core grammar, provide a lexical resource, and implement the first language functions for the Arabic language. We did a human evaluation to evaluate our functions in generating sentences from the NADA Corpus.</abstract>
       <url hash="4805a436">2020.inlg-1.2</url>
       <bibkey>abed-reiter-2020-arabic</bibkey>
-      <pwccode url="https://github.com/waelmohammedabed/natural-language-generation-for-the-arabic-language" additional="false">waelmohammedabed/natural-language-generation-for-the-arabic-language</pwccode>
       <doi>10.18653/v1/2020.inlg-1.2</doi>
+      <pwccode url="https://github.com/waelmohammedabed/natural-language-generation-for-the-arabic-language" additional="false">waelmohammedabed/natural-language-generation-for-the-arabic-language</pwccode>
     </paper>
     <paper id="3">
       <title>Generating Intelligible Plumitifs Descriptions: Use Case Application with Ethical Considerations</title>
@@ -71,8 +71,8 @@
       <abstract>Semi-structured text generation is a non-trivial problem. Although last years have brought lots of improvements in natural language generation, thanks to the development of neural models trained on large scale datasets, these approaches still struggle with producing structured, context- and commonsense-aware texts. Moreover, it is not clear how to evaluate the quality of generated texts. To address these problems, we introduce RecipeNLG – a novel dataset of cooking recipes. We discuss the data collection process and the relation between the semi-structured texts and cooking recipes. We use the dataset to approach the problem of generating recipes. Finally, we make use of multiple metrics to evaluate the generated recipes.</abstract>
       <url hash="d1455a80">2020.inlg-1.4</url>
       <bibkey>bien-etal-2020-recipenlg</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/recipenlg">RecipeNLG</pwcdataset>
       <doi>10.18653/v1/2020.inlg-1.4</doi>
+      <pwcdataset url="https://paperswithcode.com/dataset/recipenlg">RecipeNLG</pwcdataset>
     </paper>
     <paper id="5">
       <title>Controlled Text Generation with Adversarial Learning</title>
@@ -95,10 +95,10 @@
       <abstract>It is unfair to expect neural data-to-text to produce high quality output when there are gaps between system input data and information contained in the training text. Thomson et al. (2020) identify and narrow information gaps in Rotowire, a popular data-to-text dataset. In this paper, we describe a study which finds that a state-of-the-art neural data-to-text system produces higher quality output, according to the information extraction (IE) based metrics, when additional input data is carefully selected from this newly available source. It remains to be shown, however, whether IE metrics used in this study correlate well with humans in judging text quality.</abstract>
       <url hash="c62811bb">2020.inlg-1.6</url>
       <bibkey>thomson-etal-2020-studying</bibkey>
+      <doi>10.18653/v1/2020.inlg-1.6</doi>
       <pwccode url="https://github.com/nlgcat/adding_data" additional="false">nlgcat/adding_data</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/rotowire">RotoWire</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/sportsett">SportSett</pwcdataset>
-      <doi>10.18653/v1/2020.inlg-1.6</doi>
     </paper>
     <paper id="7">
       <title>Improving the Naturalness and Diversity of Referring Expression Generation models using Minimum Risk Training</title>
@@ -109,9 +109,9 @@
       <abstract>In this paper we consider the problem of optimizing neural Referring Expression Generation (REG) models with sequence level objectives. Recently reinforcement learning (RL) techniques have been adopted to train deep end-to-end systems to directly optimize sequence-level objectives. However, there are two issues associated with RL training: (1) effectively applying RL is challenging, and (2) the generated sentences lack in diversity and naturalness due to deficiencies in the generated word distribution, smaller vocabulary size, and repetitiveness of frequent words and phrases. To alleviate these issues, we propose a novel strategy for training REG models, using minimum risk training (MRT) with maximum likelihood estimation (MLE) and we show that our approach outperforms RL w.r.t naturalness and diversity of the output. Specifically, our approach achieves an increase in CIDEr scores between 23%-57% in two datasets. We further demonstrate the robustness of the proposed method through a detailed comparison with different REG models.</abstract>
       <url hash="7b6bba92">2020.inlg-1.7</url>
       <bibkey>panagiaris-etal-2020-improving</bibkey>
+      <doi>10.18653/v1/2020.inlg-1.7</doi>
       <pwcdataset url="https://paperswithcode.com/dataset/coco">COCO</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/refcoco">RefCOCO</pwcdataset>
-      <doi>10.18653/v1/2020.inlg-1.7</doi>
     </paper>
     <paper id="8">
       <title>Assessing Discourse Relations in Language Generation from <fixed-case>GPT</fixed-case>-2</title>
@@ -132,8 +132,8 @@
       <url hash="153ffcec">2020.inlg-1.9</url>
       <attachment type="Supplementary_Attachment" hash="3dc9e310">2020.inlg-1.9.Supplementary_Attachment.pdf</attachment>
       <bibkey>kasner-dusek-2020-data</bibkey>
-      <pwccode url="https://github.com/kasnerz/d2t_iterative_editing" additional="false">kasnerz/d2t_iterative_editing</pwccode>
       <doi>10.18653/v1/2020.inlg-1.9</doi>
+      <pwccode url="https://github.com/kasnerz/d2t_iterative_editing" additional="false">kasnerz/d2t_iterative_editing</pwccode>
     </paper>
     <paper id="10">
       <title>The <fixed-case>CACAPO</fixed-case> Dataset: A Multilingual, Multi-Domain Dataset for Neural Pipeline and End-to-End Data-to-Text Generation</title>
@@ -146,9 +146,9 @@
       <url hash="6218b1e2">2020.inlg-1.10</url>
       <attachment type="Supplementary_Attachment" hash="e55d25a7">2020.inlg-1.10.Supplementary_Attachment.pdf</attachment>
       <bibkey>van-der-lee-etal-2020-cacapo</bibkey>
+      <doi>10.18653/v1/2020.inlg-1.10</doi>
       <pwcdataset url="https://paperswithcode.com/dataset/rotowire">RotoWire</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/webnlg">WebNLG</pwcdataset>
-      <doi>10.18653/v1/2020.inlg-1.10</doi>
     </paper>
     <paper id="11">
       <title>Towards Generating Query to Perform Query Focused Abstractive Summarization using Pre-trained Model</title>
@@ -158,10 +158,10 @@
       <abstract>Query Focused Abstractive Summarization (QFAS) represents an abstractive summary from the source document based on a given query. To measure the performance of abstractive summarization tasks, different datasets have been broadly used. However, for QFAS tasks, only a limited number of datasets have been used, which are comparatively small and provide single sentence summaries. This paper presents a query generation approach, where we considered most similar words between documents and summaries for generating queries. By implementing our query generation approach, we prepared two relatively large datasets, namely CNN/DailyMail and Newsroom which contain multiple sentence summaries and can be used for future QFAS tasks. We also implemented a pre-processing approach to perform QFAS tasks using a pretrained language model, BERTSUM. In our pre-processing approach, we sorted the sentences of the documents from the most query-related sentences to the less query-related sentences. Then, we fine-tuned the BERTSUM model for generating the abstractive summaries. We also experimented on one of the largely used datasets, Debatepedia, to compare our QFAS approach with other models. The experimental results show that our approach outperforms the state-of-the-art models on three ROUGE scores.</abstract>
       <url hash="b4b7faf8">2020.inlg-1.11</url>
       <bibkey>abdullah-chali-2020-towards</bibkey>
+      <doi>10.18653/v1/2020.inlg-1.11</doi>
       <pwccode url="https://github.com/deen-abdullah/QABSBERT" additional="false">deen-abdullah/QABSBERT</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/cnn-daily-mail-1">CNN/Daily Mail</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/newsroom">NEWSROOM</pwcdataset>
-      <doi>10.18653/v1/2020.inlg-1.11</doi>
     </paper>
     <paper id="12">
       <title><fixed-case>S</fixed-case>imple<fixed-case>NLG</fixed-case>-<fixed-case>TI</fixed-case>: Adapting <fixed-case>S</fixed-case>imple<fixed-case>NLG</fixed-case> to <fixed-case>T</fixed-case>ibetan</title>
@@ -192,11 +192,11 @@
       <abstract>We study the pre-train + fine-tune strategy for data-to-text tasks. Our experiments indicate that text-to-text pre-training in the form of T5 (Raffel et al., 2019), enables simple, end-to-end transformer based models to outperform pipelined neural architectures tailored for data-to-text generation, as well as alternatives such as BERT and GPT-2. Importantly, T5 pre-training leads to better generalization, as evidenced by large improvements on out-ofdomain test sets. We hope our work serves as a useful baseline for future research, as transfer learning becomes ever more prevalent for data-to-text tasks.</abstract>
       <url hash="156440c9">2020.inlg-1.14</url>
       <bibkey>kale-rastogi-2020-text</bibkey>
+      <doi>10.18653/v1/2020.inlg-1.14</doi>
       <pwccode url="https://github.com/google-research-datasets/ToTTo" additional="true">google-research-datasets/ToTTo</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/multiwoz">MultiWOZ</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/totto">ToTTo</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/webnlg">WebNLG</pwcdataset>
-      <doi>10.18653/v1/2020.inlg-1.14</doi>
     </paper>
     <paper id="15">
       <title><fixed-case>D</fixed-case>a<fixed-case>M</fixed-case>ata: A Robot-Journalist Covering the <fixed-case>B</fixed-case>razilian <fixed-case>A</fixed-case>mazon Deforestation</title>
@@ -210,8 +210,8 @@
       <abstract>This demo paper introduces DaMata, a robot-journalist covering deforestation in the Brazilian Amazon. The robot-journalist is based on a pipeline architecture of Natural Language Generation, which yields multilingual daily and monthly reports based on the public data provided by DETER, a real-time deforestation satellite monitor developed and maintained by the Brazilian National Institute for Space Research (INPE). DaMata automatically generates reports in Brazilian Portuguese and English and publishes them on the Twitter platform. Corpus and code are publicly available.</abstract>
       <url hash="2aee9bfe">2020.inlg-1.15</url>
       <bibkey>rosa-teixeira-etal-2020-damata</bibkey>
-      <pwccode url="https://github.com/botsdobem/demo_inpe_covid" additional="false">botsdobem/demo_inpe_covid</pwccode>
       <doi>10.18653/v1/2020.inlg-1.15</doi>
+      <pwccode url="https://github.com/botsdobem/demo_inpe_covid" additional="false">botsdobem/demo_inpe_covid</pwccode>
     </paper>
     <paper id="16">
       <title>Generating Quantified Referring Expressions through Attention-Driven Incremental Perception</title>
@@ -244,9 +244,9 @@
       <abstract>In language generation models conditioned by structured data, the classical training via maximum likelihood almost always leads models to pick up on dataset divergence (i.e., hallucinations or omissions), and to incorporate them erroneously in their own generations at inference. In this work, we build on top of previous Reinforcement Learning based approaches and show that a model-agnostic framework relying on the recently introduced PARENT metric is efficient at reducing both hallucinations and omissions. Evaluations on the widely used WikiBIO and WebNLG benchmarks demonstrate the effectiveness of this framework compared to state-of-the-art models.</abstract>
       <url hash="5dfd3607">2020.inlg-1.18</url>
       <bibkey>rebuffel-etal-2020-parenting</bibkey>
+      <doi>10.18653/v1/2020.inlg-1.18</doi>
       <pwccode url="https://github.com/KaijuML/PARENTing-rl" additional="false">KaijuML/PARENTing-rl</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/wikibio">WikiBio</pwcdataset>
-      <doi>10.18653/v1/2020.inlg-1.18</doi>
     </paper>
     <paper id="19">
       <title>Evaluating Semantic Accuracy of Data-to-Text Generation with Natural Language Inference</title>
@@ -257,8 +257,8 @@
       <url hash="db83b189">2020.inlg-1.19</url>
       <attachment type="Supplementary_Attachment" hash="b098e192">2020.inlg-1.19.Supplementary_Attachment.pdf</attachment>
       <bibkey>dusek-kasner-2020-evaluating</bibkey>
-      <pwccode url="https://github.com/ufal/nlgi_eval" additional="false">ufal/nlgi_eval</pwccode>
       <doi>10.18653/v1/2020.inlg-1.19</doi>
+      <pwccode url="https://github.com/ufal/nlgi_eval" additional="false">ufal/nlgi_eval</pwccode>
     </paper>
     <paper id="20">
       <title>Chart-to-Text: Generating Natural Language Descriptions for Charts by Adapting the Transformer Model</title>
@@ -268,9 +268,9 @@
       <abstract>Information visualizations such as bar charts and line charts are very popular for exploring data and communicating insights. Interpreting and making sense of such visualizations can be challenging for some people, such as those who are visually impaired or have low visualization literacy. In this work, we introduce a new dataset and present a neural model for automatically generating natural language summaries for charts. The generated summaries provide an interpretation of the chart and convey the key insights found within that chart. Our neural model is developed by extending the state-of-the-art model for the data-to-text generation task, which utilizes a transformer-based encoder-decoder architecture. We found that our approach outperforms the base model on a content selection metric by a wide margin (55.42% vs. 8.49%) and generates more informative, concise, and coherent summaries.</abstract>
       <url hash="b06fec26">2020.inlg-1.20</url>
       <bibkey>obeid-hoque-2020-chart</bibkey>
+      <doi>10.18653/v1/2020.inlg-1.20</doi>
       <pwccode url="https://github.com/JasonObeid/Chart2Text" additional="false">JasonObeid/Chart2Text</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/chart2text">Chart2Text</pwcdataset>
-      <doi>10.18653/v1/2020.inlg-1.20</doi>
     </paper>
     <paper id="21">
       <title>Market Comment Generation from Data with Noisy Alignments</title>
@@ -294,8 +294,8 @@
       <abstract>Most Natural Language Generation systems need to produce accurate texts. We propose a methodology for high-quality human evaluation of the accuracy of generated texts, which is intended to serve as a gold-standard for accuracy evaluations of data-to-text systems. We use our methodology to evaluate the accuracy of computer generated basketball summaries. We then show how our gold standard evaluation can be used to validate automated metrics.</abstract>
       <url hash="e63c8fae">2020.inlg-1.22</url>
       <bibkey>thomson-reiter-2020-gold</bibkey>
-      <pwccode url="https://github.com/nlgcat/evaluating_accuracy" additional="false">nlgcat/evaluating_accuracy</pwccode>
       <doi>10.18653/v1/2020.inlg-1.22</doi>
+      <pwccode url="https://github.com/nlgcat/evaluating_accuracy" additional="false">nlgcat/evaluating_accuracy</pwccode>
     </paper>
     <paper id="23">
       <title>Twenty Years of Confusion in Human Evaluation: <fixed-case>NLG</fixed-case> Needs Evaluation Sheets and Standardised Definitions</title>
@@ -452,9 +452,9 @@
       <abstract>Neural network based approaches to data-to-text natural language generation (NLG) have gained popularity in recent years, with the goal of generating a natural language prompt that accurately realizes an input meaning representation. To facilitate the training of neural network models, researchers created large datasets of paired utterances and their meaning representations. However, the creation of such datasets is an arduous task and they mostly consist of simple meaning representations composed of slot and value tokens to be realized. These representations do not include any contextual information that an NLG system can use when trying to generalize, such as domain information and descriptions of slots and values. In this paper, we present the novel task of Schema-Guided Natural Language Generation (SG-NLG). Here, the goal is still to generate a natural language prompt, but in SG-NLG, the input MRs are paired with rich schemata providing contextual information. To generate a dataset for SG-NLG we re-purpose an existing dataset for another task: dialog state tracking, which includes a large and rich schema spanning multiple different attributes, including information about the domain, user intent, and slot descriptions. We train different state-of-the-art models for neural natural language generation on this dataset and show that in many cases, including rich schema information allows our models to produce higher quality outputs both in terms of semantics and diversity. We also conduct experiments comparing model performance on seen versus unseen domains, and present a human evaluation demonstrating high ratings for overall output quality.</abstract>
       <url hash="ab06674d">2020.inlg-1.35</url>
       <bibkey>du-etal-2020-schema</bibkey>
+      <doi>10.18653/v1/2020.inlg-1.35</doi>
       <pwccode url="https://github.com/alexa/schema-guided-nlg" additional="false">alexa/schema-guided-nlg</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/sg-nlg">SG-NLG</pwcdataset>
-      <doi>10.18653/v1/2020.inlg-1.35</doi>
     </paper>
     <paper id="36">
       <title><fixed-case>OMEGA</fixed-case> : A probabilistic approach to referring expression generation in a virtual environment</title>
@@ -477,9 +477,9 @@
       <url hash="59008881">2020.inlg-1.37</url>
       <attachment type="Supplementary_Attachment" hash="3a84b37b">2020.inlg-1.37.Supplementary_Attachment.pdf</attachment>
       <bibkey>stevens-guille-etal-2020-neural</bibkey>
+      <doi>10.18653/v1/2020.inlg-1.37</doi>
       <pwccode url="https://github.com/methodius-project/neural-methodius" additional="false">methodius-project/neural-methodius</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/webnlg">WebNLG</pwcdataset>
-      <doi>10.18653/v1/2020.inlg-1.37</doi>
     </paper>
     <paper id="38">
       <title>From “Before” to “After”: Generating Natural Language Instructions from Image Pairs in a Simple Visual Domain</title>
@@ -511,10 +511,10 @@
       <revision id="1" href="2020.inlg-1.39v1" hash="6f5e132d"/>
       <revision id="2" href="2020.inlg-1.39v2" hash="b3d358cf" date="2021-03-26">Table 2 was a copy-paste of Table 1 (VQA1.0) by mistake. It now displays results for the second dataset (VQA COCO) as expected.</revision>
       <bibkey>scialom-etal-2020-bert</bibkey>
+      <doi>10.18653/v1/2020.inlg-1.39</doi>
       <pwcdataset url="https://paperswithcode.com/dataset/coco">COCO</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/vqg">VQG</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/visual-question-answering">Visual Question Answering</pwcdataset>
-      <doi>10.18653/v1/2020.inlg-1.39</doi>
     </paper>
     <paper id="40">
       <title>When an Image Tells a Story: The Role of Visual and Semantic Information for Generating Paragraph Descriptions</title>
@@ -524,9 +524,9 @@
       <abstract>Generating multi-sentence image descriptions is a challenging task, which requires a good model to produce coherent and accurate paragraphs, describing salient objects in the image. We argue that multiple sources of information are beneficial when describing visual scenes with long sequences. These include (i) perceptual information and (ii) semantic (language) information about how to describe what is in the image. We also compare the effects of using two different pooling mechanisms on either a single modality or their combination. We demonstrate that the model which utilises both visual and language inputs can be used to generate accurate and diverse paragraphs when combined with a particular pooling mechanism. The results of our automatic and human evaluation show that learning to embed semantic information along with visual stimuli into the paragraph generation model is not trivial, raising a variety of proposals for future experiments.</abstract>
       <url hash="4bd2596f">2020.inlg-1.40</url>
       <bibkey>ilinykh-dobnik-2020-image</bibkey>
+      <doi>10.18653/v1/2020.inlg-1.40</doi>
       <pwcdataset url="https://paperswithcode.com/dataset/image-description-sequences">Image Description Sequences</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/image-paragraph-captioning">Image Paragraph Captioning</pwcdataset>
-      <doi>10.18653/v1/2020.inlg-1.40</doi>
     </paper>
     <paper id="41">
       <title>Transformer based Natural Language Generation for Question-Answering</title>
@@ -537,8 +537,8 @@
       <abstract>This paper explores Natural Language Generation within the context of Question-Answering task. The several works addressing this task only focused on generating a short answer or a long text span that contains the answer, while reasoning over a Web page or processing structured data. Such answers’ length are usually not appropriate as the answer tend to be perceived as too brief or too long to be read out loud by an intelligent assistant. In this work, we aim at generating a concise answer for a given question using an unsupervised approach that does not require annotated data. Tested over English and French datasets, the proposed approach shows very promising results.</abstract>
       <url hash="255a4e7b">2020.inlg-1.41</url>
       <bibkey>akermi-etal-2020-tansformer</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/universal-dependencies">Universal Dependencies</pwcdataset>
       <doi>10.18653/v1/2020.inlg-1.41</doi>
+      <pwcdataset url="https://paperswithcode.com/dataset/universal-dependencies">Universal Dependencies</pwcdataset>
     </paper>
     <paper id="42">
       <title>Rapformer: Conditional Rap Lyrics Generation with Denoising Autoencoders</title>
@@ -563,8 +563,8 @@
       <abstract>Large-scale, transformer-based language models such as GPT-2 are pretrained on diverse corpora scraped from the internet. Consequently, they are prone to generating non-normative text (i.e. in violation of social norms). We introduce a technique for fine-tuning GPT-2, using a policy gradient reinforcement learning technique and a normative text classifier to produce reward and punishment values. We evaluate our technique on five data sets using automated and human participant experiments. The normative text classifier is 81-90% accurate when compared to gold-standard human judgements of normative and non-normative generated text. Our normative fine-tuning technique is able to reduce non-normative text by 27-61%, depending on the data set.</abstract>
       <url hash="935f932c">2020.inlg-1.43</url>
       <bibkey>peng-etal-2020-reducing</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/rocstories">ROCStories</pwcdataset>
       <doi>10.18653/v1/2020.inlg-1.43</doi>
+      <pwcdataset url="https://paperswithcode.com/dataset/rocstories">ROCStories</pwcdataset>
     </paper>
     <paper id="44">
       <title><fixed-case>R</fixed-case>eview<fixed-case>R</fixed-case>obot: Explainable Paper Review Generation based on Knowledge Synthesis</title>
@@ -578,10 +578,10 @@
       <abstract>To assist human review process, we build a novel ReviewRobot to automatically assign a review score and write comments for multiple categories such as novelty and meaningful comparison. A good review needs to be knowledgeable, namely that the comments should be constructive and informative to help improve the paper; and explainable by providing detailed evidence. ReviewRobot achieves these goals via three steps: (1) We perform domain-specific Information Extraction to construct a knowledge graph (KG) from the target paper under review, a related work KG from the papers cited by the target paper, and a background KG from a large collection of previous papers in the domain. (2) By comparing these three KGs, we predict a review score and detailed structured knowledge as evidence for each review category. (3) We carefully select and generalize human review sentences into templates, and apply these templates to transform the review scores and evidence into natural language comments. Experimental results show that our review score predictor reaches 71.4%-100% accuracy. Human assessment by domain experts shows that 41.7%-70.5% of the comments generated by ReviewRobot are valid and constructive, and better than human-written ones for 20% of the time. Thus, ReviewRobot can serve as an assistant for paper reviewers, program chairs and authors.</abstract>
       <url hash="9be78df1">2020.inlg-1.44</url>
       <bibkey>wang-etal-2020-reviewrobot</bibkey>
+      <doi>10.18653/v1/2020.inlg-1.44</doi>
       <pwccode url="https://github.com/EagleW/ReviewRobot" additional="false">EagleW/ReviewRobot</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/reviewrobot-dataset">ReviewRobot Dataset</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/peerread">PeerRead</pwcdataset>
-      <doi>10.18653/v1/2020.inlg-1.44</doi>
     </paper>
     <paper id="45">
       <title>Gradations of Error Severity in Automatic Image Descriptions</title>
diff --git a/data/xml/2020.sigdial.xml b/data/xml/2020.sigdial.xml
index a613544d1b..0098d3257b 100644
--- a/data/xml/2020.sigdial.xml
+++ b/data/xml/2020.sigdial.xml
@@ -59,8 +59,8 @@
       <url hash="8b8aaa7d">2020.sigdial-1.3</url>
       <video href="https://youtube.com/watch?v=PxlR3DSFqkg"/>
       <bibkey>reed-etal-2020-learning</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/e2e">E2E</pwcdataset>
       <doi>10.18653/v1/2020.sigdial-1.3</doi>
+      <pwcdataset url="https://paperswithcode.com/dataset/e2e">E2E</pwcdataset>
     </paper>
     <paper id="4">
       <title><fixed-case>T</fixed-case>rip<fixed-case>P</fixed-case>y: A Triple Copy Strategy for Value Independent Neural Dialog State Tracking</title>
@@ -76,8 +76,8 @@
       <url hash="39b9202b">2020.sigdial-1.4</url>
       <video href="https://youtube.com/watch?v=qWLnp4tPbPM"/>
       <bibkey>heck-etal-2020-trippy</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/multiwoz">MultiWOZ</pwcdataset>
       <doi>10.18653/v1/2020.sigdial-1.4</doi>
+      <pwcdataset url="https://paperswithcode.com/dataset/multiwoz">MultiWOZ</pwcdataset>
     </paper>
     <paper id="5">
       <title>Conversational Agents for Intelligent Buildings</title>
@@ -130,8 +130,8 @@
       <url hash="fd16cdf7">2020.sigdial-1.8</url>
       <video href="https://youtube.com/watch?v=nofzyxM4h1k"/>
       <bibkey>chang-etal-2020-convokit</bibkey>
-      <pwccode url="https://github.com/CornellNLP/Cornell-Conversational-Analysis-Toolkit" additional="false">CornellNLP/Cornell-Conversational-Analysis-Toolkit</pwccode>
       <doi>10.18653/v1/2020.sigdial-1.8</doi>
+      <pwccode url="https://github.com/CornellNLP/Cornell-Conversational-Analysis-Toolkit" additional="false">CornellNLP/Cornell-Conversational-Analysis-Toolkit</pwccode>
     </paper>
     <paper id="9">
       <title>Commonsense Evidence Generation and Injection in Reading Comprehension</title>
@@ -144,10 +144,10 @@
       <abstract>Human tackle reading comprehension not only based on the given context itself but often rely on the commonsense beyond. To empower the machine with commonsense reasoning, in this paper, we propose a Commonsense Evidence Generation and Injection framework in reading comprehension, named CEGI. The framework injects two kinds of auxiliary commonsense evidence into comprehensive reading to equip the machine with the ability of rational thinking. Specifically, we build two evidence generators: one aims to generate textual evidence via a language model; the other aims to extract factual evidence (automatically aligned text-triples) from a commonsense knowledge graph after graph completion. Those evidences incorporate contextual commonsense and serve as the additional inputs to the reasoning model. Thereafter, we propose a deep contextual encoder to extract semantic relationships among the paragraph, question, option, and evidence. Finally, we employ a capsule network to extract different linguistic units (word and phrase) from the relations, and dynamically predict the optimal option based on the extracted units. Experiments on the CosmosQA dataset demonstrate that the proposed CEGI model outperforms the current state-of-the-art approaches and achieves the highest accuracy (83.6%) on the leaderboard.</abstract>
       <url hash="471ecfcb">2020.sigdial-1.9</url>
       <bibkey>liu-etal-2020-commonsense</bibkey>
+      <doi>10.18653/v1/2020.sigdial-1.9</doi>
       <pwcdataset url="https://paperswithcode.com/dataset/cos-e">CoS-E</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/commonsenseqa">CommonsenseQA</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/conceptnet">ConceptNet</pwcdataset>
-      <doi>10.18653/v1/2020.sigdial-1.9</doi>
     </paper>
     <paper id="10">
       <title>Identifying Collaborative Conversations using Latent Discourse Behaviors</title>
@@ -195,8 +195,8 @@
       <url hash="ca1e1b5d">2020.sigdial-1.13</url>
       <video href="https://youtube.com/watch?v=xTNbo840EPk"/>
       <bibkey>mcneill-kennington-2020-learning</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/imagenet">ImageNet</pwcdataset>
       <doi>10.18653/v1/2020.sigdial-1.13</doi>
+      <pwcdataset url="https://paperswithcode.com/dataset/imagenet">ImageNet</pwcdataset>
     </paper>
     <paper id="14">
       <title>Learning and Reasoning for Robot Dialog and Navigation Tasks</title>
@@ -239,8 +239,8 @@
       <url hash="3f04d173">2020.sigdial-1.16</url>
       <video href="https://youtube.com/watch?v=ynx2F5Hme4I"/>
       <bibkey>platonov-etal-2020-spoken</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/clevr">CLEVR</pwcdataset>
       <doi>10.18653/v1/2020.sigdial-1.16</doi>
+      <pwcdataset url="https://paperswithcode.com/dataset/clevr">CLEVR</pwcdataset>
     </paper>
     <paper id="17">
       <title>rr<fixed-case>SDS</fixed-case>: Towards a Robot-ready Spoken Dialogue System</title>
@@ -299,8 +299,8 @@
       <url hash="d62102f6">2020.sigdial-1.21</url>
       <video href="https://youtube.com/watch?v=IIcHVI9Kc0Y"/>
       <bibkey>cervone-riccardi-2020-dialogue</bibkey>
-      <pwccode url="https://github.com/alecervi/switchboard-coherence-corpus" additional="true">alecervi/switchboard-coherence-corpus</pwccode>
       <doi>10.18653/v1/2020.sigdial-1.21</doi>
+      <pwccode url="https://github.com/alecervi/switchboard-coherence-corpus" additional="true">alecervi/switchboard-coherence-corpus</pwccode>
     </paper>
     <paper id="22">
       <title>Analyzing Speaker Strategy in Referential Communication</title>
@@ -325,10 +325,10 @@
       <url hash="6416ca30">2020.sigdial-1.23</url>
       <video href="https://youtube.com/watch?v=1PH6JXbc3EI"/>
       <bibkey>wang-etal-2020-contextualized</bibkey>
+      <doi>10.18653/v1/2020.sigdial-1.23</doi>
       <pwcdataset url="https://paperswithcode.com/dataset/dailydialog">DailyDialog</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/iemocap">IEMOCAP</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/meld">MELD</pwcdataset>
-      <doi>10.18653/v1/2020.sigdial-1.23</doi>
     </paper>
     <paper id="24">
       <title>How Self-Attention Improves Rare Class Performance in a Question-Answering Dialogue Agent</title>
@@ -388,9 +388,9 @@
       <url hash="77916f6c">2020.sigdial-1.28</url>
       <video href="https://youtube.com/watch?v=lZVNe7XMQ8M"/>
       <bibkey>mehri-eskenazi-2020-unsupervised</bibkey>
+      <doi>10.18653/v1/2020.sigdial-1.28</doi>
       <pwccode url="https://github.com/shikib/fed" additional="false">shikib/fed</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/fed">FED</pwcdataset>
-      <doi>10.18653/v1/2020.sigdial-1.28</doi>
     </paper>
     <paper id="29">
       <title>Towards Unified Dialogue System Evaluation: A Comprehensive Analysis of Current Evaluation Protocols</title>
@@ -443,8 +443,8 @@
       <url hash="561004f9">2020.sigdial-1.32</url>
       <video href="https://youtube.com/watch?v=GnxClvqoi-4"/>
       <bibkey>finch-choi-2020-emora</bibkey>
-      <pwccode url="https://github.com/emora-chat/emora_stdm" additional="false">emora-chat/emora_stdm</pwccode>
       <doi>10.18653/v1/2020.sigdial-1.32</doi>
+      <pwccode url="https://github.com/emora-chat/emora_stdm" additional="false">emora-chat/emora_stdm</pwccode>
     </paper>
     <paper id="33">
       <title>Boosting Naturalness of Language in Task-oriented Dialogues via Adversarial Training</title>
@@ -479,8 +479,8 @@
       <url hash="2330848e">2020.sigdial-1.35</url>
       <video href="https://youtube.com/watch?v=0NwAtEe-vUA"/>
       <bibkey>kim-etal-2020-beyond</bibkey>
-      <pwccode url="" additional="true"/>
       <doi>10.18653/v1/2020.sigdial-1.35</doi>
+      <pwccode url="" additional="true"/>
     </paper>
     <paper id="36">
       <title>Multi-Action Dialog Policy Learning with Interactive Human Teaching</title>
diff --git a/data/xml/2021.acl.xml b/data/xml/2021.acl.xml
index b0e55d7268..7615c7d1a6 100644
--- a/data/xml/2021.acl.xml
+++ b/data/xml/2021.acl.xml
@@ -2847,6 +2847,7 @@
       <video href="2021.acl-long.176.mp4"/>
       <pwccode url="https://github.com/chiahsuan156/KaggleDBQA" additional="false">chiahsuan156/KaggleDBQA</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/kaggledbqa">KaggleDBQA</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/spider-realistic">Spider-Realistic</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikisql">WikiSQL</pwcdataset>
     </paper>
     <paper id="177">
@@ -3201,6 +3202,7 @@
       <bibkey>cao-etal-2021-lgesql</bibkey>
       <video href="2021.acl-long.198.mp4"/>
       <pwccode url="https://github.com/rhythmcao/text2sql-lgesql" additional="false">rhythmcao/text2sql-lgesql</pwccode>
+      <pwcdataset url="https://paperswithcode.com/dataset/spider-realistic">Spider-Realistic</pwcdataset>
     </paper>
     <paper id="199">
       <title>Multi-stage Pre-training over Simplified Multimodal Pre-training Models</title>
diff --git a/data/xml/2021.cl.xml b/data/xml/2021.cl.xml
index cf620b504b..5bdd3c67e4 100644
--- a/data/xml/2021.cl.xml
+++ b/data/xml/2021.cl.xml
@@ -159,6 +159,7 @@
       <bibkey>choi-etal-2021-ryansql</bibkey>
       <video href="2021.cl-2.12.mp4"/>
       <pwccode url="https://github.com/kakaoenterprise/RYANSQL" additional="false">kakaoenterprise/RYANSQL</pwccode>
+      <pwcdataset url="https://paperswithcode.com/dataset/spider-realistic">Spider-Realistic</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikisql">WikiSQL</pwcdataset>
     </paper>
     <paper id="13">
diff --git a/data/xml/2021.eacl.xml b/data/xml/2021.eacl.xml
index 6ca7cf47e8..478af034ae 100644
--- a/data/xml/2021.eacl.xml
+++ b/data/xml/2021.eacl.xml
@@ -2745,7 +2745,7 @@
       <url hash="307caf69">2021.eacl-main.204</url>
       <bibkey>sun-etal-2021-cross</bibkey>
       <doi>10.18653/v1/2021.eacl-main.204</doi>
-      <pwccode url="https://github.com/hwijeen/langrank" additional="false">hwijeen/langrank</pwccode>
+      <pwccode url="https://github.com/hwijeen/langrank" additional="true">hwijeen/langrank</pwccode>
     </paper>
     <paper id="205">
       <title><fixed-case>PHASE</fixed-case>: Learning Emotional Phase-aware Representations for Suicide Ideation Detection on Social Media</title>
diff --git a/data/xml/2021.emnlp.xml b/data/xml/2021.emnlp.xml
index ffde7ab1ad..2ed1438f80 100644
--- a/data/xml/2021.emnlp.xml
+++ b/data/xml/2021.emnlp.xml
@@ -11063,6 +11063,7 @@
       <doi>10.18653/v1/2021.emnlp-main.702</doi>
       <video href="2021.emnlp-main.702.mp4"/>
       <pwccode url="https://github.com/ygan/spider-dk" additional="false">ygan/spider-dk</pwccode>
+      <pwcdataset url="https://paperswithcode.com/dataset/spider-realistic">Spider-Realistic</pwcdataset>
     </paper>
     <paper id="703">
       <title>What happens if you treat ordinal ratings as interval data? Human evaluations in <fixed-case>NLP</fixed-case> are even more under-powered than you think</title>
@@ -11154,6 +11155,7 @@
       <video href="2021.emnlp-main.708.mp4"/>
       <pwccode url="https://github.com/yandex-research/sparqling-queries" additional="false">yandex-research/sparqling-queries</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/break">BREAK</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/spider-realistic">Spider-Realistic</pwcdataset>
     </paper>
     <paper id="709">
       <title>Time-aware Graph Neural Network for Entity Alignment between Temporal Knowledge Graphs</title>
@@ -12235,6 +12237,7 @@
       <video href="2021.emnlp-main.779.mp4"/>
       <pwccode url="https://github.com/ElementAI/picard" additional="true">ElementAI/picard</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/cosql">CoSQL</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/spider-realistic">Spider-Realistic</pwcdataset>
     </paper>
     <paper id="780">
       <title>Exploiting <fixed-case>T</fixed-case>witter as Source of Large Corpora of Weakly Similar Pairs for Semantic Sentence Embeddings</title>
diff --git a/data/xml/2021.gem.xml b/data/xml/2021.gem.xml
index a2ae49f2bf..9b7a582888 100644
--- a/data/xml/2021.gem.xml
+++ b/data/xml/2021.gem.xml
@@ -207,7 +207,6 @@
       <pwcdataset url="https://paperswithcode.com/dataset/turkcorpus">TurkCorpus</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/webnlg">WebNLG</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikilingua">WikiLingua</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/xsum">XSum</pwcdataset>
     </paper>
     <paper id="11">
       <title>Reusable Templates and Guides For Documenting Datasets and Models for Natural Language Processing and Generation: A Case Study of the <fixed-case>H</fixed-case>ugging<fixed-case>F</fixed-case>ace and <fixed-case>GEM</fixed-case> Data and Model Cards</title>
diff --git a/data/xml/2021.inlg.xml b/data/xml/2021.inlg.xml
index 0e964de256..34314093ba 100644
--- a/data/xml/2021.inlg.xml
+++ b/data/xml/2021.inlg.xml
@@ -27,9 +27,9 @@
       <abstract>Text generation from semantic graphs is traditionally performed with deterministic methods, which generate a unique description given an input graph. However, the generation problem admits a range of acceptable textual outputs, exhibiting lexical, syntactic and semantic variation. To address this disconnect, we present two main contributions. First, we propose a stochastic graph-to-text model, incorporating a latent variable in an encoder-decoder model, and its use in an ensemble. Second, to assess the diversity of the generated sentences, we propose a new automatic evaluation metric which jointly evaluates output diversity and quality in a multi-reference setting. We evaluate the models on WebNLG datasets in English and Russian, and show an ensemble of stochastic models produces diverse sets of generated sentences while, retaining similar quality to state-of-the-art models.</abstract>
       <url hash="77d12172">2021.inlg-1.1</url>
       <bibkey>han-etal-2021-generating</bibkey>
+      <doi>10.18653/v1/2021.inlg-1.1</doi>
       <pwccode url="https://github.com/Jiuzhouh/Multi-Score" additional="false">Jiuzhouh/Multi-Score</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/webnlg">WebNLG</pwcdataset>
-      <doi>10.18653/v1/2021.inlg-1.1</doi>
     </paper>
     <paper id="2">
       <title>Neural Methodius Revisited: Do Discourse Relations Help with Pre-Trained Models Too?</title>
@@ -41,8 +41,8 @@
       <abstract>Recent developments in natural language generation (NLG) have bolstered arguments in favor of re-introducing explicit coding of discourse relations in the input to neural models. In the Methodius corpus, a meaning representation (MR) is hierarchically structured and includes discourse relations. Meanwhile pre-trained language models have been shown to implicitly encode rich linguistic knowledge which provides an excellent resource for NLG. By virtue of synthesizing these lines of research, we conduct extensive experiments on the benefits of using pre-trained models and discourse relation information in MRs, focusing on the improvement of discourse coherence and correctness. We redesign the Methodius corpus; we also construct another Methodius corpus in which MRs are not hierarchically structured but flat. We report experiments on different versions of the corpora, which probe when, where, and how pre-trained models benefit from MRs with discourse relation information in them. We conclude that discourse relations significantly improve NLG when data is limited.</abstract>
       <url hash="c3d2f45e">2021.inlg-1.2</url>
       <bibkey>maskharashvili-etal-2021-neural</bibkey>
-      <pwccode url="https://github.com/aleksadre/methodiusneuralinlg2021" additional="false">aleksadre/methodiusneuralinlg2021</pwccode>
       <doi>10.18653/v1/2021.inlg-1.2</doi>
+      <pwccode url="https://github.com/aleksadre/methodiusneuralinlg2021" additional="false">aleksadre/methodiusneuralinlg2021</pwccode>
     </paper>
     <paper id="3">
       <title>Exploring Input Representation Granularity for Generating Questions Satisfying Question-Answer Congruence</title>
@@ -55,8 +55,8 @@
       <abstract>In question generation, the question produced has to be well-formed and meaningfully related to the answer serving as input. Neural generation methods have predominantly leveraged the distributional semantics of words as representations of meaning and generated questions one word at a time. In this paper, we explore the viability of form-based and more fine-grained encodings, such as character or subword representations for question generation. We start from the typical seq2seq architecture using word embeddings presented by De Kuthy et al. (2020), who generate questions from text so that the answer given in the input text matches not just in meaning but also in form, satisfying question-answer congruence. We show that models trained on character and subword representations substantially outperform the published results based on word embeddings, and they do so with fewer parameters. Our approach eliminates two important problems of the word-based approach: the encoding of rare or out-of-vocabulary words and the incorrect replacement of words with semantically-related ones. The character-based model substantially improves on the published results, both in terms of BLEU scores and regarding the quality of the generated question. Going beyond the specific task, this result adds to the evidence weighing different form- and meaning-based representations for natural language processing tasks.</abstract>
       <url hash="09da1a7f">2021.inlg-1.3</url>
       <bibkey>kannan-etal-2021-exploring</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/squad">SQuAD</pwcdataset>
       <doi>10.18653/v1/2021.inlg-1.3</doi>
+      <pwcdataset url="https://paperswithcode.com/dataset/squad">SQuAD</pwcdataset>
     </paper>
     <paper id="4">
       <title>Towards Zero-Shot Multilingual Synthetic Question and Answer Generation for Cross-Lingual Reading Comprehension</title>
@@ -68,13 +68,13 @@
       <abstract>We propose a simple method to generate multilingual question and answer pairs on a large scale through the use of a single generative model. These synthetic samples can be used to improve the zero-shot performance of multilingual QA models on target languages. Our proposed multi-task training of the generative model only requires labeled training samples in English, thus removing the need for such samples in the target languages, making it applicable to far more languages than those with labeled data. Human evaluations indicate the majority of such samples are grammatically correct and sensible. Experimental results show our proposed approach can achieve large gains on the XQuAD dataset, reducing the gap between zero-shot and supervised performance of smaller QA models on various languages.</abstract>
       <url hash="38a00ea8">2021.inlg-1.4</url>
       <bibkey>shakeri-etal-2021-towards</bibkey>
+      <doi>10.18653/v1/2021.inlg-1.4</doi>
       <pwcdataset url="https://paperswithcode.com/dataset/c4">C4</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/mlqa">MLQA</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/squad">SQuAD</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/tydi-qa">TyDi QA</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/xquad">XQuAD</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/mc4">mC4</pwcdataset>
-      <doi>10.18653/v1/2021.inlg-1.4</doi>
     </paper>
     <paper id="5">
       <title>Chefbot: A Novel Framework for the Generation of Commonsense-enhanced Responses for Task-based Dialogue Systems</title>
@@ -95,8 +95,8 @@
       <abstract>We address the task of antonym prediction in a context, which is a fill-in-the-blanks problem. This task setting is unique and practical because it requires contrastiveness to the other word and naturalness as a text in filling a blank. We propose methods for fine-tuning pre-trained masked language models (BERT) for context-aware antonym prediction. The experimental results demonstrate that these methods have positive impacts on the prediction of antonyms within a context. Moreover, human evaluation reveals that more than 85% of predictions using the proposed method are acceptable as antonyms.</abstract>
       <url hash="ffc3387c">2021.inlg-1.6</url>
       <bibkey>niwa-etal-2021-predicting</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/semeval-2018-task-9-hypernym-discovery">SemEval-2018 Task 9: Hypernym Discovery</pwcdataset>
       <doi>10.18653/v1/2021.inlg-1.6</doi>
+      <pwcdataset url="https://paperswithcode.com/dataset/semeval-2018-task-9-hypernym-discovery">SemEval-2018 Task 9: Hypernym Discovery</pwcdataset>
     </paper>
     <paper id="7">
       <title>Examining Covert Gender Bias: A Case Study in <fixed-case>T</fixed-case>urkish and <fixed-case>E</fixed-case>nglish Machine Translation Models</title>
@@ -107,8 +107,8 @@
       <abstract>As Machine Translation (MT) has become increasingly more powerful, accessible, and widespread, the potential for the perpetuation of bias has grown alongside its advances. While overt indicators of bias have been studied in machine translation, we argue that covert biases expose a problem that is further entrenched. Through the use of the gender-neutral language Turkish and the gendered language English, we examine cases of both overt and covert gender bias in MT models. Specifically, we introduce a method to investigate asymmetrical gender markings. We also assess bias in the attribution of personhood and examine occupational and personality stereotypes through overt bias indicators in MT models. Our work explores a deeper layer of bias in MT models and demonstrates the continued need for language-specific, interdisciplinary methodology in MT model development.</abstract>
       <url hash="5260729e">2021.inlg-1.7</url>
       <bibkey>ciora-etal-2021-examining</bibkey>
-      <pwccode url="https://github.com/NurIren/Gender-Bias-in-TR-to-EN-MT-Models" additional="false">NurIren/Gender-Bias-in-TR-to-EN-MT-Models</pwccode>
       <doi>10.18653/v1/2021.inlg-1.7</doi>
+      <pwccode url="https://github.com/NurIren/Gender-Bias-in-TR-to-EN-MT-Models" additional="false">NurIren/Gender-Bias-in-TR-to-EN-MT-Models</pwccode>
     </paper>
     <paper id="8">
       <title><fixed-case>W</fixed-case>ea<fixed-case>S</fixed-case>u<fixed-case>L</fixed-case>: Weakly Supervised Dialogue Policy Learning: Reward Estimation for Multi-turn Dialogue</title>
@@ -132,9 +132,9 @@
       <abstract>Incorporating external knowledge sources effectively in conversations is a longstanding problem in open-domain dialogue research. The existing literature on open-domain knowledge selection is limited and makes certain brittle assumptions on knowledge sources to simplify the overall task, such as the existence of a single relevant knowledge sentence per context. In this work, we evaluate the existing state of open-domain conversation knowledge selection, showing where the existing methodologies regarding data and evaluation are flawed. We then improve on them by proposing a new framework for collecting relevant knowledge, and create an augmented dataset based on the Wizard of Wikipedia (WOW) corpus, which we call WOW++. WOW++ averages 8 relevant knowledge sentences per dialogue context, embracing the inherent ambiguity of open-domain dialogue knowledge selection. We then benchmark various knowledge ranking algorithms on this augmented dataset with both intrinsic evaluation and extrinsic measures of response quality, showing that neural rerankers that use WOW++ can outperform rankers trained on standard datasets.</abstract>
       <url hash="45ade724">2021.inlg-1.9</url>
       <bibkey>eric-etal-2021-multi</bibkey>
+      <doi>10.18653/v1/2021.inlg-1.9</doi>
       <pwccode url="https://github.com/alexa/wow-plus-plus" additional="false">alexa/wow-plus-plus</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/wizard-of-wikipedia">Wizard of Wikipedia</pwcdataset>
-      <doi>10.18653/v1/2021.inlg-1.9</doi>
     </paper>
     <paper id="10">
       <title>Self-Training for Compositional Neural <fixed-case>NLG</fixed-case> in Task-Oriented Dialogue</title>
@@ -146,8 +146,8 @@
       <abstract>Neural approaches to natural language generation in task-oriented dialogue have typically required large amounts of annotated training data to achieve satisfactory performance, especially when generating from compositional inputs. To address this issue, we show that self-training enhanced with constrained decoding yields large gains in data efficiency on a conversational weather dataset that employs compositional meaning representations. In particular, our experiments indicate that self-training with constrained decoding can enable sequence-to-sequence models to achieve satisfactory quality using vanilla decoding with five to ten times less data than with ordinary supervised baseline; moreover, by leveraging pretrained models, data efficiency can be increased further to fifty times. We confirm the main automatic results with human evaluations and show that they extend to an enhanced, compositional version of the E2E dataset. The end result is an approach that makes it possible to achieve acceptable performance on compositional NLG tasks using hundreds rather than tens of thousands of training samples.</abstract>
       <url hash="a39a71a0">2021.inlg-1.10</url>
       <bibkey>li-etal-2021-self</bibkey>
-      <pwccode url="https://github.com/znculee/treenlg-bart" additional="true">znculee/treenlg-bart</pwccode>
       <doi>10.18653/v1/2021.inlg-1.10</doi>
+      <pwccode url="https://github.com/znculee/treenlg-bart" additional="true">znculee/treenlg-bart</pwccode>
     </paper>
     <paper id="11">
       <title>Generating Racing Game Commentary from Vision, Language, and Structured Data</title>
@@ -184,8 +184,8 @@
       <abstract>The task of Sentence Ordering refers to rearranging a set of given sentences in a coherent ordering. Prior work (Prabhumoye et al., 2020) models this as an optimal graph traversal (with sentences as nodes, and edges as local constraints) using topological sorting. However, such an approach has major limitations – it cannot handle the presence of cycles in the resulting graphs and considers only the binary presence/absence of edges rather than a more granular score. In this work, we propose an alternate formulation of this task as a classic combinatorial optimization problem popular as the Traveling Salesman Problem (or TSP in short). Compared to the previous approach of using topological sorting, our proposed technique gracefully handles the presence of cycles and is more expressive since it takes into account real-valued constraint/edge scores rather than just the presence/absence of edges. Our experiments demonstrate improved handling of such cyclic cases in resulting graphs. Additionally, we highlight how model accuracy can be sensitive to the ordering of input sentences when using such graphs-based formulations. Finally, we note that our approach requires only lightweight fine-tuning of a classification layer built on pretrained BERT sentence encoder to identify local relationships.</abstract>
       <url hash="c41bbe02">2021.inlg-1.13</url>
       <bibkey>keswani-jhamtani-2021-formulating</bibkey>
-      <pwccode url="https://github.com/vkeswani/bertsp" additional="false">vkeswani/bertsp</pwccode>
       <doi>10.18653/v1/2021.inlg-1.13</doi>
+      <pwccode url="https://github.com/vkeswani/bertsp" additional="false">vkeswani/bertsp</pwccode>
     </paper>
     <paper id="14">
       <title>Underreporting of errors in <fixed-case>NLG</fixed-case> output, and what to do about it</title>
@@ -217,8 +217,8 @@
       <url hash="00e09a6e">2021.inlg-1.15</url>
       <attachment type="Supplementary_Attachment" hash="634b0c83">2021.inlg-1.15.Supplementary_Attachment.zip</attachment>
       <bibkey>chen-etal-2021-neural-referential</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/webnlg">WebNLG</pwcdataset>
       <doi>10.18653/v1/2021.inlg-1.15</doi>
+      <pwcdataset url="https://paperswithcode.com/dataset/webnlg">WebNLG</pwcdataset>
     </paper>
     <paper id="16">
       <title><fixed-case>HI</fixed-case>-<fixed-case>CMLM</fixed-case>: Improve <fixed-case>CMLM</fixed-case> with Hybrid Decoder Input</title>
@@ -247,8 +247,8 @@
       <abstract>Choosing the most suitable classifier in a linguistic context is a well-known problem in the production of Mandarin and many other languages. The present paper proposes a solution based on BERT, compares this solution to previous neural and rule-based models, and argues that the BERT model performs particularly well on those difficult cases where the classifier adds information to the text.</abstract>
       <url hash="ac529884">2021.inlg-1.17</url>
       <bibkey>jarnfors-etal-2021-using</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/chinese-classifier">Chinese Classifier</pwcdataset>
       <doi>10.18653/v1/2021.inlg-1.17</doi>
+      <pwcdataset url="https://paperswithcode.com/dataset/chinese-classifier">Chinese Classifier</pwcdataset>
     </paper>
     <paper id="18">
       <title>Enriching the <fixed-case>E</fixed-case>2<fixed-case>E</fixed-case> dataset</title>
@@ -260,9 +260,9 @@
       <abstract>This study introduces an enriched version of the E2E dataset, one of the most popular language resources for data-to-text NLG. We extract intermediate representations for popular pipeline tasks such as discourse ordering, text structuring, lexicalization and referring expression generation, enabling researchers to rapidly develop and evaluate their data-to-text pipeline systems. The intermediate representations are extracted by aligning non-linguistic and text representations through a process called delexicalization, which consists in replacing input referring expressions to entities/attributes with placeholders. The enriched dataset is publicly available.</abstract>
       <url hash="d6562239">2021.inlg-1.18</url>
       <bibkey>castro-ferreira-etal-2021-enriching</bibkey>
+      <doi>10.18653/v1/2021.inlg-1.18</doi>
       <pwccode url="https://github.com/ThiagoCF05/EnrichedE2E" additional="false">ThiagoCF05/EnrichedE2E</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/webnlg">WebNLG</pwcdataset>
-      <doi>10.18653/v1/2021.inlg-1.18</doi>
     </paper>
     <paper id="19">
       <title>Goal-Oriented Script Construction</title>
@@ -276,9 +276,9 @@
       <bibkey>lyu-etal-2021-goal</bibkey>
       <revision id="1" href="2021.inlg-1.19v1" hash="8fd5b8b1"/>
       <revision id="2" href="2021.inlg-1.19v2" hash="06410bb7" date="2022-12-26">Corrected the Acknowledgement section.</revision>
+      <doi>10.18653/v1/2021.inlg-1.19</doi>
       <pwccode url="https://github.com/veronica320/wikihow-gosc" additional="false">veronica320/wikihow-gosc</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/wikihow">WikiHow</pwcdataset>
-      <doi>10.18653/v1/2021.inlg-1.19</doi>
     </paper>
     <paper id="20">
       <title>Single Example Can Improve Zero-Shot Data Generation</title>
@@ -291,8 +291,8 @@
       <abstract>Sub-tasks of intent classification, such as robustness to distribution shift, adaptation to specific user groups and personalization, out-of-domain detection, require extensive and flexible datasets for experiments and evaluation. As collecting such datasets is time- and labor-consuming, we propose to use text generation methods to gather datasets. The generator should be trained to generate utterances that belong to the given intent. We explore two approaches to the generation of task-oriented utterances: in the zero-shot approach, the model is trained to generate utterances from seen intents and is further used to generate utterances for intents unseen during training. In the one-shot approach, the model is presented with a single utterance from a test intent. We perform a thorough automatic, and human evaluation of the intrinsic properties of two-generation approaches. The attributes of the generated data are close to original test sets, collected via crowd-sourcing.</abstract>
       <url hash="bc6b2754">2021.inlg-1.20</url>
       <bibkey>burnyshev-etal-2021-single</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/sgd">SGD</pwcdataset>
       <doi>10.18653/v1/2021.inlg-1.20</doi>
+      <pwcdataset url="https://paperswithcode.com/dataset/sgd">SGD</pwcdataset>
     </paper>
     <paper id="21">
       <title><fixed-case>SAPPHIRE</fixed-case>: Approaches for Enhanced Concept-to-Text Generation</title>
@@ -305,9 +305,9 @@
       <abstract>We motivate and propose a suite of simple but effective improvements for concept-to-text generation called SAPPHIRE: Set Augmentation and Post-hoc PHrase Infilling and REcombination. We demonstrate their effectiveness on generative commonsense reasoning, a.k.a. the CommonGen task, through experiments using both BART and T5 models. Through extensive automatic and human evaluation, we show that SAPPHIRE noticeably improves model performance. An in-depth qualitative analysis illustrates that SAPPHIRE effectively addresses many issues of the baseline model generations, including lack of commonsense, insufficient specificity, and poor fluency.</abstract>
       <url hash="35a791e3">2021.inlg-1.21</url>
       <bibkey>feng-etal-2021-sapphire</bibkey>
+      <doi>10.18653/v1/2021.inlg-1.21</doi>
       <pwccode url="https://github.com/styfeng/sapphire" additional="false">styfeng/sapphire</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/commongen">CommonGen</pwcdataset>
-      <doi>10.18653/v1/2021.inlg-1.21</doi>
     </paper>
     <paper id="22">
       <title>Contextualizing Variation in Text Style Transfer Datasets</title>
@@ -318,9 +318,9 @@
       <abstract>Text style transfer involves rewriting the content of a source sentence in a target style. Despite there being a number of style tasks with available data, there has been limited systematic discussion of how text style datasets relate to each other. This understanding, however, is likely to have implications for selecting multiple data sources for model training. While it is prudent to consider inherent stylistic properties when determining these relationships, we also must consider how a style is realized in a particular dataset. In this paper, we conduct several empirical analyses of existing text style datasets. Based on our results, we propose a categorization of stylistic and dataset properties to consider when utilizing or comparing text style datasets.</abstract>
       <url hash="5cfcb3de">2021.inlg-1.22</url>
       <bibkey>schoch-etal-2021-contextualizing</bibkey>
+      <doi>10.18653/v1/2021.inlg-1.22</doi>
       <pwcdataset url="https://paperswithcode.com/dataset/gyafc">GYAFC</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/penn-treebank">Penn Treebank</pwcdataset>
-      <doi>10.18653/v1/2021.inlg-1.22</doi>
     </paper>
     <paper id="23">
       <title>Generation Challenges: Results of the Accuracy Evaluation Shared Task</title>
@@ -330,8 +330,8 @@
       <abstract>The Shared Task on Evaluating Accuracy focused on techniques (both manual and automatic) for evaluating the factual accuracy of texts produced by neural NLG systems, in a sports-reporting domain. Four teams submitted evaluation techniques for this task, using very different approaches and techniques. The best-performing submissions did encouragingly well at this difficult task. However, all automatic submissions struggled to detect factual errors which are semantically or pragmatically complex (for example, based on incorrect computation or inference).</abstract>
       <url hash="611b4355">2021.inlg-1.23</url>
       <bibkey>thomson-reiter-2021-generation</bibkey>
-      <pwccode url="https://github.com/ehudreiter/accuracysharedtask" additional="false">ehudreiter/accuracysharedtask</pwccode>
       <doi>10.18653/v1/2021.inlg-1.23</doi>
+      <pwccode url="https://github.com/ehudreiter/accuracysharedtask" additional="false">ehudreiter/accuracysharedtask</pwccode>
     </paper>
     <paper id="24">
       <title>The <fixed-case>R</fixed-case>epro<fixed-case>G</fixed-case>en Shared Task on Reproducibility of Human Evaluations in <fixed-case>NLG</fixed-case>: Overview and Results</title>
@@ -354,9 +354,9 @@
       <abstract>We present our Charles-UPF submission for the Shared Task on Evaluating Accuracy in Generated Texts at INLG 2021. Our system can detect the errors automatically using a combination of a rule-based natural language generation (NLG) system and pretrained language models (LMs). We first utilize a rule-based NLG system to generate sentences with facts that can be derived from the input. For each sentence we evaluate, we select a subset of facts which are relevant by measuring semantic similarity to the sentence in question. Finally, we finetune a pretrained language model on annotated data along with the relevant facts for fine-grained error detection. On the test set, we achieve 69% recall and 75% precision with a model trained on a mixture of human-annotated and synthetic data.</abstract>
       <url hash="982cb039">2021.inlg-1.25</url>
       <bibkey>kasner-etal-2021-text</bibkey>
+      <doi>10.18653/v1/2021.inlg-1.25</doi>
       <pwccode url="https://github.com/kasnerz/accuracysharedtask_cuni-upf" additional="false">kasnerz/accuracysharedtask_cuni-upf</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/rotowire">RotoWire</pwcdataset>
-      <doi>10.18653/v1/2021.inlg-1.25</doi>
     </paper>
     <paper id="26">
       <title>Shared Task in Evaluating Accuracy: Leveraging Pre-Annotations in the Validation Process</title>
@@ -386,9 +386,9 @@
       <abstract>The present paper summarizes an attempt we made to meet a shared task challenge on grounding machine-generated summaries of NBA matchups (<url>https://github.com/ehudreiter/accuracySharedTask.git</url>). In the first half, we discuss methods and in the second, we report results, together with a discussion on what feature may have had an effect on the performance.</abstract>
       <url hash="b5fa3711">2021.inlg-1.28</url>
       <bibkey>nomoto-2021-grounding</bibkey>
+      <doi>10.18653/v1/2021.inlg-1.28</doi>
       <pwccode url="https://github.com/ehudreiter/accuracysharedtask" additional="false">ehudreiter/accuracysharedtask</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/rotowire">RotoWire</pwcdataset>
-      <doi>10.18653/v1/2021.inlg-1.28</doi>
     </paper>
     <paper id="29">
       <title>Reproducing a Comparison of Hedged and Non-hedged <fixed-case>NLG</fixed-case> Texts</title>
@@ -397,8 +397,8 @@
       <abstract>This paper describes an attempt to reproduce an earlier experiment, previously conducted by the author, that compares hedged and non-hedged NLG texts as part of the ReproGen shared challenge. This reproduction effort was only able to partially replicate results from the original study. The analyisis from this reproduction effort suggests that whilst it is possible to replicate the procedural aspects of a previous study, replicating the results can prove more challenging as differences in participant type can have a potential impact.</abstract>
       <url hash="64443a05">2021.inlg-1.29</url>
       <bibkey>mahamood-2021-reproducing</bibkey>
-      <pwccode url="https://github.com/saad-mahamood/reprohum2021" additional="false">saad-mahamood/reprohum2021</pwccode>
       <doi>10.18653/v1/2021.inlg-1.29</doi>
+      <pwccode url="https://github.com/saad-mahamood/reprohum2021" additional="false">saad-mahamood/reprohum2021</pwccode>
     </paper>
     <paper id="30">
       <title>Another <fixed-case>PASS</fixed-case>: A Reproduction Study of the Human Evaluation of a Football Report Generation System</title>
@@ -442,12 +442,12 @@
       <abstract>We propose a shared task on summarizing real-life scenario dialogues, DialogSum Challenge, to encourage researchers to address challenges in dialogue summarization, which has been less studied by the summarization community. Real-life scenario dialogue summarization has a wide potential application prospect in chat-bot and personal assistant. It contains unique challenges such as special discourse structure, coreference, pragmatics, and social common sense, which require specific representation learning technologies to deal with. We carefully annotate a large-scale dialogue summarization dataset based on multiple public dialogue corpus, opening the door to all kinds of summarization models.</abstract>
       <url hash="5c2b25d1">2021.inlg-1.33</url>
       <bibkey>chen-etal-2021-dialogsum-challenge</bibkey>
+      <doi>10.18653/v1/2021.inlg-1.33</doi>
       <pwcdataset url="https://paperswithcode.com/dataset/dream">DREAM</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/dailydialog">DailyDialog</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/dialogsum">DialogSum</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/mutual">MuTual</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/samsum-corpus">SAMSum Corpus</pwcdataset>
-      <doi>10.18653/v1/2021.inlg-1.33</doi>
     </paper>
     <paper id="34">
       <title>Quality Evaluation of the Low-Resource Synthetically Generated Code-Mixed <fixed-case>H</fixed-case>inglish Text</title>
@@ -496,8 +496,8 @@
       <abstract>Understanding speaker’s feelings and producing appropriate responses with emotion connection is a key communicative skill for empathetic dialogue systems. In this paper, we propose a simple technique called Affective Decoding for empathetic response generation. Our method can effectively incorporate emotion signals during each decoding step, and can additionally be augmented with an auxiliary dual emotion encoder, which learns separate embeddings for the speaker and listener given the emotion base of the dialogue. Extensive empirical studies show that our models are perceived to be more empathetic by human evaluations, in comparison to several strong mainstream methods for empathetic responding.</abstract>
       <url hash="e4e2e2ae">2021.inlg-1.37</url>
       <bibkey>zeng-etal-2021-affective</bibkey>
-      <pwccode url="https://github.com/zenggo/affective-decoding-4-empathetic-dialog" additional="false">zenggo/affective-decoding-4-empathetic-dialog</pwccode>
       <doi>10.18653/v1/2021.inlg-1.37</doi>
+      <pwccode url="https://github.com/zenggo/affective-decoding-4-empathetic-dialog" additional="false">zenggo/affective-decoding-4-empathetic-dialog</pwccode>
     </paper>
     <paper id="38">
       <title>Controllable Sentence Simplification with a Unified Text-to-Text Transfer Transformer</title>
@@ -507,11 +507,11 @@
       <abstract>Recently, a large pre-trained language model called T5 (A Unified Text-to-Text Transfer Transformer) has achieved state-of-the-art performance in many NLP tasks. However, no study has been found using this pre-trained model on Text Simplification. Therefore in this paper, we explore the use of T5 fine-tuning on Text Simplification combining with a controllable mechanism to regulate the system outputs that can help generate adapted text for different target audiences. Our experiments show that our model achieves remarkable results with gains of between +0.69 and +1.41 over the current state-of-the-art (BART+ACCESS). We argue that using a pre-trained model such as T5, trained on several tasks with large amounts of data, can help improve Text Simplification.</abstract>
       <url hash="e5b994d2">2021.inlg-1.38</url>
       <bibkey>sheang-saggion-2021-controllable</bibkey>
+      <doi>10.18653/v1/2021.inlg-1.38</doi>
       <pwccode url="https://github.com/kimchengsheang/ts_t5" additional="false">kimchengsheang/ts_t5</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/asset">ASSET</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/turkcorpus">TurkCorpus</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikilarge">WikiLarge</pwcdataset>
-      <doi>10.18653/v1/2021.inlg-1.38</doi>
     </paper>
     <paper id="39">
       <title><fixed-case>SEPRG</fixed-case>: Sentiment aware Emotion controlled Personalized Response Generation</title>
@@ -534,9 +534,9 @@
       <abstract>Data-to-text (D2T) generation in the biomedical domain is a promising - yet mostly unexplored - field of research. Here, we apply neural models for D2T generation to a real-world dataset consisting of package leaflets of European medicines. We show that fine-tuned transformers are able to generate realistic, multi-sentence text from data in the biomedical domain, yet have important limitations. We also release a new dataset (BioLeaflets) for benchmarking D2T generation models in the biomedical domain.</abstract>
       <url hash="62dd2e99">2021.inlg-1.40</url>
       <bibkey>yermakov-etal-2021-biomedical</bibkey>
+      <doi>10.18653/v1/2021.inlg-1.40</doi>
       <pwccode url="https://github.com/bayer-science-for-a-better-life/data2text-bioleaflets" additional="false">bayer-science-for-a-better-life/data2text-bioleaflets</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/bioleaflets">BioLeaflets</pwcdataset>
-      <doi>10.18653/v1/2021.inlg-1.40</doi>
     </paper>
     <paper id="41">
       <title>Decoding, Fast and Slow: A Case Study on Balancing Trade-Offs in Incremental, Character-level Pragmatic Reasoning</title>
@@ -548,8 +548,8 @@
       <abstract>Recent work has adopted models of pragmatic reasoning for the generation of informative language in, e.g., image captioning. We propose a simple but highly effective relaxation of fully rational decoding, based on an existing incremental and character-level approach to pragmatically informative neural image captioning. We implement a mixed, ‘fast’ and ‘slow’, speaker that applies pragmatic reasoning occasionally (only word-initially), while unrolling the language model. In our evaluation, we find that increased informativeness through pragmatic decoding generally lowers quality and, somewhat counter-intuitively, increases repetitiveness in captions. Our mixed speaker, however, achieves a good balance between quality and informativeness.</abstract>
       <url hash="5bfa463c">2021.inlg-1.41</url>
       <bibkey>zarriess-etal-2021-decoding</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/coco">COCO</pwcdataset>
       <doi>10.18653/v1/2021.inlg-1.41</doi>
+      <pwcdataset url="https://paperswithcode.com/dataset/coco">COCO</pwcdataset>
     </paper>
     <paper id="42">
       <title><fixed-case>G</fixed-case>raph<fixed-case>P</fixed-case>lan: Story Generation by Planning with Event Graph</title>
@@ -571,8 +571,8 @@
       <abstract>An important part when constructing multiple-choice questions (MCQs) for reading comprehension assessment are the distractors, the incorrect but preferably plausible answer options. In this paper, we present a new BERT-based method for automatically generating distractors using only a small-scale dataset. We also release a new such dataset of Swedish MCQs (used for training the model), and propose a methodology for assessing the generated distractors. Evaluation shows that from a student’s perspective, our method generated one or more plausible distractors for more than 50% of the MCQs in our test set. From a teacher’s perspective, about 50% of the generated distractors were deemed appropriate. We also do a thorough analysis of the results.</abstract>
       <url hash="5c2bc4e0">2021.inlg-1.43</url>
       <bibkey>kalpakchi-boye-2021-bert</bibkey>
-      <pwccode url="https://github.com/dkalpakchi/swequad-mc" additional="false">dkalpakchi/swequad-mc</pwccode>
       <doi>10.18653/v1/2021.inlg-1.43</doi>
+      <pwccode url="https://github.com/dkalpakchi/swequad-mc" additional="false">dkalpakchi/swequad-mc</pwccode>
     </paper>
     <paper id="44">
       <title>Exploring Structural Encoding for Data-to-Text Generation</title>
@@ -582,8 +582,8 @@
       <abstract>Due to efficient end-to-end training and fluency in generated texts, several encoder-decoder framework-based models are recently proposed for data-to-text generations. Appropriate encoding of input data is a crucial part of such encoder-decoder models. However, only a few research works have concentrated on proper encoding methods. This paper presents a novel encoder-decoder based data-to-text generation model where the proposed encoder carefully encodes input data according to underlying structure of the data. The effectiveness of the proposed encoder is evaluated both extrinsically and intrinsically by shuffling input data without changing meaning of that data. For selecting appropriate content information in encoded data from encoder, the proposed model incorporates attention gates in the decoder. With extensive experiments on WikiBio and E2E dataset, we show that our model outperforms the state-of-the models and several standard baseline systems. Analysis of the model through component ablation tests and human evaluation endorse the proposed model as a well-grounded system.</abstract>
       <url hash="b676d59f">2021.inlg-1.44</url>
       <bibkey>mahapatra-garain-2021-exploring</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/wikibio">WikiBio</pwcdataset>
       <doi>10.18653/v1/2021.inlg-1.44</doi>
+      <pwcdataset url="https://paperswithcode.com/dataset/wikibio">WikiBio</pwcdataset>
     </paper>
     <paper id="45">
       <title>Attention Is Indeed All You Need: Semantically Attention-Guided Decoding for Data-to-Text <fixed-case>NLG</fixed-case></title>
@@ -593,9 +593,9 @@
       <abstract>Ever since neural models were adopted in data-to-text language generation, they have invariably been reliant on extrinsic components to improve their semantic accuracy, because the models normally do not exhibit the ability to generate text that reliably mentions all of the information provided in the input. In this paper, we propose a novel decoding method that extracts interpretable information from encoder-decoder models’ cross-attention, and uses it to infer which attributes are mentioned in the generated text, which is subsequently used to rescore beam hypotheses. Using this decoding method with T5 and BART, we show on three datasets its ability to dramatically reduce semantic errors in the generated outputs, while maintaining their state-of-the-art quality.</abstract>
       <url hash="924003e6">2021.inlg-1.45</url>
       <bibkey>juraska-walker-2021-attention</bibkey>
+      <doi>10.18653/v1/2021.inlg-1.45</doi>
       <pwccode url="https://github.com/jjuraska/data2text-nlg" additional="false">jjuraska/data2text-nlg</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/viggo">ViGGO</pwcdataset>
-      <doi>10.18653/v1/2021.inlg-1.45</doi>
     </paper>
   </volume>
 </collection>
diff --git a/data/xml/2021.nlp4prog.xml b/data/xml/2021.nlp4prog.xml
index e8e8c7716b..3109d78976 100644
--- a/data/xml/2021.nlp4prog.xml
+++ b/data/xml/2021.nlp4prog.xml
@@ -152,6 +152,7 @@
       <pwccode url="https://github.com/hirupert/sede" additional="false">hirupert/sede</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/sede">SEDE</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/atis">ATIS</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/spider-realistic">Spider-Realistic</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikisql">WikiSQL</pwcdataset>
     </paper>
     <paper id="10">
diff --git a/data/xml/2021.sigdial.xml b/data/xml/2021.sigdial.xml
index b25b893a1f..7dbd82b9bd 100644
--- a/data/xml/2021.sigdial.xml
+++ b/data/xml/2021.sigdial.xml
@@ -58,13 +58,13 @@
       <url hash="4f4b037e">2021.sigdial-1.3</url>
       <bibkey>kottur-etal-2021-dialogstitch</bibkey>
       <video href="https://www.youtube.com/watch?v=rSgDH9gD_cU"/>
+      <doi>10.18653/v1/2021.sigdial-1.3</doi>
       <pwccode url="https://github.com/facebookresearch/dialogstitch" additional="false">facebookresearch/dialogstitch</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/clevr">CLEVR</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/clevr-dialog">CLEVR-Dialog</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/dailydialog">DailyDialog</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/visdial">VisDial</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wizard-of-wikipedia">Wizard of Wikipedia</pwcdataset>
-      <doi>10.18653/v1/2021.sigdial-1.3</doi>
     </paper>
     <paper id="4">
       <title>Individual Interaction Styles: Evidence from a Spoken Chat Corpus</title>
@@ -89,8 +89,8 @@
       <url hash="40a416a1">2021.sigdial-1.5</url>
       <bibkey>liang-etal-2021-evaluation</bibkey>
       <video href="https://www.youtube.com/watch?v=h_L_uiu_BSo"/>
-      <pwccode url="https://github.com/KaihuiLiang/physical-activity-counseling" additional="false">KaihuiLiang/physical-activity-counseling</pwccode>
       <doi>10.18653/v1/2021.sigdial-1.5</doi>
+      <pwccode url="https://github.com/KaihuiLiang/physical-activity-counseling" additional="false">KaihuiLiang/physical-activity-counseling</pwccode>
     </paper>
     <paper id="6">
       <title>Improving Named Entity Recognition in Spoken Dialog Systems by Context and Speech Pattern Modeling</title>
@@ -146,8 +146,8 @@
       <url hash="8119cb08">2021.sigdial-1.9</url>
       <bibkey>tanaka-etal-2021-arta</bibkey>
       <video href="https://www.youtube.com/watch?v=Y4OAaQzoIhA"/>
-      <pwccode url="https://github.com/ahclab/arta_corpus" additional="false">ahclab/arta_corpus</pwccode>
       <doi>10.18653/v1/2021.sigdial-1.9</doi>
+      <pwccode url="https://github.com/ahclab/arta_corpus" additional="false">ahclab/arta_corpus</pwccode>
     </paper>
     <paper id="10">
       <title>Integrated taxonomy of errors in chat-oriented dialogue systems</title>
@@ -160,8 +160,8 @@
       <url hash="9055bf46">2021.sigdial-1.10</url>
       <bibkey>higashinaka-etal-2021-integrated</bibkey>
       <video href="https://www.youtube.com/watch?v=--OxJR0Q0Xk"/>
-      <pwccode url="https://github.com/ryuichiro-higashinaka/taxonomy-of-errors" additional="false">ryuichiro-higashinaka/taxonomy-of-errors</pwccode>
       <doi>10.18653/v1/2021.sigdial-1.10</doi>
+      <pwccode url="https://github.com/ryuichiro-higashinaka/taxonomy-of-errors" additional="false">ryuichiro-higashinaka/taxonomy-of-errors</pwccode>
     </paper>
     <paper id="11">
       <title>Effective Social Chatbot Strategies for Increasing User Initiative</title>
@@ -188,10 +188,10 @@
       <url hash="17cead64">2021.sigdial-1.12</url>
       <bibkey>papangelis-etal-2021-generative</bibkey>
       <video href="https://www.youtube.com/watch?v=NL703avDlpE"/>
+      <doi>10.18653/v1/2021.sigdial-1.12</doi>
       <pwcdataset url="https://paperswithcode.com/dataset/atis">ATIS</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/cifar-10">CIFAR-10</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/snips">SNIPS</pwcdataset>
-      <doi>10.18653/v1/2021.sigdial-1.12</doi>
     </paper>
     <paper id="13">
       <title>Commonsense-Focused Dialogues for Response Generation: An Empirical Study</title>
@@ -208,13 +208,13 @@
       <url hash="abc43b4c">2021.sigdial-1.13</url>
       <bibkey>zhou-etal-2021-commonsense</bibkey>
       <video href="https://www.youtube.com/watch?v=dfYt0OfyTDA"/>
+      <doi>10.18653/v1/2021.sigdial-1.13</doi>
       <pwccode url="https://github.com/alexa/commonsense-dialogues" additional="false">alexa/commonsense-dialogues</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/commonsense-dialogues">Commonsense-Dialogues</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/atomic">ATOMIC</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/dailydialog">DailyDialog</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/fed">FED</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/mutual">MuTual</pwcdataset>
-      <doi>10.18653/v1/2021.sigdial-1.13</doi>
     </paper>
     <paper id="14">
       <title>Velocidapter: Task-oriented Dialogue Comprehension Modeling Pairing Synthetic Text Generation with Domain Adaptation</title>
@@ -227,10 +227,10 @@
       <url hash="88cab37f">2021.sigdial-1.14</url>
       <bibkey>aksu-etal-2021-velocidapter</bibkey>
       <video href="https://www.youtube.com/watch?v=2BkbrFFGTFA"/>
+      <doi>10.18653/v1/2021.sigdial-1.14</doi>
       <pwccode url="https://github.com/cuthalionn/velocidapter" additional="false">cuthalionn/velocidapter</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/race">RACE</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/triviaqa">TriviaQA</pwcdataset>
-      <doi>10.18653/v1/2021.sigdial-1.14</doi>
     </paper>
     <paper id="15">
       <title>An Analysis of State-of-the-Art Models for Situated Interactive <fixed-case>M</fixed-case>ulti<fixed-case>M</fixed-case>odal Conversations (<fixed-case>SIMMC</fixed-case>)</title>
@@ -246,8 +246,8 @@
       <url hash="fe3dc858">2021.sigdial-1.15</url>
       <bibkey>kottur-etal-2021-analysis</bibkey>
       <video href="https://www.youtube.com/watch?v=VmdHZSno2MQ"/>
-      <pwcdataset url="https://paperswithcode.com/dataset/simmc">SIMMC</pwcdataset>
       <doi>10.18653/v1/2021.sigdial-1.15</doi>
+      <pwcdataset url="https://paperswithcode.com/dataset/simmc">SIMMC</pwcdataset>
     </paper>
     <paper id="16">
       <title>A Simple yet Effective Method for Sentence Ordering</title>
@@ -282,11 +282,11 @@
       <url hash="284d742f">2021.sigdial-1.18</url>
       <bibkey>xing-carenini-2021-improving</bibkey>
       <video href="https://www.youtube.com/watch?v=04Urc5LRBlk"/>
+      <doi>10.18653/v1/2021.sigdial-1.18</doi>
       <pwccode url="https://github.com/lxing532/Dialogue-Topic-Segmenter" additional="false">lxing532/Dialogue-Topic-Segmenter</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/dailydialog">DailyDialog</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/doc2dial-1">Doc2Dial</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/doc2dial">doc2dial</pwcdataset>
-      <doi>10.18653/v1/2021.sigdial-1.18</doi>
     </paper>
     <paper id="19">
       <title>Fundamental Exploration of Evaluation Metrics for Persona Characteristics of Text Utterances</title>
@@ -310,9 +310,9 @@
       <url hash="998c13ea">2021.sigdial-1.20</url>
       <bibkey>zhao-kawahara-2021-multi</bibkey>
       <video href="https://www.youtube.com/watch?v=PIZcPh7CGcI"/>
+      <doi>10.18653/v1/2021.sigdial-1.20</doi>
       <pwccode url="https://github.com/ZHAOTING/dialog-processing" additional="false">ZHAOTING/dialog-processing</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/dailydialog">DailyDialog</pwcdataset>
-      <doi>10.18653/v1/2021.sigdial-1.20</doi>
     </paper>
     <paper id="21">
       <title>Contrastive Response Pairs for Automatic Evaluation of Non-task-oriented Neural Conversational Models</title>
@@ -340,8 +340,8 @@
       <url hash="ce2b1a3a">2021.sigdial-1.22</url>
       <bibkey>tian-etal-2021-bert</bibkey>
       <video href="https://www.youtube.com/watch?v=bQypRvHeOR0"/>
-      <pwcdataset url="https://paperswithcode.com/dataset/snli">SNLI</pwcdataset>
       <doi>10.18653/v1/2021.sigdial-1.22</doi>
+      <pwcdataset url="https://paperswithcode.com/dataset/snli">SNLI</pwcdataset>
     </paper>
     <paper id="23">
       <title>Hi-<fixed-case>DST</fixed-case>: A Hierarchical Approach for Scalable and Extensible Dialogue State Tracking</title>
@@ -352,11 +352,11 @@
       <url hash="99e7e7be">2021.sigdial-1.23</url>
       <bibkey>dey-desarkar-2021-hi</bibkey>
       <video href="https://www.youtube.com/watch?v=ldnP2Cn_7F0"/>
+      <doi>10.18653/v1/2021.sigdial-1.23</doi>
       <pwccode url="https://github.com/suvodipdey/hi-dst" additional="false">suvodipdey/hi-dst</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/multiwoz">MultiWOZ</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/sgd">SGD</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/squad">SQuAD</pwcdataset>
-      <doi>10.18653/v1/2021.sigdial-1.23</doi>
     </paper>
     <paper id="24">
       <title>Dialogue State Tracking with Multi-Level Fusion of Predicted Dialogue States and Conversations</title>
@@ -370,9 +370,9 @@
       <url hash="aa5f682b">2021.sigdial-1.24</url>
       <bibkey>zhou-etal-2021-dialogue</bibkey>
       <video href="https://www.youtube.com/watch?v=LOC-0HQz5Lg"/>
+      <doi>10.18653/v1/2021.sigdial-1.24</doi>
       <pwccode url="https://github.com/helloacl/DST-DCPDS" additional="false">helloacl/DST-DCPDS</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/multiwoz">MultiWOZ</pwcdataset>
-      <doi>10.18653/v1/2021.sigdial-1.24</doi>
     </paper>
     <paper id="25">
       <title>Recent Neural Methods on Dialogue State Tracking for Task-Oriented Dialogue Systems: A Survey</title>
@@ -384,9 +384,9 @@
       <url hash="6722fe65">2021.sigdial-1.25</url>
       <bibkey>balaraman-etal-2021-recent</bibkey>
       <video href="https://www.youtube.com/watch?v=zQuaI9czmJk"/>
+      <doi>10.18653/v1/2021.sigdial-1.25</doi>
       <pwcdataset url="https://paperswithcode.com/dataset/multiwoz">MultiWOZ</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/sgd">SGD</pwcdataset>
-      <doi>10.18653/v1/2021.sigdial-1.25</doi>
     </paper>
     <paper id="26">
       <title><fixed-case>S</fixed-case>cikit-talk: A toolkit for processing real-world conversational speech data</title>
@@ -457,8 +457,8 @@
       <url hash="9b249653">2021.sigdial-1.30</url>
       <bibkey>si-etal-2021-telling</bibkey>
       <video href="https://www.youtube.com/watch?v=WF8fyqWJVBA"/>
-      <pwcdataset url="https://paperswithcode.com/dataset/crd3">CRD3</pwcdataset>
       <doi>10.18653/v1/2021.sigdial-1.30</doi>
+      <pwcdataset url="https://paperswithcode.com/dataset/crd3">CRD3</pwcdataset>
     </paper>
     <paper id="31">
       <title>Summarizing Behavioral Change Goals from <fixed-case>SMS</fixed-case> Exchanges to Support Health Coaches</title>
@@ -500,6 +500,7 @@
       <url hash="ec76dd0d">2021.sigdial-1.33</url>
       <bibkey>ghosal-etal-2021-cider</bibkey>
       <video href="https://www.youtube.com/watch?v=vSNq0OOGRc0"/>
+      <doi>10.18653/v1/2021.sigdial-1.33</doi>
       <pwccode url="https://github.com/declare-lab/CIDER" additional="false">declare-lab/CIDER</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/conceptnet">ConceptNet</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/dream">DREAM</pwcdataset>
@@ -509,7 +510,6 @@
       <pwcdataset url="https://paperswithcode.com/dataset/multinli">MultiNLI</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/squad">SQuAD</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/swag">SWAG</pwcdataset>
-      <doi>10.18653/v1/2021.sigdial-1.33</doi>
     </paper>
     <paper id="34">
       <title>Where Are We in Discourse Relation Recognition?</title>
@@ -521,8 +521,8 @@
       <url hash="a43a628c">2021.sigdial-1.34</url>
       <bibkey>atwell-etal-2021-discourse</bibkey>
       <video href="https://www.youtube.com/watch?v=QnTpLGN8QvY"/>
-      <pwcdataset url="https://paperswithcode.com/dataset/penn-treebank">Penn Treebank</pwcdataset>
       <doi>10.18653/v1/2021.sigdial-1.34</doi>
+      <pwcdataset url="https://paperswithcode.com/dataset/penn-treebank">Penn Treebank</pwcdataset>
     </paper>
     <paper id="35">
       <title>Annotation Inconsistency and Entity Bias in <fixed-case>M</fixed-case>ulti<fixed-case>WOZ</fixed-case></title>
@@ -549,13 +549,13 @@
       <url hash="93689198">2021.sigdial-1.36</url>
       <bibkey>mahajan-shaikh-2021-need</bibkey>
       <video href="https://www.youtube.com/watch?v=1PJRwGVxMEs"/>
+      <doi>10.18653/v1/2021.sigdial-1.36</doi>
       <pwcdataset url="https://paperswithcode.com/dataset/crd3">CRD3</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/interview">Interview</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/meld">MELD</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/molweni">Molweni</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/opensubtitles">OpenSubtitles</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/serial-speakers">Serial Speakers</pwcdataset>
-      <doi>10.18653/v1/2021.sigdial-1.36</doi>
     </paper>
     <paper id="37">
       <title>How Should Agents Ask Questions For Situated Learning? An Annotated Dialogue Corpus</title>
@@ -569,9 +569,9 @@
       <url hash="c1163e34">2021.sigdial-1.37</url>
       <bibkey>gervits-etal-2021-agents</bibkey>
       <video href="https://www.youtube.com/watch?v=9IAwjDa0Wp0"/>
+      <doi>10.18653/v1/2021.sigdial-1.37</doi>
       <pwccode url="https://github.com/USArmyResearchLab/ARL-HuRDL" additional="false">USArmyResearchLab/ARL-HuRDL</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/hurdl">HuRDL</pwcdataset>
-      <doi>10.18653/v1/2021.sigdial-1.37</doi>
     </paper>
     <paper id="38">
       <title>How Will <fixed-case>I</fixed-case> Argue? A Dataset for Evaluating Recommender Systems for Argumentations</title>
@@ -583,8 +583,8 @@
       <url hash="a910043e">2021.sigdial-1.38</url>
       <bibkey>brenneis-etal-2021-will</bibkey>
       <video href="https://www.youtube.com/watch?v=gfM2Vf-xFJ8"/>
-      <pwccode url="https://github.com/hhucn/argumentation-attitude-dataset" additional="false">hhucn/argumentation-attitude-dataset</pwccode>
       <doi>10.18653/v1/2021.sigdial-1.38</doi>
+      <pwccode url="https://github.com/hhucn/argumentation-attitude-dataset" additional="false">hhucn/argumentation-attitude-dataset</pwccode>
     </paper>
     <paper id="39">
       <title>From Argument Search to Argumentative Dialogue: A Topic-independent Approach to Argument Acquisition for Dialogue Systems</title>
@@ -599,8 +599,8 @@
       <url hash="e72d0252">2021.sigdial-1.39</url>
       <bibkey>rach-etal-2021-argument</bibkey>
       <video href="https://www.youtube.com/watch?v=r02BENY-Fu0"/>
-      <pwccode url="https://github.com/csacro/from-argument-search-to-argumentative-dialogue" additional="false">csacro/from-argument-search-to-argumentative-dialogue</pwccode>
       <doi>10.18653/v1/2021.sigdial-1.39</doi>
+      <pwccode url="https://github.com/csacro/from-argument-search-to-argumentative-dialogue" additional="false">csacro/from-argument-search-to-argumentative-dialogue</pwccode>
     </paper>
     <paper id="40">
       <title>What to Fact-Check: Guiding Check-Worthy Information Detection in News Articles through Argumentative Discourse Structure</title>
@@ -612,8 +612,8 @@
       <url hash="c1a4cf3d">2021.sigdial-1.40</url>
       <bibkey>alhindi-etal-2021-fact</bibkey>
       <video href="https://www.youtube.com/watch?v=oBT795ipFFM"/>
-      <pwccode url="https://github.com/tariq60/whattofactcheck" additional="false">tariq60/whattofactcheck</pwccode>
       <doi>10.18653/v1/2021.sigdial-1.40</doi>
+      <pwccode url="https://github.com/tariq60/whattofactcheck" additional="false">tariq60/whattofactcheck</pwccode>
     </paper>
     <paper id="41">
       <title>How “open” are the conversations with open-domain chatbots? A proposal for Speech Event based evaluation</title>
@@ -635,8 +635,8 @@
       <url hash="25f7eaa1">2021.sigdial-1.42</url>
       <bibkey>ultes-maier-2021-blending</bibkey>
       <video href="https://www.youtube.com/watch?v=6US5hE70vRU"/>
-      <pwcdataset url="https://paperswithcode.com/dataset/multiwoz">MultiWOZ</pwcdataset>
       <doi>10.18653/v1/2021.sigdial-1.42</doi>
+      <pwcdataset url="https://paperswithcode.com/dataset/multiwoz">MultiWOZ</pwcdataset>
     </paper>
     <paper id="43">
       <title>Diversity as a By-Product: Goal-oriented Language Generation Leads to Linguistic Variation</title>
@@ -702,8 +702,8 @@
       <url hash="d70f99b2">2021.sigdial-1.47</url>
       <bibkey>lin-etal-2021-domain</bibkey>
       <video href="https://www.youtube.com/watch?v=WAWFDEk1yvc"/>
-      <pwcdataset url="https://paperswithcode.com/dataset/multiwoz">MultiWOZ</pwcdataset>
       <doi>10.18653/v1/2021.sigdial-1.47</doi>
+      <pwcdataset url="https://paperswithcode.com/dataset/multiwoz">MultiWOZ</pwcdataset>
     </paper>
     <paper id="48">
       <title>A Practical 2-step Approach to Assist Enterprise Question-Answering Live Chat</title>
@@ -727,8 +727,8 @@
       <url hash="42d67701">2021.sigdial-1.49</url>
       <bibkey>parthasarathi-etal-2021-brief</bibkey>
       <video href="https://www.youtube.com/watch?v=2bnoJOzinms"/>
-      <pwccode url="https://github.com/ppartha03/Semantic-Loss-Dialogue-Generation" additional="false">ppartha03/Semantic-Loss-Dialogue-Generation</pwccode>
       <doi>10.18653/v1/2021.sigdial-1.49</doi>
+      <pwccode url="https://github.com/ppartha03/Semantic-Loss-Dialogue-Generation" additional="false">ppartha03/Semantic-Loss-Dialogue-Generation</pwccode>
     </paper>
     <paper id="50">
       <title>Do Encoder Representations of Generative Dialogue Models have sufficient summary of the Information about the task ?</title>
@@ -740,8 +740,8 @@
       <url hash="582efccf">2021.sigdial-1.50</url>
       <bibkey>parthasarathi-etal-2021-encoder</bibkey>
       <video href="https://www.youtube.com/watch?v=AwHuUPEpJFA"/>
-      <pwccode url="https://github.com/ppartha03/Dialogue-Probe-Tasks-Public" additional="false">ppartha03/Dialogue-Probe-Tasks-Public</pwccode>
       <doi>10.18653/v1/2021.sigdial-1.50</doi>
+      <pwccode url="https://github.com/ppartha03/Dialogue-Probe-Tasks-Public" additional="false">ppartha03/Dialogue-Probe-Tasks-Public</pwccode>
     </paper>
     <paper id="51">
       <title><fixed-case>G</fixed-case>en<fixed-case>SF</fixed-case>: Simultaneous Adaptation of Generative Pre-trained Models and Slot Filling</title>
@@ -752,8 +752,8 @@
       <url hash="98deeb1d">2021.sigdial-1.51</url>
       <bibkey>mehri-eskenazi-2021-gensf</bibkey>
       <video href="https://www.youtube.com/watch?v=PNCr4am-1Gc"/>
-      <pwccode url="https://github.com/shikib/generative_slot_filling" additional="false">shikib/generative_slot_filling</pwccode>
       <doi>10.18653/v1/2021.sigdial-1.51</doi>
+      <pwccode url="https://github.com/shikib/generative_slot_filling" additional="false">shikib/generative_slot_filling</pwccode>
     </paper>
     <paper id="52">
       <title>Schema-Guided Paradigm for Zero-Shot Dialog</title>
@@ -764,9 +764,9 @@
       <url hash="e42940e6">2021.sigdial-1.52</url>
       <bibkey>mehri-eskenazi-2021-schema</bibkey>
       <video href="https://www.youtube.com/watch?v=usZQulwdOZs"/>
+      <doi>10.18653/v1/2021.sigdial-1.52</doi>
       <pwccode url="https://github.com/Shikib/schema_attention_model" additional="false">Shikib/schema_attention_model</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/star">STAR</pwcdataset>
-      <doi>10.18653/v1/2021.sigdial-1.52</doi>
     </paper>
     <paper id="53">
       <title>Coreference-Aware Dialogue Summarization</title>
@@ -780,9 +780,9 @@
       <revision id="1" href="2021.sigdial-1.53v1" hash="a9c373ea"/>
       <revision id="2" href="2021.sigdial-1.53v2" hash="9be8ac79" date="2022-01-24">Fixed typos in Section 3 and updated Table 3.</revision>
       <video href="https://www.youtube.com/watch?v=XNiUdhaW6LI"/>
+      <doi>10.18653/v1/2021.sigdial-1.53</doi>
       <pwccode url="https://github.com/seq-to-mind/coref_dial_summ" additional="false">seq-to-mind/coref_dial_summ</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/samsum-corpus">SAMSum Corpus</pwcdataset>
-      <doi>10.18653/v1/2021.sigdial-1.53</doi>
     </paper>
     <paper id="54">
       <title>Weakly Supervised Extractive Summarization with Attention</title>
@@ -807,8 +807,8 @@
       <url hash="484cf7ac">2021.sigdial-1.55</url>
       <bibkey>manuvinakurike-etal-2021-incremental</bibkey>
       <video href="https://www.youtube.com/watch?v=CnHqotO89jQ"/>
-      <pwcdataset url="https://paperswithcode.com/dataset/cnn-daily-mail-1">CNN/Daily Mail</pwcdataset>
       <doi>10.18653/v1/2021.sigdial-1.55</doi>
+      <pwcdataset url="https://paperswithcode.com/dataset/cnn-daily-mail-1">CNN/Daily Mail</pwcdataset>
     </paper>
     <paper id="56">
       <title>Mitigating Topic Bias when Detecting Decisions in Dialogue</title>
@@ -835,8 +835,8 @@
       <url hash="910d2844">2021.sigdial-1.57</url>
       <bibkey>bang-etal-2021-assessing</bibkey>
       <video href="https://www.youtube.com/watch?v=R_wMpI7i6TQ"/>
-      <pwccode url="https://github.com/HLTCHKUST/chatbot-political-prudence-test" additional="false">HLTCHKUST/chatbot-political-prudence-test</pwccode>
       <doi>10.18653/v1/2021.sigdial-1.57</doi>
+      <pwccode url="https://github.com/HLTCHKUST/chatbot-political-prudence-test" additional="false">HLTCHKUST/chatbot-political-prudence-test</pwccode>
     </paper>
     <paper id="58">
       <title>Large-Scale Quantitative Evaluation of Dialogue Agents’ Response Strategies against Offensive Users</title>
@@ -848,8 +848,8 @@
       <url hash="8fd5350b">2021.sigdial-1.58</url>
       <bibkey>li-etal-2021-large</bibkey>
       <video href="https://www.youtube.com/watch?v=FLsqwyGx4zM"/>
-      <pwccode url="https://github.com/lithiumh/offensive" additional="false">lithiumh/offensive</pwccode>
       <doi>10.18653/v1/2021.sigdial-1.58</doi>
+      <pwccode url="https://github.com/lithiumh/offensive" additional="false">lithiumh/offensive</pwccode>
     </paper>
   </volume>
 </collection>
diff --git a/data/xml/2022.acl.xml b/data/xml/2022.acl.xml
index 2f11e8e8e8..5882539bf1 100644
--- a/data/xml/2022.acl.xml
+++ b/data/xml/2022.acl.xml
@@ -2279,6 +2279,7 @@
       <pwcdataset url="https://paperswithcode.com/dataset/conceptnet">ConceptNet</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/multinli">MultiNLI</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/sparc">SParC</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/spider-realistic">Spider-Realistic</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikisql">WikiSQL</pwcdataset>
     </paper>
     <paper id="143">
diff --git a/data/xml/2022.coling.xml b/data/xml/2022.coling.xml
index da7371e668..27a564eacc 100644
--- a/data/xml/2022.coling.xml
+++ b/data/xml/2022.coling.xml
@@ -1773,6 +1773,7 @@
       <abstract>Most attempts on Text-to-SQL task using encoder-decoder approach show a big problem of dramatic decline in performance for new databases. For the popular Spider dataset, despite models achieving 70% accuracy on its development or test sets, the same models show a huge decline below 20% accuracy for unseen databases. The root causes for this problem are complex and they cannot be easily fixed by adding more manually created training. In this paper we address the problem and propose a solution that is a hybrid system using automated training-data augmentation technique. Our system consists of a rule-based and a deep learning components that interact to understand crucial information in a given query and produce correct SQL as a result. It achieves double-digit percentage improvement for databases that are not part of the Spider corpus.</abstract>
       <url hash="ff9f43b0">2022.coling-1.137</url>
       <bibkey>popescu-etal-2022-addressing</bibkey>
+      <pwcdataset url="https://paperswithcode.com/dataset/spider-realistic">Spider-Realistic</pwcdataset>
     </paper>
     <paper id="138">
       <title>Mintaka: A Complex, Natural, and Multilingual Dataset for End-to-End Question Answering</title>
diff --git a/data/xml/2022.findings.xml b/data/xml/2022.findings.xml
index 099873e70b..bee54618e9 100644
--- a/data/xml/2022.findings.xml
+++ b/data/xml/2022.findings.xml
@@ -1566,6 +1566,7 @@
       <attachment type="software" hash="a9f19b91">2022.findings-acl.99.software.zip</attachment>
       <bibkey>hui-etal-2022-s2sql</bibkey>
       <doi>10.18653/v1/2022.findings-acl.99</doi>
+      <pwcdataset url="https://paperswithcode.com/dataset/spider-realistic">Spider-Realistic</pwcdataset>
     </paper>
     <paper id="100">
       <title>Constructing Open Cloze Tests Using Generation and Discrimination Capabilities of Transformers</title>
@@ -3673,6 +3674,7 @@
       <bibkey>zheng-etal-2022-hie</bibkey>
       <doi>10.18653/v1/2022.findings-acl.236</doi>
       <pwcdataset url="https://paperswithcode.com/dataset/cosql">CoSQL</pwcdataset>
+      <pwcdataset url="https://paperswithcode.com/dataset/spider-realistic">Spider-Realistic</pwcdataset>
     </paper>
     <paper id="237">
       <title><fixed-case>CRAS</fixed-case>pell: A Contextual Typo Robust Approach to Improve <fixed-case>C</fixed-case>hinese Spelling Correction</title>
diff --git a/data/xml/2022.naacl.xml b/data/xml/2022.naacl.xml
index 289470ab16..194f6f0e34 100644
--- a/data/xml/2022.naacl.xml
+++ b/data/xml/2022.naacl.xml
@@ -1088,6 +1088,7 @@
       <doi>10.18653/v1/2022.naacl-main.68</doi>
       <video href="2022.naacl-main.68.mp4"/>
       <pwccode url="https://github.com/jzbjyb/omnitab" additional="false">jzbjyb/omnitab</pwccode>
+      <pwcdataset url="https://paperswithcode.com/dataset/spider-realistic">Spider-Realistic</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikisql">WikiSQL</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikitablequestions">WikiTableQuestions</pwcdataset>
     </paper>
@@ -5658,7 +5659,7 @@
       <bibkey>li-etal-2022-quantifying</bibkey>
       <doi>10.18653/v1/2022.naacl-main.346</doi>
       <video href="2022.naacl-main.346.mp4"/>
-      <pwccode url="https://github.com/facebookresearch/task_bench" additional="false">facebookresearch/task_bench</pwccode>
+      <pwccode url="https://github.com/belindal/taskbench500" additional="true">belindal/taskbench500</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/superglue">SuperGLUE</pwcdataset>
     </paper>
     <paper id="347">
diff --git a/data/xml/2022.sigdial.xml b/data/xml/2022.sigdial.xml
index cea695463f..64d1c48d4c 100644
--- a/data/xml/2022.sigdial.xml
+++ b/data/xml/2022.sigdial.xml
@@ -31,9 +31,9 @@
       <url hash="c1026985">2022.sigdial-1.1</url>
       <bibkey>ohashi-higashinaka-2022-post</bibkey>
       <video href="https://youtu.be/3O0Jj4lpxn8"/>
+      <doi>10.18653/v1/2022.sigdial-1.1</doi>
       <pwccode url="https://github.com/nu-dialogue/post-processing-networks" additional="false">nu-dialogue/post-processing-networks</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/multiwoz">MultiWOZ</pwcdataset>
-      <doi>10.18653/v1/2022.sigdial-1.1</doi>
     </paper>
     <paper id="2">
       <title>Reducing Model Churn: Stable Re-training of Conversational Agents</title>
@@ -45,9 +45,9 @@
       <url hash="da49469f">2022.sigdial-1.2</url>
       <bibkey>hidey-etal-2022-reducing</bibkey>
       <video href="https://youtu.be/okIrVZD-zDE"/>
+      <doi>10.18653/v1/2022.sigdial-1.2</doi>
       <pwccode url="https://github.com/google/stable-retraining-conversational-agents" additional="false">google/stable-retraining-conversational-agents</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/topv2">TOPv2</pwcdataset>
-      <doi>10.18653/v1/2022.sigdial-1.2</doi>
     </paper>
     <paper id="3">
       <title>Knowledge-Grounded Conversational Data Augmentation with Generative Conversational Networks</title>
@@ -60,8 +60,8 @@
       <url hash="bafe4a68">2022.sigdial-1.3</url>
       <bibkey>lin-etal-2022-knowledge</bibkey>
       <video href="https://youtu.be/P8Ns-WWF770"/>
-      <pwcdataset url="https://paperswithcode.com/dataset/topical-chat">Topical-Chat</pwcdataset>
       <doi>10.18653/v1/2022.sigdial-1.3</doi>
+      <pwcdataset url="https://paperswithcode.com/dataset/topical-chat">Topical-Chat</pwcdataset>
     </paper>
     <paper id="4">
       <title>Guiding the Release of Safer <fixed-case>E</fixed-case>2<fixed-case>E</fixed-case> Conversational <fixed-case>AI</fixed-case> through Value Sensitive Design</title>
@@ -89,8 +89,8 @@
       <abstract>Prior work has demonstrated that data augmentation is useful for improving dialogue state tracking. However, there are many types of user utterances, while the prior method only considered the simplest one for augmentation, raising the concern about poor generalization capability. In order to better cover diverse dialogue acts and control the generation quality, this paper proposes controllable user dialogue act augmentation (CUDA-DST) to augment user utterances with diverse behaviors. With the augmented data, different state trackers gain improvement and show better robustness, achieving the state-of-the-art performance on MultiWOZ 2.1.</abstract>
       <url hash="f95fb5b7">2022.sigdial-1.5</url>
       <bibkey>lai-etal-2022-controllable</bibkey>
-      <pwccode url="https://github.com/miulab/cuda-dst" additional="false">miulab/cuda-dst</pwccode>
       <doi>10.18653/v1/2022.sigdial-1.5</doi>
+      <pwccode url="https://github.com/miulab/cuda-dst" additional="false">miulab/cuda-dst</pwccode>
     </paper>
     <paper id="6">
       <title>Developing an argument annotation scheme based on a semantic classification of arguments</title>
@@ -113,9 +113,9 @@
       <abstract>Depression is a serious mental illness that impacts the way people communicate, especially through their emotions, and, allegedly, the way they interact with others. This work examines depression signals in dialogs, a less studied setting that suffers from data sparsity. We hypothesize that depression and emotion can inform each other, and we propose to explore the influence of dialog structure through topic and dialog act prediction. We investigate a Multi-Task Learning (MTL) approach, where all tasks mentioned above are learned jointly with dialog-tailored hierarchical modeling. We experiment on the DAIC and DailyDialog corpora – both contain dialogs in English – and show important improvements over state-of-the-art on depression detection (at best 70.6% F1), which demonstrates the correlation of depression with emotion and dialog organization and the power of MTL to leverage information from different sources.</abstract>
       <url hash="7b5ef0ef">2022.sigdial-1.7</url>
       <bibkey>li-etal-2022-multi</bibkey>
+      <doi>10.18653/v1/2022.sigdial-1.7</doi>
       <pwccode url="https://github.com/chuyuanli/mtl4depr" additional="false">chuyuanli/mtl4depr</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/dailydialog">DailyDialog</pwcdataset>
-      <doi>10.18653/v1/2022.sigdial-1.7</doi>
     </paper>
     <paper id="8">
       <title>To laugh or not to laugh? The use of laughter to mark discourse structure</title>
@@ -136,10 +136,10 @@
       <abstract>Despite considerable advances in open-domain neural dialogue systems, their evaluation remains a bottleneck. Several automated metrics have been proposed to evaluate these systems, however, they mostly focus on a single notion of quality, or, when they do combine several sub-metrics, they are computationally expensive. This paper attempts to solve the latter: QualityAdapt leverages the Adapter framework for the task of Dialogue Quality Estimation. Using well defined semi-supervised tasks, we train adapters for different subqualities and score generated responses with AdapterFusion. This compositionality provides an easy to adapt metric to the task at hand that incorporates multiple subqualities. It also reduces computational costs as individual predictions of all subqualities are obtained in a single forward pass. This approach achieves comparable results to state-of-the-art metrics on several datasets, whilst keeping the previously mentioned advantages.</abstract>
       <url hash="39943e32">2022.sigdial-1.9</url>
       <bibkey>mendonca-etal-2022-qualityadapt</bibkey>
+      <doi>10.18653/v1/2022.sigdial-1.9</doi>
       <pwccode url="https://github.com/johndmendonca/qualityadapt" additional="false">johndmendonca/qualityadapt</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/dailydialog">DailyDialog</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/fed">FED</pwcdataset>
-      <doi>10.18653/v1/2022.sigdial-1.9</doi>
     </paper>
     <paper id="10">
       <title>Graph Neural Network Policies and Imitation Learning for Multi-Domain Task-Oriented Dialogues</title>
@@ -202,8 +202,8 @@
       <url hash="59987826">2022.sigdial-1.14</url>
       <bibkey>torres-foncesca-etal-2022-symbol</bibkey>
       <video href="https://youtu.be/xxulenUa754"/>
-      <pwcdataset url="https://paperswithcode.com/dataset/coco">COCO</pwcdataset>
       <doi>10.18653/v1/2022.sigdial-1.14</doi>
+      <pwcdataset url="https://paperswithcode.com/dataset/coco">COCO</pwcdataset>
     </paper>
     <paper id="15">
       <title>Towards Personality-Aware Chatbots</title>
@@ -269,8 +269,8 @@
       <url hash="be3f24cb">2022.sigdial-1.19</url>
       <bibkey>saha-etal-2022-edu</bibkey>
       <video href="https://youtu.be/4T_oMl8rAcE"/>
-      <pwccode url="https://github.com/sougata-ub/edu-ap" additional="false">sougata-ub/edu-ap</pwccode>
       <doi>10.18653/v1/2022.sigdial-1.19</doi>
+      <pwccode url="https://github.com/sougata-ub/edu-ap" additional="false">sougata-ub/edu-ap</pwccode>
     </paper>
     <paper id="20">
       <title>Using Transition Duration to Improve Turn-taking in Conversational Agents</title>
@@ -297,12 +297,12 @@
       <url hash="f571966e">2022.sigdial-1.21</url>
       <bibkey>wu-etal-2022-dg2</bibkey>
       <video href="https://youtu.be/mBxsj_qAH80"/>
+      <doi>10.18653/v1/2022.sigdial-1.21</doi>
       <pwcdataset url="https://paperswithcode.com/dataset/coqa">CoQA</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/doc2dial-1">Doc2Dial</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/quac">QuAC</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/sharc">ShARC</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/doc2dial">doc2dial</pwcdataset>
-      <doi>10.18653/v1/2022.sigdial-1.21</doi>
     </paper>
     <paper id="22">
       <title>When can <fixed-case>I</fixed-case> Speak? Predicting initiation points for spoken dialogue agents</title>
@@ -314,8 +314,8 @@
       <url hash="892c5350">2022.sigdial-1.22</url>
       <bibkey>li-etal-2022-speak</bibkey>
       <video href="https://youtu.be/63FxVG68-G8"/>
-      <pwccode url="https://github.com/siyan-sylvia-li/icarus_final" additional="false">siyan-sylvia-li/icarus_final</pwccode>
       <doi>10.18653/v1/2022.sigdial-1.22</doi>
+      <pwccode url="https://github.com/siyan-sylvia-li/icarus_final" additional="false">siyan-sylvia-li/icarus_final</pwccode>
     </paper>
     <paper id="23">
       <title>Using Interaction Style Dimensions to Characterize Spoken Dialog Corpora</title>
@@ -350,8 +350,8 @@
       <url hash="8f69af68">2022.sigdial-1.25</url>
       <bibkey>xue-etal-2022-building</bibkey>
       <video href="https://youtu.be/-R9IKfbZci8"/>
-      <pwcdataset url="https://paperswithcode.com/dataset/opendialkg">OpenDialKG</pwcdataset>
       <doi>10.18653/v1/2022.sigdial-1.25</doi>
+      <pwcdataset url="https://paperswithcode.com/dataset/opendialkg">OpenDialKG</pwcdataset>
     </paper>
     <paper id="26">
       <title>Generating Meaningful Topic Descriptions with Sentence Embeddings and <fixed-case>LDA</fixed-case></title>
@@ -391,8 +391,8 @@
       <url hash="2ddcf18e">2022.sigdial-1.28</url>
       <bibkey>lin-etal-2022-gentus</bibkey>
       <video href="https://youtu.be/v_zOm3OF_LI"/>
-      <pwcdataset url="https://paperswithcode.com/dataset/multiwoz">MultiWOZ</pwcdataset>
       <doi>10.18653/v1/2022.sigdial-1.28</doi>
+      <pwcdataset url="https://paperswithcode.com/dataset/multiwoz">MultiWOZ</pwcdataset>
     </paper>
     <paper id="29">
       <title><fixed-case>AARGH</fixed-case>! End-to-end Retrieval-Generation for Task-Oriented Dialog</title>
@@ -403,9 +403,9 @@
       <url hash="1f198f17">2022.sigdial-1.29</url>
       <bibkey>nekvinda-dusek-2022-aargh</bibkey>
       <video href="https://youtu.be/o_-G6L9wL9U"/>
+      <doi>10.18653/v1/2022.sigdial-1.29</doi>
       <pwccode url="https://github.com/tomiinek/aargh" additional="false">tomiinek/aargh</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/multiwoz">MultiWOZ</pwcdataset>
-      <doi>10.18653/v1/2022.sigdial-1.29</doi>
     </paper>
     <paper id="30">
       <title>A Systematic Evaluation of Response Selection for Open Domain Dialogue</title>
@@ -440,9 +440,9 @@
       <url hash="110e709a">2022.sigdial-1.32</url>
       <bibkey>chi-rudnicky-2022-structured</bibkey>
       <video href="https://youtu.be/wYYXdaFAQWA"/>
+      <doi>10.18653/v1/2022.sigdial-1.32</doi>
       <pwccode url="https://github.com/chijames/structured_dialogue_discourse_parsing" additional="false">chijames/structured_dialogue_discourse_parsing</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/molweni">Molweni</pwcdataset>
-      <doi>10.18653/v1/2022.sigdial-1.32</doi>
     </paper>
     <paper id="33">
       <title>“Do you follow me?”: A Survey of Recent Approaches in Dialogue State Tracking</title>
@@ -454,10 +454,10 @@
       <url hash="2b44ee2d">2022.sigdial-1.33</url>
       <bibkey>jacqmin-etal-2022-follow</bibkey>
       <video href="https://youtu.be/8ZXMHsYDlCQ"/>
+      <doi>10.18653/v1/2022.sigdial-1.33</doi>
       <pwcdataset url="https://paperswithcode.com/dataset/crosswoz">CrossWOZ</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/multiwoz">MultiWOZ</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/sgd">SGD</pwcdataset>
-      <doi>10.18653/v1/2022.sigdial-1.33</doi>
     </paper>
     <paper id="34">
       <title><fixed-case>M</fixed-case>ulti<fixed-case>WOZ</fixed-case> 2.4: A Multi-Domain Task-Oriented Dialogue Dataset with Essential Annotation Corrections to Improve State Tracking Evaluation</title>
@@ -469,9 +469,9 @@
       <url hash="7384e323">2022.sigdial-1.34</url>
       <bibkey>ye-etal-2022-multiwoz</bibkey>
       <video href="https://youtu.be/mI5UNXEtSTI"/>
+      <doi>10.18653/v1/2022.sigdial-1.34</doi>
       <pwccode url="https://github.com/smartyfh/MultiWOZ2.4" additional="false">smartyfh/MultiWOZ2.4</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/multiwoz">MultiWOZ</pwcdataset>
-      <doi>10.18653/v1/2022.sigdial-1.34</doi>
     </paper>
     <paper id="35">
       <title>The Duration of a Turn Cannot be Used to Predict When It Ends</title>
@@ -493,10 +493,10 @@
       <url hash="223b0f0c">2022.sigdial-1.36</url>
       <bibkey>tran-litman-2022-getting</bibkey>
       <video href="https://youtu.be/hpr3fXbWPVA"/>
+      <doi>10.18653/v1/2022.sigdial-1.36</doi>
       <pwcdataset url="https://paperswithcode.com/dataset/doc2dial-1">Doc2Dial</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/multidoc2dial">MultiDoc2Dial</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/doc2dial">doc2dial</pwcdataset>
-      <doi>10.18653/v1/2022.sigdial-1.36</doi>
     </paper>
     <paper id="37">
       <title>Neural Generation Meets Real People: Building a Social, Informative Open-Domain Dialogue Agent</title>
@@ -538,9 +538,9 @@
       <url hash="c680c6c2">2022.sigdial-1.38</url>
       <bibkey>bhatnagar-etal-2022-deepcon</bibkey>
       <video href="https://youtu.be/1sfwi9oXIZg"/>
+      <doi>10.18653/v1/2022.sigdial-1.38</doi>
       <pwcdataset url="https://paperswithcode.com/dataset/must-c">MuST-C</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/samsum-corpus">SAMSum Corpus</pwcdataset>
-      <doi>10.18653/v1/2022.sigdial-1.38</doi>
     </paper>
     <paper id="39">
       <title><fixed-case>ICM</fixed-case> : Intent and Conversational Mining from Conversation Logs</title>
@@ -576,8 +576,8 @@
       <url hash="5bc7b8a0">2022.sigdial-1.41</url>
       <bibkey>svikhnushina-etal-2022-ieval</bibkey>
       <video href="https://youtu.be/7h3jwTTfXUY"/>
-      <pwccode url="https://github.com/sea94/ieval" additional="false">sea94/ieval</pwccode>
       <doi>10.18653/v1/2022.sigdial-1.41</doi>
+      <pwccode url="https://github.com/sea94/ieval" additional="false">sea94/ieval</pwccode>
     </paper>
     <paper id="42">
       <title>Unsupervised Domain Adaptation on Question-Answering System with Conversation Data</title>
@@ -589,10 +589,10 @@
       <url hash="00a93917">2022.sigdial-1.42</url>
       <bibkey>adiba-etal-2022-unsupervised</bibkey>
       <video href="https://youtu.be/WZVBWoTkMbQ"/>
+      <doi>10.18653/v1/2022.sigdial-1.42</doi>
       <pwcdataset url="https://paperswithcode.com/dataset/cnn-daily-mail-1">CNN/Daily Mail</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/doc2dial-1">Doc2Dial</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/doc2dial">doc2dial</pwcdataset>
-      <doi>10.18653/v1/2022.sigdial-1.42</doi>
     </paper>
     <paper id="43">
       <title><fixed-case>U</fixed-case>ni<fixed-case>DU</fixed-case>: Towards A Unified Generative Dialogue Understanding Framework</title>
@@ -623,8 +623,8 @@
       <url hash="e27c87ec">2022.sigdial-1.44</url>
       <bibkey>cai-etal-2022-advancing</bibkey>
       <video href="https://youtu.be/ea-475Ex9do"/>
-      <pwccode url="https://github.com/cycrab/JSA-TOD" additional="false">cycrab/JSA-TOD</pwccode>
       <doi>10.18653/v1/2022.sigdial-1.44</doi>
+      <pwccode url="https://github.com/cycrab/JSA-TOD" additional="false">cycrab/JSA-TOD</pwccode>
     </paper>
     <paper id="45">
       <title>Redwood: Using Collision Detection to Grow a Large-Scale Intent Classification Dataset</title>
@@ -635,12 +635,12 @@
       <url hash="a7cedf90">2022.sigdial-1.45</url>
       <bibkey>larson-leach-2022-redwood</bibkey>
       <video href="https://youtu.be/E5tCFZ_5r0Y"/>
+      <doi>10.18653/v1/2022.sigdial-1.45</doi>
       <pwccode url="https://github.com/gxlarson/redwood" additional="false">gxlarson/redwood</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/atis">ATIS</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/banking77">BANKING77</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/clinc150">CLINC150</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/talk2car">Talk2Car</pwcdataset>
-      <doi>10.18653/v1/2022.sigdial-1.45</doi>
     </paper>
     <paper id="46">
       <title>Dialogue Evaluation with Offline Reinforcement Learning</title>
@@ -685,8 +685,8 @@
       <url hash="6ecde446">2022.sigdial-1.48</url>
       <bibkey>stevens-guille-etal-2022-generating</bibkey>
       <video href="https://youtu.be/Io_RIOZj1hQ"/>
-      <pwccode url="https://github.com/symonjorystevens-guille/penngen" additional="false">symonjorystevens-guille/penngen</pwccode>
       <doi>10.18653/v1/2022.sigdial-1.48</doi>
+      <pwccode url="https://github.com/symonjorystevens-guille/penngen" additional="false">symonjorystevens-guille/penngen</pwccode>
     </paper>
     <paper id="49">
       <title>Toward Self-Learning End-to-End Task-oriented Dialog Systems</title>
@@ -713,9 +713,9 @@
       <url hash="49f0c8fb">2022.sigdial-1.50</url>
       <bibkey>stoyanchev-etal-2022-combining</bibkey>
       <video href="https://youtu.be/4f2urztZCdQ"/>
+      <doi>10.18653/v1/2022.sigdial-1.50</doi>
       <pwcdataset url="https://paperswithcode.com/dataset/sgd">SGD</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/snli">SNLI</pwcdataset>
-      <doi>10.18653/v1/2022.sigdial-1.50</doi>
     </paper>
     <paper id="51">
       <title>How Much Does Prosody Help Turn-taking? Investigations using Voice Activity Projection Models</title>
@@ -726,8 +726,8 @@
       <url hash="4dad212e">2022.sigdial-1.51</url>
       <bibkey>ekstedt-skantze-2022-much</bibkey>
       <video href="https://youtu.be/QstNOaBfJ5k"/>
-      <pwccode url="https://github.com/erikekstedt/conv_ssl" additional="true">erikekstedt/conv_ssl</pwccode>
       <doi>10.18653/v1/2022.sigdial-1.51</doi>
+      <pwccode url="https://github.com/erikekstedt/conv_ssl" additional="true">erikekstedt/conv_ssl</pwccode>
     </paper>
     <paper id="52">
       <title>What makes you change your mind? An empirical investigation in online group decision-making conversations</title>
@@ -739,8 +739,8 @@
       <url hash="53ade687">2022.sigdial-1.52</url>
       <bibkey>karadzhov-etal-2022-makes</bibkey>
       <video href="https://youtu.be/wA9RLUEdwOg"/>
-      <pwcdataset url="https://paperswithcode.com/dataset/delidata">DeliData</pwcdataset>
       <doi>10.18653/v1/2022.sigdial-1.52</doi>
+      <pwcdataset url="https://paperswithcode.com/dataset/delidata">DeliData</pwcdataset>
     </paper>
     <paper id="53">
       <title>Dialogue Term Extraction using Transfer Learning and Topological Data Analysis</title>
@@ -755,9 +755,9 @@
       <url hash="151e734f">2022.sigdial-1.53</url>
       <bibkey>vukovic-etal-2022-dialogue</bibkey>
       <video href="https://youtu.be/keSRDRwRK3Y"/>
+      <doi>10.18653/v1/2022.sigdial-1.53</doi>
       <pwcdataset url="https://paperswithcode.com/dataset/multiwoz">MultiWOZ</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/sgd">SGD</pwcdataset>
-      <doi>10.18653/v1/2022.sigdial-1.53</doi>
     </paper>
     <paper id="54">
       <title>Evaluating N-best Calibration of Natural Language Understanding for Dialogue Systems</title>
@@ -769,8 +769,8 @@
       <url hash="a5734fbb">2022.sigdial-1.54</url>
       <bibkey>khojah-etal-2022-evaluating</bibkey>
       <video href="https://youtu.be/VW97fUNgUw8"/>
-      <pwccode url="https://github.com/ranimkhojah/confidence-estimation-benchmark" additional="false">ranimkhojah/confidence-estimation-benchmark</pwccode>
       <doi>10.18653/v1/2022.sigdial-1.54</doi>
+      <pwccode url="https://github.com/ranimkhojah/confidence-estimation-benchmark" additional="false">ranimkhojah/confidence-estimation-benchmark</pwccode>
     </paper>
     <paper id="55">
       <title><fixed-case>LAD</fixed-case>: Language Models as Data for Zero-Shot Dialog</title>
@@ -782,11 +782,11 @@
       <url hash="567bed14">2022.sigdial-1.55</url>
       <bibkey>mehri-etal-2022-lad</bibkey>
       <video href="https://youtu.be/cdJnOFBd5mE"/>
+      <doi>10.18653/v1/2022.sigdial-1.55</doi>
       <pwcdataset url="https://paperswithcode.com/dataset/banking77">BANKING77</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/clinc150">CLINC150</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/hwu64">HWU64</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/star">STAR</pwcdataset>
-      <doi>10.18653/v1/2022.sigdial-1.55</doi>
     </paper>
     <paper id="56">
       <title>Improving Bot Response Contradiction Detection via Utterance Rewriting</title>
@@ -799,10 +799,10 @@
       <url hash="a503a095">2022.sigdial-1.56</url>
       <bibkey>jin-etal-2022-improving</bibkey>
       <video href="https://youtu.be/ZEPaSnSGjiw"/>
+      <doi>10.18653/v1/2022.sigdial-1.56</doi>
       <pwccode url="https://github.com/jind11/utterance-rewriting" additional="false">jind11/utterance-rewriting</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/canard">CANARD</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/dailydialog">DailyDialog</pwcdataset>
-      <doi>10.18653/v1/2022.sigdial-1.56</doi>
     </paper>
     <paper id="57">
       <title>Comparison of Lexical Alignment with a Teachable Robot in Human-Robot and Human-Human-Robot Interactions</title>
@@ -828,10 +828,10 @@
       <abstract>The goal of dialogue relation extraction (DRE) is to identify the relation between two entities in a given dialogue. During conversations, speakers may expose their relations to certain entities by explicit or implicit clues, such evidences called “triggers”. However, trigger annotations may not be always available for the target data, so it is challenging to leverage such information for enhancing the performance. Therefore, this paper proposes to learn how to identify triggers from the data with trigger annotations and then transfers the trigger-finding capability to other datasets for better performance. The experiments show that the proposed approach is capable of improving relation extraction performance of unseen relations and also demonstrate the transferability of our proposed trigger-finding model across different domains and datasets.</abstract>
       <url hash="62b5083a">2022.sigdial-1.58</url>
       <bibkey>lin-etal-2022-trend</bibkey>
+      <doi>10.18653/v1/2022.sigdial-1.58</doi>
       <pwccode url="https://github.com/miulab/trend" additional="false">miulab/trend</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/ddrel">DDRel</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/dialogre">DialogRE</pwcdataset>
-      <doi>10.18653/v1/2022.sigdial-1.58</doi>
     </paper>
     <paper id="59">
       <title>User Satisfaction Modeling with Domain Adaptation in Task-oriented Dialogue Systems</title>
@@ -843,9 +843,9 @@
       <abstract>User Satisfaction Estimation (USE) is crucial in helping measure the quality of a task-oriented dialogue system. However, the complex nature of implicit responses poses challenges in detecting user satisfaction, and most datasets are limited in size or not available to the public due to user privacy policies. Unlike task-oriented dialogue, large-scale annotated chitchat with emotion labels is publicly available. Therefore, we present a novel user satisfaction model with domain adaptation (USMDA) to utilize this chitchat. We adopt a dialogue Transformer encoder to capture contextual features from the dialogue. And we reduce domain discrepancy to learn dialogue-related invariant features. Moreover, USMDA jointly learns satisfaction signals in the chitchat context with user satisfaction estimation, and user actions in task-oriented dialogue with dialogue action recognition. Experimental results on two benchmarks show that our proposed framework for the USE task outperforms existing unsupervised domain adaptation methods. To the best of our knowledge, this is the first work to study user satisfaction estimation with unsupervised domain adaptation from chitchat to task-oriented dialogue.</abstract>
       <url hash="240c2ff7">2022.sigdial-1.59</url>
       <bibkey>pan-etal-2022-user</bibkey>
+      <doi>10.18653/v1/2022.sigdial-1.59</doi>
       <pwcdataset url="https://paperswithcode.com/dataset/emorynlp">EmoryNLP</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/sgd">SGD</pwcdataset>
-      <doi>10.18653/v1/2022.sigdial-1.59</doi>
     </paper>
     <paper id="60">
       <title>N-best Response-based Analysis of Contradiction-awareness in Neural Response Generation Models</title>
@@ -859,8 +859,8 @@
       <abstract>Avoiding the generation of responses that contradict the preceding context is a significant challenge in dialogue response generation. One feasible method is post-processing, such as filtering out contradicting responses from a resulting n-best response list. In this scenario, the quality of the n-best list considerably affects the occurrence of contradictions because the final response is chosen from this n-best list. This study quantitatively analyzes the contextual contradiction-awareness of neural response generation models using the consistency of the n-best lists. Particularly, we used polar questions as stimulus inputs for concise and quantitative analyses. Our tests illustrate the contradiction-awareness of recent neural response generation models and methodologies, followed by a discussion of their properties and limitations.</abstract>
       <url hash="f312a19c">2022.sigdial-1.60</url>
       <bibkey>sato-etal-2022-n</bibkey>
-      <pwccode url="https://github.com/shiki-sato/nbest-contradiction-analysis" additional="false">shiki-sato/nbest-contradiction-analysis</pwccode>
       <doi>10.18653/v1/2022.sigdial-1.60</doi>
+      <pwccode url="https://github.com/shiki-sato/nbest-contradiction-analysis" additional="false">shiki-sato/nbest-contradiction-analysis</pwccode>
     </paper>
     <paper id="61">
       <title>A Visually-Aware Conversational Robot Receptionist</title>
@@ -894,9 +894,9 @@
       <abstract>We demonstrate EMMA, an embodied multimodal agent which has been developed for the Alexa Prize SimBot challenge. The agent acts within a 3D simulated environment for household tasks. EMMA is a unified and multimodal generative model aimed at solving embodied tasks. In contrast to previous work, our approach treats multiple multimodal tasks as a single multimodal conditional text generation problem, where a model learns to output text given both language and visual input. Furthermore, we showcase that a single generative agent can solve tasks with visual inputs of varying length, such as answering questions about static images, or executing actions given a sequence of previous frames and dialogue utterances. The demo system will allow users to interact conversationally with EMMA in embodied dialogues in different 3D environments from the TEACh dataset.</abstract>
       <url hash="05afdbf2">2022.sigdial-1.62</url>
       <bibkey>suglia-etal-2022-demonstrating</bibkey>
+      <doi>10.18653/v1/2022.sigdial-1.62</doi>
       <pwcdataset url="https://paperswithcode.com/dataset/ai2-thor">AI2-THOR</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/alfred">ALFRED</pwcdataset>
-      <doi>10.18653/v1/2022.sigdial-1.62</doi>
     </paper>
     <paper id="63">
       <title><fixed-case>GRILLB</fixed-case>ot: A multi-modal conversational agent for complex real-world tasks</title>
@@ -922,8 +922,8 @@
       <abstract>Robots operating in unexplored environments with human teammates will need to learn unknown concepts on the fly. To this end, we demonstrate a novel system that combines a computational model of question generation with a cognitive robotic architecture. The model supports dynamic production of back-and-forth dialogue for concept learning given observations of an environment, while the architecture supports symbolic reasoning, action representation, one-shot learning and other capabilities for situated interaction. The system is able to learn about new concepts including objects, locations, and actions, using an underlying approach that is generalizable and scalable. We evaluate the system by comparing learning efficiency to a human baseline in a collaborative reference resolution task and show that the system is effective and efficient in learning new concepts, and that it can informatively generate explanations about its behavior.</abstract>
       <url hash="7dd0c205">2022.sigdial-1.64</url>
       <bibkey>kane-etal-2022-system</bibkey>
-      <pwcdataset url="https://paperswithcode.com/dataset/hurdl">HuRDL</pwcdataset>
       <doi>10.18653/v1/2022.sigdial-1.64</doi>
+      <pwcdataset url="https://paperswithcode.com/dataset/hurdl">HuRDL</pwcdataset>
     </paper>
   </volume>
 </collection>
diff --git a/data/xml/D18.xml b/data/xml/D18.xml
index c9c86e1d82..1c05944d86 100644
--- a/data/xml/D18.xml
+++ b/data/xml/D18.xml
@@ -5868,6 +5868,7 @@
       <doi>10.18653/v1/D18-1425</doi>
       <bibkey>yu-etal-2018-spider</bibkey>
       <pwccode url="https://github.com/taoyds/spider" additional="true">taoyds/spider</pwccode>
+      <pwcdataset url="https://paperswithcode.com/dataset/spider-realistic">Spider-Realistic</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/wikisql">WikiSQL</pwcdataset>
     </paper>
     <paper id="426">
diff --git a/data/xml/D19.xml b/data/xml/D19.xml
index bb77dd3e50..c77f18b67d 100644
--- a/data/xml/D19.xml
+++ b/data/xml/D19.xml
@@ -7111,7 +7111,6 @@
       <pwcdataset url="https://paperswithcode.com/dataset/nlvr">NLVR</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/visual-genome">Visual Genome</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/visual-question-answering-v2-0">Visual Question Answering v2.0</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/vizwiz">VizWiz</pwcdataset>
     </paper>
     <paper id="515">
       <title>Phrase Grounding by Soft-Label Chain Conditional Random Field</title>
diff --git a/data/xml/P17.xml b/data/xml/P17.xml
index d90faf0384..1738cc709f 100644
--- a/data/xml/P17.xml
+++ b/data/xml/P17.xml
@@ -2420,7 +2420,6 @@ two word-vectors results in a vector that is only a small angle away from the ve
       <pwccode url="https://github.com/facebookresearch/DrQA" additional="true">facebookresearch/DrQA</pwccode>
       <pwcdataset url="https://paperswithcode.com/dataset/cbt">CBT</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/dbpedia">DBpedia</pwcdataset>
-      <pwcdataset url="https://paperswithcode.com/dataset/natural-questions">Natural Questions</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/quasar-t">QUASAR-T</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/squad">SQuAD</pwcdataset>
       <pwcdataset url="https://paperswithcode.com/dataset/searchqa">SearchQA</pwcdataset>

From 500a56e83416da9227dce40af1b71efc0c9b875a Mon Sep 17 00:00:00 2001
From: anthology-assist <126604033+anthology-assist@users.noreply.github.com>
Date: Fri, 1 Dec 2023 15:41:13 -0600
Subject: [PATCH 12/12] Ingestion: sigdial 2023 workshop tllm (#2862)

---
 data/xml/2023.sigdial.xml  |  3 +-
 data/xml/2023.tllm.xml     | 95 ++++++++++++++++++++++++++++++++++++++
 data/yaml/venues/tllm.yaml |  3 ++
 3 files changed, 100 insertions(+), 1 deletion(-)
 create mode 100644 data/xml/2023.tllm.xml
 create mode 100644 data/yaml/venues/tllm.yaml

diff --git a/data/xml/2023.sigdial.xml b/data/xml/2023.sigdial.xml
index 120830841f..cb81fe1c3e 100644
--- a/data/xml/2023.sigdial.xml
+++ b/data/xml/2023.sigdial.xml
@@ -2,7 +2,7 @@
 <collection id="2023.sigdial">
   <volume id="1" ingest-date="2023-10-01" type="proceedings">
     <meta>
-      <booktitle>Proceedings of the 24th Meeting of the Special Interest Group on Discourse and Dialogue</booktitle>
+      <booktitle>Proceedings of the 24th Annual Meeting of the Special Interest Group on Discourse and Dialogue</booktitle>
       <editor><first>Svetlana</first><last>Stoyanchev</last></editor>
       <editor><first>Shafiq</first><last>Joty</last></editor>
       <editor><first>David</first><last>Schlangen</last></editor>
@@ -780,6 +780,7 @@
       <volume-id>2023.icard-1</volume-id>
       <volume-id>2023.cs4oa-1</volume-id>
       <volume-id>2023.mmnlg-1</volume-id>
+      <volume-id>2023.tllm-1</volume-id>
       <volume-id>2023.yrrsds-1</volume-id>
     </colocated>
   </event>
diff --git a/data/xml/2023.tllm.xml b/data/xml/2023.tllm.xml
new file mode 100644
index 0000000000..2426ec0b40
--- /dev/null
+++ b/data/xml/2023.tllm.xml
@@ -0,0 +1,95 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<collection id="2023.tllm">
+  <volume id="1" ingest-date="2023-10-30" type="proceedings">
+    <meta>
+      <booktitle>Proceedings of the 1st Workshop on Taming Large Language Models: Controllability in the era of Interactive Assistants!</booktitle>
+      <editor><first>Devamanyu</first><last>Hazarika</last></editor>
+      <editor><first>Xiangru Robert</first><last>Tang</last></editor>
+      <editor><first>Di</first><last>Jin</last></editor>
+      <publisher>Association for Computational Linguistics</publisher>
+      <address>Prague, Czech Republic</address>
+      <month>September</month>
+      <year>2023</year>
+      <venue>tllm</venue>
+      <venue>ws</venue>
+    </meta>
+    <paper id="1">
+      <title><fixed-case>CST</fixed-case>5: Data Augmentation for Code-Switched Semantic Parsing</title>
+      <author><first>Anmol</first><last>Agarwal</last></author>
+      <author><first>Jigar</first><last>Gupta</last></author>
+      <author><first>Rahul</first><last>Goel</last></author>
+      <author><first>Shyam</first><last>Upadhyay</last></author>
+      <author><first>Pankaj</first><last>Joshi</last></author>
+      <author><first>Rengarajan</first><last>Aravamudhan</last></author>
+      <pages>1-10</pages>
+      <abstract>Extending semantic parsers to code-switched input has been a challenging problem, primarily due to a lack of supervised training data. In this work, we introduce CST5, a new data augmentation technique that fine-tunes a T5 model using a small seed set (≈100 utterances) to generate code-switched utterances from English utterances. We show that CST5 generates high quality code-switched data, both intrinsically (per human evaluation) and extrinsically by comparing baseline models which are trained without data augmentation to models which are trained with augmented data. Empirically we observe that using CST5, one can achieve the same semantic parsing performance by using up to 20x less labeled data. To aid further research in this area, we are also releasing (a) Hinglish-TOP, the largest human annotated code-switched semantic parsing dataset to date, containing 10k human annotated Hindi-English (Hinglish) code-switched utterances, and (b) Over 170K CST5 generated code-switched utterances from the TOPv2 dataset. Human evaluation shows that both the human annotated data as well as the CST5 generated data is of good quality.</abstract>
+      <url hash="1eb1c173">2023.tllm-1.1</url>
+      <bibkey>agarwal-etal-2023-cst5</bibkey>
+    </paper>
+    <paper id="2">
+      <title><fixed-case>P</fixed-case>anda<fixed-case>GPT</fixed-case>: One Model To Instruction-Follow Them All</title>
+      <author><first>Yixuan</first><last>Su</last></author>
+      <author><first>Tian</first><last>Lan</last></author>
+      <author><first>Huayang</first><last>Li</last></author>
+      <author><first>Jialu</first><last>Xu</last></author>
+      <author><first>Yan</first><last>Wang</last></author>
+      <author><first>Deng</first><last>Cai</last></author>
+      <pages>11-23</pages>
+      <abstract>We present PandaGPT, an approach to emPower large lANguage moDels with visual and Auditory instruction-following capabilities. Our pilot experiments show that PandaGPT can perform complex tasks such as detailed image description generation, writing stories inspired by videos, and answering questions about audios. More interestingly, PandaGPT can take multimodal inputs simultaneously and compose their semantics naturally. For example, PandaGPT can connect how objects look in an image/video and how they sound in an audio. To do so, PandaGPT combines the multimodal encoders from ImageBind and the large language models from Vicuna. Notably, only aligned image-text pairs are required for the training of PandaGPT. Thanks to the strong capability of ImageBind in embedding data from different modalities into the same space, PandaGPT displays emergent, i.e. zero-shot, cross-modal behaviors for data other than image and text (e.g., video, audio, depth, thermal, and IMU). We hope that PandaGPT serves as an initial step toward building AGI that can perceive and understand inputs in different modalities holistically, as we humans do.</abstract>
+      <url hash="c70f31d7">2023.tllm-1.2</url>
+      <bibkey>su-etal-2023-pandagpt</bibkey>
+    </paper>
+    <paper id="3">
+      <title>Emotion-Conditioned Text Generation through Automatic Prompt Optimization</title>
+      <author><first>Yarik Menchaca</first><last>Resendiz</last></author>
+      <author><first>Roman</first><last>Klinger</last></author>
+      <pages>24-30</pages>
+      <abstract>Conditional natural language generation methods often require either expensive fine-tuning or training a large language model from scratch. Both are unlikely to lead to good results without a substantial amount of data and computational resources. Prompt learning without changing the parameters of a large language model presents a promising alternative. It is a cost-effective approach, while still achieving competitive results. While this procedure is now established for zero- and few-shot text classification and structured prediction, it has received limited attention in conditional text generation. We present the first automatic prompt optimization approach for emotion-conditioned text generation with instruction-fine-tuned models. Our method uses an iterative optimization procedure that changes the prompt by adding, removing, or replacing tokens. As objective function, we only require a text classifier that measures the realization of the conditional variable in the generated text. We evaluate the method on emotion-conditioned text generation with a focus on event reports and compare it to manually designed prompts that also act as the seed for the optimization procedure. The optimized prompts achieve 0.75 macro-average F1 to fulfill the emotion condition in contrast to manually designed seed prompts with only 0.22 macro-average F1.</abstract>
+      <url hash="e4d0f08c">2023.tllm-1.3</url>
+      <bibkey>resendiz-klinger-2023-emotion</bibkey>
+    </paper>
+    <paper id="4">
+      <title>Mitigating Harms of <fixed-case>LLM</fixed-case>s via Knowledge Distillation for a Virtual Museum Tour Guide</title>
+      <author><first>Ashley</first><last>Lewis</last></author>
+      <author><first>Michael</first><last>White</last></author>
+      <pages>31-45</pages>
+      <abstract>LLMs are known to be very powerful, exhibiting both great benefits and great risk. We seek to leverage the benefits, in particular the ability to be fluent, conversational dialogue agents, while minimizing the risks, such as hallucination and toxic content. In this work we use knowledge distillation to create a virtual museum tour guide dialogue agent, employing ChatGPT as a teacher model for a smaller student model, T5-large. We find the T5 model shows competitive performance, significantly reduces instances of hallucination, and shows promise for reducing toxic content.</abstract>
+      <url hash="627bb1ca">2023.tllm-1.4</url>
+      <bibkey>lewis-white-2023-mitigating</bibkey>
+    </paper>
+    <paper id="5">
+      <title>Evaluating Large Language Models for Document-grounded Response Generation in Information-Seeking Dialogues</title>
+      <author><first>Norbert</first><last>Braunschweiler</last></author>
+      <author><first>Rama</first><last>Doddipatla</last></author>
+      <author><first>Simon</first><last>Keizer</last></author>
+      <author><first>Svetlana</first><last>Stoyanchev</last></author>
+      <pages>46-55</pages>
+      <abstract>In this paper, we investigate the use of large language models (LLMs) like ChatGPT for document-grounded response generation in the context of information-seeking dialogues. For evaluation, we use the MultiDoc2Dial corpus of task-oriented dialogues in four social service domains previously used in the DialDoc 2022 Shared Task. Information-seeking dialogue turns are grounded in multiple documents providing relevant information. We generate dialogue completion responses by prompting a ChatGPT model, using two methods: Chat-Completion and LlamaIndex. ChatCompletion uses knowledge from ChatGPT model pre-training while LlamaIndex also extracts relevant information from documents. Observing that document-grounded response generation via LLMs cannot be adequately assessed by automatic evaluation metrics as they are significantly more verbose, we perform a human evaluation where annotators rate the output of the shared task winning system, the two ChatGPT variants outputs, and human responses. While both ChatGPT variants are more likely to include information not present in the relevant segments, possibly including a presence of hallucinations, they are rated higher than both the shared task winning system and human responses.</abstract>
+      <url hash="7e481e9f">2023.tllm-1.5</url>
+      <bibkey>braunschweiler-etal-2023-evaluating</bibkey>
+    </paper>
+    <paper id="6">
+      <title>Enhancing Pipeline-Based Conversational Agents with Large Language Models</title>
+      <author><first>Mina</first><last>Foosherian</last></author>
+      <author><first>Hendrik</first><last>Purwins</last></author>
+      <author><first>Purna</first><last>Rathnayake</last></author>
+      <author><first>Touhidul</first><last>Alam</last></author>
+      <author><first>Rui</first><last>Teimao</last></author>
+      <author><first>Klaus-Dieter</first><last>Thoben</last></author>
+      <pages>56-67</pages>
+      <abstract>The latest advancements in AI and deep learning have led to a breakthrough in large language model (LLM)-based agents such as GPT-4. However, many commercial conversational agent development tools are pipeline-based and have limitations in holding a human-like conversation. This paper investigates the capabilities of LLMs to enhance pipeline-based conversational agents during two phases: 1) in the design and development phase and 2) during operations. In 1) LLMs can aid in generating training data, extracting entities and synonyms, localization, and persona design. In 2) LLMs can assist in contextualization, intent classification to prevent conversational breakdown and handle out-of-scope questions, auto-correcting utterances, rephrasing responses, formulating disambiguation questions, summarization, and enabling closed question-answering capabilities. We conducted informal experiments with GPT-4 in the private banking domain to demonstrate the scenarios above with a practical example. Companies may be hesitant to replace their pipeline-based agents with LLMs entirely due to privacy concerns and the need for deep integration within their existing ecosystems. A hybrid approach in which LLMs’ are integrated into the pipeline-based agents allows them to save time and costs of building and running agents by capitalizing on the capabilities of LLMs while retaining the integration and privacy safeguards of their existing systems.</abstract>
+      <url hash="644f01a3">2023.tllm-1.6</url>
+      <bibkey>foosherian-etal-2023-enhancing</bibkey>
+    </paper>
+    <paper id="7">
+      <title>Style Locality for Controllable Generation with k<fixed-case>NN</fixed-case> Language Models</title>
+      <author><first>Gilles</first><last>Nawezi</last></author>
+      <author><first>Lucie</first><last>Flek</last></author>
+      <author><first>Charles</first><last>Welch</last></author>
+      <pages>68-75</pages>
+      <abstract>Recent language models have been improved by the addition of external memory. Nearest neighbor language models retrieve similar contexts to assist in word prediction. The addition of locality levels allows a model to learn how to weight neighbors based on their relative location to the current text in source documents, and have been shown to further improve model performance. Nearest neighbor models have been explored for controllable generation but have not examined the use of locality levels. We present a novel approach for this purpose and evaluate it using automatic and human evaluation on politeness, formality, supportiveness, and toxicity textual data. We find that our model is successfully able to control style and provides a better fluency-style trade-off than previous work</abstract>
+      <url hash="7e5bab6d">2023.tllm-1.7</url>
+      <bibkey>nawezi-etal-2023-style</bibkey>
+    </paper>
+  </volume>
+</collection>
diff --git a/data/yaml/venues/tllm.yaml b/data/yaml/venues/tllm.yaml
new file mode 100644
index 0000000000..d695a7177b
--- /dev/null
+++ b/data/yaml/venues/tllm.yaml
@@ -0,0 +1,3 @@
+acronym: TLLM
+name: 'Workshop on Taming Large Language Models: Controllability in the era of Interactive
+  Assistants!'