Merge pull request #22 from RTIInternational/joss-paper

JOSS paper
RTIInternational · Jun 23, 2021 · d9ec813 · d9ec813
2 parents 2f4f6ab + d255dfc
commit d9ec813
Show file tree

Hide file tree

Showing 10 changed files with 212 additions and 9 deletions.
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -24,6 +24,10 @@ services:
     build:
       context: ./gobbli/augment/bert
 
+  marian:
+    build:
+      context: ./gobbli/augment/marian
+
   transformer:
     build:
       context: ./gobbli/model/transformer

diff --git a/gobbli/augment/bert/Dockerfile b/gobbli/augment/bert/Dockerfile
@@ -1,6 +1,6 @@
 FROM pytorch/pytorch:1.3-cuda10.1-cudnn7-runtime
 
-RUN pip install transformers==2.3.0
+RUN pip install transformers==2.3.0 sentencepiece==0.1.86
 
 COPY ./src /code/bert
 WORKDIR /code/bert
diff --git a/gobbli/augment/marian/Dockerfile b/gobbli/augment/marian/Dockerfile
@@ -1,6 +1,6 @@
 FROM pytorch/pytorch:1.3-cuda10.1-cudnn7-runtime
 
-RUN pip install transformers==2.9.1
+RUN pip install transformers==2.9.1 sentencepiece==0.1.86
 
 COPY ./src /code/marian
 WORKDIR /code/marian
diff --git a/gobbli/model/spacy/src/requirements.txt b/gobbli/model/spacy/src/requirements.txt
@@ -6,6 +6,10 @@ pandas==0.25.0
 # https://github.com/explosion/spacy-transformers/pull/120
 spacy==2.2.1
 spacy-transformers==0.5.1
+# Resolve nested package version conflicts
+sentencepiece==0.1.86
+urllib3>=1.25.4,<1.27
+requests==2.25.1
 
 # We're using the PyTorch image with CUDA 10.1, but spaCy doesn't have an extra
 # requirements specifier for CUDA 10.1 at the time of this writing (it only has 10.0).

diff --git a/gobbli/model/transformer/src/requirements.txt b/gobbli/model/transformer/src/requirements.txt
@@ -1,3 +1,4 @@
 # These are additional requirements needed on top of the pytorch image
 pandas==0.25.0
 transformers==2.8.0
+sentencepiece==0.1.86
diff --git a/gobbli/model/use/model.py b/gobbli/model/use/model.py
@@ -2,7 +2,7 @@
 import shutil
 import tempfile
 from pathlib import Path
-from typing import Any, Dict
+from typing import Any, Dict, List
 
 import numpy as np
 
@@ -14,12 +14,12 @@
 from gobbli.util import assert_in, download_archive, escape_line_delimited_texts
 
 
-def _read_embeddings(output_file: Path) -> np.ndarray:
-    embeddings = []
+def _read_embeddings(output_file: Path) -> List[np.ndarray]:
+    embeddings = []  # type: List[np.ndarray]
     with open(output_file, "r") as f:
         for line in f:
-            embeddings.append(json.loads(line))
-    return np.vstack(embeddings)
+            embeddings.append(np.array(json.loads(line)))
+    return embeddings
 
 
 USE_MODEL_ARCHIVES = {

diff --git a/meta.json b/meta.json
@@ -4,6 +4,6 @@
   "download_url": "",
   "author": "RTI International",
   "maintainer": "Jason Nance",
-  "version": "0.2.3",
+  "version": "0.2.4",
   "description": "Uniform interface to deep learning approaches via Docker containers."
-}
+}
diff --git a/paper/README.md b/paper/README.md
@@ -0,0 +1,3 @@
+# Journal of Open Source Software Paper
+
+This section of the repository contains materials for a paper submitted to [JOSS](https://joss.theoj.org).
diff --git a/paper/paper.bib b/paper/paper.bib
@@ -0,0 +1,152 @@
+@article{Pang:2008,
+  title={Opinion Mining and Sentiment Analysis},
+  author={Pang, Bo and Lee, Lillian},
+  journal={Foundations and Trends in Information Retrieval},
+  volume={2},
+  number={1-2},
+  pages={1--135},
+  year={2008},
+  publisher={Now Publishers Inc.}
+}
+
+@incollection{Aggarwal:2012,
+  title={A Survey of Text Classification Algorithms},
+  author={Aggarwal, Charu C and Zhai, ChengXiang},
+  booktitle={Mining Text Data},
+  pages={163--222},
+  year={2012},
+  publisher={Springer},
+  doi={10.1007/978-1-4614-3223-4_6}
+}
+
+@article{Weiss:2016,
+  title={A survey of transfer learning},
+  author={Weiss, Karl and Khoshgoftaar, Taghi M and Wang, DingDing},
+  journal={Journal of Big Data},
+  volume={3},
+  number={1},
+  pages={9},
+  year={2016},
+  publisher={SpringerOpen}
+}
+
+@incollection{Torrey:2010,
+  title={Transfer Learning},
+  author={Torrey, Lisa and Shavlik, Jude},
+  booktitle={Handbook of Research on Machine Learning Applications and Trends: Algorithms, Methods, and Techniques},
+  pages={242--264},
+  year={2010},
+  publisher={IGI Global}
+}
+
+@article{Rajpurkar:2016,
+  title={SQuAD: 100,000+ Questions for Machine Comprehension of Text},
+  author={Rajpurkar, Pranav and Zhang, Jian and Lopyrev, Konstantin and Liang, Percy},
+  journal={arXiv preprint arXiv:1606.05250},
+  year={2016},
+  doi={10.18653/v1/d16-1264}
+}
+
+@article{Sun:2019,
+  title={ERNIE: Enhanced Representation through Knowledge Integration},
+  author={Sun, Yu and Wang, Shuohuan and Li, Yukun and Feng, Shikun and Chen, Xuyi and Zhang, Han and Tian, Xin and Zhu, Danxiang and Tian, Hao and Wu, Hua},
+  journal={arXiv preprint arXiv:1904.09223},
+  year={2019}
+}
+
+@article{Raffel:2019,
+  title={Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer},
+  author={Raffel, Colin and Shazeer, Noam and Roberts, Adam and Lee, Katherine and Narang, Sharan and Matena, Michael and Zhou, Yanqi and Li, Wei and Liu, Peter J},
+  journal={arXiv preprint arXiv:1910.10683},
+  year={2019}
+}
+
+@article{Liu:2019,
+  title={Multi-Task Deep Neural Networks for Natural Language Understanding},
+  author={Liu, Xiaodong and He, Pengcheng and Chen, Weizhu and Gao, Jianfeng},
+  journal={arXiv preprint arXiv:1901.11504},
+  year={2019},
+  doi={10.18653/v1/p19-1441}
+}
+
+@article{Wolf:2019,
+  title={HuggingFace's Transformers: State-of-the-art Natural Language Processing},
+  author={Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond and Clement Delangue and Anthony Moi and Pierric Cistac and Tim Rault and R'emi Louf and Morgan Funtowicz and Jamie Brew},
+  journal={ArXiv},
+  year={2019},
+  volume={abs/1910.03771}
+}
+
+@misc{Howard:2018,
+  title={fastai},
+  author={Howard, Jeremy and others},
+  year={2018},
+  publisher={GitHub},
+  howpublished={\url{https://github.com/fastai/fastai}},
+}
+
+@article{Wei:2019,
+  title={EDA: Easy Data Augmentation Techniques for Boosting Performance on Text Classification Tasks},
+  author={Wei, Jason W and Zou, Kai},
+  journal={arXiv preprint arXiv:1901.11196},
+  year={2019},
+  doi={10.18653/v1/d19-1670}
+}
+
+@article{Shleifer:2019,
+  title={Low Resource Text Classification with ULMFit and Backtranslation},
+  author={Shleifer, Sam},
+  journal={arXiv preprint arXiv:1903.09244},
+  year={2019}
+}
+
+@misc{Teixeira:2018,
+  title={Streamlit},
+  author={Teixeira, Thiago and Treuille, Adrien and others},
+  year={2018},
+  publisher={GitHub},
+  howpublished={\url{https://github.com/streamlit/streamlit}},
+}
+
+@inproceedings{Williams:2018,
+  author={Williams, Adina and Nangia, Nikita and Bowman, Samuel},
+  title={A Broad-Coverage Challenge Corpus for Sentence Understanding through Inference},
+  booktitle={Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers)},
+  year={2018},
+  publisher={Association for Computational Linguistics},
+  pages={1112--1122},
+  location={New Orleans, Louisiana},
+  url={http://aclweb.org/anthology/N18-1101},
+  doi={10.18653/v1/n18-1101}
+}
+
+@article{Bowman:2015,
+  title={A large annotated corpus for learning natural language inference},
+  author={Bowman, Samuel R and Angeli, Gabor and Potts, Christopher and Manning, Christopher D},
+  journal={arXiv preprint arXiv:1508.05326},
+  year={2015},
+  doi={10.18653/v1/d15-1075}
+}
+
+@inproceedings{Raina:2007,
+  title={Self-taught learning: transfer learning from unlabeled data},
+  author={Raina, Rajat and Battle, Alexis and Lee, Honglak and Packer, Benjamin and Ng, Andrew Y},
+  booktitle={Proceedings of the 24th international conference on Machine learning},
+  pages={759--766},
+  year={2007}
+}
+
+@article{Devlin:2018,
+  title={BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding},
+  author={Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
+  journal={arXiv preprint arXiv:1810.04805},
+  year={2018}
+}
+
+@article{Kobayashi:2018,
+  title={Contextual Augmentation: Data Augmentation by Words with Paradigmatic Relations},
+  author={Kobayashi, Sosuke},
+  journal={arXiv preprint arXiv:1805.06201},
+  year={2018},
+  doi={10.18653/v1/n18-2072}
+}
diff --git a/paper/paper.md b/paper/paper.md
@@ -0,0 +1,39 @@
+---
+title: 'gobbli: A uniform interface to deep learning for text in Python'
+tags:
+ - Python
+ - deep learning
+ - data science
+ - classification
+ - natural language processing
+authors:
+ - name: Jason Nance
+   orcid: 0000-0003-4127-3198
+   affiliation: 1
+ - name: Peter Baumgartner
+   orcid: 0000-0003-3117-6239
+   affiliation: 1
+affiliations:
+ - name: RTI International
+   index: 1
+date: 20 May 2020
+bibliography: paper.bib
+---
+
+# Summary
+
+Machine learning has long been used to address natural language processing (NLP) tasks like sentiment analysis [@Pang:2008] and document classification [@Aggarwal:2012].  Traditional approaches to these tasks require numerous labeled examples from the specific domains in which they will be applied.  Such algorithms can only use the available training data, which is often limited in size and diversity, to learn to understand natural language.  In the last few years, transfer learning [@Weiss:2016] has caused a paradigm shift in NLP.  Rather than training distinct task-specific models from scratch, a transfer learning model first learns the rules of language from a large, diverse text corpus during an extensive self-supervised training regimen.  Self-supervised tasks are formulated such that unlabeled data can be used to train supervised models [@Raina:2007]; an example is masked language modeling, where a subset of words in a document are masked out, and the model predicts the masked words [@Devlin:2018].  The transfer learning model thus learns a rich representation of language, which can be fine-tuned to solve specific problems.  According to @Torrey:2010, this approach mimics how humans reuse their general understanding of language across tasks.  Transfer learning has not only rapidly advanced the state of the art in classification but has enabled near-human performance on more advanced tasks like question answering [@Rajpurkar:2016; @Raffel:2019] and natural language inference [@Rajpurkar:2016; @Williams:2018; @Bowman:2015].
+
+While the performance gains on benchmark tasks are undeniable, applied researchers face challenges using transfer learning models to solve new problems.  A wide variety of models are being developed by disparate research teams using different technologies [@Sun:2019; @Raffel:2019; @Liu:2019]. A practitioner may therefore be required to learn a new programming language, a deep learning library, a containerization technology, and a model interface whenever they want to evaluate the feasibility of a new model on a custom task.  ``gobbli`` was developed to address this problem.
+
+``gobbli`` is a Python library intended to bridge state-of-the-art research in natural language processing and application to real-world problems.  The library defines a simple interface for training classification models, producing predictions, and generating embeddings.  Several models implementing the interface are available using programmatically-created Docker containers to abstract away differences in underlying deep learning libraries and model hyperparameters.  This approach allows users to easily evaluate models and compare performance across model types without spending time adapting their dataset and use case to each model.  Compared to other deep learning libraries used for NLP like ``transformers`` [@Wolf:2019] and ``fastai`` [@Howard:2018], ``gobbli`` is designed to emphasize simplicity and interoperability rather than customization and performance in order to make deep learning more accessible to applied researchers.
+
+Beyond its model wrappers, ``gobbli`` provides several helpful utilities for NLP practitioners.  Data augmentation has emerged as a popular technique to improve model performance when training data is limited.  Multiple methods for data augmentation are implemented in ``gobbli``, including backtranslation [@Shleifer:2019], word replacement [@Wei:2019], and contextual augmentation [@Kobayashi:2018]. These methods can be used independently of ``gobbli`` models for interoperability with other modeling libraries.  ``gobbli`` also bundles a set of interactive web applications built using Streamlit [@Teixeira:2018] which can be used to explore a dataset, visualize embeddings, evaluate model performance, and explain model predictions without writing any code.
+
+``gobbli`` was developed from experiences on client contracts and internal projects at RTI International.  It is intended for use by anyone solving problems using applied NLP, including researchers, students, and industry practitioners.
+
+# Acknowledgments
+
+Work on ``gobbli`` was funded by RTI International.
+
+# References
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# Journal of Open Source Software Paper

		This section of the repository contains materials for a paper submitted to [JOSS](https://joss.theoj.org).