From a5cdd09abba3c9e55302e3ae0bb9435023a5d375 Mon Sep 17 00:00:00 2001 From: xhlulu Date: Wed, 17 Mar 2021 01:55:42 -0400 Subject: [PATCH] Re-add acknowledgement, bump version for release --- README.md | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- setup.py | 2 +- 2 files changed, 56 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index a00c6c9..44ea563 100644 --- a/README.md +++ b/README.md @@ -168,4 +168,58 @@ mt.translate( ) ``` -Learn more in the [huggingface docs](https://huggingface.co/transformers/main_classes/model.html#transformers.generation_utils.GenerationMixin.generate). \ No newline at end of file +Learn more in the [huggingface docs](https://huggingface.co/transformers/main_classes/model.html#transformers.generation_utils.GenerationMixin.generate). + + +## Acknowledgement + +`dl-translate` is built on top of Huggingface's implementation of multilingual BART finetuned on many-to-many translation of over 50 languages, which is [documented here](https://huggingface.co/transformers/master/model_doc/mbart.html). The original paper was written by Tang et. al from Facebook AI Research; you can [find it here](https://arxiv.org/pdf/2008.00401.pdf) and cite it using the following: +``` +@article{tang2020multilingual, + title={Multilingual translation with extensible multilingual pretraining and finetuning}, + author={Tang, Yuqing and Tran, Chau and Li, Xian and Chen, Peng-Jen and Goyal, Naman and Chaudhary, Vishrav and Gu, Jiatao and Fan, Angela}, + journal={arXiv preprint arXiv:2008.00401}, + year={2020} +} +``` + +`dlt` is a wrapper with useful `utils` to save you time. For huggingface's `transformers`, the following snippet is shown as an example: +```python +from transformers import MBartForConditionalGeneration, MBart50TokenizerFast + +article_hi = "संयुक्त राष्ट्र के प्रमुख का कहना है कि सीरिया में कोई सैन्य समाधान नहीं है" +article_ar = "الأمين العام للأمم المتحدة يقول إنه لا يوجد حل عسكري في سوريا." + +model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt") +tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt") + +# translate Hindi to French +tokenizer.src_lang = "hi_IN" +encoded_hi = tokenizer(article_hi, return_tensors="pt") +generated_tokens = model.generate(**encoded_hi, forced_bos_token_id=tokenizer.lang_code_to_id["fr_XX"]) +tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) +# => "Le chef de l 'ONU affirme qu 'il n 'y a pas de solution militaire en Syria." + +# translate Arabic to English +tokenizer.src_lang = "ar_AR" +encoded_ar = tokenizer(article_ar, return_tensors="pt") +generated_tokens = model.generate(**encoded_ar, forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"]) +tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) +# => "The Secretary-General of the United Nations says there is no military solution in Syria." +``` + +With `dlt`, you can run: +```python +import dl_translate as dlt + +article_hi = "संयुक्त राष्ट्र के प्रमुख का कहना है कि सीरिया में कोई सैन्य समाधान नहीं है" +article_ar = "الأمين العام للأمم المتحدة يقول إنه لا يوجد حل عسكري في سوريا." + +mt = dlt.TranslationModel() +translated_fr = mt.translate(article_hi, source=dlt.lang.HINDI, target=dlt.lang.FRENCH) +translated_en = mt.translate(article_ar, source=dlt.lang.ARABIC, target=dlt.lang.ENGLISH) +``` + +Notice you don't have to think about tokenizers, condition generation, pretrained models, and regional codes; you can just tell the model what to translate! + +If you are experienced with `huggingface`'s ecosystem, then you should be familiar enough with the example above that you wouldn't need this library. However, if you've never heard of huggingface or mBART, then I hope using this library will give you enough motivation to [learn more about them](https://github.com/huggingface/transformers) :) \ No newline at end of file diff --git a/setup.py b/setup.py index 2fecca8..500336c 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name="dl-translate", - version="0.1.0rc3", + version="0.1.0", author="Xing Han Lu", author_email="github@xinghanlu.com", description="A deep learning-based translation library built on Huggingface transformers",