diff --git a/README.md b/README.md index 6f88457..0e75903 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,5 @@ # PyThaiNLP Corpus +[![Build Status](https://travis-ci.org/PyThaiNLP/pythainlp-corpus.svg?branch=2.1)](https://travis-ci.org/PyThaiNLP/pythainlp-corpus) Corpora and language models for [PyThaiNLP](https://github.com//PyThaiNLP/pythainlp). diff --git a/db.json b/db.json index 3b905e1..c4e589a 100644 --- a/db.json +++ b/db.json @@ -1,134 +1,220 @@ { "test": { "name": "test", - "file_name": "test.txt", - "version": "0.1", - "download": "https://github.com/PyThaiNLP/pythainlp-corpus/releases/download/test-0.1/test.txt", - "text": "It's test file.", - "md5": "ff1f76282b7adcb310ffad3ecd867c3d", - "about": "-", - "homepage": "https://github.com/PyThaiNLP/pythainlp-corpus/", - "authors": "Wannaphong Phatthiyaphaibun" + "latest_version": "0.1", + "description": "It's a test file.", + "long_description": "A handy dummy corpus used for testing (like in unit testing) purpose.", + "url": "https://github.com/PyThaiNLP/pythainlp-corpus/", + "project_urls": { + "project_page": "https://www.thainlp.org/", + "source": "https://github.com/PyThaiNLP/pythainlp-corpus/" + }, + "license": "cc-by-sa-4.0", + "authors": [ + "Wannaphong Phatthiyaphaibun" + ], + "author_email": "wannaphong@kkumail.com", + "versions": { + "0.1": { + "filename": "test.txt", + "download_url": "https://github.com/PyThaiNLP/pythainlp-corpus/releases/download/test-0.1/test.txt", + "md5": "ff1f76282b7adcb310ffad3ecd867c3d" + } + } }, "crfcut": { "name": "crfcut", - "file_name": "sentenceseg-ted.model", - "version": "0.1", - "download": "https://github.com/vistec-AI/ted_crawler/blob/master/models/sentenceseg-ted.model?raw=true", - "text": "Thai sentence segmentation with CRF trained on TED dataset", - "md5": "-", - "about": "-", - "homepage": "https://github.com/vistec-AI/ted_crawler/", - "authors": "Charin Polpanumas" + "latest_version": "0.1", + "description": "Thai sentence segmentation with CRF trained on TED dataset", + "long_description": "-", + "url": "https://github.com/vistec-AI/ted_crawler/", + "authors": [ + "Charin Polpanumas" + ], + "author_email": "", + "license": "cc-by-sa-4.0", + "versions": { + "0.1": { + "filename": "sentenceseg-ted.model", + "download_url": "https://github.com/vistec-AI/ted_crawler/blob/master/models/sentenceseg-ted.model?raw=true", + "md5": "-" + } + } }, "g2p": { "name": "g2p", - "file_name": "wiktionary-11-2-2020.tsv", - "version": "0.1", - "download": "https://raw.githubusercontent.com/PyThaiNLP/lexicon-thai/master/G2P/wiktionary-11-2-2020.tsv", - "text": "Grapheme to Phoneme (G2P) ภาษาไทย", - "md5": "-", - "about": "ข้อมูลดึงมาจากวิกิพจนานุกรมภาษาไทย (Thai Wiktionary)", - "homepage": "https://github.com/PyThaiNLP/lexicon-thai/tree/master/G2P", - "authors": "Wannaphong Phatthiyaphaibun" + "latest_version": "0.1", + "description": "Grapheme to Phoneme (G2P) ภาษาไทย", + "long_description": "ข้อมูลดึงมาจากวิกิพจนานุกรมภาษาไทย (Thai Wiktionary)", + "url": "https://github.com/PyThaiNLP/lexicon-thai/tree/master/G2P", + "authors": [ + "Wannaphong Phatthiyaphaibun" + ], + "author_email": "wannaphong@kkumail.com", + "license": "cc-by-sa-4.0", + "versions": { + "0.1": { + "filename": "wiktionary-11-2-2020.tsv", + "download_url": "https://raw.githubusercontent.com/PyThaiNLP/lexicon-thai/master/G2P/wiktionary-11-2-2020.tsv", + "md5": "-" + } + } }, "thai-g2p": { "name": "thai-g2p", - "file_name": "thaig2p-0.1.tar", - "version": "0.1", - "download": "https://github.com/PyThaiNLP/pythainlp-corpus/releases/download/thaig2p-v0.1/thaig2p-0.1.tar", - "text": "Thai Grapheme to Phoneme (G2P)", - "md5": "-", - "about": "Thai Grapheme to Phoneme (G2P) in PyTorch", - "homepage": "https://github.com/wannaphong/thai-g2p/", - "authors": "Wannaphong Phatthiyaphaibun" + "latest_version": "0.1", + "description": "Thai Grapheme to Phoneme (G2P)", + "long_description": "Thai Grapheme to Phoneme (G2P) in PyTorch", + "url": "https://github.com/wannaphong/thai-g2p/", + "authors": [ + "Wannaphong Phatthiyaphaibun" + ], + "author_email": "wannaphong@kkumail.com", + "license": "apache-2.0", + "versions": { + "0.1": { + "filename": "thaig2p-0.1.tar", + "download_url": "https://github.com/PyThaiNLP/pythainlp-corpus/releases/download/thaig2p-v0.1/thaig2p-0.1.tar", + "md5": "-" + } + } }, "thai2fit_wv": { "name": "thai2fit_wv", - "file_name": "thai2vec.bin", - "version": "0.1", - "download": "https://www.dropbox.com/s/yuq0gp1eges8j5n/thai2vec.bin?dl=1", - "text": "thai2vec word embeddings", - "md5": "-", - "about": "-", - "homepage": "https://github.com/cstorm125/thai2fit/", - "authors": "Charin Polpanumas" + "latest_version": "0.1", + "description": "thai2vec word embeddings", + "long_description": "-", + "url": "https://github.com/cstorm125/thai2fit/", + "authors": [ + "Charin Polpanumas" + ], + "author_email": "", + "license": "", + "versions": { + "0.1": { + "filename": "thai2vec.bin", + "download_url": "https://www.dropbox.com/s/yuq0gp1eges8j5n/thai2vec.bin?dl=1", + "md5": "-" + } + } }, "thai2rom-dataset": { "name": "thai2rom-dataset", - "file_name": "thai2rom.csv", - "version": "0.1", - "download": "https://raw.githubusercontent.com/wannaphong/thai-romanization/master/dataset/data.csv", - "text": "-", - "md5": "-", - "about": "-", - "homepage": "https://github.com/wannaphongcom/thai-romanization/", - "authors": "Wannaphong Phatthiyaphaibun" + "latest_version": "0.1", + "description": "-", + "long_description": "-", + "url": "https://github.com/wannaphongcom/thai-romanization/", + "authors": [ + "Wannaphong Phatthiyaphaibun" + ], + "author_email": "wannaphong@kkumail.com", + "license": "apache-2.0", + "versions": { + "0.1": { + "filename": "thai2rom.csv", + "download_url": "https://raw.githubusercontent.com/wannaphong/thai-romanization/master/dataset/data.csv", + "md5": "-" + } + } }, "thai2rom-pytorch": { "name": "thai2rom-pytorch", - "file_name": "thai2rom-pytorch.tar", - "version": "0.1", - "download": "https://raw.githubusercontent.com/c4n/thai-romanization/master/notebook/thai2rom-pytorch.tar", - "text": "-", - "md5": "-", - "about": "LSTM encoder-decoder model", - "homepage": "https://github.com/c4n/thai-romanization/", - "authors": "Can Udomcharoenchaikit" + "latest_version": "0.1", + "description": "-", + "long_description": "LSTM encoder-decoder model", + "url": "https://github.com/c4n/thai-romanization/", + "authors": [ + "Can Udomcharoenchaikit" + ], + "author_email": "", + "license": "apache-2.0", + "versions": { + "0.1": { + "filename": "thai2rom-pytorch.tar", + "download_url": "https://raw.githubusercontent.com/c4n/thai-romanization/master/notebook/thai2rom-pytorch.tar", + "md5": "-" + } + } }, "thai2rom-pytorch-attn": { "name": "thai2rom-pytorch-attn", - "file_name": "thai2rom-pytorch-attn-v0.1.tar", - "version": "0.1", - "download": "https://raw.githubusercontent.com/artificiala/thai-romanization/master/notebook/thai2rom-pytorch-attn-v0.1.tar", - "text": "-", - "md5": "-", - "about": "LSTM encoder-decoder model with attention mechanism", - "homepage": "https://github.com/artificiala/thai-romanization/", - "authors": "Chakri Lowphansirikul" + "latest_version": "0.1", + "description": "-", + "long_description": "LSTM encoder-decoder model with attention mechanism", + "url": "https://github.com/artificiala/thai-romanization/", + "authors": [ + "Chakri Lowphansirikul" + ], + "author_email": "", + "license": "apache-2.0", + "versions": { + "0.1": { + "filename": "thai2rom-pytorch-attn-v0.1.tar", + "download_url": "https://raw.githubusercontent.com/artificiala/thai-romanization/master/notebook/thai2rom-pytorch-attn-v0.1.tar", + "md5": "-" + } + } }, - "thainer-1-3": { + "thainer": { "name": "thainer", - "file_name": "thainer.model", - "version": "1.3", - "download": "https://github.com/PyThaiNLP/pythainlp-corpus/releases/download/thainer-1.3/data.model", - "text": "Thai Named Entity Recognition For PyThaiNLP", - "md5": "-", - "about": "-", - "homepage": "https://github.com/wannaphong/thai-ner/", - "authors": "Wannaphong Phatthiyaphaibun" - }, - "thainer-1-4": { - "name": "thainer", - "file_name": "thai-ner-1-4.crfsuite", - "version": "1.4", - "download": "https://github.com/PyThaiNLP/pythainlp-corpus/releases/download/thainer-1.4/thai-ner-1-4.crfsuite", - "text": "Thai Named Entity Recognition For PyThaiNLP", - "md5": "-", - "about": "-", - "homepage": "https://github.com/wannaphong/thai-ner/", - "authors": "Wannaphong Phatthiyaphaibun" + "latest_version": "1.4", + "description": "Thai Named Entity Recognition For PyThaiNLP", + "long_description": "-", + "url": "https://github.com/wannaphong/thai-ner/", + "authors": [ + "Wannaphong Phatthiyaphaibun" + ], + "author_email": "wannaphong@kkumail.com", + "license": "apache-2.0", + "versions": { + "1.3": { + "filename": "thai-ner-1-3.crfsuite", + "download_url": "https://github.com/PyThaiNLP/pythainlp-corpus/releases/download/thainer-1.3/data.model", + "md5": "-" + }, + "1.4": { + "filename": "thai-ner-1-4.crfsuite", + "download_url": "https://github.com/PyThaiNLP/pythainlp-corpus/releases/download/thainer-1.4/thai-ner-1-4.crfsuite", + "md5": "-" + } + } }, "wiki_itos_lstm": { "name": "wiki_itos_lstm", - "file_name": "itos_lstm.pkl", - "version": "0.32", - "download": "https://www.dropbox.com/s/87p5ugshid4mbcm/thwiki_itos.pkl?dl=1", - "text": "ULMFit index to text for LSTM", - "md5": "-", - "about": "-", - "homepage": "https://github.com/cstorm125/thai2fit/", - "authors": "Charin Polpanumas" + "latest_version": "0.32", + "description": "ULMFit index to text for LSTM", + "long_description": "-", + "url": "https://github.com/cstorm125/thai2fit/", + "authors": [ + "Charin Polpanumas" + ], + "author_email": "", + "license": "cc-by-sa-4.0", + "versions": { + "0.32": { + "filename": "itos_lstm.pkl", + "download_url": "https://www.dropbox.com/s/87p5ugshid4mbcm/thwiki_itos.pkl?dl=1", + "md5": "-" + } + } }, "wiki_lm_lstm": { "name": "wiki_lm_lstm", - "file_name": "thwiki_model_lstm.pth", - "version": "0.32", - "download": "https://www.dropbox.com/s/7za2o1nmq8s3fex/thwiki_lm.pth?dl=1", - "text": "Wiki-pretrained ULMFit language model for LSTM", - "md5": "-", - "about": "-", - "homepage": "https://github.com/cstorm125/thai2fit/", - "authors": "Charin Polpanumas" + "latest_version": "0.32", + "description": "Wiki-pretrained ULMFit language model for LSTM", + "long_description": "-", + "url": "https://github.com/cstorm125/thai2fit/", + "authors": [ + "Charin Polpanumas" + ], + "author_email": "", + "license": "cc-by-sa-4.0", + "versions": { + "0.32": { + "filename": "thwiki_model_lstm.pth", + "download_url": "https://www.dropbox.com/s/7za2o1nmq8s3fex/thwiki_lm.pth?dl=1", + "md5": "-" + } + } } -} +} \ No newline at end of file diff --git a/templates/details.html b/templates/details.html index 5856dfa..45f4d5d 100644 --- a/templates/details.html +++ b/templates/details.html @@ -6,15 +6,16 @@

{{ corpus['name'] }}

- version {{ corpus['version'] }} + version {{ corpus['latest_version'] }}

{{ corpus['text'] }}


-

About : {{ corpus['about'] }}

-

HomePage : {{ corpus['homepage'] }}

-

Authors : {{ corpus['authors'] }}

+

Description : {{ corpus['description'] }}

+

Long Description : {{ corpus['long_description'] }}

+

HomePage : {{ corpus['project_url'] }}

+

Authors : {{ corpus['authors'][0] }}


Download and Use

Download

@@ -27,7 +28,7 @@

Use

if get_file('{{ corpus['name'] }}') is None than you not download {{ corpus['name'] }}.

File Details

-

File Name : {{ corpus['file_name'] }}

-

md5 : {{ corpus['md5'] }}

-

Link Download : {{ corpus['download'] }}

-{% endblock %} \ No newline at end of file +

File Name : {{ corpus['release']['latest_version']['file_name'] }}

+

md5 : {{ corpus['release']['latest_version']['md5'] }}

+

Link Download : {{ corpus['download'] }}

+{% endblock %} diff --git a/templates/list-corpus.html b/templates/list-corpus.html index 0ea66bb..60a0ca5 100644 --- a/templates/list-corpus.html +++ b/templates/list-corpus.html @@ -10,13 +10,13 @@

List Corpus & Models

{% for corpus in listcorpus %}

{{ corpus['name'] }}

- {% if corpus['text']|length < 50 %} -

{{ corpus['text'] }}

+ {% if corpus['description']|length < 50 %} +

{{ corpus['description'] }}

{% else %} -

{{ corpus['text'][:50]+' ...' }}

+

{{ corpus['description'][:50]+' ...' }}

{% endif %}

View details »

{% endfor %} -{% endblock %} \ No newline at end of file +{% endblock %}