From d99868fdb25c202ff535860195a4c3f992412d32 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sat, 30 May 2020 16:31:06 +0700 Subject: [PATCH 01/14] Delete old thainer --- db.json | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/db.json b/db.json index 3b905e1..ae21c61 100644 --- a/db.json +++ b/db.json @@ -87,18 +87,7 @@ "homepage": "https://github.com/artificiala/thai-romanization/", "authors": "Chakri Lowphansirikul" }, - "thainer-1-3": { - "name": "thainer", - "file_name": "thainer.model", - "version": "1.3", - "download": "https://github.com/PyThaiNLP/pythainlp-corpus/releases/download/thainer-1.3/data.model", - "text": "Thai Named Entity Recognition For PyThaiNLP", - "md5": "-", - "about": "-", - "homepage": "https://github.com/wannaphong/thai-ner/", - "authors": "Wannaphong Phatthiyaphaibun" - }, - "thainer-1-4": { + "thainer": { "name": "thainer", "file_name": "thai-ner-1-4.crfsuite", "version": "1.4", From afd713017c909a55db3e909c408cda15fd4283c5 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sat, 30 May 2020 16:48:19 +0700 Subject: [PATCH 02/14] Update db.json --- db.json | 110 +++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 81 insertions(+), 29 deletions(-) diff --git a/db.json b/db.json index ae21c61..b347dfe 100644 --- a/db.json +++ b/db.json @@ -3,45 +3,61 @@ "name": "test", "file_name": "test.txt", "version": "0.1", - "download": "https://github.com/PyThaiNLP/pythainlp-corpus/releases/download/test-0.1/test.txt", "text": "It's test file.", - "md5": "ff1f76282b7adcb310ffad3ecd867c3d", "about": "-", "homepage": "https://github.com/PyThaiNLP/pythainlp-corpus/", - "authors": "Wannaphong Phatthiyaphaibun" + "authors": "Wannaphong Phatthiyaphaibun", + "release":{ + "0.1":{ + "download": "https://github.com/PyThaiNLP/pythainlp-corpus/releases/download/test-0.1/test.txt", + "md5": "ff1f76282b7adcb310ffad3ecd867c3d" + } + } }, "crfcut": { "name": "crfcut", "file_name": "sentenceseg-ted.model", "version": "0.1", - "download": "https://github.com/vistec-AI/ted_crawler/blob/master/models/sentenceseg-ted.model?raw=true", "text": "Thai sentence segmentation with CRF trained on TED dataset", - "md5": "-", "about": "-", "homepage": "https://github.com/vistec-AI/ted_crawler/", - "authors": "Charin Polpanumas" + "authors": "Charin Polpanumas", + "release":{ + "0.1":{ + "download": "https://github.com/vistec-AI/ted_crawler/blob/master/models/sentenceseg-ted.model?raw=true", + "md5": "-" + } + } }, "g2p": { "name": "g2p", "file_name": "wiktionary-11-2-2020.tsv", "version": "0.1", - "download": "https://raw.githubusercontent.com/PyThaiNLP/lexicon-thai/master/G2P/wiktionary-11-2-2020.tsv", "text": "Grapheme to Phoneme (G2P) ภาษาไทย", - "md5": "-", "about": "ข้อมูลดึงมาจากวิกิพจนานุกรมภาษาไทย (Thai Wiktionary)", "homepage": "https://github.com/PyThaiNLP/lexicon-thai/tree/master/G2P", - "authors": "Wannaphong Phatthiyaphaibun" + "authors": "Wannaphong Phatthiyaphaibun", + "release":{ + "0.1":{ + "download": "https://raw.githubusercontent.com/PyThaiNLP/lexicon-thai/master/G2P/wiktionary-11-2-2020.tsv", + "md5": "-" + } + } }, "thai-g2p": { "name": "thai-g2p", "file_name": "thaig2p-0.1.tar", "version": "0.1", - "download": "https://github.com/PyThaiNLP/pythainlp-corpus/releases/download/thaig2p-v0.1/thaig2p-0.1.tar", "text": "Thai Grapheme to Phoneme (G2P)", - "md5": "-", "about": "Thai Grapheme to Phoneme (G2P) in PyTorch", "homepage": "https://github.com/wannaphong/thai-g2p/", - "authors": "Wannaphong Phatthiyaphaibun" + "authors": "Wannaphong Phatthiyaphaibun", + "release":{ + "0.1":{ + "download": "https://github.com/PyThaiNLP/pythainlp-corpus/releases/download/thaig2p-v0.1/thaig2p-0.1.tar", + "md5": "-" + } + } }, "thai2fit_wv": { "name": "thai2fit_wv", @@ -49,10 +65,15 @@ "version": "0.1", "download": "https://www.dropbox.com/s/yuq0gp1eges8j5n/thai2vec.bin?dl=1", "text": "thai2vec word embeddings", - "md5": "-", "about": "-", "homepage": "https://github.com/cstorm125/thai2fit/", - "authors": "Charin Polpanumas" + "authors": "Charin Polpanumas", + "release":{ + "0.1":{ + "download": "https://www.dropbox.com/s/yuq0gp1eges8j5n/thai2vec.bin?dl=1", + "md5": "-" + } + } }, "thai2rom-dataset": { "name": "thai2rom-dataset", @@ -60,10 +81,15 @@ "version": "0.1", "download": "https://raw.githubusercontent.com/wannaphong/thai-romanization/master/dataset/data.csv", "text": "-", - "md5": "-", "about": "-", "homepage": "https://github.com/wannaphongcom/thai-romanization/", - "authors": "Wannaphong Phatthiyaphaibun" + "authors": "Wannaphong Phatthiyaphaibun", + "release":{ + "0.1":{ + "download": "https://raw.githubusercontent.com/wannaphong/thai-romanization/master/dataset/data.csv", + "md5": "-" + } + } }, "thai2rom-pytorch": { "name": "thai2rom-pytorch", @@ -71,10 +97,15 @@ "version": "0.1", "download": "https://raw.githubusercontent.com/c4n/thai-romanization/master/notebook/thai2rom-pytorch.tar", "text": "-", - "md5": "-", "about": "LSTM encoder-decoder model", "homepage": "https://github.com/c4n/thai-romanization/", - "authors": "Can Udomcharoenchaikit" + "authors": "Can Udomcharoenchaikit", + "release":{ + "0.1":{ + "download": "https://raw.githubusercontent.com/c4n/thai-romanization/master/notebook/thai2rom-pytorch.tar", + "md5": "-" + } + } }, "thai2rom-pytorch-attn": { "name": "thai2rom-pytorch-attn", @@ -82,42 +113,63 @@ "version": "0.1", "download": "https://raw.githubusercontent.com/artificiala/thai-romanization/master/notebook/thai2rom-pytorch-attn-v0.1.tar", "text": "-", - "md5": "-", "about": "LSTM encoder-decoder model with attention mechanism", "homepage": "https://github.com/artificiala/thai-romanization/", - "authors": "Chakri Lowphansirikul" + "authors": "Chakri Lowphansirikul", + "release":{ + "0.1":{ + "download": "https://raw.githubusercontent.com/artificiala/thai-romanization/master/notebook/thai2rom-pytorch-attn-v0.1.tar", + "md5": "-" + } + } }, "thainer": { "name": "thainer", "file_name": "thai-ner-1-4.crfsuite", "version": "1.4", - "download": "https://github.com/PyThaiNLP/pythainlp-corpus/releases/download/thainer-1.4/thai-ner-1-4.crfsuite", "text": "Thai Named Entity Recognition For PyThaiNLP", - "md5": "-", "about": "-", "homepage": "https://github.com/wannaphong/thai-ner/", - "authors": "Wannaphong Phatthiyaphaibun" + "authors": "Wannaphong Phatthiyaphaibun", + "release":{ + "1.3":{ + "download": "https://github.com/PyThaiNLP/pythainlp-corpus/releases/download/thainer-1.3/data.model", + "md5": "-" + }, + "1.4":{ + "download": "https://github.com/PyThaiNLP/pythainlp-corpus/releases/download/thainer-1.4/thai-ner-1-4.crfsuite", + "md5": "-" + } + } }, "wiki_itos_lstm": { "name": "wiki_itos_lstm", "file_name": "itos_lstm.pkl", "version": "0.32", - "download": "https://www.dropbox.com/s/87p5ugshid4mbcm/thwiki_itos.pkl?dl=1", "text": "ULMFit index to text for LSTM", - "md5": "-", "about": "-", "homepage": "https://github.com/cstorm125/thai2fit/", - "authors": "Charin Polpanumas" + "authors": "Charin Polpanumas", + "release":{ + "0.32":{ + "download": "https://www.dropbox.com/s/87p5ugshid4mbcm/thwiki_itos.pkl?dl=1", + "md5": "-" + } + } }, "wiki_lm_lstm": { "name": "wiki_lm_lstm", "file_name": "thwiki_model_lstm.pth", "version": "0.32", - "download": "https://www.dropbox.com/s/7za2o1nmq8s3fex/thwiki_lm.pth?dl=1", "text": "Wiki-pretrained ULMFit language model for LSTM", - "md5": "-", "about": "-", "homepage": "https://github.com/cstorm125/thai2fit/", - "authors": "Charin Polpanumas" + "authors": "Charin Polpanumas", + "release":{ + "0.32":{ + "download": "https://www.dropbox.com/s/7za2o1nmq8s3fex/thwiki_lm.pth?dl=1", + "md5": "-" + } + } } } From e3bb9f0ec8b275973d760a0f478e2cfda7065b22 Mon Sep 17 00:00:00 2001 From: bact Date: Sat, 30 May 2020 18:36:18 +0100 Subject: [PATCH 03/14] Update README.md --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index b766043..6f88457 100644 --- a/README.md +++ b/README.md @@ -2,12 +2,13 @@ Corpora and language models for [PyThaiNLP](https://github.com//PyThaiNLP/pythainlp). -All corpora and data created by PyThaiNLP project use [Creative Commons Attribution-ShareAlike 4.0 International License](https://creativecommons.org/licenses/by-sa/4.0/), which mean anyone can use them in other projects for whatever purposes - without the need to seek permission. For other corpus that may included with PyThaiNLP distribution, please refer to Corpus License. +- All corpora and data created by PyThaiNLP project use [Creative Commons Attribution-ShareAlike 4.0 International License](https://creativecommons.org/licenses/by-sa/4.0/), which mean anyone can use them in other projects for whatever purposes - without the need to seek permission. +- For corpora that may included with PyThaiNLP distribution (for example, when you `pip install pythainlp`), please refer to PyThaiNLP module's [Corpus License](https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/corpus_license.md). ## Branches - `master` for test and dev - `2.2` for PyThaiNLP 2.2 - `2.1` for PyThaiNLP 2.1 -- `1.7` for PyThaiNLP 1.7 - `2.0` for PyThaiNLP 2.0 +- `1.7` for PyThaiNLP 1.7 From 5d010b0ab1cf8a840eafc0667c264b574e48b908 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sun, 31 May 2020 13:28:18 +0700 Subject: [PATCH 04/14] Update db.json --- db.json | 154 ++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 99 insertions(+), 55 deletions(-) diff --git a/db.json b/db.json index b347dfe..ace4c7c 100644 --- a/db.json +++ b/db.json @@ -2,11 +2,15 @@ "test": { "name": "test", "file_name": "test.txt", - "version": "0.1", - "text": "It's test file.", - "about": "-", - "homepage": "https://github.com/PyThaiNLP/pythainlp-corpus/", - "authors": "Wannaphong Phatthiyaphaibun", + "latest_version": "0.1", + "description": "It's test file.", + "long_description": "-", + "project_url": "https://github.com/PyThaiNLP/pythainlp-corpus/", + "license": "Creative Commons Attribution-ShareAlike 4.0 International License", + "authors": [ + "Wannaphong Phatthiyaphaibun" + ], + "author_email": "wannaphong@kkumail.com", "release":{ "0.1":{ "download": "https://github.com/PyThaiNLP/pythainlp-corpus/releases/download/test-0.1/test.txt", @@ -17,11 +21,15 @@ "crfcut": { "name": "crfcut", "file_name": "sentenceseg-ted.model", - "version": "0.1", - "text": "Thai sentence segmentation with CRF trained on TED dataset", - "about": "-", - "homepage": "https://github.com/vistec-AI/ted_crawler/", - "authors": "Charin Polpanumas", + "latest_version": "0.1", + "description": "Thai sentence segmentation with CRF trained on TED dataset", + "long_description": "-", + "project_url": "https://github.com/vistec-AI/ted_crawler/", + "authors": [ + "Charin Polpanumas" + ], + "author_email": "", + "license":"", "release":{ "0.1":{ "download": "https://github.com/vistec-AI/ted_crawler/blob/master/models/sentenceseg-ted.model?raw=true", @@ -32,11 +40,15 @@ "g2p": { "name": "g2p", "file_name": "wiktionary-11-2-2020.tsv", - "version": "0.1", - "text": "Grapheme to Phoneme (G2P) ภาษาไทย", - "about": "ข้อมูลดึงมาจากวิกิพจนานุกรมภาษาไทย (Thai Wiktionary)", - "homepage": "https://github.com/PyThaiNLP/lexicon-thai/tree/master/G2P", - "authors": "Wannaphong Phatthiyaphaibun", + "latest_version": "0.1", + "description": "Grapheme to Phoneme (G2P) ภาษาไทย", + "long_description": "ข้อมูลดึงมาจากวิกิพจนานุกรมภาษาไทย (Thai Wiktionary)", + "project_url": "https://github.com/PyThaiNLP/lexicon-thai/tree/master/G2P", + "authors": [ + "Wannaphong Phatthiyaphaibun" + ], + "author_email": "wannaphong@kkumail.com", + "license":"", "release":{ "0.1":{ "download": "https://raw.githubusercontent.com/PyThaiNLP/lexicon-thai/master/G2P/wiktionary-11-2-2020.tsv", @@ -47,11 +59,15 @@ "thai-g2p": { "name": "thai-g2p", "file_name": "thaig2p-0.1.tar", - "version": "0.1", - "text": "Thai Grapheme to Phoneme (G2P)", - "about": "Thai Grapheme to Phoneme (G2P) in PyTorch", - "homepage": "https://github.com/wannaphong/thai-g2p/", - "authors": "Wannaphong Phatthiyaphaibun", + "latest_version": "0.1", + "description": "Thai Grapheme to Phoneme (G2P)", + "long_description": "Thai Grapheme to Phoneme (G2P) in PyTorch", + "project_url": "https://github.com/wannaphong/thai-g2p/", + "authors": [ + "Wannaphong Phatthiyaphaibun" + ], + "author_email": "wannaphong@kkumail.com", + "license":"", "release":{ "0.1":{ "download": "https://github.com/PyThaiNLP/pythainlp-corpus/releases/download/thaig2p-v0.1/thaig2p-0.1.tar", @@ -62,12 +78,16 @@ "thai2fit_wv": { "name": "thai2fit_wv", "file_name": "thai2vec.bin", - "version": "0.1", + "latest_version": "0.1", "download": "https://www.dropbox.com/s/yuq0gp1eges8j5n/thai2vec.bin?dl=1", - "text": "thai2vec word embeddings", - "about": "-", - "homepage": "https://github.com/cstorm125/thai2fit/", - "authors": "Charin Polpanumas", + "description": "thai2vec word embeddings", + "long_description": "-", + "project_url": "https://github.com/cstorm125/thai2fit/", + "authors": [ + "Charin Polpanumas" + ], + "author_email": "", + "license":"", "release":{ "0.1":{ "download": "https://www.dropbox.com/s/yuq0gp1eges8j5n/thai2vec.bin?dl=1", @@ -78,12 +98,16 @@ "thai2rom-dataset": { "name": "thai2rom-dataset", "file_name": "thai2rom.csv", - "version": "0.1", + "latest_version": "0.1", "download": "https://raw.githubusercontent.com/wannaphong/thai-romanization/master/dataset/data.csv", - "text": "-", - "about": "-", - "homepage": "https://github.com/wannaphongcom/thai-romanization/", - "authors": "Wannaphong Phatthiyaphaibun", + "description": "-", + "long_description": "-", + "project_url": "https://github.com/wannaphongcom/thai-romanization/", + "authors": [ + "Wannaphong Phatthiyaphaibun" + ], + "author_email": "wannaphong@kkumail.com", + "license":"", "release":{ "0.1":{ "download": "https://raw.githubusercontent.com/wannaphong/thai-romanization/master/dataset/data.csv", @@ -94,12 +118,16 @@ "thai2rom-pytorch": { "name": "thai2rom-pytorch", "file_name": "thai2rom-pytorch.tar", - "version": "0.1", + "latest_version": "0.1", "download": "https://raw.githubusercontent.com/c4n/thai-romanization/master/notebook/thai2rom-pytorch.tar", - "text": "-", - "about": "LSTM encoder-decoder model", - "homepage": "https://github.com/c4n/thai-romanization/", - "authors": "Can Udomcharoenchaikit", + "description": "-", + "long_description": "LSTM encoder-decoder model", + "project_url": "https://github.com/c4n/thai-romanization/", + "authors": [ + "Can Udomcharoenchaikit" + ], + "author_email": "", + "license":"", "release":{ "0.1":{ "download": "https://raw.githubusercontent.com/c4n/thai-romanization/master/notebook/thai2rom-pytorch.tar", @@ -110,12 +138,16 @@ "thai2rom-pytorch-attn": { "name": "thai2rom-pytorch-attn", "file_name": "thai2rom-pytorch-attn-v0.1.tar", - "version": "0.1", + "latest_version": "0.1", "download": "https://raw.githubusercontent.com/artificiala/thai-romanization/master/notebook/thai2rom-pytorch-attn-v0.1.tar", - "text": "-", - "about": "LSTM encoder-decoder model with attention mechanism", - "homepage": "https://github.com/artificiala/thai-romanization/", - "authors": "Chakri Lowphansirikul", + "description": "-", + "long_description": "LSTM encoder-decoder model with attention mechanism", + "project_url": "https://github.com/artificiala/thai-romanization/", + "authors": [ + "Chakri Lowphansirikul" + ], + "author_email": "", + "license":"", "release":{ "0.1":{ "download": "https://raw.githubusercontent.com/artificiala/thai-romanization/master/notebook/thai2rom-pytorch-attn-v0.1.tar", @@ -126,11 +158,15 @@ "thainer": { "name": "thainer", "file_name": "thai-ner-1-4.crfsuite", - "version": "1.4", - "text": "Thai Named Entity Recognition For PyThaiNLP", - "about": "-", - "homepage": "https://github.com/wannaphong/thai-ner/", - "authors": "Wannaphong Phatthiyaphaibun", + "latest_version": "1.4", + "description": "Thai Named Entity Recognition For PyThaiNLP", + "long_description": "-", + "project_url": "https://github.com/wannaphong/thai-ner/", + "authors": [ + "Wannaphong Phatthiyaphaibun" + ], + "author_email": "wannaphong@kkumail.com", + "license":"", "release":{ "1.3":{ "download": "https://github.com/PyThaiNLP/pythainlp-corpus/releases/download/thainer-1.3/data.model", @@ -145,11 +181,15 @@ "wiki_itos_lstm": { "name": "wiki_itos_lstm", "file_name": "itos_lstm.pkl", - "version": "0.32", - "text": "ULMFit index to text for LSTM", - "about": "-", - "homepage": "https://github.com/cstorm125/thai2fit/", - "authors": "Charin Polpanumas", + "latest_version": "0.32", + "description": "ULMFit index to text for LSTM", + "long_description": "-", + "project_url": "https://github.com/cstorm125/thai2fit/", + "authors": [ + "Charin Polpanumas" + ], + "author_email": "", + "license":"", "release":{ "0.32":{ "download": "https://www.dropbox.com/s/87p5ugshid4mbcm/thwiki_itos.pkl?dl=1", @@ -160,11 +200,15 @@ "wiki_lm_lstm": { "name": "wiki_lm_lstm", "file_name": "thwiki_model_lstm.pth", - "version": "0.32", - "text": "Wiki-pretrained ULMFit language model for LSTM", - "about": "-", - "homepage": "https://github.com/cstorm125/thai2fit/", - "authors": "Charin Polpanumas", + "latest_version": "0.32", + "description": "Wiki-pretrained ULMFit language model for LSTM", + "long_description": "-", + "project_url": "https://github.com/cstorm125/thai2fit/", + "authors": [ + "Charin Polpanumas" + ], + "author_email": "", + "license":"", "release":{ "0.32":{ "download": "https://www.dropbox.com/s/7za2o1nmq8s3fex/thwiki_lm.pth?dl=1", From 5065eb0a7432807fdf33b9ba7d70efc5e1bebdd4 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sun, 31 May 2020 13:32:44 +0700 Subject: [PATCH 05/14] Update db.json --- db.json | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/db.json b/db.json index ace4c7c..4c8dcda 100644 --- a/db.json +++ b/db.json @@ -1,7 +1,6 @@ { "test": { "name": "test", - "file_name": "test.txt", "latest_version": "0.1", "description": "It's test file.", "long_description": "-", @@ -13,6 +12,7 @@ "author_email": "wannaphong@kkumail.com", "release":{ "0.1":{ + "filename": "test.txt", "download": "https://github.com/PyThaiNLP/pythainlp-corpus/releases/download/test-0.1/test.txt", "md5": "ff1f76282b7adcb310ffad3ecd867c3d" } @@ -20,7 +20,6 @@ }, "crfcut": { "name": "crfcut", - "file_name": "sentenceseg-ted.model", "latest_version": "0.1", "description": "Thai sentence segmentation with CRF trained on TED dataset", "long_description": "-", @@ -32,6 +31,7 @@ "license":"", "release":{ "0.1":{ + "filename": "sentenceseg-ted.model", "download": "https://github.com/vistec-AI/ted_crawler/blob/master/models/sentenceseg-ted.model?raw=true", "md5": "-" } @@ -39,7 +39,6 @@ }, "g2p": { "name": "g2p", - "file_name": "wiktionary-11-2-2020.tsv", "latest_version": "0.1", "description": "Grapheme to Phoneme (G2P) ภาษาไทย", "long_description": "ข้อมูลดึงมาจากวิกิพจนานุกรมภาษาไทย (Thai Wiktionary)", @@ -51,6 +50,7 @@ "license":"", "release":{ "0.1":{ + "filename": "wiktionary-11-2-2020.tsv", "download": "https://raw.githubusercontent.com/PyThaiNLP/lexicon-thai/master/G2P/wiktionary-11-2-2020.tsv", "md5": "-" } @@ -58,7 +58,6 @@ }, "thai-g2p": { "name": "thai-g2p", - "file_name": "thaig2p-0.1.tar", "latest_version": "0.1", "description": "Thai Grapheme to Phoneme (G2P)", "long_description": "Thai Grapheme to Phoneme (G2P) in PyTorch", @@ -70,6 +69,7 @@ "license":"", "release":{ "0.1":{ + "file_name": "thaig2p-0.1.tar", "download": "https://github.com/PyThaiNLP/pythainlp-corpus/releases/download/thaig2p-v0.1/thaig2p-0.1.tar", "md5": "-" } @@ -77,9 +77,7 @@ }, "thai2fit_wv": { "name": "thai2fit_wv", - "file_name": "thai2vec.bin", "latest_version": "0.1", - "download": "https://www.dropbox.com/s/yuq0gp1eges8j5n/thai2vec.bin?dl=1", "description": "thai2vec word embeddings", "long_description": "-", "project_url": "https://github.com/cstorm125/thai2fit/", @@ -90,6 +88,7 @@ "license":"", "release":{ "0.1":{ + "file_name": "thai2vec.bin", "download": "https://www.dropbox.com/s/yuq0gp1eges8j5n/thai2vec.bin?dl=1", "md5": "-" } @@ -97,9 +96,7 @@ }, "thai2rom-dataset": { "name": "thai2rom-dataset", - "file_name": "thai2rom.csv", "latest_version": "0.1", - "download": "https://raw.githubusercontent.com/wannaphong/thai-romanization/master/dataset/data.csv", "description": "-", "long_description": "-", "project_url": "https://github.com/wannaphongcom/thai-romanization/", @@ -110,6 +107,7 @@ "license":"", "release":{ "0.1":{ + "file_name": "thai2rom.csv", "download": "https://raw.githubusercontent.com/wannaphong/thai-romanization/master/dataset/data.csv", "md5": "-" } @@ -117,9 +115,7 @@ }, "thai2rom-pytorch": { "name": "thai2rom-pytorch", - "file_name": "thai2rom-pytorch.tar", "latest_version": "0.1", - "download": "https://raw.githubusercontent.com/c4n/thai-romanization/master/notebook/thai2rom-pytorch.tar", "description": "-", "long_description": "LSTM encoder-decoder model", "project_url": "https://github.com/c4n/thai-romanization/", @@ -130,6 +126,7 @@ "license":"", "release":{ "0.1":{ + "file_name": "thai2rom-pytorch.tar", "download": "https://raw.githubusercontent.com/c4n/thai-romanization/master/notebook/thai2rom-pytorch.tar", "md5": "-" } @@ -137,9 +134,7 @@ }, "thai2rom-pytorch-attn": { "name": "thai2rom-pytorch-attn", - "file_name": "thai2rom-pytorch-attn-v0.1.tar", "latest_version": "0.1", - "download": "https://raw.githubusercontent.com/artificiala/thai-romanization/master/notebook/thai2rom-pytorch-attn-v0.1.tar", "description": "-", "long_description": "LSTM encoder-decoder model with attention mechanism", "project_url": "https://github.com/artificiala/thai-romanization/", @@ -150,6 +145,7 @@ "license":"", "release":{ "0.1":{ + "file_name": "thai2rom-pytorch-attn-v0.1.tar", "download": "https://raw.githubusercontent.com/artificiala/thai-romanization/master/notebook/thai2rom-pytorch-attn-v0.1.tar", "md5": "-" } @@ -157,7 +153,6 @@ }, "thainer": { "name": "thainer", - "file_name": "thai-ner-1-4.crfsuite", "latest_version": "1.4", "description": "Thai Named Entity Recognition For PyThaiNLP", "long_description": "-", @@ -169,10 +164,12 @@ "license":"", "release":{ "1.3":{ + "file_name": "thai-ner-1-3.crfsuite", "download": "https://github.com/PyThaiNLP/pythainlp-corpus/releases/download/thainer-1.3/data.model", "md5": "-" }, "1.4":{ + "file_name": "thai-ner-1-4.crfsuite", "download": "https://github.com/PyThaiNLP/pythainlp-corpus/releases/download/thainer-1.4/thai-ner-1-4.crfsuite", "md5": "-" } @@ -180,7 +177,6 @@ }, "wiki_itos_lstm": { "name": "wiki_itos_lstm", - "file_name": "itos_lstm.pkl", "latest_version": "0.32", "description": "ULMFit index to text for LSTM", "long_description": "-", @@ -192,6 +188,7 @@ "license":"", "release":{ "0.32":{ + "file_name": "itos_lstm.pkl", "download": "https://www.dropbox.com/s/87p5ugshid4mbcm/thwiki_itos.pkl?dl=1", "md5": "-" } @@ -199,7 +196,6 @@ }, "wiki_lm_lstm": { "name": "wiki_lm_lstm", - "file_name": "thwiki_model_lstm.pth", "latest_version": "0.32", "description": "Wiki-pretrained ULMFit language model for LSTM", "long_description": "-", @@ -211,6 +207,7 @@ "license":"", "release":{ "0.32":{ + "file_name": "thwiki_model_lstm.pth", "download": "https://www.dropbox.com/s/7za2o1nmq8s3fex/thwiki_lm.pth?dl=1", "md5": "-" } From 58c1cf6c69c5dbc2ad2f78d22734e3b002637b49 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sun, 31 May 2020 13:42:52 +0700 Subject: [PATCH 06/14] Add license --- db.json | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/db.json b/db.json index 4c8dcda..9a74d95 100644 --- a/db.json +++ b/db.json @@ -47,7 +47,7 @@ "Wannaphong Phatthiyaphaibun" ], "author_email": "wannaphong@kkumail.com", - "license":"", + "license":"Creative Commons Attribution-ShareAlike 4.0 International License", "release":{ "0.1":{ "filename": "wiktionary-11-2-2020.tsv", @@ -66,7 +66,7 @@ "Wannaphong Phatthiyaphaibun" ], "author_email": "wannaphong@kkumail.com", - "license":"", + "license":"Apache License 2.0", "release":{ "0.1":{ "file_name": "thaig2p-0.1.tar", @@ -104,7 +104,7 @@ "Wannaphong Phatthiyaphaibun" ], "author_email": "wannaphong@kkumail.com", - "license":"", + "license":"Apache License 2.0", "release":{ "0.1":{ "file_name": "thai2rom.csv", @@ -123,7 +123,7 @@ "Can Udomcharoenchaikit" ], "author_email": "", - "license":"", + "license":"Apache License 2.0", "release":{ "0.1":{ "file_name": "thai2rom-pytorch.tar", @@ -142,7 +142,7 @@ "Chakri Lowphansirikul" ], "author_email": "", - "license":"", + "license":"Apache License 2.0", "release":{ "0.1":{ "file_name": "thai2rom-pytorch-attn-v0.1.tar", @@ -161,7 +161,7 @@ "Wannaphong Phatthiyaphaibun" ], "author_email": "wannaphong@kkumail.com", - "license":"", + "license":"Apache License 2.0", "release":{ "1.3":{ "file_name": "thai-ner-1-3.crfsuite", From 62de0d15e4cd57adf62a405295a481f0b31a1539 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sun, 31 May 2020 14:17:55 +0700 Subject: [PATCH 07/14] Update db.json --- db.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/db.json b/db.json index 9a74d95..05c3593 100644 --- a/db.json +++ b/db.json @@ -12,7 +12,7 @@ "author_email": "wannaphong@kkumail.com", "release":{ "0.1":{ - "filename": "test.txt", + "file_name": "test.txt", "download": "https://github.com/PyThaiNLP/pythainlp-corpus/releases/download/test-0.1/test.txt", "md5": "ff1f76282b7adcb310ffad3ecd867c3d" } @@ -31,7 +31,7 @@ "license":"", "release":{ "0.1":{ - "filename": "sentenceseg-ted.model", + "file_name": "sentenceseg-ted.model", "download": "https://github.com/vistec-AI/ted_crawler/blob/master/models/sentenceseg-ted.model?raw=true", "md5": "-" } @@ -50,7 +50,7 @@ "license":"Creative Commons Attribution-ShareAlike 4.0 International License", "release":{ "0.1":{ - "filename": "wiktionary-11-2-2020.tsv", + "file_name": "wiktionary-11-2-2020.tsv", "download": "https://raw.githubusercontent.com/PyThaiNLP/lexicon-thai/master/G2P/wiktionary-11-2-2020.tsv", "md5": "-" } From a5c4a19da96ed87bf693b0f4e234b36ccbc2d610 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sun, 31 May 2020 17:06:36 +0700 Subject: [PATCH 08/14] Update details.html --- templates/details.html | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/templates/details.html b/templates/details.html index 5856dfa..16fd606 100644 --- a/templates/details.html +++ b/templates/details.html @@ -6,15 +6,15 @@

{{ corpus['name'] }}

- version {{ corpus['version'] }} + version {{ corpus['latest_version'] }}

{{ corpus['text'] }}


-

About : {{ corpus['about'] }}

-

HomePage : {{ corpus['homepage'] }}

-

Authors : {{ corpus['authors'] }}

+

About : {{ corpus['description'] }}

+

HomePage : {{ corpus['project_url'] }}

+

Authors : {{ corpus['authors'][0] }}


Download and Use

Download

@@ -27,7 +27,7 @@

Use

if get_file('{{ corpus['name'] }}') is None than you not download {{ corpus['name'] }}.

File Details

-

File Name : {{ corpus['file_name'] }}

-

md5 : {{ corpus['md5'] }}

-

Link Download : {{ corpus['download'] }}

-{% endblock %} \ No newline at end of file +

File Name : {{ corpus[corpus['release']['latest_version']]['file_name'] }}

+

md5 : {{ corpus[corpus['release']['latest_version']]['md5'] }}

+

Link Download : {{ corpus['download'] }}

+{% endblock %} From a4a22d8050928623733900fe0076c6a81c9cfb4f Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sun, 31 May 2020 17:07:39 +0700 Subject: [PATCH 09/14] Update list-corpus.html --- templates/list-corpus.html | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/templates/list-corpus.html b/templates/list-corpus.html index 0ea66bb..60a0ca5 100644 --- a/templates/list-corpus.html +++ b/templates/list-corpus.html @@ -10,13 +10,13 @@

List Corpus & Models

{% for corpus in listcorpus %}

{{ corpus['name'] }}

- {% if corpus['text']|length < 50 %} -

{{ corpus['text'] }}

+ {% if corpus['description']|length < 50 %} +

{{ corpus['description'] }}

{% else %} -

{{ corpus['text'][:50]+' ...' }}

+

{{ corpus['description'][:50]+' ...' }}

{% endif %}

View details »

{% endfor %} -{% endblock %} \ No newline at end of file +{% endblock %} From cf684dfbb3f0fda6a312133d689ebd302a841fe5 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sun, 31 May 2020 17:08:35 +0700 Subject: [PATCH 10/14] Update details.html --- templates/details.html | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/templates/details.html b/templates/details.html index 16fd606..20e2d79 100644 --- a/templates/details.html +++ b/templates/details.html @@ -12,7 +12,8 @@

{{ corpus['name'] }}

{{ corpus['text'] }}


-

About : {{ corpus['description'] }}

+

Description : {{ corpus['description'] }}

+

Long Description : {{ corpus['long_description'] }}

HomePage : {{ corpus['project_url'] }}

Authors : {{ corpus['authors'][0] }}


From e523bd1ce3cff62b67d83801669e404aaa229e55 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sun, 31 May 2020 17:10:55 +0700 Subject: [PATCH 11/14] Update details.html --- templates/details.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/templates/details.html b/templates/details.html index 20e2d79..dd72406 100644 --- a/templates/details.html +++ b/templates/details.html @@ -29,6 +29,6 @@

Use


File Details

File Name : {{ corpus[corpus['release']['latest_version']]['file_name'] }}

-

md5 : {{ corpus[corpus['release']['latest_version']]['md5'] }}

+

md5 : {{ corpus['release']['latest_version']['md5'] }}

Link Download : {{ corpus['download'] }}

{% endblock %} From a2059d58d2bc4e70d76130008ffdb2baa7c278f3 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sun, 31 May 2020 17:13:00 +0700 Subject: [PATCH 12/14] Update details.html --- templates/details.html | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/templates/details.html b/templates/details.html index dd72406..45f4d5d 100644 --- a/templates/details.html +++ b/templates/details.html @@ -28,7 +28,7 @@

Use

if get_file('{{ corpus['name'] }}') is None than you not download {{ corpus['name'] }}.

File Details

-

File Name : {{ corpus[corpus['release']['latest_version']]['file_name'] }}

+

File Name : {{ corpus['release']['latest_version']['file_name'] }}

md5 : {{ corpus['release']['latest_version']['md5'] }}

-

Link Download : {{ corpus['download'] }}

+

Link Download : {{ corpus['download'] }}

{% endblock %} From 517fdd1a9591ed9b3d3be9d9df0908643c08b555 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Sun, 31 May 2020 17:13:28 +0700 Subject: [PATCH 13/14] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 6f88457..0e75903 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,5 @@ # PyThaiNLP Corpus +[![Build Status](https://travis-ci.org/PyThaiNLP/pythainlp-corpus.svg?branch=2.1)](https://travis-ci.org/PyThaiNLP/pythainlp-corpus) Corpora and language models for [PyThaiNLP](https://github.com//PyThaiNLP/pythainlp). From 50bec646b5fa8d13fc2d7beef973d67c8be47abf Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Sun, 31 May 2020 13:26:35 +0100 Subject: [PATCH 14/14] Update field names and use license abbreviated names --- db.json | 148 +++++++++++++++++++++++++++++--------------------------- 1 file changed, 76 insertions(+), 72 deletions(-) diff --git a/db.json b/db.json index 05c3593..c4e589a 100644 --- a/db.json +++ b/db.json @@ -2,18 +2,22 @@ "test": { "name": "test", "latest_version": "0.1", - "description": "It's test file.", - "long_description": "-", - "project_url": "https://github.com/PyThaiNLP/pythainlp-corpus/", - "license": "Creative Commons Attribution-ShareAlike 4.0 International License", + "description": "It's a test file.", + "long_description": "A handy dummy corpus used for testing (like in unit testing) purpose.", + "url": "https://github.com/PyThaiNLP/pythainlp-corpus/", + "project_urls": { + "project_page": "https://www.thainlp.org/", + "source": "https://github.com/PyThaiNLP/pythainlp-corpus/" + }, + "license": "cc-by-sa-4.0", "authors": [ "Wannaphong Phatthiyaphaibun" ], "author_email": "wannaphong@kkumail.com", - "release":{ - "0.1":{ - "file_name": "test.txt", - "download": "https://github.com/PyThaiNLP/pythainlp-corpus/releases/download/test-0.1/test.txt", + "versions": { + "0.1": { + "filename": "test.txt", + "download_url": "https://github.com/PyThaiNLP/pythainlp-corpus/releases/download/test-0.1/test.txt", "md5": "ff1f76282b7adcb310ffad3ecd867c3d" } } @@ -23,16 +27,16 @@ "latest_version": "0.1", "description": "Thai sentence segmentation with CRF trained on TED dataset", "long_description": "-", - "project_url": "https://github.com/vistec-AI/ted_crawler/", + "url": "https://github.com/vistec-AI/ted_crawler/", "authors": [ "Charin Polpanumas" ], "author_email": "", - "license":"", - "release":{ - "0.1":{ - "file_name": "sentenceseg-ted.model", - "download": "https://github.com/vistec-AI/ted_crawler/blob/master/models/sentenceseg-ted.model?raw=true", + "license": "cc-by-sa-4.0", + "versions": { + "0.1": { + "filename": "sentenceseg-ted.model", + "download_url": "https://github.com/vistec-AI/ted_crawler/blob/master/models/sentenceseg-ted.model?raw=true", "md5": "-" } } @@ -42,16 +46,16 @@ "latest_version": "0.1", "description": "Grapheme to Phoneme (G2P) ภาษาไทย", "long_description": "ข้อมูลดึงมาจากวิกิพจนานุกรมภาษาไทย (Thai Wiktionary)", - "project_url": "https://github.com/PyThaiNLP/lexicon-thai/tree/master/G2P", + "url": "https://github.com/PyThaiNLP/lexicon-thai/tree/master/G2P", "authors": [ "Wannaphong Phatthiyaphaibun" ], "author_email": "wannaphong@kkumail.com", - "license":"Creative Commons Attribution-ShareAlike 4.0 International License", - "release":{ - "0.1":{ - "file_name": "wiktionary-11-2-2020.tsv", - "download": "https://raw.githubusercontent.com/PyThaiNLP/lexicon-thai/master/G2P/wiktionary-11-2-2020.tsv", + "license": "cc-by-sa-4.0", + "versions": { + "0.1": { + "filename": "wiktionary-11-2-2020.tsv", + "download_url": "https://raw.githubusercontent.com/PyThaiNLP/lexicon-thai/master/G2P/wiktionary-11-2-2020.tsv", "md5": "-" } } @@ -61,16 +65,16 @@ "latest_version": "0.1", "description": "Thai Grapheme to Phoneme (G2P)", "long_description": "Thai Grapheme to Phoneme (G2P) in PyTorch", - "project_url": "https://github.com/wannaphong/thai-g2p/", + "url": "https://github.com/wannaphong/thai-g2p/", "authors": [ "Wannaphong Phatthiyaphaibun" ], "author_email": "wannaphong@kkumail.com", - "license":"Apache License 2.0", - "release":{ - "0.1":{ - "file_name": "thaig2p-0.1.tar", - "download": "https://github.com/PyThaiNLP/pythainlp-corpus/releases/download/thaig2p-v0.1/thaig2p-0.1.tar", + "license": "apache-2.0", + "versions": { + "0.1": { + "filename": "thaig2p-0.1.tar", + "download_url": "https://github.com/PyThaiNLP/pythainlp-corpus/releases/download/thaig2p-v0.1/thaig2p-0.1.tar", "md5": "-" } } @@ -80,16 +84,16 @@ "latest_version": "0.1", "description": "thai2vec word embeddings", "long_description": "-", - "project_url": "https://github.com/cstorm125/thai2fit/", + "url": "https://github.com/cstorm125/thai2fit/", "authors": [ "Charin Polpanumas" ], "author_email": "", - "license":"", - "release":{ - "0.1":{ - "file_name": "thai2vec.bin", - "download": "https://www.dropbox.com/s/yuq0gp1eges8j5n/thai2vec.bin?dl=1", + "license": "", + "versions": { + "0.1": { + "filename": "thai2vec.bin", + "download_url": "https://www.dropbox.com/s/yuq0gp1eges8j5n/thai2vec.bin?dl=1", "md5": "-" } } @@ -99,16 +103,16 @@ "latest_version": "0.1", "description": "-", "long_description": "-", - "project_url": "https://github.com/wannaphongcom/thai-romanization/", + "url": "https://github.com/wannaphongcom/thai-romanization/", "authors": [ "Wannaphong Phatthiyaphaibun" ], "author_email": "wannaphong@kkumail.com", - "license":"Apache License 2.0", - "release":{ - "0.1":{ - "file_name": "thai2rom.csv", - "download": "https://raw.githubusercontent.com/wannaphong/thai-romanization/master/dataset/data.csv", + "license": "apache-2.0", + "versions": { + "0.1": { + "filename": "thai2rom.csv", + "download_url": "https://raw.githubusercontent.com/wannaphong/thai-romanization/master/dataset/data.csv", "md5": "-" } } @@ -118,16 +122,16 @@ "latest_version": "0.1", "description": "-", "long_description": "LSTM encoder-decoder model", - "project_url": "https://github.com/c4n/thai-romanization/", + "url": "https://github.com/c4n/thai-romanization/", "authors": [ "Can Udomcharoenchaikit" ], "author_email": "", - "license":"Apache License 2.0", - "release":{ - "0.1":{ - "file_name": "thai2rom-pytorch.tar", - "download": "https://raw.githubusercontent.com/c4n/thai-romanization/master/notebook/thai2rom-pytorch.tar", + "license": "apache-2.0", + "versions": { + "0.1": { + "filename": "thai2rom-pytorch.tar", + "download_url": "https://raw.githubusercontent.com/c4n/thai-romanization/master/notebook/thai2rom-pytorch.tar", "md5": "-" } } @@ -137,16 +141,16 @@ "latest_version": "0.1", "description": "-", "long_description": "LSTM encoder-decoder model with attention mechanism", - "project_url": "https://github.com/artificiala/thai-romanization/", + "url": "https://github.com/artificiala/thai-romanization/", "authors": [ "Chakri Lowphansirikul" ], "author_email": "", - "license":"Apache License 2.0", - "release":{ - "0.1":{ - "file_name": "thai2rom-pytorch-attn-v0.1.tar", - "download": "https://raw.githubusercontent.com/artificiala/thai-romanization/master/notebook/thai2rom-pytorch-attn-v0.1.tar", + "license": "apache-2.0", + "versions": { + "0.1": { + "filename": "thai2rom-pytorch-attn-v0.1.tar", + "download_url": "https://raw.githubusercontent.com/artificiala/thai-romanization/master/notebook/thai2rom-pytorch-attn-v0.1.tar", "md5": "-" } } @@ -156,21 +160,21 @@ "latest_version": "1.4", "description": "Thai Named Entity Recognition For PyThaiNLP", "long_description": "-", - "project_url": "https://github.com/wannaphong/thai-ner/", + "url": "https://github.com/wannaphong/thai-ner/", "authors": [ "Wannaphong Phatthiyaphaibun" ], "author_email": "wannaphong@kkumail.com", - "license":"Apache License 2.0", - "release":{ - "1.3":{ - "file_name": "thai-ner-1-3.crfsuite", - "download": "https://github.com/PyThaiNLP/pythainlp-corpus/releases/download/thainer-1.3/data.model", + "license": "apache-2.0", + "versions": { + "1.3": { + "filename": "thai-ner-1-3.crfsuite", + "download_url": "https://github.com/PyThaiNLP/pythainlp-corpus/releases/download/thainer-1.3/data.model", "md5": "-" }, - "1.4":{ - "file_name": "thai-ner-1-4.crfsuite", - "download": "https://github.com/PyThaiNLP/pythainlp-corpus/releases/download/thainer-1.4/thai-ner-1-4.crfsuite", + "1.4": { + "filename": "thai-ner-1-4.crfsuite", + "download_url": "https://github.com/PyThaiNLP/pythainlp-corpus/releases/download/thainer-1.4/thai-ner-1-4.crfsuite", "md5": "-" } } @@ -180,16 +184,16 @@ "latest_version": "0.32", "description": "ULMFit index to text for LSTM", "long_description": "-", - "project_url": "https://github.com/cstorm125/thai2fit/", + "url": "https://github.com/cstorm125/thai2fit/", "authors": [ "Charin Polpanumas" ], "author_email": "", - "license":"", - "release":{ - "0.32":{ - "file_name": "itos_lstm.pkl", - "download": "https://www.dropbox.com/s/87p5ugshid4mbcm/thwiki_itos.pkl?dl=1", + "license": "cc-by-sa-4.0", + "versions": { + "0.32": { + "filename": "itos_lstm.pkl", + "download_url": "https://www.dropbox.com/s/87p5ugshid4mbcm/thwiki_itos.pkl?dl=1", "md5": "-" } } @@ -199,18 +203,18 @@ "latest_version": "0.32", "description": "Wiki-pretrained ULMFit language model for LSTM", "long_description": "-", - "project_url": "https://github.com/cstorm125/thai2fit/", + "url": "https://github.com/cstorm125/thai2fit/", "authors": [ "Charin Polpanumas" ], "author_email": "", - "license":"", - "release":{ - "0.32":{ - "file_name": "thwiki_model_lstm.pth", - "download": "https://www.dropbox.com/s/7za2o1nmq8s3fex/thwiki_lm.pth?dl=1", + "license": "cc-by-sa-4.0", + "versions": { + "0.32": { + "filename": "thwiki_model_lstm.pth", + "download_url": "https://www.dropbox.com/s/7za2o1nmq8s3fex/thwiki_lm.pth?dl=1", "md5": "-" } } } -} +} \ No newline at end of file