Skip to content

Commit

Permalink
Merge pull request #17 from PyThaiNLP/fix-thainer
Browse files Browse the repository at this point in the history
Update corpus version
  • Loading branch information
wannaphong authored Jun 7, 2020
2 parents 6132fe2 + 50bec64 commit 8e6499d
Show file tree
Hide file tree
Showing 4 changed files with 201 additions and 113 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# PyThaiNLP Corpus
[![Build Status](https://travis-ci.org/PyThaiNLP/pythainlp-corpus.svg?branch=2.1)](https://travis-ci.org/PyThaiNLP/pythainlp-corpus)

Corpora and language models for [PyThaiNLP](https://github.com//PyThaiNLP/pythainlp).

Expand Down
288 changes: 187 additions & 101 deletions db.json
Original file line number Diff line number Diff line change
@@ -1,134 +1,220 @@
{
"test": {
"name": "test",
"file_name": "test.txt",
"version": "0.1",
"download": "https://github.com/PyThaiNLP/pythainlp-corpus/releases/download/test-0.1/test.txt",
"text": "It's test file.",
"md5": "ff1f76282b7adcb310ffad3ecd867c3d",
"about": "-",
"homepage": "https://github.com/PyThaiNLP/pythainlp-corpus/",
"authors": "Wannaphong Phatthiyaphaibun"
"latest_version": "0.1",
"description": "It's a test file.",
"long_description": "A handy dummy corpus used for testing (like in unit testing) purpose.",
"url": "https://github.com/PyThaiNLP/pythainlp-corpus/",
"project_urls": {
"project_page": "https://www.thainlp.org/",
"source": "https://github.com/PyThaiNLP/pythainlp-corpus/"
},
"license": "cc-by-sa-4.0",
"authors": [
"Wannaphong Phatthiyaphaibun"
],
"author_email": "[email protected]",
"versions": {
"0.1": {
"filename": "test.txt",
"download_url": "https://github.com/PyThaiNLP/pythainlp-corpus/releases/download/test-0.1/test.txt",
"md5": "ff1f76282b7adcb310ffad3ecd867c3d"
}
}
},
"crfcut": {
"name": "crfcut",
"file_name": "sentenceseg-ted.model",
"version": "0.1",
"download": "https://github.com/vistec-AI/ted_crawler/blob/master/models/sentenceseg-ted.model?raw=true",
"text": "Thai sentence segmentation with CRF trained on TED dataset",
"md5": "-",
"about": "-",
"homepage": "https://github.com/vistec-AI/ted_crawler/",
"authors": "Charin Polpanumas"
"latest_version": "0.1",
"description": "Thai sentence segmentation with CRF trained on TED dataset",
"long_description": "-",
"url": "https://github.com/vistec-AI/ted_crawler/",
"authors": [
"Charin Polpanumas"
],
"author_email": "",
"license": "cc-by-sa-4.0",
"versions": {
"0.1": {
"filename": "sentenceseg-ted.model",
"download_url": "https://github.com/vistec-AI/ted_crawler/blob/master/models/sentenceseg-ted.model?raw=true",
"md5": "-"
}
}
},
"g2p": {
"name": "g2p",
"file_name": "wiktionary-11-2-2020.tsv",
"version": "0.1",
"download": "https://raw.githubusercontent.com/PyThaiNLP/lexicon-thai/master/G2P/wiktionary-11-2-2020.tsv",
"text": "Grapheme to Phoneme (G2P) ภาษาไทย",
"md5": "-",
"about": "ข้อมูลดึงมาจากวิกิพจนานุกรมภาษาไทย (Thai Wiktionary)",
"homepage": "https://github.com/PyThaiNLP/lexicon-thai/tree/master/G2P",
"authors": "Wannaphong Phatthiyaphaibun"
"latest_version": "0.1",
"description": "Grapheme to Phoneme (G2P) ภาษาไทย",
"long_description": "ข้อมูลดึงมาจากวิกิพจนานุกรมภาษาไทย (Thai Wiktionary)",
"url": "https://github.com/PyThaiNLP/lexicon-thai/tree/master/G2P",
"authors": [
"Wannaphong Phatthiyaphaibun"
],
"author_email": "[email protected]",
"license": "cc-by-sa-4.0",
"versions": {
"0.1": {
"filename": "wiktionary-11-2-2020.tsv",
"download_url": "https://raw.githubusercontent.com/PyThaiNLP/lexicon-thai/master/G2P/wiktionary-11-2-2020.tsv",
"md5": "-"
}
}
},
"thai-g2p": {
"name": "thai-g2p",
"file_name": "thaig2p-0.1.tar",
"version": "0.1",
"download": "https://github.com/PyThaiNLP/pythainlp-corpus/releases/download/thaig2p-v0.1/thaig2p-0.1.tar",
"text": "Thai Grapheme to Phoneme (G2P)",
"md5": "-",
"about": "Thai Grapheme to Phoneme (G2P) in PyTorch",
"homepage": "https://github.com/wannaphong/thai-g2p/",
"authors": "Wannaphong Phatthiyaphaibun"
"latest_version": "0.1",
"description": "Thai Grapheme to Phoneme (G2P)",
"long_description": "Thai Grapheme to Phoneme (G2P) in PyTorch",
"url": "https://github.com/wannaphong/thai-g2p/",
"authors": [
"Wannaphong Phatthiyaphaibun"
],
"author_email": "[email protected]",
"license": "apache-2.0",
"versions": {
"0.1": {
"filename": "thaig2p-0.1.tar",
"download_url": "https://github.com/PyThaiNLP/pythainlp-corpus/releases/download/thaig2p-v0.1/thaig2p-0.1.tar",
"md5": "-"
}
}
},
"thai2fit_wv": {
"name": "thai2fit_wv",
"file_name": "thai2vec.bin",
"version": "0.1",
"download": "https://www.dropbox.com/s/yuq0gp1eges8j5n/thai2vec.bin?dl=1",
"text": "thai2vec word embeddings",
"md5": "-",
"about": "-",
"homepage": "https://github.com/cstorm125/thai2fit/",
"authors": "Charin Polpanumas"
"latest_version": "0.1",
"description": "thai2vec word embeddings",
"long_description": "-",
"url": "https://github.com/cstorm125/thai2fit/",
"authors": [
"Charin Polpanumas"
],
"author_email": "",
"license": "",
"versions": {
"0.1": {
"filename": "thai2vec.bin",
"download_url": "https://www.dropbox.com/s/yuq0gp1eges8j5n/thai2vec.bin?dl=1",
"md5": "-"
}
}
},
"thai2rom-dataset": {
"name": "thai2rom-dataset",
"file_name": "thai2rom.csv",
"version": "0.1",
"download": "https://raw.githubusercontent.com/wannaphong/thai-romanization/master/dataset/data.csv",
"text": "-",
"md5": "-",
"about": "-",
"homepage": "https://github.com/wannaphongcom/thai-romanization/",
"authors": "Wannaphong Phatthiyaphaibun"
"latest_version": "0.1",
"description": "-",
"long_description": "-",
"url": "https://github.com/wannaphongcom/thai-romanization/",
"authors": [
"Wannaphong Phatthiyaphaibun"
],
"author_email": "[email protected]",
"license": "apache-2.0",
"versions": {
"0.1": {
"filename": "thai2rom.csv",
"download_url": "https://raw.githubusercontent.com/wannaphong/thai-romanization/master/dataset/data.csv",
"md5": "-"
}
}
},
"thai2rom-pytorch": {
"name": "thai2rom-pytorch",
"file_name": "thai2rom-pytorch.tar",
"version": "0.1",
"download": "https://raw.githubusercontent.com/c4n/thai-romanization/master/notebook/thai2rom-pytorch.tar",
"text": "-",
"md5": "-",
"about": "LSTM encoder-decoder model",
"homepage": "https://github.com/c4n/thai-romanization/",
"authors": "Can Udomcharoenchaikit"
"latest_version": "0.1",
"description": "-",
"long_description": "LSTM encoder-decoder model",
"url": "https://github.com/c4n/thai-romanization/",
"authors": [
"Can Udomcharoenchaikit"
],
"author_email": "",
"license": "apache-2.0",
"versions": {
"0.1": {
"filename": "thai2rom-pytorch.tar",
"download_url": "https://raw.githubusercontent.com/c4n/thai-romanization/master/notebook/thai2rom-pytorch.tar",
"md5": "-"
}
}
},
"thai2rom-pytorch-attn": {
"name": "thai2rom-pytorch-attn",
"file_name": "thai2rom-pytorch-attn-v0.1.tar",
"version": "0.1",
"download": "https://raw.githubusercontent.com/artificiala/thai-romanization/master/notebook/thai2rom-pytorch-attn-v0.1.tar",
"text": "-",
"md5": "-",
"about": "LSTM encoder-decoder model with attention mechanism",
"homepage": "https://github.com/artificiala/thai-romanization/",
"authors": "Chakri Lowphansirikul"
"latest_version": "0.1",
"description": "-",
"long_description": "LSTM encoder-decoder model with attention mechanism",
"url": "https://github.com/artificiala/thai-romanization/",
"authors": [
"Chakri Lowphansirikul"
],
"author_email": "",
"license": "apache-2.0",
"versions": {
"0.1": {
"filename": "thai2rom-pytorch-attn-v0.1.tar",
"download_url": "https://raw.githubusercontent.com/artificiala/thai-romanization/master/notebook/thai2rom-pytorch-attn-v0.1.tar",
"md5": "-"
}
}
},
"thainer-1-3": {
"thainer": {
"name": "thainer",
"file_name": "thainer.model",
"version": "1.3",
"download": "https://github.com/PyThaiNLP/pythainlp-corpus/releases/download/thainer-1.3/data.model",
"text": "Thai Named Entity Recognition For PyThaiNLP",
"md5": "-",
"about": "-",
"homepage": "https://github.com/wannaphong/thai-ner/",
"authors": "Wannaphong Phatthiyaphaibun"
},
"thainer-1-4": {
"name": "thainer",
"file_name": "thai-ner-1-4.crfsuite",
"version": "1.4",
"download": "https://github.com/PyThaiNLP/pythainlp-corpus/releases/download/thainer-1.4/thai-ner-1-4.crfsuite",
"text": "Thai Named Entity Recognition For PyThaiNLP",
"md5": "-",
"about": "-",
"homepage": "https://github.com/wannaphong/thai-ner/",
"authors": "Wannaphong Phatthiyaphaibun"
"latest_version": "1.4",
"description": "Thai Named Entity Recognition For PyThaiNLP",
"long_description": "-",
"url": "https://github.com/wannaphong/thai-ner/",
"authors": [
"Wannaphong Phatthiyaphaibun"
],
"author_email": "[email protected]",
"license": "apache-2.0",
"versions": {
"1.3": {
"filename": "thai-ner-1-3.crfsuite",
"download_url": "https://github.com/PyThaiNLP/pythainlp-corpus/releases/download/thainer-1.3/data.model",
"md5": "-"
},
"1.4": {
"filename": "thai-ner-1-4.crfsuite",
"download_url": "https://github.com/PyThaiNLP/pythainlp-corpus/releases/download/thainer-1.4/thai-ner-1-4.crfsuite",
"md5": "-"
}
}
},
"wiki_itos_lstm": {
"name": "wiki_itos_lstm",
"file_name": "itos_lstm.pkl",
"version": "0.32",
"download": "https://www.dropbox.com/s/87p5ugshid4mbcm/thwiki_itos.pkl?dl=1",
"text": "ULMFit index to text for LSTM",
"md5": "-",
"about": "-",
"homepage": "https://github.com/cstorm125/thai2fit/",
"authors": "Charin Polpanumas"
"latest_version": "0.32",
"description": "ULMFit index to text for LSTM",
"long_description": "-",
"url": "https://github.com/cstorm125/thai2fit/",
"authors": [
"Charin Polpanumas"
],
"author_email": "",
"license": "cc-by-sa-4.0",
"versions": {
"0.32": {
"filename": "itos_lstm.pkl",
"download_url": "https://www.dropbox.com/s/87p5ugshid4mbcm/thwiki_itos.pkl?dl=1",
"md5": "-"
}
}
},
"wiki_lm_lstm": {
"name": "wiki_lm_lstm",
"file_name": "thwiki_model_lstm.pth",
"version": "0.32",
"download": "https://www.dropbox.com/s/7za2o1nmq8s3fex/thwiki_lm.pth?dl=1",
"text": "Wiki-pretrained ULMFit language model for LSTM",
"md5": "-",
"about": "-",
"homepage": "https://github.com/cstorm125/thai2fit/",
"authors": "Charin Polpanumas"
"latest_version": "0.32",
"description": "Wiki-pretrained ULMFit language model for LSTM",
"long_description": "-",
"url": "https://github.com/cstorm125/thai2fit/",
"authors": [
"Charin Polpanumas"
],
"author_email": "",
"license": "cc-by-sa-4.0",
"versions": {
"0.32": {
"filename": "thwiki_model_lstm.pth",
"download_url": "https://www.dropbox.com/s/7za2o1nmq8s3fex/thwiki_lm.pth?dl=1",
"md5": "-"
}
}
}
}
}
17 changes: 9 additions & 8 deletions templates/details.html
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,16 @@
<div class="d-flex align-items-center p-3 my-3 text-white-50 bg-purple rounded box-shadow">
<div class="lh-100">
<h1 class="mb-0 text-white lh-100">{{ corpus['name'] }}</h1>
<small>version {{ corpus['version'] }}</small>
<small>version {{ corpus['latest_version'] }}</small>
</div>
</div>
</div>
<p>{{ corpus['text'] }}</p>
<br>
<p>About : {{ corpus['about'] }}</p>
<p><b>HomePage</b> : <a rel="nofollow" target="_blank" href="{{ corpus['homepage'] }}">{{ corpus['homepage'] }}</a></p>
<p><b>Authors</b> : {{ corpus['authors'] }}</p>
<p>Description : {{ corpus['description'] }}</p>
<p>Long Description : {{ corpus['long_description'] }}</p>
<p><b>HomePage</b> : <a rel="nofollow" target="_blank" href="{{ corpus['project_url'] }}">{{ corpus['project_url'] }}</a></p>
<p><b>Authors</b> : {{ corpus['authors'][0] }}</p>
<hr>
<h2>Download and Use</h2>
<h3>Download</h3>
Expand All @@ -27,7 +28,7 @@ <h3>Use</h3>
if <code>get_file('{{ corpus['name'] }}')</code> is <code>None</code> than you not download <code>{{ corpus['name'] }}</code>.
<hr>
<h2>File Details</h2>
<p>File Name : {{ corpus['file_name'] }}</p>
<p>md5 : {{ corpus['md5'] }}</p>
<p>Link Download : <a rel="nofollow" target="_blank" href="{{ corpus['download'] }}">{{ corpus['download'] }}</a></p>
{% endblock %}
<p>File Name : {{ corpus['release']['latest_version']['file_name'] }}</p>
<p>md5 : {{ corpus['release']['latest_version']['md5'] }}</p>
<p>Link Download : <a rel="nofollow" target="_blank" href="{{ corpus['release']['latest_version']['download'] }}">{{ corpus['download'] }}</a></p>
{% endblock %}
8 changes: 4 additions & 4 deletions templates/list-corpus.html
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,13 @@ <h1>List Corpus & Models</h1>
{% for corpus in listcorpus %}
<div class="col-md-4">
<h2>{{ corpus['name'] }}</h2>
{% if corpus['text']|length < 50 %}
<p>{{ corpus['text'] }}</p>
{% if corpus['description']|length < 50 %}
<p>{{ corpus['description'] }}</p>
{% else %}
<p>{{ corpus['text'][:50]+' ...' }}</p>
<p>{{ corpus['description'][:50]+' ...' }}</p>
{% endif %}
<p><a class="btn btn-secondary" href="./{{ corpus['name'] }}.html" role="button">View details &raquo;</a></p>
</div>
{% endfor %}
</div></div>
{% endblock %}
{% endblock %}

0 comments on commit 8e6499d

Please sign in to comment.