From c793a1265a535252bb0a5c623bc14c2ed5326f8b Mon Sep 17 00:00:00 2001 From: krikit Date: Thu, 27 Dec 2018 20:44:06 +0900 Subject: [PATCH 1/2] version from 0.1 to 0.2 #41 --- include/khaiii/khaiii_api.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/khaiii/khaiii_api.h b/include/khaiii/khaiii_api.h index 9018c58..0a99e1c 100644 --- a/include/khaiii/khaiii_api.h +++ b/include/khaiii/khaiii_api.h @@ -12,7 +12,7 @@ // constants // /////////////// #define KHAIII_VERSION_MAJOR 0 -#define KHAIII_VERSION_MINOR 1 +#define KHAIII_VERSION_MINOR 2 #define _MAC2STR(m) #m #define _JOIN_VER(x,y) _MAC2STR(x) "." _MAC2STR(y) // NOLINT #define KHAIII_VERSION _JOIN_VER(KHAIII_VERSION_MAJOR,KHAIII_VERSION_MINOR) // NOLINT From 6c9bf1bf4fcb0a4803a108f9d2dc447049eb2b94 Mon Sep 17 00:00:00 2001 From: krikit Date: Thu, 27 Dec 2018 20:44:47 +0900 Subject: [PATCH 2/2] =?UTF-8?q?pytorch=20=EC=9D=98=EC=A1=B4=EC=84=B1=20?= =?UTF-8?q?=EC=98=A4=EB=A5=98=20=EC=88=98=EC=A0=95.=20numpy,=20tqdm=20?= =?UTF-8?q?=EC=9D=98=EC=A1=B4=EC=84=B1=EB=8F=84=20=EC=A0=9C=EA=B1=B0=20#41?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docker/Dockerfile | 2 - requirements.txt | 2 - rsc/lib/vocabulary.py | 74 +++---------------------------------- src/main/python/setup.py.in | 2 +- 4 files changed, 6 insertions(+), 74 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 604f8da..b818a81 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -8,9 +8,7 @@ RUN pip install cython RUN pip install --upgrade pip RUN pip install -r requirements.txt -RUN pip install cmake RUN mkdir build - WORKDIR /workspace/khaiii/build RUN cmake .. diff --git a/requirements.txt b/requirements.txt index b37d2c3..384dcd4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1 @@ cmake>=3.10 -numpy -tqdm diff --git a/rsc/lib/vocabulary.py b/rsc/lib/vocabulary.py index 08e5ab1..f07fb1c 100644 --- a/rsc/lib/vocabulary.py +++ b/rsc/lib/vocabulary.py @@ -11,22 +11,16 @@ ########### # imports # ########### -import re import codecs -from collections import defaultdict import copy import logging import os -import torch -from torch import nn -import numpy as np -from tqdm import tqdm ######### # types # ######### -class Vocabulary(object): +class Vocabulary: """ vocabulary class """ @@ -69,6 +63,9 @@ def __getitem__(self, key): def __len__(self): return len(self.dic) + ''' + # 리소스 빌드 시 pytorch 의존성 제거를 위해 임시로 메서드를 제거합니다. + # 추후 학습 코드를 추가할 때 이 부분을 리팩토링 합니다. def get_embedding(self, dim, padding_idx=None): """ embedding을 리턴합니다. @@ -79,6 +76,7 @@ def get_embedding(self, dim, padding_idx=None): if padding_idx: return nn.Embedding(len(self), dim, padding_idx=padding_idx) return nn.Embedding(len(self), dim) + ''' # pylint: disable=pointless-string-statement def padding_idx(self): """ @@ -117,65 +115,3 @@ def _load(self, path, cutoff=1): self.rev.append(entry) append_num += 1 logging.info('%s: %d entries, %d cutoff', os.path.basename(path), append_num, cutoff_num) - - -class PreTrainedVocabulary(Vocabulary): - """ - pre-train된 word2vec를 사용하는 경우, vector에 있는 어휘로 - 사전을 구성하도록 합니다. - """ - def __init__(self, path): #pylint: disable=super-init-not-called - """ - Args: - path: file path - """ - # simple : 사과/N , none : 사과 - # 읽어들인 glove의 키 타입을 보고 판단해놓는다. - self.glove_key_type = None - self.dic, self.vectors = self._load_glove(path) - self.rev = {val:key for key, val in self.dic.items()} - assert len(self.dic) == len(self.rev) - logging.info('%s: %d entries, %d dim - not trainable', - os.path.basename(path), len(self.dic), self.vectors.size(1)) - - def get_embedding(self, dim, padding_idx=None): - """ - pre-training된 벡터가 세팅된 embedding을 리턴합니다. - """ - assert dim == self.vectors.size(1) - embed = super().get_embedding(dim, padding_idx) - embed.weight = nn.Parameter(self.vectors, requires_grad=False) - return embed - - def _load_glove(self, path): - """ - pre-trained GloVe (텍스트 포맷) 워드 벡터를 읽어들인다. - Args: - path: 워드 벡터 경로 - """ - unk = None - vecs = [] - for line in tqdm(codecs.open(path, 'r', encoding='UTF-8')): - cols = line.split(' ') - word = cols[0] - vec = np.array([float(_) for _ in cols[1:]]) - if vec.size == 0: # format error - continue - if word == '': - unk = vec - continue - vecs.append((word, vec)) - if self.glove_key_type is None: - if re.search('/[A-Z]$', word) is None: - self.glove_key_type = 'none' - else: - self.glove_key_type = 'simple' - if unk is None: - unk = [0] * len(vecs[0][1]) - padding = [0] * len(vecs[0][1]) - vecs.sort(key=lambda x: x[0]) - vecs.insert(0, ('', unk)) - vecs.insert(1, ('

', padding)) - vocab = defaultdict(int) - vocab.update({word: idx for idx, (word, _) in enumerate(vecs)}) - return vocab, torch.Tensor([vec for _, vec in vecs]) diff --git a/src/main/python/setup.py.in b/src/main/python/setup.py.in index e492f5d..e11b1ff 100644 --- a/src/main/python/setup.py.in +++ b/src/main/python/setup.py.in @@ -89,7 +89,7 @@ setup( packages=['khaiii', ], include_package_data=True, install_requires=[], - setup_requires=['numpy', 'pytest-runner', 'tqdm'], + setup_requires=['cmake>=3.10', 'pytest-runner'], tests_require=['pytest', ], zip_safe=False, cmdclass={'build': CustomBuild}