Skip to content

Commit

Permalink
For short strings, it is more accurate to use all ngrams rather than …
Browse files Browse the repository at this point in the history
…a sample
  • Loading branch information
maparent committed May 7, 2021
1 parent a1598f1 commit 96dd84a
Showing 1 changed file with 11 additions and 0 deletions.
11 changes: 11 additions & 0 deletions langdetect/detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,17 @@ def _detect_block(self):
if not ngrams:
raise LangDetectException(ErrorCode.CantDetectError, 'No features in text.')

if len(ngrams) < self.ITERATION_LIMIT / 3:
# More accurate to take them all
prob = self._init_probability()
for i, ngram in enumerate(ngrams):
self._update_lang_prob(prob, ngram, self.alpha)
if i % 5 == 0:
self._normalize_prob(prob)
self._normalize_prob(prob)
self.langprob = prob
return

self.langprob = [0.0] * len(self.langlist)

self.random.seed(self.seed)
Expand Down

0 comments on commit 96dd84a

Please sign in to comment.