initial commit

etern · Dec 17, 2023 · cc62e56 · cc62e56
commit cc62e56
Show file tree

Hide file tree

Showing 10 changed files with 715 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+.DS_Store
diff --git a/README.org b/README.org
@@ -0,0 +1,14 @@
+#+STARTUP: showall
+#+TITLE: Ci Dict
+#+OPTIONS: num:nil ^:{} toc:nil
+
+alfred workflow词典，针对英汉查询做了些优化
+
+- 即时答案用本地词典，速度快
+- Shift预览有道词典，信息全
+- 回车打开有道网页（或本地词典），随心配
+
+* 参考
+- 本地查词基于[[https://github.com/tonyseek/macdict][macdict]]
+- 单词发音取自[[https://github.com/wensonsmith/YoudaoTranslator][YoudaoTranslator]]
+- 相似单词建议基于[[https://github.com/ahupp/bktree/blob/master/bktree.py][bktree]]
diff --git a/assets/translate-star.png b/assets/translate-star.png
diff --git a/assets/translate.png b/assets/translate.png
diff --git a/bktree.py b/bktree.py
@@ -0,0 +1,151 @@
+"""
+
+This module implements Burkhard-Keller Trees (bk-tree).  bk-trees
+allow fast lookup of words that lie within a specified distance of a
+query word.  For example, this might be used by a spell checker to
+find near matches to a mispelled word.
+
+The implementation is based on the description in this article:
+
+http://blog.notdot.net/2007/4/Damn-Cool-Algorithms-Part-1-BK-Trees
+
+Licensed under the PSF license: http://www.python.org/psf/license/
+
+- Adam Hupp <[email protected]>
+
+"""
+
+# http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/Levenshtein_distance#Python
+def levenshtein(s, t):
+    m, n = len(s), len(t)
+    d = [range(n+1)]
+    d += [[i] for i in range(1,m+1)]
+    for i in range(0,m):
+        for j in range(0,n):
+            cost = 1
+            if s[i] == t[j]: cost = 0
+
+            d[i+1].append( min(d[i][j+1]+1, # deletion
+                               d[i+1][j]+1, #insertion
+                               d[i][j]+cost) #substitution
+                           )
+    return d[m][n]
+
+
+class BKTree:
+    def __init__(self, words, distfn=levenshtein):
+        """
+        Create a new BK-tree from the given distance function and
+        words.
+
+        Arguments:
+
+        distfn: a binary function that returns the distance between
+        two words.  Return value is a non-negative integer.  the
+        distance function must be a metric space.
+
+        words: an iterable.  produces values that can be passed to
+        distfn
+
+        """
+        self.distfn = distfn
+
+        root = next(words)
+        self.tree = (root, {})
+
+        for i in words:
+            self._add_word(self.tree, i)
+
+    def _add_word(self, parent, word):
+        pword, children = parent
+        d = self.distfn(word, pword)
+        if d in children:
+            self._add_word(children[d], word)
+        else:
+            children[d] = (word, {})
+
+    def query(self, word, n):
+        """
+        Return all words in the tree that are within a distance of `n'
+        from `word`.
+
+        Arguments:
+
+        word: a word to query on
+
+        n: a non-negative integer that specifies the allowed distance
+        from the query word.
+
+        Return value is a list of tuples (distance, word), sorted in
+        ascending order of distance.
+
+        """
+        def rec(parent):
+            pword, children = parent
+            d = self.distfn(word, pword)
+            results = []
+            if d <= n:
+                results.append( (d, pword) )
+
+            for i in range(d-n, d+n+1):
+                child = children.get(i)
+                if child is not None:
+                    results.extend(rec(child))
+            return results
+
+        # sort by distance
+        return sorted(rec(self.tree))
+
+
+def brute_query(word, words, distfn, n):
+    """A brute force distance query
+
+    Arguments:
+
+    word: the word to query for
+
+    words: a iterable that produces words to test
+
+    distfn: a binary function that returns the distance between a
+    `word' and an item in `words'.
+
+    n: an integer that specifies the distance of a matching word
+
+    """
+    return [i for i in words
+            if distfn(i, word) <= n]
+
+
+def maxdepth(tree, count=0):
+    _, children = t
+    if len(children):
+        return max(maxdepth(i, c+1) for i in children.values())
+    else:
+        return c
+
+
+def dict_words(dictfile="/usr/share/dict/american-english"):
+    "Return an iterator that produces words in the given dictionary."
+    return filter(len, map(str.strip, open(dictfile)))
+
+
+def timeof(fn, *args):
+    import time
+    t = time.time()
+    res = fn(*args)
+    print("time: ", (time.time() - t))
+    return res
+
+
+
+if __name__ == "__main__":
+
+    tree = BKTree(dict_words('big.txt'))
+
+    print(tree.query("abc", 2))
+
+#     dist = 1
+#     for i in ["book", "cat", "backlash", "scandal"]:
+#         w = set(tree.query(i, dist)) - set([i])
+#         print "words within %d of %s: %r" % (dist, i, w)
+
diff --git a/dict.py b/dict.py
@@ -0,0 +1,133 @@
+#!/usr/bin/env python3
+
+import sys
+import os
+import json
+import subprocess
+import pickle
+import string
+import re
+from typing import List, Tuple
+from pathlib import Path
+from bktree import BKTree, dict_words
+import macdict
+
+
+def parse_Oxford_Chinese_Dictionary(content) -> List[Tuple[str, str]]:
+    """content in plain text, parse to structrued data
+    不同词典库，格式不一样，此函数只解析 牛津英汉汉英词典
+    """
+    entries = []
+    pinyin = r"([a-z]*[āɑ̄ēīōūǖáɑ́éíóúǘǎɑ̌ěǐǒǔǚàɑ̀èìòùǜü]+[a-z]*)+"
+    # (synoym) 词义 pīnyīn
+    pattern = re.compile(r";? ?(\([a-zA-Z, ]+\))? ?(«[a-zA-Z, ]+»)? [\u4e00-\u9fff…]+ " + pinyin)
+    for m in pattern.finditer(content):
+        entries.append(m.group(0))
+    lines = []
+    for ent in entries:
+        if ent.startswith(';') and lines:
+            lines[-1] += ent
+        else:
+            lines.append(ent)
+    results = []
+    for text in lines:
+        text = re.sub(pinyin, "", text)
+        text = re.sub(" +", " ", text)
+        title = ','.join(re.findall(r"[\u4e00-\u9fff…]+", text))
+        results.append((title, text))
+    return results
+
+
+def alfred_item(title, subtitle, arg=None, is_suggestion=False):
+    """https://www.alfredapp.com/help/workflows/inputs/script-filter/json/"""
+    arg = arg or title
+    item = {
+        "arg": arg,
+        "title": title,
+        "subtitle": subtitle or "👻本地查不到，按shift或enter网络查询",
+        "valid": True,
+        "quicklookurl": f"https://youdao.com/result?word={arg}&lang=en",
+        "icon": { "path": "assets/translate-star.png" if is_suggestion else "assets/translate.png" },
+        "mods": {
+            "cmd": { "subtitle": "🔊 ", "arg": arg, "valid": True },
+            "alt": { "subtitle": "📣 ", "arg": arg, "valid": True }
+        },
+        "text": {
+            "copy": title
+        }
+    }
+    return item
+
+
+class Suggester:
+    def __init__(self, cache_dir=None):
+        cache_dir = cache_dir or os.getenv("alfred_workflow_data", "./dict_cache")
+        self.cache_dir = Path(cache_dir)
+        if self.cache_dir.exists() and (self.cache_dir / 'z.pkl').exists():
+            return
+        self.cache_dir.mkdir(exist_ok=True, parents=True)
+        atoz = string.ascii_lowercase
+        trees = self._load_bktrees(atoz)
+        for ch, tree in zip(atoz, trees):
+            with open(self.cache_dir / f"{ch}.pkl", "wb") as f:
+                pickle.dump(tree, f)
+
+    @staticmethod
+    def _load_bktrees(initials) -> List[BKTree]:
+        trees = []
+        for ch in initials:
+            tree = BKTree((w for w in dict_words("/usr/share/dict/words")
+                           if w[0].lower() == ch.lower()))
+            trees.append(tree)
+        return trees
+
+    def suggest(self, word: str, max_count:int = 10) -> List[str]:
+        if len(word) < 2:
+            return []
+        if word[0].lower() not in string.ascii_lowercase:
+            return []
+        cache_file = self.cache_dir / f"{word[0]}.pkl"
+        with open(cache_file, "rb") as f:
+            tree = pickle.load(f)
+        results = tree.query(word, 2)
+        return [s for i, s in results[:max_count] if s != word]
+
+
+def lookup(word: str) -> str:
+    content = macdict.lookup_word(word) or ''
+    _, *rest = content.split('|')
+    return '|'.join(rest)
+
+
+def lookup_parsed(word) -> List[Tuple[str, str]]:
+    page = lookup(word)
+    parsed = parse_Oxford_Chinese_Dictionary(page)
+    if not parsed:
+        parsed = [(word, page)]
+    return parsed
+
+
+def lookup_render(word) -> str:
+    entries = lookup_parsed(word)
+    return ';'.join(t for t, _ in entries)
+
+
+def main():
+    try:
+        word = sys.argv[1]
+    except IndexError:
+        print('You did not enter any terms to look up in the Dictionary.')
+        sys.exit()
+    entries = lookup_parsed(word)
+    items = [alfred_item(w, m, word) for w, m in entries[:5]] or [alfred_item(word, '')]
+    max_suggestions = os.getenv('max_suggestions', '0')
+    max_suggestions = int(max_suggestions) if max_suggestions.isdigit() else 0
+    if max_suggestions > 0:
+        words = Suggester().suggest(word)[:max_suggestions]
+        meanings = [lookup_render(w) for w in words]
+        items += [alfred_item(w, m, is_suggestion=True) for w, m in zip(words, meanings) if m]
+    print(json.dumps({"items": items}, ensure_ascii=False))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/icon.png b/icon.png