Merge pull request #16 from hayd/de

text passing py3 tests
pattern3 · Dec 11, 2014 · ddfd919 · ddfd919
2 parents 72a5a3e + 8d42094
commit ddfd919
Show file tree

Hide file tree

Showing 19 changed files with 189 additions and 132 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -24,7 +24,7 @@ script:
     # TODO perhaps split build into tests and examples?
     # For now we only run the passing python 3 tests are run on the 3.4 build
     - if [ "$TRAVIS_PYTHON_VERSION" == "3.4" ]; then
-      nosetests test/test_graph.py test/test_metrics.py; else
+      nosetests --ignore-files=test_examples\|test_db\|test_vector\|test_web; else
       nosetests --exclude=test_05vector_07slp --with-coverage --cover-package=pattern;
       fi
 

diff --git a/pattern/db/__init__.py b/pattern/db/__init__.py
@@ -2134,7 +2134,7 @@ def csv_header_encode(field, type=STRING):
     # csv_header_encode("age", INTEGER) => "age (INTEGER)".
     t = re.sub(r"^varchar\(.*?\)", "string", (type or ""))
     t = t and " (%s)" % t or ""
-    s = "%s%s" % (encode_utf8(field or ""), t.upper())
+    s = "%s%s" % (field or "", t.upper())
     return s
 
 

diff --git a/pattern/text/__init__.py b/pattern/text/__init__.py
@@ -399,10 +399,13 @@ def _read(path, encoding="utf-8", comment=";;;"):
             # From file or buffer.
             f = path
         for i, line in enumerate(f):
-            line = line.strip(codecs.BOM_UTF8) if i == 0 and isinstance(
-                line, str) else line
+            line = (line.strip(codecs.BOM_UTF8)
+                    if i == 0 and isinstance(line, bytes)
+                    else line)
+
             line = line.strip()
-            line = decode_utf8(line, encoding)
+            line = line.decode(encoding) if isinstance(line, bytes) else line
+
             if not line or (comment and line.startswith(comment)):
                 continue
             yield line
@@ -424,6 +427,7 @@ def load(self):
         # Arnold NNP x
         dict.update(self, (x.split(" ")[:2] for x in _read(self._path)))
 
+
 #--- FREQUENCY -----------------------------------------------------------
 
 
@@ -859,7 +863,7 @@ def __init__(self, lexicon={}, frequency={}, model=None, morphology=None, contex
             The given default tags are used for unknown words.
             Unknown words that start with a capital letter are tagged NNP (except for German).
             Unknown words that contain only digits and punctuation are tagged CD.
-            Optionally, morphological and contextual rules (or a language model) can be used 
+            Optionally, morphological and contextual rules (or a language model) can be used
             to improve the tags of unknown words.
             The given language can be used to discern between
             Germanic and Romance languages for phrase chunking.
@@ -1727,7 +1731,7 @@ def commandline(parse=Parser().parse):
         # The output can be either slash-formatted string or XML.
         if "xml" in arguments:
             s = Tree(s, s.tags).xml
-        print(encode_utf8(s))
+        print(s)
 
 #### VERBS ###############################################################
 
@@ -2153,9 +2157,11 @@ def tenses(self, verb, parse=True):
                 for id1, id2 in self._default.items():
                     if id2 in a:
                         a.add(id1)
-        a = (TENSES[id][:-2] for id in a)
-        a = Tenses(sorted(a))
-        return a
+        t = (TENSES[id][:-2] for id in a)
+        # TODO fix this hack
+        t = Tenses(sorted(t, key=lambda x: (x[0] or '', x[1] or 0, x[2] or '',
+                                            x[3] or '', x[4] or '')))
+        return t
 
     def find_lemma(self, verb):
         # Must be overridden in a subclass.
@@ -2289,14 +2295,14 @@ def load(self, path=None):
         self._language = xml.attrib.get("language", self._language)
         # Average scores of all word senses per part-of-speech tag.
         for w in words:
-            words[w] = dict((pos, map(avg, zip(*psi)))
+            words[w] = dict((pos, [avg(x) for x in zip(*psi)])
                             for pos, psi in words[w].items())
         # Average scores of all part-of-speech tags.
         for w, pos in words.items():
-            words[w][None] = map(avg, zip(*pos.values()))
+            words[w][None] = [avg(x) for x in zip(*pos.values())]
         # Average scores of all synonyms per synset.
         for id, psi in synsets.items():
-            synsets[id] = map(avg, zip(*psi))
+            synsets[id] = [avg(x) for x in zip(*psi)]
         dict.update(self, words)
         dict.update(self.labeler, labels)
         dict.update(self._synsets, synsets)
@@ -2628,7 +2634,7 @@ def suggest(self, w):
 def _module(language):
     """ Returns the given language module (e.g., "en" => pattern.en).
     """
-    return _modules.setdefault(language, __import__(language, globals(), {}, [], -1))
+    return _modules.setdefault(language, __import__(language, globals(), {}, [], 1))
 
 
 def _multilingual(function, *args, **kwargs):

diff --git a/pattern/text/en/inflect.py b/pattern/text/en/inflect.py
@@ -48,7 +48,7 @@
 # Based on the Ruby Linguistics module by Michael Granger:
 # http://www.deveiate.org/projects/Linguistics/wiki/English
 
-RE_ARTICLE = map(lambda x: (re.compile(x[0]), x[1]), (
+RE_ARTICLE = [(re.compile(x[0]), x[1]) for x in (
     # exceptions: an hour, an honor
     ("euler|hour(?!i)|heir|honest|hono", "an"),
     # Abbreviations:
@@ -67,7 +67,7 @@
     # y like "i": an yclept, a year
     (r"y(b[lor]|cl[ea]|fere|gg|p[ios]|rou|tt)", "an"),
     (r"", "a")  # guess "a"
-))
+)]
 
 
 def definite_article(word):
@@ -85,14 +85,16 @@ def indefinite_article(word):
         if rule.search(word) is not None:
             return article
 
-DEFINITE, INDEFINITE = \
-    "definite", "indefinite"
+DEFINITE, INDEFINITE = "definite", "indefinite"
 
 
 def article(word, function=INDEFINITE):
     """Returns the indefinite (a or an) or definite (the) article for the given
     word."""
-    return function == DEFINITE and definite_article(word) or indefinite_article(word)
+    if function == DEFINITE:
+        return definite_article(word)
+    else:
+        return indefinite_article(word)
 
 _article = article
 

diff --git a/pattern/text/en/modality.py b/pattern/text/en/modality.py
@@ -5,6 +5,11 @@
 # License: BSD (see LICENSE.txt for details).
 # http://www.clips.ua.ac.be/pages/pattern
 
+try:
+    basestring
+except NameError:  # Python 3
+    basestring = str
+
 
 ### LIST FUNCTIONS #######################################################
 

diff --git a/pattern/text/en/wordnet/__init__.py b/pattern/text/en/wordnet/__init__.py
@@ -31,6 +31,7 @@
 # Note that pywordnet has been included in nltk upstream
 # TODO ensure these are fixed upstream (so we can use that?
 
+import codecs  # TODO use this exclusively for opening?
 import os
 import sys
 import glob
@@ -53,9 +54,9 @@
 
 try:
     basestring
-except NameError:
+except NameError: # python 3
     basestring = str
-
+    unicode = str
 
 VERSION = ""
 s = open(os.path.join(MODULE, CORPUS, "dict", "index.noun")).read(2048)
@@ -215,22 +216,25 @@ def antonym(self):
     def meronyms(self):
         """ Yields a list of synsets that are semantic members/parts of this synset, for example:
             synsets("house")[0].meronyms() =>
-            [Synset("library"), 
-             Synset("loft"), 
+            [Synset("library"),
+             Synset("loft"),
              Synset("porch")
             ]
         """
-        p = self._synset.getPointers(wn.MEMBER_HOLONYM)
-        p += self._synset.getPointers(wn.PART_HOLONYM)
-        return [Synset(p.getTarget()) for p in p]
+        p1 = self._synset.getPointers(wn.MEMBER_HOLONYM)
+        p2 = self._synset.getPointers(wn.PART_HOLONYM)
+        return ([Synset(p.getTarget()) for p in p1] +
+                [Synset(p.getTarget()) for p in p2])
+
 
     def holonyms(self):
         """ Yields a list of synsets of which this synset is a member/part, for example:
             synsets("tree")[0].holonyms() => Synset("forest").
         """
-        p = self._synset.getPointers(wn.MEMBER_MERONYM)
-        p += self._synset.getPointers(wn.PART_MERONYM)
-        return [Synset(p.getTarget()) for p in p]
+        p1 = self._synset.getPointers(wn.MEMBER_MERONYM)
+        p2 = self._synset.getPointers(wn.PART_MERONYM)
+        return ([Synset(p.getTarget()) for p in p1] +
+                [Synset(p.getTarget()) for p in p2])
 
     def hyponyms(self, recursive=False, depth=None):
         """ Yields a list of semantically more specific synsets, for example:
@@ -277,7 +281,11 @@ def hypernym(self):
             synsets("train")[0].hypernym => Synset("public transport").
         """
         p = self._synset.getPointers(wn.HYPERNYM)
-        return len(p) > 0 and Synset(p[0].getTarget()) or None
+        try:
+            first = p[0] if isinstance(p, tuple) else next(p)
+            return Synset(first.getTarget())
+        except StopIteration:
+            return None
 
     def similar(self):
         """ Returns a list of similar synsets for adjectives and adverbs, for example:
@@ -386,14 +394,18 @@ def map32(id, pos=NOUN):
     """
     global _map32_cache
     if not _map32_cache:
-        _map32_cache = open(
-            os.path.join(MODULE, "dict", "index.32")).readlines()
+        _map32_cache = codecs.open(os.path.join(MODULE, "dict", "index.32"))\
+                             .readlines()
         _map32_cache = (x for x in _map32_cache if x[0] != ";")  # comments
-        _map32_cache = dict(x.strip().split(" ") for x in _map32_cache)
+        _map32_cache = (x.strip().split(b" ", 1) for x in _map32_cache)
+        _map32_cache = dict(x for x in _map32_cache if len(x) == 2)
+
     k = pos in _map32_pos2 and pos or _map32_pos1.get(pos, "x")
     k += str(id).lstrip("0")
-    k = _map32_cache.get(k, None)
+    k = _map32_cache.get(k.encode("utf-8"), None)
+
     if k is not None:
+        k = k.decode("utf-8")
         return int(k[1:]), _map32_pos2[k[0]]
     return None
 

diff --git a/pattern/text/en/wordnet/pywordnet/wordnet.py b/pattern/text/en/wordnet/pywordnet/wordnet.py
@@ -394,15 +394,15 @@ def __init__(self, pos, offset, line):
         self.lexname = Lexname.lexnames and Lexname.lexnames[
             int(tokens[1])] or []
         (self._senseTuples, remainder) = _partition(
-            tokens[4:], 2, string.atoi(tokens[3], 16))
+            tokens[4:], 2, int(tokens[3], 16))
         (self._pointerTuples, remainder) = _partition(
             remainder[1:], 4, int(remainder[0]))
         if pos == VERB:
             (vfTuples, remainder) = _partition(
                 remainder[1:], 3, int(remainder[0]))
 
             def extractVerbFrames(index, vfTuples):
-                return tuple(map(lambda t: string.atoi(t[1]), filter(lambda t, i=index: string.atoi(t[2], 16) in (0, i), vfTuples)))
+                return tuple(map(lambda t: int(t[1]), filter(lambda t, i=index: int(t[2], 16) in (0, i), vfTuples)))
             senseVerbFrames = []
             for index in range(1, len(self._senseTuples) + 1):
                 senseVerbFrames.append(extractVerbFrames(index, vfTuples))
@@ -752,7 +752,7 @@ def __init__(self, sourceOffset, pointerTuple):
         self.targetOffset = int(offset)
         self.pos = _normalizePOS(pos)
         """part of speech -- one of NOUN, VERB, ADJECTIVE, ADVERB"""
-        indices = string.atoi(indices, 16)
+        indices = int(indices, 16)
         self.sourceIndex = indices >> 8
         self.targetIndex = indices & 255
 

diff --git a/pattern/text/fr/__init__.py b/pattern/text/fr/__init__.py
@@ -172,7 +172,7 @@ def load(self, path=None):
         _Sentiment.load(self, path)
         # Map "précaire" to "precaire" (without diacritics, +1% accuracy).
         if not path:
-            for w, pos in dict.items(self):
+            for w, pos in list(dict.items(self)):
                 w0 = w
                 if not w.endswith((u"à", u"è", u"é", u"ê", u"ï")):
                     w = w.replace(u"à", "a")

diff --git a/pattern/text/search.py b/pattern/text/search.py
@@ -153,15 +153,15 @@ def combinations(iterable, n):
 
 def product(*args, **kwargs):
     """ Yields all permutations with replacement:
-        list(product("cat", repeat=2)) => 
-        [("c", "c"), 
-         ("c", "a"), 
-         ("c", "t"), 
-         ("a", "c"), 
-         ("a", "a"), 
-         ("a", "t"), 
-         ("t", "c"), 
-         ("t", "a"), 
+        list(product("cat", repeat=2)) =>
+        [("c", "c"),
+         ("c", "a"),
+         ("c", "t"),
+         ("a", "c"),
+         ("a", "a"),
+         ("a", "t"),
+         ("t", "c"),
+         ("t", "a"),
          ("t", "t")]
     """
     p = [[]]
@@ -196,7 +196,7 @@ def variations(iterable, optional=lambda x: False):
         v = tuple(iterable[i] for i in range(len(v)) if not v[i])
         a.add(v)
     # Longest-first.
-    return sorted(a, cmp=lambda x, y: len(y) - len(x))
+    return sorted(a, key=len, reverse=True)
 
 #### TAXONOMY ############################################################
 
@@ -626,7 +626,7 @@ def match(self, word):
             Some part-of-speech-tags can also contain wildcards: NN*, VB*, JJ*, RB*, PR*.
             If the given word contains spaces (e.g., proper noun),
             the entire chunk will also be compared.
-            For example: Constraint(words=["Mac OS X*"]) 
+            For example: Constraint(words=["Mac OS X*"])
             matches the word "Mac" if the word occurs in a Chunk("Mac OS X 10.5").
         """
         # If the constraint has a custom function it must return True.
@@ -918,7 +918,7 @@ def match(self, sentence, start=0, _v=None, _u=None):
                 _u[id(sequence)] = False
         # Return the leftmost-longest.
         if len(a) > 0:
-            return sorted(a)[0][-1]
+            return sorted(a, key=lambda x: x[:2])[0][-1]
 
     def _variations(self):
         v = variations(