Skip to content

Commit

Permalink
Merge pull request #16 from hayd/de
Browse files Browse the repository at this point in the history
text passing py3 tests
  • Loading branch information
hayd committed Dec 11, 2014
2 parents 72a5a3e + 8d42094 commit ddfd919
Show file tree
Hide file tree
Showing 19 changed files with 189 additions and 132 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ script:
# TODO perhaps split build into tests and examples?
# For now we only run the passing python 3 tests are run on the 3.4 build
- if [ "$TRAVIS_PYTHON_VERSION" == "3.4" ]; then
nosetests test/test_graph.py test/test_metrics.py; else
nosetests --ignore-files=test_examples\|test_db\|test_vector\|test_web; else
nosetests --exclude=test_05vector_07slp --with-coverage --cover-package=pattern;
fi

Expand Down
2 changes: 1 addition & 1 deletion pattern/db/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2134,7 +2134,7 @@ def csv_header_encode(field, type=STRING):
# csv_header_encode("age", INTEGER) => "age (INTEGER)".
t = re.sub(r"^varchar\(.*?\)", "string", (type or ""))
t = t and " (%s)" % t or ""
s = "%s%s" % (encode_utf8(field or ""), t.upper())
s = "%s%s" % (field or "", t.upper())
return s


Expand Down
30 changes: 18 additions & 12 deletions pattern/text/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -399,10 +399,13 @@ def _read(path, encoding="utf-8", comment=";;;"):
# From file or buffer.
f = path
for i, line in enumerate(f):
line = line.strip(codecs.BOM_UTF8) if i == 0 and isinstance(
line, str) else line
line = (line.strip(codecs.BOM_UTF8)
if i == 0 and isinstance(line, bytes)
else line)

line = line.strip()
line = decode_utf8(line, encoding)
line = line.decode(encoding) if isinstance(line, bytes) else line

if not line or (comment and line.startswith(comment)):
continue
yield line
Expand All @@ -424,6 +427,7 @@ def load(self):
# Arnold NNP x
dict.update(self, (x.split(" ")[:2] for x in _read(self._path)))


#--- FREQUENCY -----------------------------------------------------------


Expand Down Expand Up @@ -859,7 +863,7 @@ def __init__(self, lexicon={}, frequency={}, model=None, morphology=None, contex
The given default tags are used for unknown words.
Unknown words that start with a capital letter are tagged NNP (except for German).
Unknown words that contain only digits and punctuation are tagged CD.
Optionally, morphological and contextual rules (or a language model) can be used
Optionally, morphological and contextual rules (or a language model) can be used
to improve the tags of unknown words.
The given language can be used to discern between
Germanic and Romance languages for phrase chunking.
Expand Down Expand Up @@ -1727,7 +1731,7 @@ def commandline(parse=Parser().parse):
# The output can be either slash-formatted string or XML.
if "xml" in arguments:
s = Tree(s, s.tags).xml
print(encode_utf8(s))
print(s)

#### VERBS ###############################################################

Expand Down Expand Up @@ -2153,9 +2157,11 @@ def tenses(self, verb, parse=True):
for id1, id2 in self._default.items():
if id2 in a:
a.add(id1)
a = (TENSES[id][:-2] for id in a)
a = Tenses(sorted(a))
return a
t = (TENSES[id][:-2] for id in a)
# TODO fix this hack
t = Tenses(sorted(t, key=lambda x: (x[0] or '', x[1] or 0, x[2] or '',
x[3] or '', x[4] or '')))
return t

def find_lemma(self, verb):
# Must be overridden in a subclass.
Expand Down Expand Up @@ -2289,14 +2295,14 @@ def load(self, path=None):
self._language = xml.attrib.get("language", self._language)
# Average scores of all word senses per part-of-speech tag.
for w in words:
words[w] = dict((pos, map(avg, zip(*psi)))
words[w] = dict((pos, [avg(x) for x in zip(*psi)])
for pos, psi in words[w].items())
# Average scores of all part-of-speech tags.
for w, pos in words.items():
words[w][None] = map(avg, zip(*pos.values()))
words[w][None] = [avg(x) for x in zip(*pos.values())]
# Average scores of all synonyms per synset.
for id, psi in synsets.items():
synsets[id] = map(avg, zip(*psi))
synsets[id] = [avg(x) for x in zip(*psi)]
dict.update(self, words)
dict.update(self.labeler, labels)
dict.update(self._synsets, synsets)
Expand Down Expand Up @@ -2628,7 +2634,7 @@ def suggest(self, w):
def _module(language):
""" Returns the given language module (e.g., "en" => pattern.en).
"""
return _modules.setdefault(language, __import__(language, globals(), {}, [], -1))
return _modules.setdefault(language, __import__(language, globals(), {}, [], 1))


def _multilingual(function, *args, **kwargs):
Expand Down
12 changes: 7 additions & 5 deletions pattern/text/en/inflect.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
# Based on the Ruby Linguistics module by Michael Granger:
# http://www.deveiate.org/projects/Linguistics/wiki/English

RE_ARTICLE = map(lambda x: (re.compile(x[0]), x[1]), (
RE_ARTICLE = [(re.compile(x[0]), x[1]) for x in (
# exceptions: an hour, an honor
("euler|hour(?!i)|heir|honest|hono", "an"),
# Abbreviations:
Expand All @@ -67,7 +67,7 @@
# y like "i": an yclept, a year
(r"y(b[lor]|cl[ea]|fere|gg|p[ios]|rou|tt)", "an"),
(r"", "a") # guess "a"
))
)]


def definite_article(word):
Expand All @@ -85,14 +85,16 @@ def indefinite_article(word):
if rule.search(word) is not None:
return article

DEFINITE, INDEFINITE = \
"definite", "indefinite"
DEFINITE, INDEFINITE = "definite", "indefinite"


def article(word, function=INDEFINITE):
"""Returns the indefinite (a or an) or definite (the) article for the given
word."""
return function == DEFINITE and definite_article(word) or indefinite_article(word)
if function == DEFINITE:
return definite_article(word)
else:
return indefinite_article(word)

_article = article

Expand Down
5 changes: 5 additions & 0 deletions pattern/text/en/modality.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,11 @@
# License: BSD (see LICENSE.txt for details).
# http://www.clips.ua.ac.be/pages/pattern

try:
basestring
except NameError: # Python 3
basestring = str


### LIST FUNCTIONS #######################################################

Expand Down
42 changes: 27 additions & 15 deletions pattern/text/en/wordnet/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
# Note that pywordnet has been included in nltk upstream
# TODO ensure these are fixed upstream (so we can use that?

import codecs # TODO use this exclusively for opening?
import os
import sys
import glob
Expand All @@ -53,9 +54,9 @@

try:
basestring
except NameError:
except NameError: # python 3
basestring = str

unicode = str

VERSION = ""
s = open(os.path.join(MODULE, CORPUS, "dict", "index.noun")).read(2048)
Expand Down Expand Up @@ -215,22 +216,25 @@ def antonym(self):
def meronyms(self):
""" Yields a list of synsets that are semantic members/parts of this synset, for example:
synsets("house")[0].meronyms() =>
[Synset("library"),
Synset("loft"),
[Synset("library"),
Synset("loft"),
Synset("porch")
]
"""
p = self._synset.getPointers(wn.MEMBER_HOLONYM)
p += self._synset.getPointers(wn.PART_HOLONYM)
return [Synset(p.getTarget()) for p in p]
p1 = self._synset.getPointers(wn.MEMBER_HOLONYM)
p2 = self._synset.getPointers(wn.PART_HOLONYM)
return ([Synset(p.getTarget()) for p in p1] +
[Synset(p.getTarget()) for p in p2])


def holonyms(self):
""" Yields a list of synsets of which this synset is a member/part, for example:
synsets("tree")[0].holonyms() => Synset("forest").
"""
p = self._synset.getPointers(wn.MEMBER_MERONYM)
p += self._synset.getPointers(wn.PART_MERONYM)
return [Synset(p.getTarget()) for p in p]
p1 = self._synset.getPointers(wn.MEMBER_MERONYM)
p2 = self._synset.getPointers(wn.PART_MERONYM)
return ([Synset(p.getTarget()) for p in p1] +
[Synset(p.getTarget()) for p in p2])

def hyponyms(self, recursive=False, depth=None):
""" Yields a list of semantically more specific synsets, for example:
Expand Down Expand Up @@ -277,7 +281,11 @@ def hypernym(self):
synsets("train")[0].hypernym => Synset("public transport").
"""
p = self._synset.getPointers(wn.HYPERNYM)
return len(p) > 0 and Synset(p[0].getTarget()) or None
try:
first = p[0] if isinstance(p, tuple) else next(p)
return Synset(first.getTarget())
except StopIteration:
return None

def similar(self):
""" Returns a list of similar synsets for adjectives and adverbs, for example:
Expand Down Expand Up @@ -386,14 +394,18 @@ def map32(id, pos=NOUN):
"""
global _map32_cache
if not _map32_cache:
_map32_cache = open(
os.path.join(MODULE, "dict", "index.32")).readlines()
_map32_cache = codecs.open(os.path.join(MODULE, "dict", "index.32"))\
.readlines()
_map32_cache = (x for x in _map32_cache if x[0] != ";") # comments
_map32_cache = dict(x.strip().split(" ") for x in _map32_cache)
_map32_cache = (x.strip().split(b" ", 1) for x in _map32_cache)
_map32_cache = dict(x for x in _map32_cache if len(x) == 2)

k = pos in _map32_pos2 and pos or _map32_pos1.get(pos, "x")
k += str(id).lstrip("0")
k = _map32_cache.get(k, None)
k = _map32_cache.get(k.encode("utf-8"), None)

if k is not None:
k = k.decode("utf-8")
return int(k[1:]), _map32_pos2[k[0]]
return None

Expand Down
6 changes: 3 additions & 3 deletions pattern/text/en/wordnet/pywordnet/wordnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -394,15 +394,15 @@ def __init__(self, pos, offset, line):
self.lexname = Lexname.lexnames and Lexname.lexnames[
int(tokens[1])] or []
(self._senseTuples, remainder) = _partition(
tokens[4:], 2, string.atoi(tokens[3], 16))
tokens[4:], 2, int(tokens[3], 16))
(self._pointerTuples, remainder) = _partition(
remainder[1:], 4, int(remainder[0]))
if pos == VERB:
(vfTuples, remainder) = _partition(
remainder[1:], 3, int(remainder[0]))

def extractVerbFrames(index, vfTuples):
return tuple(map(lambda t: string.atoi(t[1]), filter(lambda t, i=index: string.atoi(t[2], 16) in (0, i), vfTuples)))
return tuple(map(lambda t: int(t[1]), filter(lambda t, i=index: int(t[2], 16) in (0, i), vfTuples)))
senseVerbFrames = []
for index in range(1, len(self._senseTuples) + 1):
senseVerbFrames.append(extractVerbFrames(index, vfTuples))
Expand Down Expand Up @@ -752,7 +752,7 @@ def __init__(self, sourceOffset, pointerTuple):
self.targetOffset = int(offset)
self.pos = _normalizePOS(pos)
"""part of speech -- one of NOUN, VERB, ADJECTIVE, ADVERB"""
indices = string.atoi(indices, 16)
indices = int(indices, 16)
self.sourceIndex = indices >> 8
self.targetIndex = indices & 255

Expand Down
2 changes: 1 addition & 1 deletion pattern/text/fr/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ def load(self, path=None):
_Sentiment.load(self, path)
# Map "précaire" to "precaire" (without diacritics, +1% accuracy).
if not path:
for w, pos in dict.items(self):
for w, pos in list(dict.items(self)):
w0 = w
if not w.endswith((u"à", u"è", u"é", u"ê", u"ï")):
w = w.replace(u"à", "a")
Expand Down
24 changes: 12 additions & 12 deletions pattern/text/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,15 +153,15 @@ def combinations(iterable, n):

def product(*args, **kwargs):
""" Yields all permutations with replacement:
list(product("cat", repeat=2)) =>
[("c", "c"),
("c", "a"),
("c", "t"),
("a", "c"),
("a", "a"),
("a", "t"),
("t", "c"),
("t", "a"),
list(product("cat", repeat=2)) =>
[("c", "c"),
("c", "a"),
("c", "t"),
("a", "c"),
("a", "a"),
("a", "t"),
("t", "c"),
("t", "a"),
("t", "t")]
"""
p = [[]]
Expand Down Expand Up @@ -196,7 +196,7 @@ def variations(iterable, optional=lambda x: False):
v = tuple(iterable[i] for i in range(len(v)) if not v[i])
a.add(v)
# Longest-first.
return sorted(a, cmp=lambda x, y: len(y) - len(x))
return sorted(a, key=len, reverse=True)

#### TAXONOMY ############################################################

Expand Down Expand Up @@ -626,7 +626,7 @@ def match(self, word):
Some part-of-speech-tags can also contain wildcards: NN*, VB*, JJ*, RB*, PR*.
If the given word contains spaces (e.g., proper noun),
the entire chunk will also be compared.
For example: Constraint(words=["Mac OS X*"])
For example: Constraint(words=["Mac OS X*"])
matches the word "Mac" if the word occurs in a Chunk("Mac OS X 10.5").
"""
# If the constraint has a custom function it must return True.
Expand Down Expand Up @@ -918,7 +918,7 @@ def match(self, sentence, start=0, _v=None, _u=None):
_u[id(sequence)] = False
# Return the leftmost-longest.
if len(a) > 0:
return sorted(a)[0][-1]
return sorted(a, key=lambda x: x[:2])[0][-1]

def _variations(self):
v = variations(
Expand Down
Loading

0 comments on commit ddfd919

Please sign in to comment.