-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtest.py
93 lines (81 loc) · 4.41 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
from counter import counter_s, counter_nos
from counter import analysis, hirsch_index
import numpy as np
from numpy.testing import assert_array_equal
from levenshtein_numerics import levenshtein_word
from levenshtein_distance import levenshtein_phrase_distance
def testCounter():
d = {1: 'A B C D', 2: 'A B C E A B C D', 3: 'A B C D E', \
4: 'C E A B', 5: 'A E E E', 6: 'B C B C', 7: 'C B E E', \
8: 'B C E', 9: 'E A B C E'}
matrix_s = analysis(counter_s(d))
matrix_nos = analysis(counter_nos(d))
assert_array_equal(matrix_s, [[4, 4, 9], [3, 1, 1], [2, 3, 5], \
[1, 2, 2]])
assert_array_equal(matrix_nos, [[4, 4, 9], [3, 1, 1], [2, 3, 5], \
[1, 2, 2]])
def testCountOnlyOneOfSamePhrasePerMessage():
matrix_s = analysis(counter_s({1: 'This is a second This is a second'}))
matrix_nos = analysis(counter_nos({1: 'This is a second This is a second'}))
assert_array_equal(matrix_s[0], [4, 1, 1])
assert_array_equal(matrix_nos[0], [4, 1, 1])
def testNotCountSinglePhrase():
matrix_s = analysis(counter_s({1: 'This is a long phrase', 2: 'This is a'}))
assert_array_equal(matrix_s[0], [3, 1, 2])
matrix_s = analysis(counter_s({1: 'This is a long phrase', 2: 'A new Phrase', \
3: 'This is a long phrase'}))
assert_array_equal(matrix_s[2], [3, 0, 0])
matrix_nos = analysis(counter_nos({1: 'This is a long phrase', 2: 'This is a'}))
assert_array_equal(matrix_nos[0], [3, 1, 2])
matrix_nos = analysis(counter_nos({1: 'This is a long phrase', 2: 'A new Phrase', \
3: 'This is a long phrase'}))
assert_array_equal(matrix_nos[2], [3, 0, 0])
def testNotCountIfFullyMarked():
matrix_s = analysis(counter_s({1: 'This is a phrase', 2: 'This is a phrase', \
3: 'Here is a', 4: 'Here is a sentence'}))
matrix_nos = analysis(counter_s({1: 'This is a phrase', 2: 'This is a phrase', \
3: 'Here is a', 4: 'Here is a sentence'}))
# Dont count 'is a', cause it's marked in every message
assert_array_equal(matrix_s[2], [2, 0, 0])
assert_array_equal(matrix_nos[2], [2, 0, 0])
def testCountSinglePhraseIfInLongerPhrase():
matrix_s = analysis(counter_s({1: 'This is a long phrase', 2: 'This is a', \
3: 'This is a long phrase'}))
matrix_nos = analysis(counter_nos({1: 'This is a long phrase', 2: 'This is a', \
3: 'This is a long phrase'}))
assert_array_equal(matrix_s[2], [3, 1, 1])
assert_array_equal(matrix_nos[2], [3, 1, 1])
def testSingleWords():
tuples_s = counter_s({1: 'This is a sentence with no word repetition at all.'})
tuples_nos = counter_nos({1: 'This is a sentence with no word repetition at all.'})
assert_array_equal(analysis(tuples_s).shape, (0, 3))
assert_array_equal(analysis(tuples_nos).shape, (0, 3))
def testSentenceSplitting():
d = {1: 'Here it is. A phrase, but Mr. Smith said this is an' +
' example of only one phrase.', 2: 'Although it is a phrase here again.', \
3: 'It is a phrase. Mr Smith said it.'}
matrix_s = analysis(counter_s(d))
assert_array_equal(matrix_s[0], [4, 1, 2]) # it is a phrase
assert_array_equal(matrix_s[1], [3, 1, 2]) # Mr Smith said
def testNoSentenceSplitting():
d = {1: 'Here it is. A phrase, but Mr. Smith said this is an' +
' example of only one phrase.', 2: 'Although it is a phrase here again.', \
3: 'It is a phrase. Mr Smith said it.'}
matrix_nos = analysis(counter_nos(d))
# Different results for no-sentence-splitting!
assert_array_equal(matrix_nos[0], [4, 1, 3]) # it is a phrase
assert_array_equal(matrix_nos[1], [3, 1, 2]) # Mr Smith said
def testHirschIndex():
d = {1: 'Here it is. A phrase, but Mr. Smith said this is an' +
' example of only one phrase.', 2: 'Although it is a phrase here again.', \
3: 'It is a phrase. Mr Smith said it.'}
tuples_nos = counter_nos(d)
hidx = hirsch_index(tuples_nos)
assert hidx > 0 and hidx < len([word for doc in d.values() for word in doc.split()])
def testLevenshteinWord():
assert levenshtein_word("sport".encode('utf-8'), "support".encode('utf-8')) != -1
def testLevenshteinPhrase():
phrase_1 = "Well it's true that we love one another."
phrase_2 = "I love Jack White like a little brother."
assert levenshtein_phrase_distance(phrase_1, phrase_2) == 8
assert levenshtein_phrase_distance(phrase_1, phrase_2, CYTHON=True) == 8