-
Notifications
You must be signed in to change notification settings - Fork 0
/
jtfidf.go
155 lines (122 loc) · 2.76 KB
/
jtfidf.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
// Copyright 2019 ramenjuniti.
/*
Package jtfidf provides calculations of TF(Term Frequency), IDF(Inverse Document Frequency) and TF-IDF values at Japanese documents.
Dependencies
This package uses [kagome](https://github.com/ikawaha/kagome) as Morphological Analyzer.
About how to calulate TF-IDF value
The calculation of the TF-IDF value in this package uses the IDF value plus 1.
This is to prevent the TF-IDF value from becoming 0.
*/
package jtfidf
import (
"math"
"github.com/ikawaha/kagome/tokenizer"
)
func splitTerm(d string) []string {
t := tokenizer.New()
tokens := t.Tokenize(d)
tokens = tokens[1 : len(tokens)-1]
terms := make([]string, len(tokens))
for i, token := range tokens {
terms[i] = token.Surface
}
return terms
}
// AllTf returns all TF values in d
func AllTf(d string) map[string]float64 {
terms := splitTerm(d)
n := len(terms)
tfs := map[string]float64{}
for _, term := range terms {
if _, ok := tfs[term]; ok {
tfs[term]++
} else {
tfs[term] = 1
}
}
for term := range tfs {
tfs[term] /= float64(n)
}
return tfs
}
// Tf returns t's TF value in d
func Tf(t, d string) float64 {
terms := splitTerm(d)
n := len(terms)
var count int
if n == 0 {
return 0
}
for _, term := range terms {
if t == term {
count++
}
}
return float64(count) / float64(n)
}
// AllIdf returns all IDF values in ds
func AllIdf(ds []string) map[string]float64 {
n := len(ds)
terms := []string{}
termsList := make([][]string, n)
for _, d := range ds {
terms = append(terms, splitTerm(d)...)
}
for i, d := range ds {
termsList[i] = splitTerm(d)
}
idfs := map[string]float64{}
for _, term := range terms {
var df int
for i := 0; i < len(termsList); i++ {
for j := 0; j < len(termsList[i]); j++ {
if termsList[i][j] == term {
df++
break
}
}
}
if _, ok := idfs[term]; !ok {
idfs[term] = math.Log(float64(n) / float64(df))
}
}
return idfs
}
// Idf retuns t's IDF value in ds
func Idf(t string, ds []string) float64 {
n := len(ds)
termsList := make([][]string, n)
var df int
for i, d := range ds {
termsList[i] = splitTerm(d)
}
for i := 0; i < len(termsList); i++ {
for j := 0; j < len(termsList[i]); j++ {
if t == termsList[i][j] {
df++
break
}
}
}
if df == 0 {
return 0
}
return math.Log(float64(n) / float64(df))
}
// AllTfidf retuns all TF-IDF values in ds
func AllTfidf(ds []string) []map[string]float64 {
idfs := AllIdf(ds)
tfidfs := []map[string]float64{}
for _, d := range ds {
tfidf := map[string]float64{}
for term, tf := range AllTf(d) {
tfidf[term] = tf * (idfs[term] + 1)
}
tfidfs = append(tfidfs, tfidf)
}
return tfidfs
}
// Tfidf returns t's TF-IDF value in ds
func Tfidf(t, d string, ds []string) float64 {
return Tf(t, d) * (Idf(t, ds) + 1)
}