-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathwctrmvec.h
225 lines (188 loc) · 9.28 KB
/
wctrmvec.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
/****************************** -*- C++ -*- *****************************/
/* */
/* WordClust -- Word Clustering */
/* Version 2.00 */
/* by Ralf Brown */
/* */
/* File: wctrmvec.h term vector declarations */
/* LastEdit: 21sep2018 */
/* */
/* (c) Copyright 1999,2000,2001,2002,2003,2005,2006,2008,2009,2010, */
/* 2015,2016,2017,2018 Carnegie Mellon University */
/* This program may be redistributed and/or modified under the */
/* terms of the GNU General Public License, version 3, or an */
/* alternative license agreement as detailed in the accompanying */
/* file LICENSE. You should also have received a copy of the */
/* GPL (file COPYING) along with this program. If not, see */
/* http://www.gnu.org/licenses/ */
/* */
/* This program is distributed in the hope that it will be */
/* useful, but WITHOUT ANY WARRANTY; without even the implied */
/* warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR */
/* PURPOSE. See the GNU General Public License for more details. */
/* */
/************************************************************************/
#ifndef __WCTRMVEC_H_INCLUDED
#define __WCTRMVEC_H_INCLUDED
#include "framepac/list.h"
#include "framepac/vecsim.h"
#include "wcparam.h"
//----------------------------------------------------------------------
class WcTermVectorInfo
{
public:
WcTermVectorInfo(const WcWordCorpus* c, const WcParameters& p) : m_corpus(c), m_params(p) {}
// accessors
const WcWordCorpus* corpus() const { return m_corpus ; }
const WcParameters& params() const { return m_params ; }
const Fr::List* leftConstraint() const { return m_left_constraint; }
const Fr::List* rightConstraint() const { return m_right_constraint; }
// manipulators
void setCorpus(const WcWordCorpus* corp) { m_corpus = corp ; }
void leftConstraint(const Fr::List* c) ;
void rightConstraint(const Fr::List* c) ;
protected:
const WcWordCorpus* m_corpus ;
const WcParameters& m_params ;
Fr::ListPtr m_left_constraint ;
Fr::ListPtr m_right_constraint ;
} ;
//----------------------------------------------------------------------
template <typename IdxT>
class WcTermVectorSparse : public Fr::SparseVector<IdxT,float>
{
public:
typedef Fr::SparseVector<IdxT,float> super ;
public:
static WcTermVectorSparse* create(size_t cap = 0) { return new WcTermVectorSparse(cap) ; }
static WcTermVectorSparse* create(const WcWordCorpus* c, const WcParameters& p, size_t cap = 0)
{ return new WcTermVectorSparse(c,p,cap) ; }
static WcTermVectorSparse* create(const WcIDCountHashTable* counts, const WcWordCorpus* c,
const WcParameters& p)
{ return new WcTermVectorSparse(counts,c,p) ; }
WcTermVectorInfo* info() const { return reinterpret_cast<WcTermVectorInfo*>(this->userData()) ; }
public:
// manipulators
void weightTerms(WcDecayType decay, double null_weight) ;
const WcWordCorpus* corpus() const { return info()->corpus() ; }
const WcParameters& params() const { return info()->params() ; }
// extra context to separate out variant senses while clustering
const Fr::List* leftConstraint() const
{ return info() ? info()->leftConstraint() : Fr::List::emptyList() ; }
const Fr::List* rightConstraint() const
{ return info() ? info()->rightConstraint() : Fr::List::emptyList() ; }
void leftConstraint(const Fr::List* c) { if (info()) info()->leftConstraint(c) ; }
void rightConstraint(const Fr::List* c) { if (info()) info()->rightConstraint(c) ; }
protected: // creation/destruction
void* operator new(size_t) { return s_allocator.allocate() ; }
void operator delete(void* blk, size_t) { s_allocator.release(blk) ; }
WcTermVectorSparse(size_t capacity = 0) : super(capacity) {}
WcTermVectorSparse(const WcWordCorpus* c, const WcParameters& p, size_t cap = 0) : super(cap)
{ this->setUserData(new WcTermVectorInfo(c,p)) ; }
WcTermVectorSparse(const WcIDCountHashTable* counts, const WcWordCorpus*, const WcParameters&) ;
~WcTermVectorSparse()
{ delete reinterpret_cast<WcTermVectorInfo*>(this->userData()) ; this->setUserData(nullptr) ; }
protected: // implementation functions for virtual methods
friend class FramepaC::Object_VMT<WcTermVectorSparse> ;
// *** destroying ***
static void free_(Fr::Object* obj) { delete static_cast<WcTermVectorSparse*>(obj) ; }
private:
static Fr::Allocator s_allocator ;
static const char s_typename[] ;
} ;
//----------------------------------------------------------------------
class WcTermVectorDense : public Fr::DenseVector<uint32_t,float>
{
public:
typedef Fr::DenseVector<uint32_t,float> super ;
typedef Fr::ContextVectorCollection<WcWordCorpus::ID,uint32_t,float,false> context_coll ;
public:
static WcTermVectorDense* create(size_t cap = 0) { return new WcTermVectorDense(cap) ; }
static WcTermVectorDense* create(const WcWordCorpus* c, const WcParameters& p, size_t cap = 0)
{ return new WcTermVectorDense(c,p,cap) ; }
static WcTermVectorDense* create(const WcIDCountHashTable* counts, const WcWordCorpus* c,
const WcParameters& p)
{ return new WcTermVectorDense(counts,c,p) ; }
WcTermVectorInfo* info() const { return reinterpret_cast<WcTermVectorInfo*>(this->userData()) ; }
// manipulators
void weightTerms(WcDecayType decay, double null_weight) ;
void incr(const Vector<uint32_t,float>* other, float weight)
{
((Vector<uint32_t,float>*)this)->incr(other,weight) ;
}
const WcWordCorpus* corpus() const { return info() ? info()->corpus() : nullptr ; }
// extra context to separate out variant senses while clustering
const Fr::List* leftConstraint() const
{ return info() ? info()->leftConstraint() : Fr::List::emptyList() ; }
const Fr::List* rightConstraint() const
{ return info() ? info()->rightConstraint() : Fr::List::emptyList() ; }
void leftConstraint(const Fr::List* c) { if (info()) info()->leftConstraint(c); }
void rightConstraint(const Fr::List* c) { if (info()) info()->rightConstraint(c); }
protected: // creation/destruction
void* operator new(size_t) { return s_allocator.allocate() ; }
void operator delete(void* blk, size_t) { s_allocator.release(blk) ; }
WcTermVectorDense(size_t cap = 0) : super(cap) { }
WcTermVectorDense(const WcWordCorpus* c, const WcParameters& p, size_t cap = 0) : super(cap)
{ this->setUserData(new WcTermVectorInfo(c,p)) ; }
WcTermVectorDense(const WcIDCountHashTable* counts, const WcWordCorpus*, const WcParameters&) ;
~WcTermVectorDense()
{ delete reinterpret_cast<WcTermVectorInfo*>(this->userData()) ; this->setUserData(nullptr) ; }
protected: // implementation functions for virtual methods
friend class FramepaC::Object_VMT<WcTermVectorDense> ;
// *** destroying ***
static void free_(Fr::Object* obj) { delete static_cast<WcTermVectorDense*>(obj) ; }
private:
static Fr::Allocator s_allocator ;
static const char s_typename[] ;
} ;
//----------------------------------------------------------------------
class WcTermVector : public WcTermVectorSparse<WcWordCorpus::ID>
{
public:
typedef WcTermVectorSparse<WcWordCorpus::ID> super ;
typedef WcTermVectorSparse<WcWordCorpus::ID> sparse_type ;
typedef WcTermVectorDense dense_type ;
typedef Fr::ContextVectorCollection<WcWordCorpus::ID,uint32_t,float,false> context_coll ;
public:
static WcTermVector* create(size_t cap = 0) { return static_cast<WcTermVector*>(super::create(cap)) ; }
static WcTermVector* create(const WcIDCountHashTable* counts, const WcWordCorpus* c,
const WcParameters& p)
{
if (p.contextCollection())
return (WcTermVector*)(WcTermVectorDense::create(counts,c,p)) ;
else
return static_cast<WcTermVector*>(super::create(counts,c,p)) ;
}
void incr(const Vector<uint32_t,float>* other, float weight)
{
if (isSparseVector())
this->sparse_type::incr(other,weight) ;
else
((Vector<uint32_t,float>*)this)->incr(other,weight) ;
}
sparse_type* sparseVector() const { return isSparseVector() ? (sparse_type*)this : nullptr ; }
dense_type* denseVector() const { return isSparseVector() ? nullptr : (dense_type*)this ; }
protected:
WcTermVector() : super() {}
~WcTermVector() {}
} ;
//----------------------------------------------------------------------------
template <typename IdxT, typename ValT>
class VectorMeasureSplitCosine : public Fr::SimilarityMeasure<IdxT, ValT>
{
public:
typedef Fr::SimilarityMeasure<IdxT, ValT> super ;
public:
VectorMeasureSplitCosine(const WcWordCorpus* corpus)
: m_left_context(corpus->leftContextSize()), m_total_context(corpus->totalContextSize()) {}
virtual ~VectorMeasureSplitCosine() {}
virtual double similarity(const Fr::Vector<IdxT,ValT>* v1, const Fr::Vector<IdxT,ValT>* v2) const ;
protected:
virtual const char* myCanonicalName() const { return "SplitCosine" ; }
protected:
unsigned m_left_context ;
unsigned m_total_context ;
} ;
//----------------------------------------------------------------------------
#endif /* !__WCTRMVEC_H_INCLUDED */
// end of file wctrmvec.h //