-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathwordclus.h
187 lines (148 loc) · 6.7 KB
/
wordclus.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
/****************************** -*- C++ -*- *****************************/
/* */
/* WordClust -- Word Clustering */
/* Version 2.00 */
/* by Ralf Brown */
/* */
/* File: wordclus.h word clustering (declarations) */
/* LastEdit: 21sep2018 */
/* */
/* (c) Copyright 1999,2000,2001,2002,2003,2005,2006,2008,2009,2010, */
/* 2015,2016,2017,2018 Carnegie Mellon University */
/* This program may be redistributed and/or modified under the */
/* terms of the GNU General Public License, version 3, or an */
/* alternative license agreement as detailed in the accompanying */
/* file LICENSE. You should also have received a copy of the */
/* GPL (file COPYING) along with this program. If not, see */
/* http://www.gnu.org/licenses/ */
/* */
/* This program is distributed in the hope that it will be */
/* useful, but WITHOUT ANY WARRANTY; without even the implied */
/* warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR */
/* PURPOSE. See the GNU General Public License for more details. */
/* */
/************************************************************************/
#ifndef __WORDCLUS_H_INCLUDED
#define __WORDCLUS_H_INCLUDED
#include "framepac/cluster.h"
#include "framepac/file.h"
#include "framepac/hashtable.h"
#include "framepac/list.h"
#include "framepac/threshold.h"
#include "framepac/wordcorpus.h"
/************************************************************************/
/* Manifest Constants */
/************************************************************************/
#define WORDCLUST_VERSION "2.00beta"
#define WcSORT_OUTPUT true
#define WcATTR_STOPWORD 0
#define WcATTR_DESIRED 1
#define WcATTR_DELETABLE 2
/************************************************************************/
/* Types */
/************************************************************************/
#ifdef HUGE_CORPUS
typedef Fr::WordCorpusXL WcWordCorpus ;
#else
typedef Fr::WordCorpus WcWordCorpus ;
#endif /* HUGE_CORPUS */
// forward declarations
class WcConfig ;
class WcParameters ;
class WcTermVector ;
//--------------------------------------------------------------------------
enum WcDecayType
{
Decay_None,
Decay_Reciprocal,
Decay_Linear,
Decay_Exponential
} ;
//--------------------------------------------------------------------------
//--------------------------------------------------------------------------
namespace Fr
{
template <>
inline size_t HashTable<unsigned,size_t>::hashVal(const char* keyname, size_t* namelen) const
{ *namelen = strlen(keyname) ; return (size_t)keyname ; } //FIXME
template <>
inline bool HashTable<unsigned,size_t>::isEqual(const char* keyname, size_t keylen, unsigned other)
{
(void)keyname; (void)keylen; (void)other ;
return false ; //FIXME
}
typedef HashTable<unsigned,size_t> WcIDCountHashTable ;
} // end namespace Fr
using Fr::WcIDCountHashTable ;
//--------------------------------------------------------------------------
typedef bool WcGlobalFilterFunc(const Fr::Array* tvs, const WcParameters* params, void* user_data) ;
typedef bool WcVectorFilterFunc(const WcTermVector *tv, const WcParameters *params,
const Fr::SymHashTable *keys, void *user_data) ;
typedef void WcClusterFilterFunc(Fr::ClusterInfo* clust, const WcParameters *params, void *user_data) ;
// rules for cluster filter func:
// any member vectors which are to be discarded from cluster should be set to nullptr
// cluster label may be modified
// call shrink_to_fit() if any vectors were discarded
typedef Fr::ClusterInfo *WcClusterPostprocFunc(Fr::ClusterInfo *clusters, void *user_data) ;
typedef void WcWordFreqProcFunc(class WcParameters ¶ms, size_t corpus_size) ;
typedef double WcMIScoreFuncID(const WcWordCorpus*,
WcWordCorpus::ID word1, WcWordCorpus::ID word2,
size_t cooccur, void *udata) ;
/************************************************************************/
/************************************************************************/
extern bool use_nearest ;
extern bool use_RMS_cosine ;
//----------------------------------------------------------------------
// configuration
void apply_WordClus_configuration(WcConfig *config) ;
void WcLoadTermWeights(const char *weights_file) ;
void WcSetWordDelimiters(const char *delim) ;
const char *WcWordDelimiters() ;
// file access
Fr::List *WcLoadFileList(bool use_stdin, const char *listfile) ;
// corpus generation and loading
void init_corpus_parsing(const char* delims = nullptr) ;
Fr::List *load_file_list(const char* listfile) ;
bool generate_indices(WcWordCorpus *corpus, bool reverse) ;
WcWordCorpus* new_corpus(const WcParameters* params, const char *filename = nullptr) ;
WcWordCorpus* load_corpus(const Fr::List *filelist, const WcParameters *params = nullptr) ;
WcWordCorpus* load_or_generate_corpus(const char *filename, const WcParameters* params) ;
// preprocessing
class WcWordIDPairTable *WcComputeMutualInfo(const WcWordCorpus* corpus,
const WcParameters* params) ;
void WcRemoveAutoClustersFromSeeds(Fr::ObjHashTable* seeds) ;
// the actual clustering
Fr::ClusterInfo* cluster_vectors(Fr::SymHashTable* ht, const WcParameters* params,
const WcWordCorpus* corpus, Fr::SymHashTable* seeds = nullptr,
Fr::VectorMeasure<WcWordCorpus::ID,float>* measure = nullptr,
bool verbose = false) ;
// top-level processing functions
bool WcProcessCorpus(WcWordCorpus* corpus, // deletes corpus to save memory!
Fr::VectorMeasure<WcWordCorpus::ID,float>* measure,
Fr::CFile& outfp, Fr::CFile& tokfp, Fr::CFile& tagfp,
const WcParameters* global_params,
const char* outfilename, const char *tokfilename,
const char* tagfilename) ;
// output of results
void WcOutputClusters(const Fr::ClusterInfo* clusters, Fr::CFile& outfp,
const char* seed_file, bool sort_output = true,
const char* output_filename = nullptr,
bool skip_auto_clusters = false) ;
void WcOutputTokenFile(const Fr::ClusterInfo* cluster_list, Fr::CFile& tokfp,
bool sort_output = true,
const char *output_filename = nullptr,
bool skip_auto_clusters = false,
bool suppress_auto_brackets = false) ;
void WcOutputTaggedCorpus(const Fr::ClusterInfo* cluster_list, Fr::CFile& tagfp,
bool sort_output = true,
const char* output_filename = nullptr,
bool skip_auto_clusters = false) ;
// cleanup
void WcClearWordDelimiters() ;
//----------------------------------------------------------------------------
std::locale* WcCurrentCharEncoding() ;
void WcSetCharEncoding(const char *char_enc) ;
bool WcLowercaseOutput() ;
void WcLowercaseOutput(bool lc) ;
#endif /* !__WORDCLUS_H_INCLUDED */
// end of file wordclus.h //