-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathGaussClust_lite.sh
executable file
·517 lines (459 loc) · 23.1 KB
/
GaussClust_lite.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
#!/bin/sh
##########################################################################################
# File: GaussClust_lite.sh #
version="v0.1.1" #
# Author: Justin C. Bagley #
# Date: created by Justin Bagley Wed Jan 4 09:09:34 2017 -0600 #
# Last update: April 28, 2017 #
# Copyright (c) 2016-2019 Justin C. Bagley. All rights reserved. #
# Please report bugs to <[email protected]> #
# #
# Description: #
# SHELL SCRIPT FOR AUTOMATING GAUSSIAN MIXTURE MODELING ON CONTINUOUS, DISCRETE, OR #
# COMBINED GENETIC/MORPHOLOGICAL/ECOLOGICAL DATA, FOR SPECIES DELIMITATION AND #
# CLASSIFICATION #
# #
##########################################################################################
echo "INFO | $(date) | Running GaussClust_lite on test input files generated by 'bgmmSensTest.sh' ... "
############ SCRIPT OPTIONS
## OPTION DEFAULTS ##
NUM_NMDS_DIMS=4
CALL_UNSUPERGMM=1
RANGE_NBCLUST=0
CALL_DISCRIMINANT=1
MY_PROBS_MATRIX=probs.txt
CALL_BGMM=0
## PARSE THE OPTIONS ##
while getopts 'k:u:r:d:b:p:c:' opt ; do
case $opt in
k) NUM_NMDS_DIMS=$OPTARG ;;
u) CALL_UNSUPERGMM=$OPTARG ;;
r) RANGE_NBCLUST=$OPTARG ;;
d) CALL_DISCRIMINANT=$OPTARG ;;
b) CALL_BGMM=$OPTARG ;;
p) MY_PROBS_MATRIX=$OPTARG ;;
c) NUM_COMPONENTS=$OPTARG ;;
esac
done
Usage="Usage: $(basename "$0") [Help: -h help] [Options: -k u r d b p c] [stdin:] <inputFile>
## Help:
-h help text (also: -help)
## Options:
-k nmdsDimensions (def: 4) specify number of dimensions, k, to retain during NMDS on Gower
distances
-u unsuperGMM (def: 1) 0=no unsupervised GMM is carried out; 1=conduct unsupervised GMM
using 'Rmixmod' R package, for comparative or individual purposes
-r rangeNumClusters (def: 0) optional numeric listing of a range, x:y, of the number of
clusters to be modeled over during unsupervised GMM in Rmixmod
-d GMMDiscrim (def: 1) 0=(semi-)supervised, GMM-based discriminant analysis is not carried
out with 'mixmodelLearn' in Rmixmod; 1=conduct discriminant analysis
-b beliefBasedMM (def: 0) 0=no mixture modeling is carried out using the 'bgmm' R package;
1=calls 'supervised' GMM analysis, 2=calls 'semisupervised' GMM analysis, 3=calls both
supervised and semisupervised analyses, 4=calls 'belief' GMM analysis, and 5=calls
'soft' GMM analysis in bgmm
-p probsMatrix (def: probs.txt) specify matrix of plausibilities, or weights of the prior
probabilities for labeled observations, for bgmm; also sets belief matrix
-c numComponents (specify number of components (e.g. Gaussian components) or 'clusters'
to assign individuals to during regular GMM (single value, rather than a range; see -r
above or bgmm modeling)
## Input:
Input file: Script expects as <inputFile> a single plain-text data table with a header and
several columns of information followed by columns containing single-type or mixed data
(e.g. categorical, discrete, or continuous data for different morphological characters
measured) for the sample. The first column will be named 'samples' and typically contain
sample IDs/codes for each individual (preferably with a species-specific abbreviation
followed by a museum voucher number or individual code). The second column is headed as
'type' and specifies whether each individual ID is 'known' or 'unknown'. The third column
contains labels (e.g. four-letter codes) for each known individual (e.g. by species), and
'NA' values for samples of unknown type, assigning individuals to species. The example
input file contains a header with four-letter codes for each datacolumn, but users can
make the names a little longer if needed.
CITATION
Bagley, J.C. 2017. GaussClust v0.1.1. GitHub package, Available at:
<http://github.com/justincbagley/GaussClust>.
Created by Justin Bagley Tue Dec 13 16:39:37 2016 -0600
Copyright (c) 2016-2019 Justin C. Bagley. All rights reserved.
"
verboseUsage="Usage: $(basename "$0") [Help: -h help H Help] [Options: -k u r d b p c] [stdin:] <inputFile>
## Help:
-h help text (also: -help)
-H verbose help text (also: -Help)
## Options:
-k nmdsDimensions (def: 4) specify number of dimensions, k, to retain during NMDS on Gower
distances
-u unsuperGMM (def: 1) 0=no unsupervised GMM is carried out; 1=conduct unsupervised GMM
using 'Rmixmod' R package, for comparative or individual purposes
-r rangeNumClusters (def: 0) optional numeric listing of a range, x:y, of the number of
clusters to be modeled over during unsupervised GMM in Rmixmod
-d GMMDiscrim (def: 1) 0=(semi-)supervised, GMM-based discriminant analysis is not carried
out with 'mixmodelLearn' in Rmixmod; 1=conduct discriminant analysis
-b beliefBasedMM (def: 0) 0=no mixture modeling is carried out using the 'bgmm' R package;
1=calls 'supervised' GMM analysis, 2=calls 'semisupervised' GMM analysis, 3=calls both
supervised and semisupervised analyses, 4=calls 'belief' GMM analysis, and 5=calls
'soft' GMM analysis in bgmm
-p probsMatrix (def: probs.txt) specify matrix of plausibilities, or weights of the prior
probabilities for labeled observations, for bgmm; also sets belief matrix
-c numComponents (specify number of components (e.g. Gaussian components) or 'clusters'
to assign individuals to during regular GMM (single value, rather than a range; see -r
above or bgmm modeling)
## Input:
Input file: Script expects as <inputFile> a single plain-text data table with a header and
several columns of information followed by columns containing single-type or mixed data
(e.g. categorical, discrete, or continuous data for different morphological characters
measured) for the sample. The first column will be named 'samples' and typically contain
sample IDs/codes for each individual (preferably with a species-specific abbreviation
followed by a museum voucher number or individual code). The second column is headed as
'type' and specifies whether each individual ID is 'known' or 'unknown'. The third column
contains labels (e.g. four-letter codes) for each known individual (e.g. by species), and
'NA' values for samples of unknown type, assigning individuals to species. The example
input file contains a header with four-letter codes for each datacolumn, but users can
make the names a little longer if needed.
DETAILS
The -k flag sets the number of k dimensions to be retained during NMDS, which affects both
regular Gaussian mixture modeling and also the different models that are implemented in
the bgmm R package. Like file name, there is no default value; however, k=4 is recommended
by the authors based on discussion in Edwards and Knowles (Proc. Roy. Soc. B. 2014) and
Hausdorf and Hennig (Syst. Biol. 2014). (By contrast, k=2 would be normal for most other
ecological data, but may not contain sufficient information for interspecific datasets.)
The -u flag calls the unsupervised Gaussian mixture modeling method implemented in the
'mixmodCluster' function of the Rmixmod R package. See the Rmixmod R site and documentation
for additional information on this package (available at:
https://cran.r-project.org/web/packages/Rmixmod/index.html). Set this flag to '0' to
skip this analysis.
The -r flag gives the user the ability to conduct unsupervised modeling (called using -u
above) over a range of nbCluster values. In the case that rangeNumClusters is specified
(e.g. as '5:20'), Rmixmod will calculate unsupervised GMMs over this range and select the
best model using the Bayesian information criterion (BIC). If a range of values is not
specified for -r, then a GMM analysis in Rmixmod will use the number of components/clusters
specified using the -c flag (see below).
The -d flag calls the supervised or semi-supervised discriminant analysis method implemented
in the 'mixmodLearn' and 'mixmodPredict' functions of Rmixmod. The discriminant analysis is
based on GMMs and is conducted in a two-step (A, Learning; B, Prediction) procedure, which
estimates a discriminant function from known labeled data and uses it to predict (classify)
unknown samples that correspondto the same knowns, i.e. species or clusters. Set this flag
to '0' to skip this analysis.
The -b flag allows users to request four belief-based Gaussian mixture modeling options
available in the 'bgmm' R package. Passing the script a value of '1' calls the 'supervised'
function for supervised GMM analysis; passing a '2' calls the 'semisupervised' function
for semisupervised GMM analysis; a '3' calls both the supervised and semisupervised functions;
a '4' calls the 'belief' function for belief-based GMM analysis; and a '5' calls the 'soft'
function for soft-labeled GMM analysis. See the bgmm R site and documentation for more
information (available at: https://cran.r-project.org/web/packages/bgmm/index.html). Set this
flag to '0' to skip the bgmm analysis altogether.
The -p flag specifies the filename of the bgmm 'B' matrix file in the working dir.
The -c flag specifies the number of components or 'clusters' that will be modeled during
regular GMM or bgmm modeling (except see other option available using -r flag above). This
corresponds to 'k' or the number of columns in 'B', based on definitions in the bgmm
documentation.
CITATION
Bagley, J.C. 2017. GaussClust v0.1.1. GitHub package, Available at:
<http://github.com/justincbagley/GaussClust>.
REFERENCES
Biecek P, Szczurek E, Vingron M, Tiuryn J (2012) The R package bgmm: mixture modeling with
uncertain knowledge. Journal of Statistical Software, 47, 1-31.
Edwards DL, Knowles LL (2014) Species detection and individual assignment in species
delimitation: can integrative data increase efficacy? Proceedings of the Royal Society
B, 281, 20132765.
Gower JC (1971) A general coefficient of similarity and some of its properties. Biometrics,
27, 857–871. doi:10.2307/2528823
Hausdorf B, Hennig C (2010). Species delimitation using dominant and codominant multilocus
markers. Systematic Biology, 59, 491-503.
Lebret R, Iovleff S, Langrognet F, Biernacki C, Celeux G, Govaert G (2015) Rmixmod: the R
package of the model-based unsupervised, supervised, and semi-supervised classification
Mixmod Library. Journal of Statistical Software, 67(6). doi:10.18637/jss.v067.i06
Szczurek E, Biecek P, Tiuryn J, Vingron M (2010) Introducing knowledge into differential
expression analysis. Journal of Computational Biology, 17, 953-967.
Created by Justin Bagley Tue Dec 13 16:39:37 2016 -0600
Copyright (c) 2016-2019 Justin C. Bagley. All rights reserved.
"
## SCRIPT USAGE ##
##--Check for mandatory positional parameters and echo usage, then wait for commands...
shift $((OPTIND-1))
if [ $# -lt 1 ]; then
echo "$Usage"
exit 1
fi
if [[ "$1" == "-h" ]] || [[ "$1" == "-help" ]]; then
echo "$Usage";
exit
fi
if [[ "$1" == "-H" ]] || [[ "$1" == "--Help" ]]; then
echo "$verboseUsage";
exit
fi
if [[ "$1" == "-v" ]] || [[ "$1" == "--version" ]]; then
echo "$(basename $0) ${version}";
exit
fi
## Make input file a mandatory parameter, and set the path variable to the current dir:
MY_INPUT_FILE="$1"
MY_PATH=`pwd -P`
##--FIX issues with echoing shell text containing dollar signs to R:
MY_POINTS_VAR=$(echo "\$points") ## Make points variable with '$points' text for Rscript...
MY_TYPE_VAR=$(echo "\$type") ## Make type variable with '$type' text for Rscript...
MY_SAMP_VAR=$(echo "\$samples") ## Same as above but for '$samples'...
MY_SPECIES_VAR=$(echo "\$species") ## Same as above but for '$species'...
MY_STRESS_VAR=$(echo "\$stress") ## Same as above but for '$stress'...
MY_NMDS1_VAR=$(echo "\$nmds_1")
MY_NMDS2_VAR=$(echo "\$nmds_2")
MY_NMDS3_VAR=$(echo "\$nmds_3")
MY_NMDS4_VAR=$(echo "\$nmds_4")
MY_TIJ_VAR=$(echo "\$tij")
MY_TAB_VAR=$(echo "\t")
############ MAKE R SCRIPT
echo "
#!/usr/bin/env Rscript
#################################### GaussClust.R ########################################
############ CONDUCT SETUP, READ IN AND PLOT THE DATA
setwd('$MY_PATH')
#
##--Load needed library, R code, or package stuff. Install packages if not present.
packages <- c('bgmm', 'Rmixmod', 'StatMatch', 'MASS', 'ggfortify', 'vegan')
if (length(setdiff(packages, rownames(installed.packages()))) > 0) {
install.packages(setdiff(packages, rownames(installed.packages())))
}
library(bgmm)
library(Rmixmod)
library(StatMatch)
library(MASS)
library(ggfortify)
library(vegan)
##--Read in the data:
mydata_names <- read.table('$MY_INPUT_FILE', h=T)
str(mydata_names)
mydata <- mydata_names[,-c(1:3)]
str(mydata)
##--Graph of pairwise data plots when there are small numbers of characters (else basic R
##--function and windows cannot handle the plots resulting from calling 'plot' function):
if( dim(mydata_names)[2] < 10 ){
print('Making pairwise plots of the data... ')
pdf('pairwise_data_plots.pdf')
plot(mydata_names)
dev.off()} else {print('Cannot do pairwise plots of the data (too much data)... ')
}
############ ANALYSIS
############ I. SCALE / STANDARDIZE DATA USING NMDS
##--Estimate Gower distances from the original data. Here, we are starting from a single
##--data frame, and the code was originally written with a single morphological data matrix
##--in mind. However, multiple datasets from different data types could be used. Either way,
##--Gower distances are ideal because they allow us to get an estimate of the similarities
##--and dissimilarities between all individuals/samples, while allowing for missing data
##--('NA' observations) in the dataset.
mydata_gower <- gower.dist(mydata)
##--Use ggfortify to visualize the Gower distnces:
pdf('gower_dist_gg_autoplot.pdf')
autoplot(mydata_gower)
dev.off()
##--Conduct NMDS using 'metaMDS' function in vegan package, retaining k dimensions:
mydata_gower_metaMDS <- metaMDS(mydata_gower, k=$NUM_NMDS_DIMS)
summary(mydata_gower_metaMDS)
#
nmds_stress <- mydata_gower_metaMDS$MY_STRESS_VAR * 100
nmds_stress
#
mydata_gower_metaMDS$MY_POINTS_VAR
metaMDS_points <- as.data.frame(mydata_gower_metaMDS$MY_POINTS_VAR)
metaMDS_points
row.names(metaMDS_points) <- mydata_names[,1]
names(metaMDS_points) <- c('nmds_1', 'nmds_2', 'nmds_3', 'nmds_4')
#
pdf('gower_nmds_plot.pdf')
plot(metaMDS_points)
dev.off()
pdf('gower_nmds_plot_1vs2.pdf')
plot(metaMDS_points$MY_NMDS1_VAR, metaMDS_points$MY_NMDS2_VAR, xlab='NMDS dim 1', ylab='NMDS dim 2')
text(-0.12,0.12, round(c(mydata_gower_metaMDS$MY_STRESS_VAR * 100), digits=2), col='red')
dev.off()
pdf('gower_nmds_plot_1vs3.pdf')
plot(metaMDS_points$MY_NMDS1_VAR, metaMDS_points$MY_NMDS3_VAR, xlab='NMDS dim 1', ylab='NMDS dim 3')
text(-0.12,0.12, round(c(mydata_gower_metaMDS$MY_STRESS_VAR * 100), digits=2), col='red')
dev.off()
pdf('gower_nmds_plot_2vs3.pdf')
plot(metaMDS_points$MY_NMDS2_VAR, metaMDS_points$MY_NMDS3_VAR, xlab='NMDS dim 2', ylab='NMDS dim 3')
text(-0.12,0.12, round(c(mydata_gower_metaMDS$MY_STRESS_VAR * 100), digits=2), col='red')
dev.off()
##--Save each dimension of values retained from NMDS into a separate variable, and then
##--in a data frame (extension 'df'):
nmds_1 <- metaMDS_points[,1]
nmds_2 <- metaMDS_points[,2]
nmds_3 <- metaMDS_points[,3]
nmds_4 <- metaMDS_points[,4]
nmds_dims_df = data.frame(nmds_1, nmds_2, nmds_3, nmds_4)
############ II. PREP AND CHECK DATA FOR GMM ANALYSES
##--Make data frame containing the individual sample names as well as columns of points
##--from NMDS dimensions retained during STEP I above. Also save the new data frame(s) to
##--file(s) in the working dir; we may want it handy in case we need it later...
sample_names <- mydata_names[,1]
type <- mydata_names[,2]
species <- mydata_names[,3]
mydata_names_df <- data.frame(sample_names, type, species, nmds_1, nmds_2, nmds_3, nmds_4)
write.table(mydata_names_df, file='mydata_names_df.txt')
##--Subset the NMDS points by 'known' and 'unknown' individuals. We also stop to write the
##--resulting new data frames back to file in working dir--in case of subsequent checks:
attach(mydata_names_df)
known_0 <- mydata_names_df[ which(mydata_names_df$MY_TYPE_VAR=='known'), ]
detach(mydata_names_df)
row.names(known_0) <- known_0[,1]
known_0
write.table(known_0, file='known_0.txt')
str(known_0)
#
knowns <- known_0[,-c(1:3)]
knowns
dim(knowns)[1]
write.table(knowns, file='knowns.txt')
str(knowns)
#
known_labels <- subset(mydata_names_df$MY_SPECIES_VAR, type=='known')
length(known_labels)
known_labels
#
attach(mydata_names_df)
unknown_0 <- mydata_names_df[ which(mydata_names_df$MY_TYPE_VAR=='unknown'), ]
detach(mydata_names_df)
row.names(unknown_0) <- unknown_0[,1]
unknown_0 <- unknown_0[,-c(1:3)]
write.table(unknown_0, file='unknown_0.txt')
str(unknown_0)
#
unknown_labels <- subset(mydata_names_df$MY_SPECIES_VAR, type=='unknown')
length(unknown_labels)
unknown_labels
##--Remove names from data frame of unknowns and place in var 'X' (bgmm unknowns var). Also
##--read in the belief matrix (B) containing prior probabilities for the knowns, with 0.95
##--probability for 'known' labeled individuals and all other cells receiving probs of
##--0.05/k (where k is number of components or clusters, and number of columns in B).
##--***IMPORTANT***: matrix B is generated by the user prior to running this script and
##--ONLY contains individuals classified as 'knowns'.
row.names(unknown_0) <- c(1:dim(unknown_0)[1])
unknowns <- unknown_0
X <- unknowns
X
dim(X)
#
B <- read.table('$MY_PROBS_MATRIX', header=TRUE, sep='$MY_TAB_VAR')
names(B) <- c(0:$NUM_COMPONENTS)
row.names(B) <- B[,1]
B <- B[,-c(1)]
B
dim(B)
############ III. CONDUCT UNSUPERVISED GMM CLUSTERING ANALYSIS IN RMIXMOD
if( $CALL_UNSUPERGMM == '0' ){print('Skipping unsupervised GMM analysis... ')} else {if($CALL_UNSUPERGMM == '1' ){
date()
print('Conducting unsupervised GMM analysis of the data using Rmixmod... ')
#
if( $RANGE_NBCLUST == '0'){print('Running unsupervised GMM using a single cluster number... ')
mydata_gower_gmm <-mixmodCluster(nmds_dims_df, nbCluster=$NUM_COMPONENTS)
summary(mydata_gower_gmm)
pdf('gower_gmm_result.pdf') ## SAVE THIS PLOT!
plot(mydata_gower_gmm)
dev.off()
mydata_gower_gmm['partition']
} else {print('Running unsupervised GMMs over a range of values for nbCluster, then selecting the best model using BIC... ')
mydata_gower_gmm <-mixmodCluster(nmds_dims_df, nbCluster=$RANGE_NBCLUST)
summary(mydata_gower_gmm)
pdf('gower_gmm_result.pdf') ## SAVE THESE PLOTS--THEY'RE AWESOME!!!
plot(mydata_gower_gmm)
plot(mydata_gower_gmm, c(1, 2))
plot(mydata_gower_gmm, c(1, 3))
plot(mydata_gower_gmm, c(1, 4))
plot(mydata_gower_gmm, c(2, 3))
plot(mydata_gower_gmm, c(2, 4))
plot(mydata_gower_gmm, c(3, 4))
dev.off()
mydata_gower_gmm['partition']}
}
}
####### IV. (SEMI-)SUPERVISED GMM-BASED DISCRIMINANT ANALYSIS IN RMIXMOD:
if( $CALL_DISCRIMINANT == '0' ){print('Skipping GMM-based discriminant analysis in Rmixmod... ')} else {if($CALL_DISCRIMINANT == '1' ){
## Analysis:
## A. Learning:
mydata_known_learn <- mixmodLearn(as.data.frame(knowns), as.factor(known_labels), nbCVBlocks = 10)
mydata_known_learn['bestResult']
graphics.off()
quartz()
pdf('superRmixmod_learn_results.pdf') ## SAVE THESE PLOTS--THEY'RE AWESOME!!!
plot(mydata_known_learn)
plot(mydata_known_learn, c(1, 2))
plot(mydata_known_learn, c(1, 3))
plot(mydata_known_learn, c(1, 4))
plot(mydata_known_learn, c(2, 3))
plot(mydata_known_learn, c(2, 4))
plot(mydata_known_learn, c(3, 4))
dev.off()
#
## B. Prediction:
## My prior experience with this (supervised prediction on lizard morphological dataset)
## suggests that prediction success when going from a set of knowns (1:1 or partial coverage)
## to unknowns is usually not so good (~20%). Nevertheless, here goes:
mydata_unknown_prediction <- mixmodPredict(data = X, classificationRule = mydata_known_learn['bestResult'])
summary(mydata_unknown_prediction)
mean(as.integer(unknown_labels) == mydata_unknown_prediction['partition'])
}
}
####### V. (SEMI-)SUPERVISED BELIEF-BASED GMM ANALYSIS IN BGMM:
if( $CALL_BGMM == '0' ){print('Skipping belief-based GMM analysis in bgmm... ')} else if($CALL_BGMM == '1' ){
supervisedModel <- supervised(as.data.frame(knowns), class = as.factor(known_labels))
supervisedModel
##--Commented out because plotting supervised model results is not working currently.
# pdf('bgmm_supervised_result.pdf')
# plot(supervisedModel)
# dev.off()
} else if($CALL_BGMM == '2' ){
semisupervisedModel <- semisupervised(as.data.frame(X), as.data.frame(knowns), class = as.factor(known_labels), k = $NUM_COMPONENTS, P = B)
semisupervisedModel
pdf('bgmm_semisupervised_result.pdf')
plot(semisupervisedModel)
dev.off()
tij_2 <- as.data.frame(semisupervisedModel$MY_TIJ_VAR)
write.table(tij_2, file='bgmm_semisupervised_posteriorProbs.txt', sep='$MY_TAB_VAR')} else if($CALL_BGMM == '3' ){
supervisedModel <- supervised(as.data.frame(knowns), class = as.factor(known_labels))
supervisedModel
##--Commented out because plotting supervised model results is not working currently.
# pdf('bgmm_supervised_result.pdf')
# plot(supervisedModel)
# dev.off()
semisupervisedModel <- semisupervised(as.data.frame(X), as.data.frame(knowns), class = as.factor(known_labels), k = $NUM_COMPONENTS, P = B)
semisupervisedModel
pdf('bgmm_semisupervised_result.pdf')
plot(semisupervisedModel)
dev.off()
tij_3 <- as.data.frame(semisupervisedModel$MY_TIJ_VAR)
write.table(tij_3, file='bgmm_semisupervised_posteriorProbs.txt', sep='$MY_TAB_VAR')} else if($CALL_BGMM == '4' ){
modelBelief <- belief(X, knowns, B=as.matrix(B))
modelBelief
pdf('bgmm_belief_result.pdf')
plot(modelBelief)
dev.off()
tij_4 <- as.data.frame(modelBelief$MY_TIJ_VAR, row.names=c(1:length(modelBelief$MY_TIJ_VAR)), col.names=c(levels(known_labels)))
write.table(tij_4, file='bgmm_belief_posteriorProbs.txt', sep='$MY_TAB_VAR')}else if($CALL_BGMM == '5' ){
modelSoft <- soft(X, knowns, P=as.matrix(B))
modelSoft
pdf('bgmm_soft_result.pdf')
plot(modelSoft)
dev.off()
tij_5 <- as.data.frame(modelSoft$MY_TIJ_VAR, row.names=c(1:length(modelSoft$MY_TIJ_VAR)), col.names=c(levels(known_labels)))
write.table(tij_5, file='bgmm_soft_posteriorProbs.txt', sep='$MY_TAB_VAR')
}
##--Save R workspace to file in wd:
save.image(file='GaussClustlite.workspace.RData')
######################################### END ############################################
" > GaussClust.r
############ FINAL STEPS:
R CMD BATCH ./GaussClust.R
##--Cleanup:
mkdir R
mv ./*.pdf ./*.Rout ./*.RData ./R/
if [ "$CALL_BGMM" -gt "0" ]; then
mv ./*_posteriorProbs.txt ./R/
fi
## Place Rscript generated by GaussCLust into 'R' folder within run folder:
mv ./GaussClust.r ./R/
## Place text files generated by GaussClust into 'txt' folder:
mkdir txt
mv ./known_0.txt ./knowns.txt ./unknown_0.txt ./mydata_names_df.txt ./txt/
#
#
#
######################################### END ############################################
exit 0