-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathprocessRealData.R
119 lines (94 loc) · 5.31 KB
/
processRealData.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
library(fda) # for growth dataset
library("fda.usc") # for tecator dataset
# returns a list of lists where every element describes a different dataset. all datasets are smoothed
processRealData = function(){
list_of_lists = list()
list_index = 1
### GROWTH CURVES from fda package
# A list containing the heights of 39 boys and 54 girls from age 1 to 18 and the ages at which they were collected.
# --- Smooth, penalizing the Nth derivative (Lfdobj) --
# A standard non-monotone smoothing using function smooth.basis
# with a penalty on the Nth derivative (Lfdobj) in order to estimate a
# smooth acceleration function
# This gives a smoother estimate of the acceleration functions
# Smoothing code follows https://github.com/cran/fda/blob/master/demo/growth.R with modification to simultaneous smoothing of both genders
attach(growth)
true_group<- c(rep(TRUE,39),rep(FALSE,54)) # TRUE = boy, FALSE = girl
(ageRng <- range(age))
agefine <- seq(ageRng[1],ageRng[2],length=101)
knots <- age
norder <- 6
nbasis <- length(knots) + norder - 2
hgtbasis <- create.bspline.basis(range(knots), nbasis, norder, knots)
# raw_data_list<- list(data = cbind(growth$hgtm,growth$hgtf), reg_grid = age, true_group = true_group, name = "growth_raw")
# list_of_lists[[1]] = raw_data_list
# list_index = 2
for (Lfdobj in 1:4) {
# loglambda <- seq(-10,-1,by=0.5)
# gcv<-rep(0,length(loglambda))
# for(lam in 1:length(loglambda)){
# fd_par_obj <- fdPar(fdobj=hgtbasis,Lfdobj,lambda=10^loglambda[lam])
# smooth_res <- smooth.basis(growth$age, cbind(growth$hgtm,growth$hgtf),fd_par_obj)
# gcv[lam]=mean(smooth_res$gcv)
# }
# ind_m=which(gcv==min(gcv))
# growfdPar <- fdPar(fdobj=hgtbasis,Lfdobj,lambda=10^loglambda[ind_m])
lambda <- 1e-2
growfdPar <- fdPar(hgtbasis, Lfdobj, lambda)
# Need 'hgtm', 'hgtf', e.g., from attach(growth)
hgtfd <- smooth.basis(growth$age, cbind(growth$hgtm,growth$hgtf), growfdPar)$fd
growthhat <- eval.fd(agefine, hgtfd) # growth
velmhat <- eval.fd(agefine, hgtfd, 1) # velocity
accmhat <- eval.fd(agefine, hgtfd, 2) # acceleration
growthhat_list = list(data = growthhat, reg_grid = agefine, true_group = true_group, name = sprintf("growthhat_der%d",Lfdobj))
list_of_lists[[list_index]] = growthhat_list
velmhat_list = list(data = velmhat, reg_grid = agefine, true_group = true_group, name = sprintf("velmhat_der%d",Lfdobj))
list_of_lists[[list_index+1]] = velmhat_list
accmhat_list = list(data = accmhat, reg_grid = agefine, true_group = true_group, name = sprintf("accmhat_der%d",Lfdobj))
list_of_lists[[list_index+2]]= accmhat_list
list_index = list_index + 3
}
detach(growth)
### Tecator from fda package
# This is from Mahalanobis technometrics paper
# "The classification problem here is to separate meat samples with a high fat content(more than 20%) from samples with a low fat content (less than20%) based on absorbance.
# Among the 215 samples, 77 have high fat content and 138 have low fat content."
# Warning: there are some weird boundary effects here!
# Mahalanobis paper regarding smoothing
# "second-order derivatives of the observed functions produces lower misclassification rates.
# Therefore, the analysis of the original data and their second-order derivatives are carried out.
# In both cases, the original discrete observations and their second differences are converted to
# functional observations using a B- spline basis of order 6 with 20 and 40 basis functions, respectively."
data(tecator)
absorp <- tecator$absorp.fdata
Fat20 <- ifelse(tecator$y$Fat < 20, FALSE, TRUE) # class membership
# bspline parameters
norder <- 6
lambda <- 1e-2
raw_data_list<- list(data = absorp$data, reg_grid = absorp$argvals, true_group = Fat20, name = "tecator_raw")
list_of_lists[[list_index]] = raw_data_list
list_index = list_index+1
for (nbasis in c(20,40)) {
basis <- create.bspline.basis(tecator[["absorp.fdata"]]$rangeval, nbasis, norder)
growfdPar <- fdPar(basis, 0, lambda)
tecator_fd <- smooth.basis(tecator[["absorp.fdata"]]$argvals, t(absorp$data), growfdPar)$fd
grid = absorp$argvals
# grid_chopped = grid[-c(1,length(grid))] #TODO: is this cheating? we have to do it to avoid boundary effects
if (nbasis==20){
# Evaluate on original time grid
list_of_lists[[list_index]] = list(data = eval.fd(grid, tecator_fd), reg_grid = grid, true_group = Fat20, name = "tecator")
list_index = list_index+1
} else{
# Evaluate on original time grid
list_of_lists[[list_index]] = list(data = eval.fd(grid, tecator_fd, 2), reg_grid = grid, true_group = Fat20, name = "tecator_der2")
list_index = list_index+1
}
}
# Lin and Yao's contamination paper regarding tecator data preporcessing:
# we predict the fat content based on the first derivative curves approximated
# by the difference quotient between measurements at adjacent wavelengths, shown in the left panel of Figure 2.
tecatorLY_fd=fdata.deriv(absorp,nderiv=1,method="diff",class.out='fdata')
list_of_lists[[list_index]] = list(data = t(tecatorLY_fd$data), reg_grid = tecatorLY_fd$argvals, true_group = Fat20, name = "tecator_LYdiff1storder")
list_index = list_index+1
return(list_of_lists)
}