-
Notifications
You must be signed in to change notification settings - Fork 6
/
mo_kernel.f90
2883 lines (2442 loc) · 102 KB
/
mo_kernel.f90
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
!> \file mo_kernel.f90
!> \brief Module for kernel regression and kernel density estimation.
!> \details This module provides routines for kernel regression of data as well as kernel density
!> estimation of both probability density functions (PDF) and cumulative density functions (CDF).\n
!> So far only a Gaussian kernel is implemented (Nadaraya-Watson)
!> which can be used for one- and multidimensional data.\n
!> Furthermore, the estimation of the bandwith needed for kernel methods is provided
!> by either Silverman''s rule of thumb or a Cross-Validation approach.\n
!> The Cross-Validation method is an optimization of the bandwith and
!> is the most costly part of the kernel smoother.
!> Therefore, the bandwith estimation is not necessarily part of the kernel smoothing
!> but can be determined first and given as an optional argument to the smoother.
!> \author Juliane Mai
!> \date Mar 2013
MODULE mo_kernel
! This module provides functions for kernel regression and kernel density estimation.
! Written Juliane Mai, Mar 2013
! Modified Stephan Thober, Mar 2013
! Matthias Cuntz, Mar 2013
! Matthias Cuntz, May 2014 - sort -> qsort
! Matthias Cuntz, May 2014 - module procedure golden
! Stephan Thober, Jul 2015 - using sort_index in favor of qsort_index
! Matthias Cuntz, Jun 2016 - Romberg integration in cumdensity
! License
! -------
! This file is part of the JAMS Fortran package, distributed under the MIT License.
!
! Copyright (c) 2013-2016 Juliane Mai, Stephan Thober, Matthias Cuntz - mc (at) macu (dot) de
!
! Permission is hereby granted, free of charge, to any person obtaining a copy
! of this software and associated documentation files (the "Software"), to deal
! in the Software without restriction, including without limitation the rights
! to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
! copies of the Software, and to permit persons to whom the Software is
! furnished to do so, subject to the following conditions:
!
! The above copyright notice and this permission notice shall be included in all
! copies or substantial portions of the Software.
!
! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
! IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
! FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
! AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
! LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
! OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
! SOFTWARE.
USE mo_kind, ONLY: i4, sp, dp
USE mo_moment, ONLY: stddev
IMPLICIT NONE
PUBLIC :: kernel_cumdensity ! Kernel smoothing of a CDF (only 1D)
PUBLIC :: kernel_density ! Kernel smoothing of a PDF (only 1D)
PUBLIC :: kernel_density_h ! Bandwith estimation for PDF and CDF (only 1D)
PUBLIC :: kernel_regression ! Kernel regression (1D and ND)
PUBLIC :: kernel_regression_h ! Bandwith estimation for kernel regression (1D and ND)
! ------------------------------------------------------------------
! NAME
! kernel_cumdensity
! PURPOSE
! Approximates the cumulative density function (CDF) to a given 1D data set using a Gaussian kernel.
!
!> \brief Approximates the cumulative density function (CDF).
!
!> \details Approximates the cumulative density function (CDF)
!> to a given 1D data set using a Gaussian kernel.\n
!
!> The bandwith of the kernel can be pre-determined using the function kernel_density_h and
!> specified by the optional argument h.
!> If h is not given the default method to approximate the bandwith h is Silverman''s rule-of-thumb
!> \f[ h = \frac{4}{3}^{0.2} n^{-0.2} \sigma_x \f]
!> where n is the number of given data points and \f$ \sigma \f$ is the standard deviation of the data.\n
!> If the optional argument silverman is set to false, the cross-validation method described
!> by Scott et al. (2005) is applied.
!> Therefore, the likelihood of a given h is maximized using the Nelder-Mead algorithm nelminrange.
!> For large data sets this might be time consuming and should be performed aforehand using the
!> function kernel_density_h.\n
!> The dataset x can be single or double precision. The result will have the same numerical precision.\n
!> If the CDF for other datapoints than x is needed the optional argument xout has to be specified.
!> The result will than be of the same size and precision as xout.
!
! INTENT(IN)
!> \param[in] "real(sp/dp) :: x(:)" \f$ x_i \f$ 1D-array with data points
!
! INTENT(INOUT)
! None
! INTENT(OUT)
! None
!
! INTENT(IN), OPTIONAL
!> \param[in] "real(sp/dp), optional :: h" Bandwith of kernel.\n
!> If present, argument silverman is ignored.
!> If not present, the bandwith will be approximated first.
!> \param[in] "logical, optional :: silverman" By default Silverman''s Rule-of-thumb will be used to approximate
!> the bandwith of the kernel (silverman=true).
!> If silverman=false the Cross-Validation approach is used
!> to estimate the bandwidth.
!> \param[in] "real(sp/dp), optional :: xout(:)" If present, the CDF will be approximated at this arguments,
!> otherwise the CDF is approximated at x.
!> \param[in] "logical, optional :: romberg" If .true., use Romberg converging integration
!> If .true., use 5-point Newton-Cotes with fixed nintegrate points
!> \param[in] "integer(i4), optional :: nintegrate" 2**nintegrate if Romberg or nintegrate number of sampling
!> points for integration between output points.
!> Default: 10 if Romberg, 101 otherwise.
!> \param[in] "real(sp/dp), optional :: epsint" maximum relative error for Romberg integration.
!> Default: 1e-6.
!> \param[in] "logical, optional :: mask(:)" mask x values at calculation.\n
!> if not xout given, then kernel estimates will have nodata value.
!> \param[in] "real(sp/dp), optional :: nodata" if mask and not xout given, then masked data will
!> have nodata kernel estimate.
!
! INTENT(INOUT), OPTIONAL
! None
!
! INTENT(OUT), OPTIONAL
! None
!
! RETURN
!> \return real(sp/dp), allocatable :: kernel_cumdensity(:) — smoothed CDF at either x or xout
!
! RESTRICTIONS
!> \note Data need to be one-dimensional. Multi-dimensional data handling not implemented yet.
!
! EXAMPLE
! ! given data, e.g. temperature
! x = (/ 26.1_dp, 24.5_dp, 24.8_dp, 24.5_dp, 24.1_dp /)
!
! ! estimate bandwidth via cross-validation (time-consuming)
! h = kernel_density_h(x,silverman=.false.)
!
! ! estimate bandwidth with Silverman''s rule of thumb (default)
! h = kernel_density_h(x,silverman=.true.)
!
! ! calc cumulative density with the estimated bandwidth h at given output points xout
! cdf = kernel_cumdensity(x, h=h, xout=xout)
! ! gives cumulative density at xout values, if specified, or at x values, if xout is not present
! ! if bandwith h is given : silverman (true/false) is ignored
! ! if silverman=.true. and h not present : bandwith will be estimated using Silerman''s rule of thumb
! ! if silverman=.false. and h not present : bandwith will be estimated using Cross-Validation approach
! -> see also example in test directory
! LITERATURE
! Scott, D. W., & Sain, S. R. (2005).
! Multi-dimensional Density Estimation. Handbook of Statistics, 24, 229-261.
! doi:10.1016/S0169-7161(04)24009-3
! Haerdle, W., & Mueller, M. (2000). Multivariate and semiparametric kernel regression.
! In M. G. Schimek (Ed.), Smoothing and regression: Approaches, computation, and
! application (pp. 357-392). Hoboken, NJ, USA: John Wiley & Sons, Inc. doi:10.1002/9781118150658.ch12s,
! Cambridge University Press, UK, 1996
! HISTORY
!> \author Juliane Mai
!> \date Mar 2013
! Modified, Matthias Cuntz, Mar 2013
! Matthias Cuntz, May 2014 - sort -> qsort
! Matthias Cuntz, Jun 2016 - Romberg integration
! Matthias Cuntz, May 2018 - allocate correct size for kernel_pdf in romberg=.false.
INTERFACE kernel_cumdensity
MODULE PROCEDURE kernel_cumdensity_1d_dp, kernel_cumdensity_1d_sp
END INTERFACE kernel_cumdensity
! ------------------------------------------------------------------
! NAME
! kernel_density
! PURPOSE
! Approximates the probability density function (PDF) to a given 1D data set using a Gaussian kernel.
!
!> \brief Approximates the probability density function (PDF).
!
!> \details Approximates the probability density function (PDF)
!> to a given 1D data set using a Gaussian kernel.\n
!
!> The bandwith of the kernel can be pre-determined using the function kernel_density_h and specified
!> by the optional argument h.
!> If h is not given the default method to approximate the bandwith h is Silverman''s rule-of-thumb
!> \f[ h = \frac{4}{3}^{0.2} n^{-0.2} \sigma_x \f]
!> where n is the number of given data points and \f$ \sigma \f$ is the standard deviation of the data.\n
!> If the optional argument silverman is set to false, the cross-validation method described
!> by Scott et al. (2005) is applied.
!> Therefore, the likelihood of a given h is maximized using the Nelder-Mead algorithm nelminrange.
!> For large data sets this might be time consuming and should be performed aforehand using the function
!> kernel_density_h.\n
!> The dataset x can be single or double precision. The result will have the same numerical precision.\n
!> If the PDF for other datapoints than x is needed the optional argument xout has to be specified.
!> The result will than be of the same size and precision as xout.
!
! INTENT(IN)
!> \param[in] "real(sp/dp) :: x(:)" \f$ x_i \f$ 1D-array with data points
!
! INTENT(INOUT)
! None
! INTENT(OUT)
! None
!
! INTENT(IN), OPTIONAL
!> \param[in] "real(sp/dp), optional :: h" Bandwith of kernel.\n
!> If present, argument silverman is ignored.
!> If not present, the bandwith will be approximated first.
!> \param[in] "logical, optional :: silverman" By default Silverman''s Rule-of-thumb will be used to approximate
!> the bandwith of the kernel (silverman=true).
!> If silverman=false the Cross-Validation approach is used
!> to estimate the bandwidth.
!> \param[in] "real(sp/dp), optional :: xout(:)" If present, the PDF will be approximated at this arguments,
!> otherwise the PDF is approximated at x.
!> \param[in] "logical, optional :: mask(:)" mask x values at calculation.\n
!> if not xout given, then kernel estimates will have nodata value.
!> \param[in] "real(sp/dp), optional :: nodata" if mask and not xout given, then masked data will
!> have nodata kernel estimate.
!
! INTENT(INOUT), OPTIONAL
! None
!
! INTENT(OUT), OPTIONAL
! None
!
! RETURN
!> \return real(sp/dp), allocatable :: kernel_density(:) — smoothed PDF at either x or xout
!
! RESTRICTIONS
!> \note Data need to be one-dimensional. Multi-dimensional data handling not implemented yet.
!
! EXAMPLE
! ! given data, e.g. temperature
! x = (/ 26.1_dp, 24.5_dp, 24.8_dp, 24.5_dp, 24.1_dp /)
!
! ! estimate bandwidth via cross-validation (time-consuming)
! h = kernel_density_h(x,silverman=.false.)
!
! ! estimate bandwidth with Silverman''s rule of thumb (default)
! h = kernel_density_h(x,silverman=.true.)
!
! ! calc cumulative density with the estimated bandwidth h at given output points xout
! pdf = kernel_density(x, h=h, xout=xout)
! ! gives (probability) density at either xout values, if specified, or at x values, if xout is not present
! ! if bandwith h is given : silverman (true/false) is ignored
! ! if silverman=.true. and h not present : bandwith will be estimated using Silerman''s rule of thumb
! ! if silverman=.false. and h not present : bandwith will be estimated using Cross-Validation approach
!
! -> see also example in test directory
! LITERATURE
! Scott, D. W., & Sain, S. R. (2005).
! Multi-dimensional Density Estimation. Handbook of Statistics, 24, 229-261.
! doi:10.1016/S0169-7161(04)24009-3
! Haerdle, W., & Mueller, M. (2000). Multivariate and semiparametric kernel regression.
! In M. G. Schimek (Ed.), Smoothing and regression: Approaches, computation, and
! application (pp. 357-392). Hoboken, NJ, USA: John Wiley & Sons, Inc. doi:10.1002/9781118150658.ch12s,
! Cambridge University Press, UK, 1996
! HISTORY
!> \author Juliane Mai
!> \date Mar 2013
! Modified, Stephan Thober, Mar 2013 - mask and nodata
! Matthias Cuntz, Mar 2013
INTERFACE kernel_density
MODULE PROCEDURE kernel_density_1d_dp, kernel_density_1d_sp
END INTERFACE kernel_density
! ------------------------------------------------------------------
! NAME
! kernel_density_h
! PURPOSE
! Approximates the bandwith h of the kernel.
!
!> \brief Approximates the bandwith h of the kernel.
!
!> \details Approximates the bandwith h of the kernel for a given dataset x
!> either using Silverman''s rule-of-thumb or a cross-validation method.\n
!
!> By default the bandwidth h is approximated by Silverman''s rule-of-thumb
!> \f[ h = \frac{4}{3}^{0.2} n^{-0.2} \sigma_x \f]
!> where n is the number of given data points and \f$ \sigma \f$ is the standard deviation of the data.\n
!> If the optional argument silverman is set to false, the cross-validation method described
!> by Scott et al. (2005) is applied.
!> Therefore, the likelihood of a given h is maximized using the Nelder-Mead algorithm nelminrange.
!> For large data sets this might be time consuming and should be performed aforehand using the
!> function kernel_density_h.\n
!> The dataset x can be single or double precision. The result will have the same numerical precision.\n
!> The result of this function can be used as the optional input for kernel_density and kernel_cumdensity.
!
! INTENT(IN)
!> \param[in] "real(sp/dp) :: x(:)" \f$ x_i \f$ 1D-array with data points
!
! INTENT(INOUT)
! None
! INTENT(OUT)
! None
!
! INTENT(IN), OPTIONAL
!> \param[in] "logical, optional :: silverman" By default Silverman''s Rule-of-thumb will be used to approximate
!> the bandwith of the kernel (silverman=true).
!> If silverman=false the Cross-Validation approach is used
!> to estimate the bandwidth.
!> \param[in] "logical, optional :: mask(:)" mask x values at calculation.
!
! INTENT(INOUT), OPTIONAL
! None
!
! INTENT(OUT), OPTIONAL
! None
!
! RETURN
!> \return real(sp/dp), allocatable :: kernel_density_h(:) — approximated bandwith h for kernel smoother
!
! RESTRICTIONS
!> \note Data need to be one-dimensional. Multi-dimensional data handling not implemented yet.
!
! EXAMPLE
! ! given data, e.g. temperature
! x = (/ 26.1_dp, 24.5_dp, 24.8_dp, 24.5_dp, 24.1_dp /)
!
! ! estimate bandwidth via cross-validation (time-consuming)
! h = kernel_density_h(x,silverman=.false.)
!
! ! estimate bandwidth with Silverman''s rule of thumb (default)
! h = kernel_density_h(x,silverman=.true.)
!
! -> see also example in test directory
! LITERATURE
! Scott, D. W., & Sain, S. R. (2005).
! Multi-dimensional Density Estimation. Handbook of Statistics, 24, 229-261.
! doi:10.1016/S0169-7161(04)24009-3
! Haerdle, W., & Mueller, M. (2000). Multivariate and semiparametric kernel regression.
! In M. G. Schimek (Ed.), Smoothing and regression: Approaches, computation, and
! application (pp. 357-392). Hoboken, NJ, USA: John Wiley & Sons, Inc. doi:10.1002/9781118150658.ch12s,
! Cambridge University Press, UK, 1996
! HISTORY
!> \author Juliane Mai
!> \date Mar 2013
! Modified, Matthias Cuntz, Mar 2013
INTERFACE kernel_density_h
MODULE PROCEDURE kernel_density_h_1d_dp, kernel_density_h_1d_sp
END INTERFACE kernel_density_h
! ------------------------------------------------------------------
! NAME
! kernel_regression
! PURPOSE
! Multi-dimensional non-parametric kernel regression using a Gaussian kernel.
!
!> \brief Multi-dimensional non-parametric kernel regression.
!
!> \details Multi-dimensional non-parametric kernel regression using a Gaussian kernel.\n
!
!> The bandwith of the kernel can be pre-determined using the function kernel_regression_h and specified
!> by the optional argument h.
!> If h is not given the default method to approximate the bandwith h is Silverman''s rule-of-thumb
!> \f[ h = \frac{4}{d+2}^{1/(d+4)} n^{-1/(d+4)} \sigma_{x_i} \f]
!> where \f$ n \f$ is the number of given data points, \f$ d \f$ is the number of dimensions,
!> and \f$ \sigma_{x_i} \f$ is the standard deviation of the data of dimension \f$ i \f$.\n
!> If the optional argument silverman is set to false, the cross-validation method described
!> by Scott et al. (2005) is applied.
!> Therefore, the likelihood of a given h is maximized using the Nelder-Mead algorithm nelminrange.
!> For large data sets this might be time consuming and should be performed aforehand
!> using the function kernel_regression_h.\n
!> The dataset (x,y) can be single or double precision. The result will have the same numerical precision.\n
!> If the regression for other datapoints than x is needed the optional argument xout has to be specified.
!> The result will than be of the same size and precision as xout.\n
!> \n
!> The code is adapted from the kernel_regression.py of the JAMS Python library.
!
! INTENT(IN)
!> \param[in] "real(sp/dp) :: x(:)/x(:,:)" 1D or ND array with independent variables
!> \param[in] "real(sp/dp) :: y(:)" 1D array of dependent variables y(i) = f(x(i:)) with unknown f
!
! INTENT(INOUT)
! None
! INTENT(OUT)
! None
!
! INTENT(IN), OPTIONAL
!> \param[in] "real(sp/dp), optional :: h" Bandwith of kernel.\n
!> If present, argument silverman is ignored.
!> If not present, the bandwith will be approximated first.
!> \param[in] "logical, optional :: silverman" By default Silverman''s Rule-of-thumb will be used to approximate the
!> bandwith of the kernel (silverman=true).
!> If silverman=false the Cross-Validation approach is used
!> to estimate the bandwidth.
!> \param[in] "real(sp/dp), optional :: xout(:)/xout(:,:)"
!> If present, the fitted values will be returned for
!> this independent variables,
!> otherwise the fitted values at x are returned.
!> \param[in] "logical, optional :: mask(:)" mask y values at calculation.\n
!> if not xout given, then kernel estimates will have nodata value.
!> \param[in] "real(sp/dp), optional :: nodata" if mask and not xout given, then masked data will
!> have nodata kernel estimate.
!
! INTENT(INOUT), OPTIONAL
! None
!
! INTENT(OUT), OPTIONAL
! None
!
! RETURN
!> \return real(sp/dp), allocatable :: kernel_regression(:) — fitted values at eighter x or xout
!
! RESTRICTIONS
! None
!
! EXAMPLE
! ! given data, e.g. temperature
! x(:,1) = (/ 26.1_dp, 24.5_dp, 24.8_dp, 14.5_dp, 2.1_dp /)
! x(:,2) = (/ 2.1_dp, 4.5_dp, 6.8_dp, 4.8_dp, 0.1_dp /)
! y = (/ 28.2_dp, 29.0_dp, 31.6_dp, 19.3_dp, 2.2_dp /)
!
! ! estimate bandwidth via cross-validation (time-consuming)
! h = kernel_regression_h(x,y,silverman=.false.)
!
! ! estimate bandwidth with Silverman''s rule of thumb (default)
! h = kernel_regression_h(x,y,silverman=.true.)
!
! fit_y = kernel_regression(x, y, h=h, silverman=.false., xout=xout)
! ! gives fitted values at either xout values, if specified, or at x values, if xout is not present
! ! if bandwith h is given : silverman (true/false) is ignored
! ! if silverman=.true. and h not present : bandwith will be estimated using Silerman''s rule of thumb
! ! if silverman=.false. and h not present : bandwith will be estimated using Cross-Validation approach
!
! -> see also example in test directory
! LITERATURE
! Scott, D. W., & Sain, S. R. (2005).
! Multi-dimensional Density Estimation. Handbook of Statistics, 24, 229-261.
! doi:10.1016/S0169-7161(04)24009-3
! Haerdle, W., & Mueller, M. (2000). Multivariate and semiparametric kernel regression.
! In M. G. Schimek (Ed.), Smoothing and regression: Approaches, computation, and
! application (pp. 357-392). Hoboken, NJ, USA: John Wiley & Sons, Inc. doi:10.1002/9781118150658.ch12s,
! Cambridge University Press, UK, 1996
! HISTORY
!> \authors Matthias Cuntz, Juliane Mai
!> \date Mar 2013
INTERFACE kernel_regression
MODULE PROCEDURE kernel_regression_2d_dp, kernel_regression_2d_sp, &
kernel_regression_1d_dp, kernel_regression_1d_sp
END INTERFACE kernel_regression
! ------------------------------------------------------------------
! NAME
! kernel_regression_h
! PURPOSE
! Approximates the bandwith h of the kernel.
!
!> \brief Approximates the bandwith h of the kernel for regression.
!
!> \details Approximates the bandwith h of the kernel for a given dataset x
!> either using Silverman''s rule-of-thumb or a cross-validation method.\n
!
!> By default the bandwidth h is approximated by Silverman''s rule-of-thumb
!> \f[ h = \frac{4}{d+2}^{1/(d+4)} n^{-1/(d+4)} \sigma_{x_i} \f]
!> where \f$ n \f$ is the number of given data points, \f$ d \f$ is the number of dimensions,
!> and \f$ \sigma_{x_i} \f$ is the standard deviation of the data of dimension \f$ i \f$.\n
!> If the optional argument silverman is set to false, the cross-validation method described
!> by Scott et al. (2005) is applied.
!> Therefore, the likelihood of a given h is maximized using the Nelder-Mead algorithm nelminrange.
!> For large data sets this might be time consuming and should be performed aforehand using the function kernel_density_h.\n
!> The dataset x can be single or double precision. The result will have the same numerical precision.\n
!> The result of this function can be used as the optional input for kernel_regression.\n
!> \n
!> The code is adapted from the kernel_regression.py of the JAMS Python library.
!
! INTENT(IN)
!> \param[in] "real(sp/dp) :: x(:)/x(:,:)" 1D or ND array with independent variables
!> \param[in] "real(sp/dp) :: y(:)" 1D array of dependent variables y(i) = f(x(i:)) with unknown f
!
! INTENT(INOUT)
! None
! INTENT(OUT)
! None
!
! INTENT(IN), OPTIONAL
!> \param[in] "logical, optional :: silverman" By default Silverman''s Rule-of-thumb will be used to approximate the
!> bandwith of the kernel (silverman=true).
!> If silverman=false the Cross-Validation approach is used
!> to estimate the bandwidth.
!> \param[in] "logical, optional :: mask(:)" mask x values at calculation.
!
! INTENT(INOUT), OPTIONAL
! None
!
! INTENT(OUT), OPTIONAL
! None
!
! RETURN
!> \return real(sp/dp), allocatable :: kernel_regression_h(:) — approximated bandwith h for kernel regression\n
!> number of bandwidths equals
!> number of independent variables
!
! RESTRICTIONS
! None
!
! EXAMPLE
! ! given data, e.g. temperature
! x(:,1) = (/ 26.1_dp, 24.5_dp, 24.8_dp, 14.5_dp, 2.1_dp /)
! x(:,2) = (/ 2.1_dp, 4.5_dp, 6.8_dp, 4.8_dp, 0.1_dp /)
! y = (/ 28.2_dp, 29.0_dp, 31.6_dp, 19.3_dp, 2.2_dp /)
!
! ! estimate bandwidth via cross-validation (time-consuming)
! h = kernel_regression_h(x,y,silverman=.false.)
!
! ! estimate bandwidth with Silverman''s rule of thumb (default)
! h = kernel_regression_h(x,y,silverman=.true.)
!
! -> see also example in test directory
! LITERATURE
! Scott, D. W., & Sain, S. R. (2005).
! Multi-dimensional Density Estimation. Handbook of Statistics, 24, 229-261.
! doi:10.1016/S0169-7161(04)24009-3
! Haerdle, W., & Mueller, M. (2000). Multivariate and semiparametric kernel regression.
! In M. G. Schimek (Ed.), Smoothing and regression: Approaches, computation, and
! application (pp. 357-392). Hoboken, NJ, USA: John Wiley & Sons, Inc. doi:10.1002/9781118150658.ch12s,
! Cambridge University Press, UK, 1996
! HISTORY
!> \authors Matthias Cuntz, Juliane Mai
!> \date Mar 2013
INTERFACE kernel_regression_h
MODULE PROCEDURE kernel_regression_h_2d_dp, kernel_regression_h_2d_sp, &
kernel_regression_h_1d_dp, kernel_regression_h_1d_sp
END INTERFACE kernel_regression_h
! ------------------------------------------------------------------
INTERFACE nadaraya_watson
MODULE PROCEDURE nadaraya_watson_2d_dp, nadaraya_watson_2d_sp, &
nadaraya_watson_1d_dp, nadaraya_watson_1d_sp
END INTERFACE nadaraya_watson
INTERFACE allocate_globals
MODULE PROCEDURE allocate_globals_2d_dp, allocate_globals_2d_sp, &
allocate_globals_1d_dp, allocate_globals_1d_sp
END INTERFACE allocate_globals
INTERFACE cross_valid_density
MODULE PROCEDURE cross_valid_density_1d_dp, cross_valid_density_1d_sp
END INTERFACE cross_valid_density
INTERFACE cross_valid_regression
MODULE PROCEDURE cross_valid_regression_dp, cross_valid_regression_sp
END INTERFACE cross_valid_regression
INTERFACE mesh
MODULE PROCEDURE mesh_dp, mesh_sp
END INTERFACE mesh
INTERFACE golden
MODULE PROCEDURE golden_dp, golden_sp
END INTERFACE golden
INTERFACE trapzd
MODULE PROCEDURE trapzd_dp, trapzd_sp
END INTERFACE trapzd
INTERFACE polint
MODULE PROCEDURE polint_dp, polint_sp
END INTERFACE polint
PRIVATE
! Module variables which need to be public for optimization of bandwith via cross-validation
real(sp), dimension(:,:), allocatable :: global_x_sp
real(sp), dimension(:,:), allocatable :: global_xout_sp
real(sp), dimension(:), allocatable :: global_y_sp
real(dp), dimension(:,:), allocatable :: global_x_dp
real(dp), dimension(:,:), allocatable :: global_xout_dp
real(dp), dimension(:), allocatable :: global_y_dp
! ------------------------------------------------------------------------------------------------
CONTAINS
function kernel_cumdensity_1d_dp(ix, h, silverman, xout, romberg, nintegrate, epsint, mask, nodata)
use mo_utils, only: le, linspace
! use mo_quicksort, only: qsort_index !ST: may lead to Segmentation Fault for large arrays > 600 000 entries
! use mo_sort, only: sort_index
use mo_orderpack, only: sort_index ! MC: use orderpack for no NR until somebody complains
use mo_integrate, only: int_regular
implicit none
real(dp), dimension(:), intent(in) :: ix
real(dp), optional, intent(in) :: h
logical, optional, intent(in) :: silverman
real(dp), dimension(:), optional, intent(in) :: xout
logical, optional, intent(in) :: romberg
integer(i4), optional, intent(in) :: nintegrate
real(dp), optional, intent(in) :: epsint
logical, dimension(:), optional, intent(in) :: mask
real(dp), optional, intent(in) :: nodata
real(dp), dimension(:), allocatable :: kernel_cumdensity_1d_dp
! local variables
integer(i4) :: nin, nout
integer(i4) :: ii, jj
real(dp) :: hh ! bandwidth
real(dp), dimension(:), allocatable :: xxout
integer(i4), dimension(:), allocatable :: xindx
! real(dp) :: tmp
real(dp), dimension(:), allocatable :: x
! integration
logical :: doromberg ! Romberg of Newton-Coates
real(dp), dimension(:), allocatable :: kernel_pdf ! kernel densities at xout
integer(i4) :: trapzmax ! maximum 2**trapzmax points per integration between xouts
real(dp) :: trapzreps ! maximum relative integration error
integer(i4), parameter :: kromb = 5 ! Romberg's method of order 2kromb
real(dp) :: a, b, k1, k2 ! integral limits and corresponding kernel densities
real(dp) :: qromb, dqromb ! interpolated result and changeof consecutive calls to trapzd
real(dp), dimension(:), allocatable :: s, hs ! results and stepsize of consecutive calls to trapzd
real(dp) :: lower_x ! lowest support point at min(x) - 3 stddev(x)
real(dp), dimension(1) :: klower_x ! kernel density estimate at lower_x
real(dp) :: delta ! stepsize for Newton-Coates
! consistency check - mask needs either nodata or xout
if (present(mask) .and. (.not. present(xout)) .and. (.not. present(nodata)) ) then
stop 'kernel_cumdensity_1d_dp: missing nodata value or xout with present(mask)'
end if
! Pack if mask present
if (present(mask)) then
nin = count(mask)
allocate(x(nin))
x = pack(ix, mask)
else
nin = size(ix,1)
allocate(x(nin))
x = ix
endif
! allocate
if (present(xout)) then
nout = size(xout,1)
allocate(xxout(nout))
allocate(xindx(nout))
xxout = xout
else
nout = nin
allocate(xxout(nout))
allocate(xindx(nout))
xxout = x
end if
! sort the x
xindx = sort_index(xxout)
xxout = xxout(xindx)
if (present(romberg)) then
doromberg = romberg
else
doromberg = .false.
end if
! maximum 2**nintegrate points for Romberg integration; (4*n+1) points for 5-point Newton-Cotes
if (present(nintegrate)) then
trapzmax = nintegrate
else
if (doromberg) then
trapzmax = 10_i4
else
trapzmax = 101_i4
endif
endif
! maximum relative error for integration
if (present(epsint)) then
trapzreps = epsint
else
trapzreps = 1.0e-6_dp
endif
! determine h
if (present(h)) then
hh = h
else
if (present(silverman)) then
hh = kernel_density_h(x, silverman=silverman)
else
hh = kernel_density_h(x, silverman=.true.)
end if
end if
! cumulative integration of pdf with Simpson's and trapezoidal rules as in Numerical recipes (qsimp)
! We do the i=1 step of trapzd ourselves to save kernel evaluations
allocate(kernel_cumdensity_1d_dp(nout))
lower_x = minval(x) - 3.0_dp * stddev(x)
if (doromberg) then
allocate(s(trapzmax+1), hs(trapzmax+1))
allocate(kernel_pdf(nout))
kernel_pdf = kernel_density(x, hh, xout=xxout)
klower_x = kernel_density(x, hh, xout=(/lower_x/))
! loop through all regression points
do ii = 1, nout
if (ii==1) then
a = lower_x
k1 = klower_x(1)
else
a = xxout(ii-1)
k1 = kernel_pdf(ii-1)
endif
b = xxout(ii)
k2 = kernel_pdf(ii)
! Romberg integration
! first trapzd manually using kernel_pdf above
jj = 1
s(jj) = 0.5_dp * (b-a) * (k1+k2)
hs(jj) = 1.0_dp
s(jj+1) = s(jj)
hs(jj+1) = 0.25_dp*hs(jj)
do jj=2, trapzmax
call trapzd(kernel_density_1d_dp, x, hh, a, b, s(jj), jj)
if (jj >= kromb) then
call polint(hs(jj-kromb+1:jj), s(jj-kromb+1:jj), 0.0_dp, qromb, dqromb)
if (le(abs(dqromb),trapzreps*abs(qromb))) exit
end if
s(jj+1) = s(jj)
hs(jj+1) = 0.25_dp*hs(jj)
end do
if (ii==1) then
kernel_cumdensity_1d_dp(ii) = qromb
else
kernel_cumdensity_1d_dp(ii) = kernel_cumdensity_1d_dp(ii-1) + qromb
endif
end do
deallocate(s, hs)
else
! loop through all regression points
allocate(kernel_pdf(trapzmax))
do ii = 1, nout
if (ii .eq. 1_i4) then
delta = (xxout(ii)-lower_x) / real(trapzmax-1,dp)
kernel_pdf = kernel_density(x, hh, xout=linspace(lower_x, xxout(ii), trapzmax))
kernel_cumdensity_1d_dp(1) = int_regular(kernel_pdf, delta)
else
delta = (xxout(ii)-xxout(ii-1)) / real(trapzmax-1,dp)
kernel_pdf = kernel_density(x, hh, xout=linspace(xxout(ii-1), xxout(ii), trapzmax))
kernel_cumdensity_1d_dp(ii) = kernel_cumdensity_1d_dp(ii-1) + int_regular(kernel_pdf, delta)
end if
end do
endif
! scale to range [0,1]
! tmp = 1.0_dp / (kernel_cumdensity_1d_dp(nout) - kernel_cumdensity_1d_dp(1))
! kernel_cumdensity_1d_dp(:) = ( kernel_cumdensity_1d_dp(:) - kernel_cumdensity_1d_dp(1) ) * tmp
kernel_cumdensity_1d_dp = min(kernel_cumdensity_1d_dp, 1.0_dp)
! resorting
kernel_cumdensity_1d_dp(xindx) = kernel_cumdensity_1d_dp(:)
! check whether output has to be unpacked
if (present(mask) .and. (.not. present(xout))) then
deallocate(x)
nin = size(ix,1)
allocate(x(nin))
x = unpack(kernel_cumdensity_1d_dp, mask, nodata)
deallocate(kernel_cumdensity_1d_dp)
allocate(kernel_cumdensity_1d_dp(nin))
kernel_cumdensity_1d_dp = x
end if
! clean up
deallocate(xxout)
deallocate(xindx)
deallocate(kernel_pdf)
deallocate(x)
end function kernel_cumdensity_1d_dp
function kernel_cumdensity_1d_sp(ix, h, silverman, xout, romberg, nintegrate, epsint, mask, nodata)
use mo_utils, only: le, linspace
! use mo_quicksort, only: qsort_index !ST: may lead to Segmentation Fault for large arrays > 600 000 entries
! use mo_sort, only: sort_index
use mo_orderpack, only: sort_index ! MC: use orderpack for no NR until somebody complains
use mo_integrate, only: int_regular
implicit none
real(sp), dimension(:), intent(in) :: ix
real(sp), optional, intent(in) :: h
logical, optional, intent(in) :: silverman
real(sp), dimension(:), optional, intent(in) :: xout
logical, optional, intent(in) :: romberg
integer(i4), optional, intent(in) :: nintegrate
real(sp), optional, intent(in) :: epsint
logical, dimension(:), optional, intent(in) :: mask
real(sp), optional, intent(in) :: nodata
real(sp), dimension(:), allocatable :: kernel_cumdensity_1d_sp
! local variables
integer(i4) :: nin, nout
integer(i4) :: ii, jj
real(sp) :: hh ! bandwidth
real(sp), dimension(:), allocatable :: xxout
integer(i4), dimension(:), allocatable :: xindx
! real(sp) :: tmp
real(sp), dimension(:), allocatable :: x
! integration
logical :: doromberg ! Romberg of Newton-Coates
real(sp), dimension(:), allocatable :: kernel_pdf ! kernel densities at xout
integer(i4) :: trapzmax ! maximum 2**trapzmax points per integration between xouts
real(sp) :: trapzreps ! maximum relative integration error
integer(i4), parameter :: kromb = 5 ! Romberg's method of order 2kromb
real(sp) :: a, b, k1, k2 ! integral limits and corresponding kernel densities
real(sp) :: qromb, dqromb ! interpolated result and changeof consecutive calls to trapzd
real(sp), dimension(:), allocatable :: s, hs ! results and stepsize of consecutive calls to trapzd
real(sp) :: lower_x ! lowest support point at min(x) - 3 stddev(x)
real(sp), dimension(1) :: klower_x ! kernel density estimate at lower_x
real(sp) :: delta ! stepsize for Newton-Coates
! consistency check - mask needs either nodata or xout
if (present(mask) .and. (.not. present(xout)) .and. (.not. present(nodata)) ) then
stop 'kernel_cumdensity_1d_sp: missing nodata value or xout with present(mask)'
end if
! Pack if mask present
if (present(mask)) then
nin = count(mask)
allocate(x(nin))
x = pack(ix, mask)
else
nin = size(ix,1)
allocate(x(nin))
x = ix
endif
! allocate
if (present(xout)) then
nout = size(xout,1)
allocate(xxout(nout))
allocate(xindx(nout))
xxout = xout
else
nout = nin
allocate(xxout(nout))
allocate(xindx(nout))
xxout = x
end if
! sort the x
xindx = sort_index(xxout)
xxout = xxout(xindx)
if (present(romberg)) then
doromberg = romberg
else
doromberg = .false.
end if
! maximum 2**nintegrate points for Romberg integration; (4*n+1) points for 5-point Newton-Cotes
if (present(nintegrate)) then
trapzmax = nintegrate
else
if (doromberg) then
trapzmax = 10_i4
else
trapzmax = 101_i4
endif
endif
! maximum relative error for integration
if (present(epsint)) then
trapzreps = epsint
else
trapzreps = 1.0e-6_sp
endif
! determine h
if (present(h)) then
hh = h
else
if (present(silverman)) then
hh = kernel_density_h(x, silverman=silverman)
else
hh = kernel_density_h(x, silverman=.true.)
end if
end if
! cumulative integration of pdf with Simpson's and trapezoidal rules as in Numerical recipes (qsimp)
! We do the i=1 step of trapzd ourselves to save kernel evaluations
allocate(kernel_pdf(nout))
allocate(kernel_cumdensity_1d_sp(nout))
if (doromberg) then
allocate(s(trapzmax+1), hs(trapzmax+1))
kernel_pdf = kernel_density(x, hh, xout=xxout)
lower_x = minval(x) - 3.0_sp * stddev(x)
klower_x = kernel_density(x, hh, xout=(/lower_x/))
! loop through all regression points
do ii = 1, nout
if (ii==1) then
a = lower_x
k1 = klower_x(1)
else
a = xxout(ii-1)
k1 = kernel_pdf(ii-1)
endif
b = xxout(ii)
k2 = kernel_pdf(ii)
! Romberg integration
! first trapzd manually using kernel_pdf above
jj = 1
s(jj) = 0.5_sp * (b-a) * (k1+k2)
hs(jj) = 1.0_sp
s(jj+1) = s(jj)
hs(jj+1) = 0.25_sp*hs(jj)
do jj=2, trapzmax
call trapzd(kernel_density_1d_sp, x, hh, a, b, s(jj), jj)
if (jj >= kromb) then
call polint(hs(jj-kromb+1:jj), s(jj-kromb+1:jj), 0.0_sp, qromb, dqromb)
if (le(abs(dqromb),trapzreps*abs(qromb))) exit
end if
s(jj+1) = s(jj)
hs(jj+1) = 0.25_sp*hs(jj)
end do
if (ii==1) then
kernel_cumdensity_1d_sp(ii) = qromb
else
kernel_cumdensity_1d_sp(ii) = kernel_cumdensity_1d_sp(ii-1) + qromb
endif
end do
deallocate(s, hs)
else
! loop through all regression points
do ii = 1, nout
if (ii .eq. 1_i4) then
delta = (xxout(ii)-lower_x) / real(trapzmax-1,sp)
kernel_pdf = kernel_density(x, hh, xout=linspace(lower_x, xxout(ii), trapzmax))
kernel_cumdensity_1d_sp(1) = int_regular(kernel_pdf, delta)
else
delta = (xxout(ii)-xxout(ii-1)) / real(trapzmax-1,sp)
kernel_pdf = kernel_density(x, hh, xout=linspace(xxout(ii-1), xxout(ii), trapzmax))
kernel_cumdensity_1d_sp(ii) = kernel_cumdensity_1d_sp(ii-1) + int_regular(kernel_pdf, delta)
end if
end do
endif
! scale to range [0,1]
! tmp = 1.0_sp / (kernel_cumdensity_1d_sp(nout) - kernel_cumdensity_1d_sp(1))
! kernel_cumdensity_1d_sp(:) = ( kernel_cumdensity_1d_sp(:) - kernel_cumdensity_1d_sp(1) ) * tmp
kernel_cumdensity_1d_sp = min(kernel_cumdensity_1d_sp, 1.0_sp)
! resorting
kernel_cumdensity_1d_sp(xindx) = kernel_cumdensity_1d_sp(:)
! check whether output has to be unpacked
if (present(mask) .and. (.not. present(xout))) then
deallocate(x)
nin = size(ix,1)
allocate(x(nin))
x = unpack(kernel_cumdensity_1d_sp, mask, nodata)
deallocate(kernel_cumdensity_1d_sp)
allocate(kernel_cumdensity_1d_sp(nin))
kernel_cumdensity_1d_sp = x
end if
! clean up
deallocate(xxout)
deallocate(xindx)
deallocate(kernel_pdf)
deallocate(x)