-
Notifications
You must be signed in to change notification settings - Fork 0
/
Sampling_Techniques.cpp
310 lines (262 loc) · 14.2 KB
/
Sampling_Techniques.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
#include "Sampling_Techniques.h"
#include <math.h>
#include <ctime>
#include <sstream>
Sampling::Sampling() {
// Altough our constructor is very simple, I want to talk about one interesting
// problem which I struggled a lot with in the beginning: I wanted to be able to
// generate many different numbers based on some distribution by calling one
// of the main methods multiple times. However, the problem I stubbled upon
// was that if I called the same function (say "gaussian()") multiple times to try
// to generate multiple different gaussian numbers, then it would just generate
// the same number for all calls. I knew it had something to do with the
// initialization of the seed so I tried placing the seed initialization code, on
// line 26, in the main() function, in the gaussian() function itself and pretty much
// everywhere else except in the constructor. when I finally tried to place the line
// here, it suddenly worked. I found out that this is due to the fact that when I
// called the srand() function in the gaussian() function (the method I was calling
// multiple times), it led to initialization of the same seed over and over again,
// resulting in the same generation of numbers. By moving the seed initialization
// in the constructor we ensure that the seed is set only once when an instance
// of the class is created. In this way, it not only fixes the orginal problem
// but it also encapsulates the seed initialization logic of the class.
srand((unsigned) time(nullptr));
}
double Sampling::get_random_prob() {
// This method generates a pseudo-random number within the range [0, 1) which
// we will use to represent some random probability.
double generated_number = int_to_double(rand()) / RAND_MAX;
return generated_number;
}
double Sampling::calculate_factorial(double n) {
// This method simply calculates the factorial of the given number "n".
// The next if-statement just ensures that the user can't provide us with a
// negative number to take the factorial of (since produces an error). We
// return 0 if the user inputs an invalid value for the parameter "n".
if (n < 0) return int_to_double(0);
double factorial_of_n = 1; // initialization.
for (int i = 1; i <= n; ++i) factorial_of_n *= i;
return factorial_of_n;
}
double Sampling::binomial_coefficient(int n, int k) {
// This function calculates the binomial coefficient ("n choose k"), representing
// the number of ways to choose k elements from a set of n distinct elements.
// The binomial coefficient is defined by the following relation, computed
// recursively:
//
// C(n, k) = C(n - 1, k - 1) + C(n - 1, k).
// when k is 0 or equal to n, there is only one way to choose "n choose 0" or
// "n choose n", which is 1, hence, the need for the next line.
if (k == 0 || k == n) return 1;
else return binomial_coefficient(n - 1, k - 1) + binomial_coefficient(n - 1, k);
}
int Sampling::gamma_function(int n) {
// This method is used to calculate the gamma function for a given number ("n")
// which we assume to be a positive number. Note that it is possible to compute
// the gamma function for all complex numbers, I simply haven't made the
// implementation. If you're interested, look up "Lanczos Approximation".
//
// f(n) = (n - 1)!,
// where n is a positive integer.
// we return 0 if the user inputs an invalid value for the parameter "n".
if (n <= 0) return nan_value;
return calculate_factorial(n - 1);
}
// With the next couple of method implementations, we want to generate a random
// number that is based on the specific distribution that the user choose. The
// distributions the user has to choose from are the following:
//
// - Uniform
// - Gaussian
// - Exponential
// - Logarithmic
// - Poisson
// - Bernoulli
// - Binomial
// - Beta
// - Gamma
double Sampling::uniform(int min, int max) {
// This method is designed to generate a pseudo-random floating-point
// whole number. within a specified range, inclusive of the minimum
// value ("min") and exclusive of the maximum value ("max").
// One thing that I found is both very important and interesting to note here
// is that the modulus operator (%) cannot be directly applied to floating-point
// numbers in C++. It can only be applied with integer numbers. I found out about
// this when I initially tried to define the following line in this way:
// "double generated_number = int_to_double(rand()) % (max - min + 1) + min".
// Using this line above instead will produce a compiler error.
// The next if-statement ensures that min is less than or equal to max.
if (min > max) return nan_value;
// Generate a random floating-point whole number between 0 and 1.
int generated_number = rand() % (max - min + 1) + min;
return int_to_double(generated_number);
}
double Sampling::gaussian(double mean, double stddev) {
// We use the Box-Muller transformation to generate a number following a gaussian
// distribution and which got the desired mean ("mean") and standard deviation
// ("stddev").
//
// The transformation states that if we take two random variables with seperate
// values ("u_1" & "u_2"), then an independent random variable ("z") with a
// gaussian distribution can be written as z = √(-2 * ln(u_1)) * cos(2 * π * u_2).
double u_1 = int_to_double(rand()) / RAND_MAX;
double u_2 = int_to_double(rand()) / RAND_MAX;
// Use the Box-Muller transform to generate a number with a Gaussian distribution.
double z_0 = sqrt(-2.0 * log(u_1)) * cos(2.0 * M_PI * u_2);
// Scale and shift the result to match the desired mean and standard deviation.
double generated_number = mean + stddev * z_0;
return generated_number;
}
double Sampling::exponential(double lambda, int max) {
// This method generates a number between 0 and the provided upper boundary
// ("max") that follows an exponential distribution. The hyperparameter
// "lambda" is used for scaling (this hyperparameter is also called the
// "rate parameter" sometimes).
//
// f(λ, "max") = λe^(-λu),
// where λ is the rate parameter, and where u is some random number ∈ [0, "max").
// we return NaN if the user inputs an invalid value for the parameter "lambda".
if (lambda <= 0) return nan_value;
// Generate a number with a uniform distribution from 0 to the provided "max".
double u = uniform(0, max);
// Manipulate the random number, "u", so that it follows an exponential
// distribution.
double generated_number = lambda * exp(-lambda * u);
return generated_number;
}
double Sampling::logarithmic(int rate) {
// This method generates a number that follows a logarithmic distribution.
// The hyperparameter "rate" is used to shape the distribution and how
// quicky or slowly the probability it outputs decreases as the input variable
// increases (in our case, this would be the generated probability "p").
//
// f(k) = [-1 / ln(1 - p)] * [p^k / k],
// where k is our parameter "rate", and where p is some random
// number ∈ [0, "max").
// We do not accept "rate" to be under 1 since we want the expression
// "[p^rate / rate]" to remain well-defined and stable so that we won't
// have division by zero or by negative numbers. We return NaN if the user
// inputs an invalid value for the parameter "rate".
if (rate < 1) return nan_value;
// Generate a floating-point double number from 0 to 1.
double p = get_random_prob();
// Manipulate the random number, "p", so that it follows a logarithmic
// distribution.
double generated_number = (-1 / log(1 - p)) * (pow(p, rate) / rate);
return generated_number;
}
double Sampling::poisson(double lambda, int max) {
// This method generates a number between 0 and the provided upper boundary
// ("max") that follows a poisson distribution. The hyperparameter "lambda"
// is used to represent the averege rate of events. "rate of events" here
// describes the number of events that occur within a fixed interval when
// these events happen with a known constant mean rate.
//
// f(λ) = (λ^u * e-λ) / u!,
// where λ is the rate of events, and where u is some random number ∈ [0, "max").
// We do not accept lambda to be a non-positve number since it would be
// irrational to have a negative value for something that is representating
// the rate of something (in our case, "rate of events"). We return NaN if
// the user inputs an invalid value for the parameter "lambda".
if (lambda <= 0) return nan_value;
// Generate a number with a uniform distribution from 0 to the provided "max".
double u = uniform(0, max);
// Manipulate the random number, "u", so that it follows a poisson distribution.
double generated_number = (pow(lambda, u) * exp(-lambda)) /
calculate_factorial(u);
return generated_number;
}
double Sampling::bernoulli(double proba) {
// This method generates a binary or boolean number (either a 1 or a 0) that
// follows a Bernoulli distribution, given a probability ("proba").
//
// f(proba) = { p for u = 1; 1 - p for u = 0 },
// where p is our parameter "proba", and where u is some random
// number ∈ [0, 1).
// The next if-statement just ensures that the user can't provide us with a
// negative probability (since this would be irrational to have). We return
// NaN if the user inputs an invalid value for the parameter "proba".
if (proba < 0) return nan_value;
double u = get_random_prob(); // Generate a random number from 0 to 1.
// Manipulate the random number, "p", so that it follows a bernoulli
// distribution.
double generated_number;
if (proba < u) generated_number = 1.;
else generated_number = 0;
return generated_number;
}
double Sampling::binomial(int num_trails, double proba) {
// This method generates a number following a binomial distribution, based
// on a given probability of success ("proba") and on a set number of trials
// ("num_trials"). The binomial distribution models the number of successes
// in a fixed number of independent trials (often called "bernoulli trails)"),
// where each trial has the same probability of success.
//
// The algorithm first generates 'num_trials' independent uniform random numbers
// between 0 and 1. If a generated number is less than the specified probability,
// it is considered a success, and the count of successes ("generated_number") is
// incremented.
//
// f(n, p) = [n! / k!(n - k)!] * p^k * (1 - p)^(n - k),
// where n is the parameter "num_trails", p is the parameter "proba",
// and where k is the number of successes we are currently evaluating.
double generated_number = 0; // initilization. This is our "success count".
for (int k = 0; k <= num_trails; ++k) {
double probability = binomial_coefficient(num_trails, k) * pow(proba, k) *
pow(1 - proba, num_trails - k);
// Use a random number between 0 and 1 to decide whether k occurs or not.
double u = get_random_prob(); // Generate a random number from 0 to 1.
if (u < probability) {
generated_number = int_to_double(k);
break;
}
}
return generated_number;
}
double Sampling::beta(double alpha, double beta) {
// This method generates a number following a beta distribution given two
// hyperparameters, "alpha" (α) and "beta" (β). The two hyperparameters
// controls the overall shape of the distribution; α controls the shape of
// the distribution on the left side (near 0). Larger values of α result in
// a distribution that is more focused toward 1. Similarly, β controls the
// shape of the distribution on the right side (near 1). Larger values of β
// result in a distribution that is more focused toward 0.
//
// f(α, β) = u^(α - 1) * (1 - u)^(β - 1),
// where α and β are the parameters "alpha" and "beta", respectively, and
// where u is some random number ∈ [0, 1).
// We do not accept "alpha" and "beta" to be non-positve since we want to
// maintain the distributions coherence and to always keep the it well-defined.
// we return 0 if the user inputs an invalid value for the parameters
// "alpha" and "beta".
if (alpha <= 0 || beta <= 0) return nan_value;
double u = get_random_prob(); // Generate a random number from 0 to 1.
// Manipulate the random number, "p", so that it follows a beta distribution.
double generated_number = pow(u, (alpha - 1)) * pow((1 - u), (beta - 1));
return generated_number;
}
double Sampling::gamma(double alpha, double beta, int max) {
// This method generates a number following a gamma distribution given two
// hyperparameters, "alpha" (α) and "beta" (β). The first hyperparameter
// ("alpha") controls the shape of the distribution and where the second
// hyperparameter ("beta") controls the rate at which the gamma distribution
// decreases (this is similar to the "rate parameter" ("rate") we encountered
// in the exponential distribution).
//
// f(α, β) = [u^(α - 1) * e^(-βu) * β^α] / Γ(α),
// where α and β are the parameters "alpha" and "beta", respectively, where u
// is some random number ∈ [0, "max"), and where Γ is the gamma function.
// The reason behind restricting the hyperparameters ("alpha" and "beta") is
// very similar to the reason provided in the beta distribution: we do not accept
// "alpha" and "beta" to be non-positve since we want to maintain the distributions
// coherence and to always keep the it well-defined. We return 0 if the user inputs
// an invalid value for the parameters "alpha" and "beta".
if (alpha <= 0 || beta <= 0) return nan_value;
// Generate a number with a uniform distribution from 0 to the provided "max".
double u = uniform(0, max);
// Manipulate the random number, "p", so that it follows a gamma distribution.
double gamma_of_alpha = int_to_double(gamma_function(int_to_double(alpha)));
double generated_number = (pow(u, (alpha - 1)) * exp(-beta * u) * pow(beta, alpha)) /
gamma_of_alpha;
return generated_number;
}