Skip to content

Commit 1dcc852

Browse files
committed
Making more changes to incorporate an outlier removal step along with different regression implementations
1 parent bf90f52 commit 1dcc852

39 files changed

+699
-840
lines changed

src/Enums/MatrixDecomposition.cs

+2-1
Original file line numberDiff line numberDiff line change
@@ -20,5 +20,6 @@ public enum MatrixDecomposition
2020
GramSchmidt,
2121
Lu,
2222
Qr,
23-
Svd
23+
Svd,
24+
Normal
2425
}

src/Enums/RegularizationType.cs

+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
namespace AiDotNet.Enums;
2+
3+
public enum RegularizationType
4+
{
5+
None,
6+
L1,
7+
L2,
8+
ElasticNet
9+
}

src/Extensions/MatrixExtensions.cs

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
namespace AiDotNet.Extensions;
2+
3+
public static class MatrixExtensions
4+
{
5+
public static Matrix<T> AddConstantColumn<T>(this Matrix<T> matrix, T value)
6+
{
7+
var newMatrix = new Matrix<T>(matrix.Rows, matrix.Columns + 1);
8+
for (int i = 0; i < matrix.Rows; i++)
9+
{
10+
newMatrix[i, 0] = value;
11+
for (int j = 0; j < matrix.Columns; j++)
12+
{
13+
newMatrix[i, j + 1] = matrix[i, j];
14+
}
15+
}
16+
17+
return newMatrix;
18+
}
19+
}

src/Extensions/VectorExtensions.cs

+10
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,14 @@
22

33
public static class VectorExtensions
44
{
5+
public static Vector<T> Slice<T>(this Vector<T> vector, int start, int length)
6+
{
7+
var slicedVector = new Vector<T>(length);
8+
for (int i = 0; i < length; i++)
9+
{
10+
slicedVector[i] = vector[start + i];
11+
}
12+
13+
return slicedVector;
14+
}
515
}

src/Factories/NormalizerFactory.cs

+1-3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
1-
using AiDotNet.Normalizers;
2-
3-
namespace AiDotNet.Factories;
1+
namespace AiDotNet.Factories;
42

53
public class NormalizerFactory<T>
64
{

src/Factories/RegressionFactory.cs

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
global using AiDotNet.Regression;
2+
3+
namespace AiDotNet.Factories;
4+
5+
public static class RegressionFactory
6+
{
7+
public static MultivariateRegression<T> CreateRidgeRegression<T>(INumericOperations<T> numOps, RegressionOptions options)
8+
{
9+
return new MultivariateRegression<T>(numOps, options);
10+
}
11+
12+
public static MultivariateRegression<T> CreateLassoRegression<T>(INumericOperations<T> numOps, RegressionOptions options)
13+
{
14+
return new MultivariateRegression<T>(numOps, options);
15+
}
16+
17+
public static MultivariateRegression<T> CreateElasticNetRegression<T>(INumericOperations<T> numOps, ElasticNetRegressionOptions<T> options)
18+
{
19+
return new MultivariateRegression<T>(numOps, options);
20+
}
21+
}

src/Helpers/StatisticsHelper.cs

+121-6
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1-
namespace AiDotNet.Helpers;
1+
using System.Linq;
2+
3+
namespace AiDotNet.Helpers;
24

35
public static class StatisticsHelper<T>
46
{
@@ -903,21 +905,21 @@ public static T CalculateTValue(int degreesOfFreedom, T confidenceLevel)
903905
return CalculateInverseStudentTCDF(degreesOfFreedom, NumOps.Subtract(NumOps.One, alpha));
904906
}
905907

906-
public static (T FirstQuartile, T ThirdQuartile) CalculateQuartiles(Vector<T> data)
908+
public static (T FirstQuantile, T ThirdQuantile) CalculateQuantiles(Vector<T> data)
907909
{
908910
var sortedData = data.OrderBy(x => x).ToArray();
909911
int n = sortedData.Length;
910912

911-
T Q1 = CalculateQuartile(sortedData, NumOps.FromDouble(0.25));
912-
T Q3 = CalculateQuartile(sortedData, NumOps.FromDouble(0.75));
913+
T Q1 = CalculateQuantile(sortedData, NumOps.FromDouble(0.25));
914+
T Q3 = CalculateQuantile(sortedData, NumOps.FromDouble(0.75));
913915

914916
return (Q1, Q3);
915917
}
916918

917-
public static T CalculateQuartile(T[] sortedData, T quartile)
919+
public static T CalculateQuantile(T[] sortedData, T quantile)
918920
{
919921
int n = sortedData.Length;
920-
T position = NumOps.Multiply(NumOps.FromDouble(n - 1), quartile);
922+
T position = NumOps.Multiply(NumOps.FromDouble(n - 1), quantile);
921923
int index = NumOps.ToInt32(NumOps.Round(position));
922924
T fraction = NumOps.Subtract(position, NumOps.FromDouble(index));
923925

@@ -956,4 +958,117 @@ public static (T skewness, T kurtosis) CalculateSkewnessAndKurtosis(Vector<T> sa
956958

957959
return (skewness, kurtosis);
958960
}
961+
962+
public static (T Lower, T Upper) CalculateToleranceInterval(Vector<T> actual, Vector<T> predicted, T confidenceLevel)
963+
{
964+
int n = actual.Length;
965+
T mean = predicted.Average();
966+
T stdDev = CalculateStandardDeviation(predicted);
967+
T factor = NumOps.FromDouble(Math.Sqrt(1 + (1.0 / n)));
968+
T tValue = CalculateTValue(n - 1, confidenceLevel);
969+
T margin = NumOps.Multiply(tValue, NumOps.Multiply(stdDev, factor));
970+
971+
return (NumOps.Subtract(mean, margin), NumOps.Add(mean, margin));
972+
}
973+
974+
public static (T Lower, T Upper) CalculateForecastInterval(Vector<T> actual, Vector<T> predicted, T confidenceLevel)
975+
{
976+
int n = actual.Length;
977+
T mean = predicted.Average();
978+
T mse = CalculateMeanSquaredError(actual, predicted);
979+
T factor = NumOps.FromDouble(Math.Sqrt(1 + (1.0 / n)));
980+
T tValue = CalculateTValue(n - 1, confidenceLevel);
981+
T margin = NumOps.Multiply(tValue, NumOps.Multiply(NumOps.Sqrt(mse), factor));
982+
983+
return (NumOps.Subtract(mean, margin), NumOps.Add(mean, margin));
984+
}
985+
986+
public static List<(T Quantile, T Lower, T Upper)> CalculateQuantileIntervals(Vector<T> actual, Vector<T> predicted, T[] quantiles)
987+
{
988+
var result = new List<(T Quantile, T Lower, T Upper)>();
989+
var sortedPredictions = new Vector<T>([.. predicted.OrderBy(x => x)]);
990+
991+
foreach (var q in quantiles)
992+
{
993+
T lowerQuantile = CalculateQuantile(sortedPredictions, NumOps.Subtract(q, NumOps.FromDouble(0.025)));
994+
T upperQuantile = CalculateQuantile(sortedPredictions, NumOps.Add(q, NumOps.FromDouble(0.025)));
995+
result.Add((q, lowerQuantile, upperQuantile));
996+
}
997+
998+
return result;
999+
}
1000+
1001+
public static (T Lower, T Upper) CalculateBootstrapInterval(Vector<T> actual, Vector<T> predicted, T confidenceLevel)
1002+
{
1003+
int n = actual.Length;
1004+
int bootstrapSamples = 1000;
1005+
var bootstrapMeans = new List<T>();
1006+
1007+
Random random = new();
1008+
for (int i = 0; i < bootstrapSamples; i++)
1009+
{
1010+
var sample = new Vector<T>(n);
1011+
for (int j = 0; j < n; j++)
1012+
{
1013+
int index = random.Next(n);
1014+
sample[j] = predicted[index];
1015+
}
1016+
bootstrapMeans.Add(sample.Average());
1017+
}
1018+
1019+
bootstrapMeans.Sort();
1020+
int lowerIndex = NumOps.ToInt32(NumOps.Divide(NumOps.Multiply(confidenceLevel, NumOps.FromDouble(bootstrapSamples)), NumOps.FromDouble(2)));
1021+
int upperIndex = bootstrapSamples - lowerIndex - 1;
1022+
1023+
return (bootstrapMeans[lowerIndex], bootstrapMeans[upperIndex]);
1024+
}
1025+
1026+
public static (T Lower, T Upper) CalculateSimultaneousPredictionInterval(Vector<T> actual, Vector<T> predicted, T confidenceLevel)
1027+
{
1028+
int n = actual.Length;
1029+
T mean = predicted.Average();
1030+
T mse = CalculateMeanSquaredError(actual, predicted);
1031+
T factor = NumOps.Sqrt(NumOps.Multiply(NumOps.FromDouble(2), confidenceLevel));
1032+
T margin = NumOps.Multiply(factor, NumOps.Sqrt(mse));
1033+
1034+
return (NumOps.Subtract(mean, margin), NumOps.Add(mean, margin));
1035+
}
1036+
1037+
public static (T Lower, T Upper) CalculateJackknifeInterval(Vector<T> actual, Vector<T> predicted)
1038+
{
1039+
int n = actual.Length;
1040+
var jackknifeSamples = new List<T>();
1041+
1042+
for (int i = 0; i < n; i++)
1043+
{
1044+
var sample = new Vector<T>(n - 1);
1045+
int index = 0;
1046+
for (int j = 0; j < n; j++)
1047+
{
1048+
if (j != i)
1049+
{
1050+
sample[index++] = predicted[j];
1051+
}
1052+
}
1053+
jackknifeSamples.Add(sample.Average());
1054+
}
1055+
1056+
T jackknifeEstimate = new Vector<T>([.. jackknifeSamples]).Average();
1057+
T jackknifeStdError = CalculateStandardDeviation(new Vector<T>([.. jackknifeSamples]));
1058+
T tValue = CalculateTValue(n - 1, NumOps.FromDouble(0.95));
1059+
T margin = NumOps.Multiply(tValue, jackknifeStdError);
1060+
1061+
return (NumOps.Subtract(jackknifeEstimate, margin), NumOps.Add(jackknifeEstimate, margin));
1062+
}
1063+
1064+
public static (T Lower, T Upper) CalculatePercentileInterval(Vector<T> predicted, T confidenceLevel)
1065+
{
1066+
var sortedPredictions = new Vector<T>([.. predicted.OrderBy(x => x)]);
1067+
int n = sortedPredictions.Length;
1068+
T alpha = NumOps.Subtract(NumOps.One, confidenceLevel);
1069+
int lowerIndex = NumOps.ToInt32(NumOps.Divide(NumOps.Multiply(alpha, NumOps.FromDouble(n)), NumOps.FromDouble(2.0)));
1070+
int upperIndex = n - lowerIndex - 1;
1071+
1072+
return (sortedPredictions[lowerIndex], sortedPredictions[upperIndex]);
1073+
}
9591074
}

src/Interfaces/IOutlierRemoval.cs

+2-8
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,6 @@
11
namespace AiDotNet.Interfaces;
22

3-
public abstract class IOutlierRemoval
3+
public interface IOutlierRemoval<T>
44
{
5-
internal abstract (double[], double[]) RemoveOutliers(double[] rawInputs, double[] rawOutputs);
6-
7-
internal abstract (double[][], double[]) RemoveOutliers(double[][] rawInputs, double[] rawOutputs);
8-
9-
internal abstract (double[][], double[][]) RemoveOutliers(double[][] rawInputs, double[][] rawOutputs);
10-
11-
internal IQuartile? Quartile { get; set; }
5+
(Matrix<T> CleanedInputs, Vector<T> CleanedOutputs) RemoveOutliers(Matrix<T> inputs, Vector<T> outputs);
126
}

src/Interfaces/IPredictionModelBuilder.cs

+1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ public interface IPredictionModelBuilder<T>
1010
IPredictionModelBuilder<T> WithRegression(IRegression<T> regression);
1111
IPredictionModelBuilder<T> WithOptimizer(IOptimizationAlgorithm<T> optimizationAlgorithm, OptimizationAlgorithmOptions optimizationOptions);
1212
IPredictionModelBuilder<T> WithDataPreprocessor(IDataPreprocessor<T> dataPreprocessor);
13+
IPredictionModelBuilder<T> WithOutlierRemoval(IOutlierRemoval<T> outlierRemoval);
1314
PredictionModelResult<T> Build(Matrix<T> x, Vector<T> y);
1415
Vector<T> Predict(Matrix<T> newData, PredictionModelResult<T> model);
1516
void SaveModel(PredictionModelResult<T> model, string filePath);

src/LinearAlgebra/DataPreprocessor.cs

+6-3
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,15 @@ public class DataPreprocessor<T> : IDataPreprocessor<T>
44
{
55
private readonly INormalizer<T> _normalizer;
66
private readonly IFeatureSelector<T> _featureSelector;
7+
private readonly IOutlierRemoval<T> _outlierRemoval;
78
private readonly bool _normalizeBeforeFeatureSelection;
89
private readonly PredictionModelOptions _options;
910

10-
public DataPreprocessor(INormalizer<T> normalizer, IFeatureSelector<T> featureSelector, PredictionModelOptions options)
11+
public DataPreprocessor(INormalizer<T> normalizer, IFeatureSelector<T> featureSelector, IOutlierRemoval<T> outlierRemoval, PredictionModelOptions options)
1112
{
1213
_normalizer = normalizer;
1314
_featureSelector = featureSelector;
15+
_outlierRemoval = outlierRemoval;
1416
_options = options;
1517
_normalizeBeforeFeatureSelection = options.NormalizeBeforeFeatureSelection;
1618
}
@@ -19,6 +21,8 @@ public DataPreprocessor(INormalizer<T> normalizer, IFeatureSelector<T> featureSe
1921
{
2022
NormalizationInfo<T> normInfo = new();
2123

24+
(X, y) = _outlierRemoval.RemoveOutliers(X, y);
25+
2226
if (_normalizeBeforeFeatureSelection)
2327
{
2428
(X, normInfo.XParams) = _normalizer.NormalizeMatrix(X);
@@ -35,8 +39,7 @@ public DataPreprocessor(INormalizer<T> normalizer, IFeatureSelector<T> featureSe
3539
return (X, y, normInfo);
3640
}
3741

38-
public (Matrix<T> XTrain, Vector<T> yTrain, Matrix<T> XValidation, Vector<T> yValidation, Matrix<T> XTest, Vector<T> yTest)
39-
SplitData(Matrix<T> X, Vector<T> y)
42+
public (Matrix<T> XTrain, Vector<T> yTrain, Matrix<T> XValidation, Vector<T> yValidation, Matrix<T> XTest, Vector<T> yTest) SplitData(Matrix<T> X, Vector<T> y)
4043
{
4144
int totalSamples = X.Rows;
4245
int trainSize = (int)(totalSamples * _options.TrainingSplitPercentage);

src/LinearAlgebra/Matrix.cs

+11
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,17 @@ public static Matrix<T> CreateZeros(int rows, int cols, INumericOperations<T>? n
131131
return matrix.Zeros(rows, cols);
132132
}
133133

134+
public static Matrix<T> CreateDiagonal(Vector<T> diagonal, INumericOperations<T>? numericOperations = null)
135+
{
136+
var matrix = new Matrix<T>(diagonal.Length, diagonal.Length, numericOperations);
137+
for (int i = 0; i < diagonal.Length; i++)
138+
{
139+
matrix[i, i] = diagonal[i];
140+
}
141+
142+
return matrix;
143+
}
144+
134145
public new static Matrix<T> Empty()
135146
{
136147
return new Matrix<T>(0, 0);

src/LinearAlgebra/MatrixBase.cs

+4-4
Original file line numberDiff line numberDiff line change
@@ -109,16 +109,16 @@ public static MatrixBase<T> Empty()
109109
return new Matrix<T>(0, 0);
110110
}
111111

112-
public virtual VectorBase<T> GetRow(int row)
112+
public virtual Vector<T> GetRow(int row)
113113
{
114114
ValidateIndices(row, 0);
115-
return new Vector<T>(Enumerable.Range(0, cols).Select(col => this[row, col]).ToArray(), ops);
115+
return new Vector<T>([.. Enumerable.Range(0, cols).Select(col => this[row, col])], ops);
116116
}
117117

118-
public virtual VectorBase<T> GetColumn(int col)
118+
public virtual Vector<T> GetColumn(int col)
119119
{
120120
ValidateIndices(0, col);
121-
return new Vector<T>(Enumerable.Range(0, rows).Select(row => this[row, col]).ToArray(), ops);
121+
return new Vector<T>([.. Enumerable.Range(0, rows).Select(row => this[row, col])], ops);
122122
}
123123

124124
public Matrix<T> SubMatrix(int startRow, int startCol, int numRows, int numCols)

src/LinearAlgebra/PredictionModelBuilder.cs

+10-1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
global using AiDotNet.Regularization;
44
global using AiDotNet.Optimizers;
55
global using AiDotNet.Normalizers;
6+
global using AiDotNet.OutlierRemoval;
67

78
namespace AiDotNet.LinearAlgebra;
89

@@ -18,6 +19,7 @@ public class PredictionModelBuilder<T> : IPredictionModelBuilder<T>
1819
private IRegression<T>? _regression;
1920
private IOptimizationAlgorithm<T>? _optimizer;
2021
private IDataPreprocessor<T>? _dataPreprocessor;
22+
private IOutlierRemoval<T>? _outlierRemoval;
2123

2224
public PredictionModelBuilder(PredictionModelOptions? options = null)
2325
{
@@ -73,6 +75,12 @@ public IPredictionModelBuilder<T> WithDataPreprocessor(IDataPreprocessor<T> data
7375
return this;
7476
}
7577

78+
public IPredictionModelBuilder<T> WithOutlierRemoval(IOutlierRemoval<T> outlierRemoval)
79+
{
80+
_outlierRemoval = outlierRemoval;
81+
return this;
82+
}
83+
7684
public PredictionModelResult<T> Build(Matrix<T> x, Vector<T> y)
7785
{
7886
// Validate inputs
@@ -93,7 +101,8 @@ public PredictionModelResult<T> Build(Matrix<T> x, Vector<T> y)
93101
var fitDetector = _fitDetector ?? new DefaultFitDetector<T>();
94102
var fitnessCalculator = _fitnessCalculator ?? new RSquaredFitnessCalculator<T>();
95103
var regularization = _regularization ?? new NoRegularization<T>();
96-
var dataPreprocessor = _dataPreprocessor ?? new DataPreprocessor<T>(normalizer, featureSelector, _options);
104+
var outlierRemoval = _outlierRemoval ?? new NoOutlierRemoval<T>();
105+
var dataPreprocessor = _dataPreprocessor ?? new DataPreprocessor<T>(normalizer, featureSelector, outlierRemoval, _options);
97106

98107
// Preprocess the data
99108
var (preprocessedX, preprocessedY, normInfo) = dataPreprocessor.PreprocessData(x, y);

src/Models/BasicStats.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ private void CalculateStats(Vector<T> values)
5858
Min = values.Min();
5959
Max = values.Max();
6060
Median = StatisticsHelper<T>.CalculateMedian(values);
61-
(FirstQuartile, ThirdQuartile) = StatisticsHelper<T>.CalculateQuartiles(values);
61+
(FirstQuartile, ThirdQuartile) = StatisticsHelper<T>.CalculateQuantiles(values);
6262
InterquartileRange = NumOps.Subtract(ThirdQuartile, FirstQuartile);
6363
MAD = StatisticsHelper<T>.CalculateMeanAbsoluteDeviation(values, Median);
6464
}

0 commit comments

Comments
 (0)