-
Notifications
You must be signed in to change notification settings - Fork 23
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Adds Quantile Discretizer #90
base: master
Are you sure you want to change the base?
Changes from all commits
43d267a
c9a7724
d6bfb85
23a95fd
c75024c
b308ffb
ecbab4c
ea3304d
4849dba
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
package io.picnicml.doddlemodel.preprocessing | ||
|
||
import breeze.linalg.DenseVector | ||
import breeze.stats.DescriptiveStats | ||
import cats.syntax.option._ | ||
import io.picnicml.doddlemodel.CrossScalaCompat._ | ||
import io.picnicml.doddlemodel.data.Feature.FeatureIndex | ||
import io.picnicml.doddlemodel.data.Features | ||
import io.picnicml.doddlemodel.syntax.OptionSyntax._ | ||
import io.picnicml.doddlemodel.typeclasses.Transformer | ||
import scala.Double.{MaxValue, MinValue} | ||
|
||
case class QuantileDiscretizer( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I like this formatting. Is this scalafmt? We need to add a formatter to the project 😅. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, I ran Scalafmt (but didn't PR my setup of it since I didn't know if you wanted it). It's a one line addition to the plugins file and a configurable settings file. I can make a PR with the setup and you can tune the config to your personal preferences! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I added this issue: #95. |
||
private val bucketCounts: DenseVector[Double], | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If I understood correctly, you implemented this with I tried using Could you please check again, deleting |
||
private val featureIndex: FeatureIndex, | ||
private val quantiles: Option[Seq[Seq[(Double, Double)]]] = None | ||
) { | ||
private val numNumeric = featureIndex.numerical.columnIndices.length | ||
require(numNumeric == 0 || numNumeric == bucketCounts.length, "A quantile should be given for every numerical column") | ||
} | ||
|
||
/** An immutable preprocessor that discretizes numerical features into buckets based on quantiles. | ||
* Numerical feature values are converted to categorical features based on their distribution | ||
* */ | ||
object QuantileDiscretizer { | ||
|
||
/** Create a quantile discretizer which splits data into discrete evenly sized buckets. | ||
* | ||
* @param bucketCount The number of quantiles desired | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm nitpicking here but I wouldn't describe |
||
* @param featureIndex feature index associated with features - this is needed so that only numerical features are | ||
* transformed by this preprocessor; could be a subset of columns to be transformed | ||
* | ||
* @example Discretize a matrix into quartiles with two features: one numerical and one categorical. | ||
* {{{ | ||
* import io.picnicml.doddlemodel.preprocessing.QuantileDiscretizer.ev | ||
* import io.picnicml.doddlemodel.syntax.TransformerSyntax._ | ||
* | ||
* val featureIndex = FeatureIndex(List(NumericalFeature, CategoricalFeature)) | ||
* val x = DenseMatrix( | ||
* List(-1.0, 0.0), | ||
* List(0.0, 1.0), | ||
* List(2.0, 0.0), | ||
* List(5.0, 0.0) | ||
* ) | ||
* // equivalently, DenseVector(4) could be used | ||
* val bucketCounts = 4 | ||
* val discretizer = QuantileDiscretizer(bucketCounts, featureIndex).fit(x) | ||
* val xTransformed = discretizer.transform(x) | ||
* }}} | ||
*/ | ||
def apply(bucketCount: Int, featureIndex: FeatureIndex): QuantileDiscretizer = { | ||
val numNumeric = featureIndex.numerical.columnIndices.length | ||
val bucketCountExtended: DenseVector[Double] = DenseVector.fill(numNumeric) { bucketCount.toDouble } | ||
QuantileDiscretizer(bucketCountExtended, featureIndex) | ||
} | ||
|
||
implicit lazy val ev: Transformer[QuantileDiscretizer] = new Transformer[QuantileDiscretizer] { | ||
|
||
Helw150 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
@inline override def isFitted(model: QuantileDiscretizer): Boolean = model.quantiles.isDefined | ||
|
||
override def fit(model: QuantileDiscretizer, x: Features): QuantileDiscretizer = { | ||
val discreteRangeArrays = model.featureIndex.numerical.columnIndices.zipWithIndex.map { | ||
case (colIndex, bucketCountsIndex) => | ||
val colArray = x(::, colIndex).toScalaVector.sorted | ||
computeQuantiles(colArray, model.bucketCounts(bucketCountsIndex).toInt) | ||
} | ||
model.copy(quantiles = discreteRangeArrays.some) | ||
} | ||
|
||
override protected def transformSafe(model: QuantileDiscretizer, x: Features): Features = { | ||
val xCopy = x.copy | ||
model.featureIndex.numerical.columnIndices.zipWithIndex.foreach { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I like this, it's elegant. I personally would format it as (just a subjective preference): model.featureIndex.numerical.columnIndices.zipWithIndex.foreach { case (colIndex, bucketsIndex) =>
val buckets = model.quantiles.getOrBreak(bucketsIndex)
(0 until xCopy.rows).foreach { rowIndex =>
xCopy(rowIndex, colIndex) = buckets.indexWhere { case (lowerBound, upperBound) =>
lowerBound <= xCopy(rowIndex, colIndex) && xCopy(rowIndex, colIndex) <= upperBound
}.toDouble
}
} |
||
case (colIndex, bucketsIndex) => | ||
val buckets = model.quantiles.getOrBreak(bucketsIndex) | ||
(0 until xCopy.rows).foreach { rowIndex => | ||
xCopy(rowIndex, colIndex) = buckets | ||
.indexWhere({ | ||
case (lowerBound, upperBound) => | ||
lowerBound <= xCopy(rowIndex, colIndex) && xCopy(rowIndex, colIndex) <= upperBound | ||
}) | ||
.toDouble | ||
} | ||
} | ||
xCopy | ||
} | ||
} | ||
|
||
private def computeQuantiles(target: Seq[Double], bucketCount: Int): Seq[(Double, Double)] = { | ||
val binPercentileWidth = 1.0 / bucketCount | ||
val targetArray = target.toArray | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The reason I wrote the comment about The solution would be to change Hope this makes sense and I'm not making a mistake reading this. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Got it! I didn't understand the comment but makes sense now |
||
// NOTE: Adds binPercentileWidth to make the range inclusive of 1 | ||
val rangePairs = | ||
Range | ||
.BigDecimal(0, 1.0 + (binPercentileWidth / 2), binPercentileWidth) | ||
.map(_.toDouble) | ||
.map(DescriptiveStats.percentileInPlace(targetArray, _)) | ||
.sliding(2) | ||
.map({case Seq(lowerBound, upperBound) => (lowerBound, upperBound)}) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This line can be just There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oop yeah, I always use parens but it's personal preference |
||
.toSeq | ||
val headUpdate = rangePairs.headOption.getOrElse((MinValue, MaxValue)).copy(_1 = MinValue) | ||
val lastUpdate = rangePairs.lastOption.getOrElse((MinValue, MaxValue)).copy(_2 = MaxValue) | ||
rangePairs | ||
.updated(0, headUpdate) | ||
.updated(rangePairs.size - 1, lastUpdate) | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
package io.picnicml.doddlemodel.preprocessing | ||
|
||
import breeze.linalg.{DenseMatrix, DenseVector} | ||
import io.picnicml.doddlemodel.TestingUtils | ||
import io.picnicml.doddlemodel.data.Feature.{CategoricalFeature, FeatureIndex, NumericalFeature} | ||
import io.picnicml.doddlemodel.syntax.TransformerSyntax._ | ||
import org.scalatest.{FlatSpec, Matchers} | ||
|
||
class QuantileDiscretizerTest extends FlatSpec with Matchers with TestingUtils { | ||
|
||
private val x = DenseMatrix( | ||
List(-1.0, -100.0, 0.0), | ||
List(0.0, -1.0, 1.0), | ||
List(2.0, 2000.0, 0.0), | ||
List(5.0, 20.0, 0.0) | ||
) | ||
|
||
"QuantileDiscretizer" should "bucket all numerical features into the number of buckets defined in bucketCounts" in { | ||
val featureIndex = FeatureIndex(List(NumericalFeature, NumericalFeature, CategoricalFeature)) | ||
val bucketCounts: DenseVector[Double] = DenseVector(3, 4) | ||
|
||
val quantileDiscretizer = QuantileDiscretizer(bucketCounts, featureIndex).fit(x) | ||
val xQuantizedExpected = DenseMatrix( | ||
List(0.0, 0.0, 0.0), | ||
List(1.0, 1.0, 1.0), | ||
List(1.0, 3.0, 0.0), | ||
List(2.0, 2.0, 0.0) | ||
) | ||
|
||
breezeEqual(quantileDiscretizer.transform(x), xQuantizedExpected) shouldBe true | ||
} | ||
|
||
it should "process all the numerical columns by a single bucketCounts" in { | ||
val featureIndex = FeatureIndex(List(NumericalFeature, NumericalFeature, NumericalFeature)) | ||
val bucketCount: Int = 2 | ||
|
||
val quantileDiscretizer = QuantileDiscretizer(bucketCount, featureIndex).fit(x) | ||
val xQuantizedExpected = DenseMatrix( | ||
List(0.0, 0.0, 0.0), | ||
List(0.0, 0.0, 1.0), | ||
List(1.0, 1.0, 0.0), | ||
List(1.0, 1.0, 0.0) | ||
) | ||
|
||
breezeEqual(quantileDiscretizer.transform(x), xQuantizedExpected) shouldBe true | ||
} | ||
|
||
it should "amount to no-op if there are no numerical features in data" in { | ||
val featureIndex = FeatureIndex(List(CategoricalFeature, CategoricalFeature, CategoricalFeature)) | ||
val bucketCounts: DenseVector[Double] = DenseVector(2, 3) | ||
val bucketCount: Int = 3 | ||
|
||
val quantileDiscretizer1 = QuantileDiscretizer(bucketCounts, featureIndex).fit(x) | ||
val quantileDiscretizer2 = QuantileDiscretizer(bucketCount, featureIndex).fit(x) | ||
|
||
Helw150 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
breezeEqual(quantileDiscretizer1.transform(x), x) shouldBe true | ||
breezeEqual(quantileDiscretizer2.transform(x), x) shouldBe true | ||
} | ||
|
||
it should "fail when the amount of passed bucketCounts is different to number of numerical features in data" in { | ||
val featureIndex = FeatureIndex(List(NumericalFeature, NumericalFeature, NumericalFeature)) | ||
val bucketCounts: DenseVector[Double] = DenseVector(2, 3) | ||
|
||
// 3 numeric columns vs 2 thresholds | ||
an[IllegalArgumentException] should be thrownBy QuantileDiscretizer(bucketCounts, featureIndex) | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Another line between the penultimate import and
import scala.Double.{MaxValue, MinValue}
(I usedOptimize imports
in IntelliJ which also reordered some of the imports).There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I use Emacs sadly, but I'll open IntelliJ for the import optimization (I have used Scalafix for similar things, but it's a big dependency for something IntelliJ does for free for most folks)