From c16b37eeb8b521e5c6cd45733d3cae581354e727 Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Tue, 30 Jul 2024 11:39:18 +0200 Subject: [PATCH 1/6] adds kotlin.dataframe.debug property which runs the #713 checks when enabled. Updated contribution guide too. TC will have to be updated manually --- CONTRIBUTING.md | 6 ++-- build.gradle.kts | 1 + .../dataframe/impl/columns/DataColumnImpl.kt | 31 +++++++++++++++++++ gradle.properties | 5 +++ 4 files changed, 41 insertions(+), 2 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 767fef0490..e7e57cf3a6 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -52,8 +52,8 @@ so do familiarize yourself with the following guidelines. ## PR workflow -0. The contributor builds the library locally and runs all unit tests via the Gradle task `dataframe:test` - (see the ["Building"](#building) chapter). +0. The contributor builds the library locally and runs all unit tests via the Gradle task + `dataframe:test -Pkotlin.dataframe.debug=true` (see the ["Building"](#building) chapter). 1. The contributor submits the PR if the local build is successful and the tests are green. 2. The reviewer puts their name in the "Reviewers" section of the proposed PR at the start of the review process. 3. The reviewer leaves comments or marks the PR with the abbreviation "LGTM" (Looks good to me). @@ -103,6 +103,8 @@ This library is built with Gradle. * Run `./gradlew build` to build. It also runs all the tests and checks the linter. * Run `./gradlew :test` to test the module you are looking at to speed things up during development. +* Make sure to pass the extra parameter `-Pkotlin.dataframe.debug=true` to enable debug mode. This flag will + make sure some extra checks are run, which are important but too heavy for production. You can import this project into IDEA, but you have to delegate the build actions to Gradle (in Preferences -> Build, Execution, Deployment -> Build Tools -> Gradle -> Runner) diff --git a/build.gradle.kts b/build.gradle.kts index 6c4ad837d3..fa78e51e24 100644 --- a/build.gradle.kts +++ b/build.gradle.kts @@ -163,6 +163,7 @@ allprojects { packageName = "org.jetbrains.kotlinx.dataframe" className = "BuildConfig" buildConfigField("VERSION", "${project.version}") + buildConfigField("DEBUG", findProperty("kotlin.dataframe.debug")?.toString()?.toBoolean() ?: false) } } catch (_: UnknownDomainObjectException) { logger.warn("Could not set buildConfig on :${this.name}") diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/columns/DataColumnImpl.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/columns/DataColumnImpl.kt index cce956c1c4..c7ef55c5a5 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/columns/DataColumnImpl.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/columns/DataColumnImpl.kt @@ -1,8 +1,13 @@ package org.jetbrains.kotlinx.dataframe.impl.columns +import org.jetbrains.kotlinx.dataframe.BuildConfig import org.jetbrains.kotlinx.dataframe.DataColumn import org.jetbrains.kotlinx.dataframe.api.dataFrameOf +import org.jetbrains.kotlinx.dataframe.impl.isArray +import org.jetbrains.kotlinx.dataframe.impl.isPrimitiveArray +import kotlin.reflect.KClass import kotlin.reflect.KType +import kotlin.reflect.full.isSubclassOf internal abstract class DataColumnImpl( protected val values: List, @@ -12,6 +17,32 @@ internal abstract class DataColumnImpl( ) : DataColumn, DataColumnInternal { + private infix fun T?.matches(type: KType) = + when { + this == null -> type.isMarkedNullable + + this.isPrimitiveArray -> + type.isPrimitiveArray && + this!!::class.qualifiedName == type.classifier?.let { (it as KClass<*>).qualifiedName } + + this.isArray -> type.isArray + + // cannot check the precise type of array + else -> this!!::class.isSubclassOf(type.classifier as KClass<*>) + } + + init { + /* Check for [Issue #713](https://github.com/Kotlin/dataframe/issues/713). + * This only runs with `kotlin.dataframe.debug=true` in gradle.properties + */ + if (BuildConfig.DEBUG) { + require(values.all { it matches type }) { + val types = values.map { if (it == null) "Nothing?" else it!!::class.simpleName }.distinct() + "Values of column '$name' have types '$types' which are not compatible given with column type '$type'" + } + } + } + protected val distinct = distinct ?: lazy { values.toSet() } override fun name() = name diff --git a/gradle.properties b/gradle.properties index faf44dfd03..6fcbca9f35 100644 --- a/gradle.properties +++ b/gradle.properties @@ -11,3 +11,8 @@ org.gradle.jvmargs=-Xmx4G # This makes it mandatory to explicitly apply your own version of the # KSP plugin in the modules that use it. kotlin.dataframe.add.ksp=false + +# Enables debug mode for dataframe. +# This can make certain tests run that should not be run in production. +# It can also be turned on from the command line with `-Pkotlin.dataframe.debug=true` +kotlin.dataframe.debug=false From ba1f3433ca2d1374f32c1fff623f6977f36be15e Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Tue, 30 Jul 2024 14:46:54 +0200 Subject: [PATCH 2/6] removing erroneous use of TODO from `zero()` function, plus small QOL cleanup --- .../org/jetbrains/kotlinx/dataframe/impl/Utils.kt | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/Utils.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/Utils.kt index 08678234e0..4cdbc78eb5 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/Utils.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/Utils.kt @@ -121,7 +121,7 @@ internal fun Iterable.anyNull(): Boolean = any { it == null } internal fun emptyPath(): ColumnPath = ColumnPath(emptyList()) @PublishedApi -internal fun KClass.zero(): T = +internal fun KClass.zeroOrNull(): T? = when (this) { Int::class -> 0 as T Byte::class -> 0.toByte() as T @@ -131,10 +131,14 @@ internal fun KClass.zero(): T = Float::class -> 0.toFloat() as T BigDecimal::class -> BigDecimal.ZERO as T BigInteger::class -> BigInteger.ZERO as T - Number::class -> 0 as T - else -> TODO() + Number::class -> 0 as? T + else -> null } +@PublishedApi +internal fun KClass.zero(): T = + zeroOrNull() ?: throw NotImplementedError("Zero value for $this is not supported") + internal fun catchSilent(body: () -> T): T? = try { body() From 1fc719734860ea6c0ebf400d448cd960e0d7f0e2 Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Tue, 30 Jul 2024 15:51:17 +0200 Subject: [PATCH 3/6] Fixed one failing test for #713: convertTo can create a temporary non-nullable column with nulls in it. This is now avoided and the behavior has been simplified/stabilized --- .../kotlinx/dataframe/impl/api/convertTo.kt | 39 ++++++++++--------- .../kotlinx/dataframe/impl/schema/Utils.kt | 27 +++++++++---- 2 files changed, 39 insertions(+), 27 deletions(-) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convertTo.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convertTo.kt index 7e4c1a31fa..5bade30f08 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convertTo.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convertTo.kt @@ -11,13 +11,13 @@ import org.jetbrains.kotlinx.dataframe.api.ConvertSchemaDsl import org.jetbrains.kotlinx.dataframe.api.ConverterScope import org.jetbrains.kotlinx.dataframe.api.ExcessiveColumns import org.jetbrains.kotlinx.dataframe.api.Infer +import org.jetbrains.kotlinx.dataframe.api.add import org.jetbrains.kotlinx.dataframe.api.all import org.jetbrains.kotlinx.dataframe.api.allNulls import org.jetbrains.kotlinx.dataframe.api.asColumnGroup import org.jetbrains.kotlinx.dataframe.api.concat import org.jetbrains.kotlinx.dataframe.api.convertTo import org.jetbrains.kotlinx.dataframe.api.emptyDataFrame -import org.jetbrains.kotlinx.dataframe.api.getColumnPaths import org.jetbrains.kotlinx.dataframe.api.isEmpty import org.jetbrains.kotlinx.dataframe.api.map import org.jetbrains.kotlinx.dataframe.api.name @@ -29,12 +29,14 @@ import org.jetbrains.kotlinx.dataframe.columns.ColumnGroup import org.jetbrains.kotlinx.dataframe.columns.ColumnKind import org.jetbrains.kotlinx.dataframe.columns.ColumnPath import org.jetbrains.kotlinx.dataframe.columns.FrameColumn +import org.jetbrains.kotlinx.dataframe.columns.UnresolvedColumnsPolicy import org.jetbrains.kotlinx.dataframe.columns.toColumnSet import org.jetbrains.kotlinx.dataframe.exceptions.ExcessiveColumnsException import org.jetbrains.kotlinx.dataframe.exceptions.TypeConversionException import org.jetbrains.kotlinx.dataframe.impl.emptyPath -import org.jetbrains.kotlinx.dataframe.impl.schema.createEmptyColumn +import org.jetbrains.kotlinx.dataframe.impl.getColumnPaths import org.jetbrains.kotlinx.dataframe.impl.schema.createEmptyDataFrame +import org.jetbrains.kotlinx.dataframe.impl.schema.createNullFilledColumn import org.jetbrains.kotlinx.dataframe.impl.schema.extractSchema import org.jetbrains.kotlinx.dataframe.impl.schema.render import org.jetbrains.kotlinx.dataframe.kind @@ -252,22 +254,15 @@ internal fun AnyFrame.convertToImpl( } }.toMutableList() - // when the target is nullable but the source does not contain a column, fill it in with nulls / empty dataframes + // when the target is nullable but the source does not contain a column, + // fill it in with nulls / empty dataframes val size = this.size.nrow schema.columns.forEach { (name, targetColumn) -> - val isNullable = - // like value column of type Int? - targetColumn.nullable || - // like value column of type Int? (backup check) - targetColumn.type.isMarkedNullable || - // like DataRow for a group column (all columns in the group will be nullable) - targetColumn.contentType?.isMarkedNullable == true || - // frame column can be filled with empty dataframes - targetColumn.kind == ColumnKind.Frame - if (name !in visited) { - newColumns += targetColumn.createEmptyColumn(name, size) - if (!isNullable) { + try { + newColumns += targetColumn.createNullFilledColumn(name, size) + } catch (e: IllegalStateException) { + // if this could not be done automatically, they need to be filled manually missingPaths.add(path + name) } } @@ -280,10 +275,16 @@ internal fun AnyFrame.convertToImpl( var result = convertToSchema(marker.schema, emptyPath()) dsl.fillers.forEach { filler -> - val paths = result.getColumnPaths(filler.columns) - missingPaths.removeAll(paths.toSet()) - result = result.update { paths.toColumnSet() }.with { - filler.expr(this, this) + val paths = result.getColumnPaths(UnresolvedColumnsPolicy.Create, filler.columns).toSet() + missingPaths -= paths + val (newPaths, existingPaths) = paths.partition { it in missingPaths } + + // first fill cols that are already in the df + result = result.update { existingPaths.toColumnSet() }.with { filler.expr(this, this) } + + // then create any missing ones by filling + result = newPaths.fold(result) { df, newPath -> + df.add(newPath, Infer.Type) { filler.expr(this, this) } } } diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/schema/Utils.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/schema/Utils.kt index d8c8069538..66505bf93c 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/schema/Utils.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/schema/Utils.kt @@ -102,6 +102,9 @@ internal fun AnyCol.extractSchema(): ColumnSchema = @PublishedApi internal fun getSchema(kClass: KClass<*>): DataFrameSchema = MarkersExtractor.get(kClass).schema +/** + * Create "empty" column based on the toplevel of [this] [ColumnSchema]. + */ internal fun ColumnSchema.createEmptyColumn(name: String): AnyCol = when (this) { is ColumnSchema.Value -> DataColumn.createValueColumn(name, emptyList(), type) @@ -110,14 +113,22 @@ internal fun ColumnSchema.createEmptyColumn(name: String): AnyCol = else -> error("Unexpected ColumnSchema: $this") } -/** Create "empty" column, filled with either null or empty dataframes. */ -internal fun ColumnSchema.createEmptyColumn(name: String, numberOfRows: Int): AnyCol = +/** + * Creates a column based on [this] [ColumnSchema] filled with `null` or empty dataframes. + * @throws IllegalStateException if the column is not nullable and [numberOfRows]` > 0`. + */ +internal fun ColumnSchema.createNullFilledColumn(name: String, numberOfRows: Int): AnyCol = when (this) { - is ColumnSchema.Value -> DataColumn.createValueColumn( - name = name, - values = List(numberOfRows) { null }, - type = type, - ) + is ColumnSchema.Value -> { + if (!type.isMarkedNullable && numberOfRows > 0) { + error("Cannot create a null-filled value column of type $type as it's not nullable.") + } + DataColumn.createValueColumn( + name = name, + values = List(numberOfRows) { null }, + type = type, + ) + } is ColumnSchema.Group -> DataColumn.createColumnGroup( name = name, @@ -143,7 +154,7 @@ internal fun DataFrameSchema.createEmptyDataFrame(numberOfRows: Int): AnyFrame = DataFrame.empty(numberOfRows) } else { columns.map { (name, schema) -> - schema.createEmptyColumn(name, numberOfRows) + schema.createNullFilledColumn(name, numberOfRows) }.toDataFrame() } From 5e01d8635a0c5eefce13e3b66dc9b1c3609b8ea7 Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Tue, 30 Jul 2024 16:19:43 +0200 Subject: [PATCH 4/6] fixup! Fixed one failing test for #713: convertTo can create a temporary non-nullable column with nulls in it. This is now avoided and the behavior has been simplified/stabilized --- .../org/jetbrains/kotlinx/dataframe/impl/api/convertTo.kt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convertTo.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convertTo.kt index 5bade30f08..da419ec115 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convertTo.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convertTo.kt @@ -276,8 +276,8 @@ internal fun AnyFrame.convertToImpl( dsl.fillers.forEach { filler -> val paths = result.getColumnPaths(UnresolvedColumnsPolicy.Create, filler.columns).toSet() - missingPaths -= paths val (newPaths, existingPaths) = paths.partition { it in missingPaths } + missingPaths -= paths // first fill cols that are already in the df result = result.update { existingPaths.toColumnSet() }.with { filler.expr(this, this) } From 9d492b1b385e7fe52552ce0c8a3574ce9abaae73 Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Wed, 31 Jul 2024 14:13:20 +0200 Subject: [PATCH 5/6] clarification with docs as asked in the review --- core/build.gradle.kts | 1 + .../kotlinx/dataframe/impl/api/convertTo.kt | 29 +++++++++++++++++-- .../kotlinx/dataframe/impl/schema/Utils.kt | 2 +- 3 files changed, 28 insertions(+), 4 deletions(-) diff --git a/core/build.gradle.kts b/core/build.gradle.kts index cfa3a39ade..8ac255e616 100644 --- a/core/build.gradle.kts +++ b/core/build.gradle.kts @@ -73,6 +73,7 @@ dependencies { api(libs.kotlin.datetimeJvm) implementation(libs.kotlinpoet) + implementation(libs.kotlinLogging) testImplementation(libs.junit) testImplementation(libs.kotestAssertions) { diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convertTo.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convertTo.kt index da419ec115..b958fa0508 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convertTo.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convertTo.kt @@ -1,5 +1,6 @@ package org.jetbrains.kotlinx.dataframe.impl.api +import io.github.oshai.kotlinlogging.KotlinLogging import org.jetbrains.kotlinx.dataframe.AnyFrame import org.jetbrains.kotlinx.dataframe.AnyRow import org.jetbrains.kotlinx.dataframe.ColumnsSelector @@ -47,6 +48,8 @@ import kotlin.reflect.KType import kotlin.reflect.full.withNullability import kotlin.reflect.jvm.jvmErasure +private val logger = KotlinLogging.logger {} + private open class Converter(val transform: ConverterScope.(Any?) -> Any?, val skipNulls: Boolean) private class Filler(val columns: ColumnsSelector<*, *>, val expr: RowExpression<*, *>) @@ -262,6 +265,7 @@ internal fun AnyFrame.convertToImpl( try { newColumns += targetColumn.createNullFilledColumn(name, size) } catch (e: IllegalStateException) { + logger.debug(e) { "" } // if this could not be done automatically, they need to be filled manually missingPaths.add(path + name) } @@ -274,20 +278,39 @@ internal fun AnyFrame.convertToImpl( val marker = MarkersExtractor.get(clazz) var result = convertToSchema(marker.schema, emptyPath()) + /* + * Here we handle all registered fillers of the user. + * Fillers are registered in the DSL like: + * ```kt + * df.convertTo { + * fill { col1 and col2 }.with { something } + * fill { col3 }.with { somethingElse } + * } + * ``` + * Users can use this to fill up any column that was missing during the conversion. + * They can also fill up and thus overwrite any existing column here. + */ dsl.fillers.forEach { filler -> + // get all paths from the `fill { col1 and col2 }` part val paths = result.getColumnPaths(UnresolvedColumnsPolicy.Create, filler.columns).toSet() + + // split the paths into those that are already in the df and those that are missing val (newPaths, existingPaths) = paths.partition { it in missingPaths } - missingPaths -= paths - // first fill cols that are already in the df + // first fill cols that are already in the df using the `with {}` part of the dsl result = result.update { existingPaths.toColumnSet() }.with { filler.expr(this, this) } - // then create any missing ones by filling + // then create any missing ones by filling using the `with {}` part of the dsl result = newPaths.fold(result) { df, newPath -> df.add(newPath, Infer.Type) { filler.expr(this, this) } } + + // remove the paths that are now filled + missingPaths -= paths } + // Inform the user which target columns could not be created in the conversion + // The user will need to supply extra information for these, like `fill {}` them. if (missingPaths.isNotEmpty()) { throw IllegalArgumentException( "The following columns were not found in DataFrame: ${ diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/schema/Utils.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/schema/Utils.kt index 66505bf93c..48ff46f460 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/schema/Utils.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/schema/Utils.kt @@ -141,7 +141,7 @@ internal fun ColumnSchema.createNullFilledColumn(name: String, numberOfRows: Int schema = lazyOf(schema), ) - else -> error("Unexpected ColumnSchema: $this") + else -> error("Cannot create null-filled column of unexpected ColumnSchema: $this") } internal fun DataFrameSchema.createEmptyDataFrame(): AnyFrame = From 57714be4a7f92f8aae76f017061604d31bb08470 Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Thu, 1 Aug 2024 12:06:30 +0200 Subject: [PATCH 6/6] updated kotlinLogging dependency and added sl4j --- core/build.gradle.kts | 1 + dataframe-openapi/build.gradle.kts | 1 + gradle/libs.versions.toml | 2 +- plugins/kotlin-dataframe/build.gradle.kts | 1 + 4 files changed, 4 insertions(+), 1 deletion(-) diff --git a/core/build.gradle.kts b/core/build.gradle.kts index 8ac255e616..0cef9fa5b2 100644 --- a/core/build.gradle.kts +++ b/core/build.gradle.kts @@ -73,6 +73,7 @@ dependencies { api(libs.kotlin.datetimeJvm) implementation(libs.kotlinpoet) + implementation(libs.sl4j) implementation(libs.kotlinLogging) testImplementation(libs.junit) diff --git a/dataframe-openapi/build.gradle.kts b/dataframe-openapi/build.gradle.kts index abf5c58b90..86d113e84a 100644 --- a/dataframe-openapi/build.gradle.kts +++ b/dataframe-openapi/build.gradle.kts @@ -22,6 +22,7 @@ repositories { dependencies { api(project(":core")) + implementation(libs.sl4j) implementation(libs.kotlinLogging) implementation(libs.kotlin.reflect) implementation(libs.kotlinpoet) diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 3d752f77d7..18279a5d6c 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -36,7 +36,7 @@ sqlite = "3.45.1.0" jtsCore = "1.18.1" kotlinDatetime = "0.6.0" openapi = "2.1.20" -kotlinLogging = "6.0.3" +kotlinLogging = "7.0.0" sl4j = "2.0.12" junit = "4.13.2" diff --git a/plugins/kotlin-dataframe/build.gradle.kts b/plugins/kotlin-dataframe/build.gradle.kts index 0e2304ce61..a28ebc9fd0 100644 --- a/plugins/kotlin-dataframe/build.gradle.kts +++ b/plugins/kotlin-dataframe/build.gradle.kts @@ -37,6 +37,7 @@ dependencies { testRuntimeOnly("org.jetbrains.kotlin:kotlin-annotations-jvm:$kotlinVersion") implementation(project(":core")) + api(libs.kotlinLogging) api("org.jetbrains.kotlinx:kotlinx-serialization-json:1.4.0-RC") testImplementation("org.jetbrains.kotlin:kotlin-reflect:$kotlinVersion")