diff --git a/src/daft-connect/src/translation/logical_plan/local_relation.rs b/src/daft-connect/src/translation/logical_plan/local_relation.rs index f5e4a3a3cf..f32e2a476b 100644 --- a/src/daft-connect/src/translation/logical_plan/local_relation.rs +++ b/src/daft-connect/src/translation/logical_plan/local_relation.rs @@ -11,6 +11,7 @@ use daft_logical_plan::{ SourceInfo, }; use daft_micropartition::{python::PyMicroPartition, MicroPartition}; +use daft_schema::dtype::DaftDataType; use daft_table::Table; use eyre::{bail, ensure, WrapErr}; use itertools::Itertools; @@ -56,13 +57,27 @@ pub fn local_relation(plan: spark_connect::LocalRelation) -> eyre::Result .map(|daft_field| daft_field.to_arrow()) .try_collect()?; + let mut dict_idx = 0; + let ipc_fields: Vec<_> = daft_fields .iter() - .map(|_| { + .map(|field| { + let required_dictionary = field.dtype == DaftDataType::Utf8; + + let dictionary_id = match required_dictionary { + true => { + let res = dict_idx; + dict_idx += 1; + debug!("using dictionary id {res}"); + Some(res) + } + false => None, + }; + // For integer columns, we don't need dictionary encoding IpcField { - fields: vec![], // No nested fields for primitive types - dictionary_id: None, // No dictionary encoding + fields: vec![], // No nested fields for primitive types + dictionary_id, } }) .collect(); diff --git a/tests/connect/conftest.py b/tests/connect/conftest.py index 60c5ae9986..9144e16a7b 100644 --- a/tests/connect/conftest.py +++ b/tests/connect/conftest.py @@ -12,7 +12,11 @@ def spark_session(): This fixture is available to all test files and creates a single Spark session for the entire test suite run. """ + from daft.daft import connect_start + from daft.logging import setup_debug_logger + + setup_debug_logger() # Start Daft Connect server server = connect_start() diff --git a/tests/connect/test_create_df.py b/tests/connect/test_create_df.py index e06944e19f..187f4fbc5a 100644 --- a/tests/connect/test_create_df.py +++ b/tests/connect/test_create_df.py @@ -25,3 +25,11 @@ def test_create_df(spark_session): assert len(df_two_pandas) == 3, "Two-column DataFrame should have 3 rows" assert list(df_two_pandas["num1"]) == [1, 2, 3], "First number column should contain expected values" assert list(df_two_pandas["num2"]) == [10, 20, 30], "Second number column should contain expected values" + + # now do boolean + print("now testing boolean") + boolean_data = [(True,), (False,), (True,)] + df_boolean = spark_session.createDataFrame(boolean_data, ["value"]) + df_boolean_pandas = df_boolean.toPandas() + assert len(df_boolean_pandas) == 3, "Boolean DataFrame should have 3 rows" + assert list(df_boolean_pandas["value"]) == [True, False, True], "Boolean DataFrame should contain expected values"