Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions cpp/src/parquet/metadata.cc
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,18 @@ std::string ParquetVersionToString(ParquetVersion::type ver) {
return "2.4";
case ParquetVersion::PARQUET_2_6:
return "2.6";
case ParquetVersion::PARQUET_2_7:
return "2.7";
case ParquetVersion::PARQUET_2_8:
return "2.8";
case ParquetVersion::PARQUET_2_9:
return "2.9";
case ParquetVersion::PARQUET_2_10:
return "2.10";
case ParquetVersion::PARQUET_2_11:
return "2.11";
case ParquetVersion::PARQUET_2_12:
return "2.12";
}

// This should be unreachable
Expand Down
6 changes: 3 additions & 3 deletions cpp/src/parquet/metadata_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ TEST(Metadata, TestBuildAccess) {
WriterProperties::Builder prop_builder;

std::shared_ptr<WriterProperties> props =
prop_builder.version(ParquetVersion::PARQUET_2_6)->build();
prop_builder.version(ParquetVersion::PARQUET_2_LATEST)->build();

fields.push_back(parquet::schema::Int32("int_col", Repetition::REQUIRED));
fields.push_back(parquet::schema::Float("float_col", Repetition::REQUIRED));
Expand Down Expand Up @@ -137,7 +137,7 @@ TEST(Metadata, TestBuildAccess) {
ASSERT_EQ(nrows, f_accessors[loop_index]->num_rows());
ASSERT_LE(0, static_cast<int>(f_accessors[loop_index]->size()));
ASSERT_EQ(2, f_accessors[loop_index]->num_row_groups());
ASSERT_EQ(ParquetVersion::PARQUET_2_6, f_accessors[loop_index]->version());
ASSERT_EQ(ParquetVersion::PARQUET_2_LATEST, f_accessors[loop_index]->version());
ASSERT_EQ(DEFAULT_CREATED_BY, f_accessors[loop_index]->created_by());
ASSERT_EQ(3, f_accessors[loop_index]->num_schema_elements());

Expand Down Expand Up @@ -256,7 +256,7 @@ TEST(Metadata, TestBuildAccess) {
ASSERT_EQ(4, f_accessor->num_row_groups());
ASSERT_EQ(nrows * 2, f_accessor->num_rows());
ASSERT_LE(0, static_cast<int>(f_accessor->size()));
ASSERT_EQ(ParquetVersion::PARQUET_2_6, f_accessor->version());
ASSERT_EQ(ParquetVersion::PARQUET_2_LATEST, f_accessor->version());
ASSERT_EQ(DEFAULT_CREATED_BY, f_accessor->created_by());
ASSERT_EQ(3, f_accessor->num_schema_elements());

Expand Down
51 changes: 50 additions & 1 deletion cpp/src/parquet/type_fwd.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,11 +54,60 @@ struct ParquetVersion {
/// Note: Parquet format 2.6.0 was released in September 2018.
PARQUET_2_6,

/// Enable Parquet format 2.7 and earlier features when writing
///
/// This enables bloom filters and encryption in addition to the
/// PARQUET_2_6 features.
///
/// Note: Parquet format 2.7.0 was released in June 2019.
PARQUET_2_7,

/// Enable Parquet format 2.8 and earlier features when writing
///
/// This enables BYTE_STREAM_SPLIT encoding in addition to the
/// PARQUET_2_7 features.
///
/// Note: Parquet format 2.8.0 was released in February 2020.
PARQUET_2_8,

/// Enable Parquet format 2.9 and earlier features when writing
///
/// This enables interoperable LZ4 codec in addition to the
/// PARQUET_2_8 features.
///
/// Note: Parquet format 2.9.0 was released in January 2021.
PARQUET_2_9,

/// Enable Parquet format 2.10 and earlier features when writing
///
/// This enables Float16 logical type in addition to the
/// PARQUET_2_9 features.
///
/// Note: Parquet format 2.10.0 was released in October 2022.
PARQUET_2_10,

/// Enable Parquet format 2.11 and earlier features when writing
///
/// This enables VARIANT logical type, GEOMETRY/GEOGRAPHY types,
/// and extended BYTE_STREAM_SPLIT encoding for INT32/INT64/FIXED_LEN_BYTE_ARRAY
/// in addition to the PARQUET_2_10 features.
///
/// Note: Parquet format 2.11.0 was released in March 2025.
PARQUET_2_11,

/// Enable Parquet format 2.12 and earlier features when writing
///
/// This finalizes the VARIANT logical type specification and shredding
/// in addition to the PARQUET_2_11 features.
///
/// Note: Parquet format 2.12.0 was released in August 2025.
PARQUET_2_12,

/// Enable latest Parquet format 2.x features
///
/// This value is equal to the greatest 2.x version supported by
/// this library.
PARQUET_2_LATEST = PARQUET_2_6
PARQUET_2_LATEST = PARQUET_2_12
};
};

Expand Down
18 changes: 15 additions & 3 deletions python/pyarrow/_parquet.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1113,7 +1113,7 @@ cdef class FileMetaData(_Weakrefable):
"""
Parquet format version used in file (str, such as '1.0', '2.4').

If version is missing or unparsable, will default to assuming '2.6'.
If version is missing or unparsable, will default to assuming '2.12'.
"""
cdef ParquetVersion version = self._metadata.version()
if version == ParquetVersion_V1:
Expand All @@ -1122,9 +1122,21 @@ cdef class FileMetaData(_Weakrefable):
return '2.4'
elif version == ParquetVersion_V2_6:
return '2.6'
elif version == ParquetVersion_V2_7:
return '2.7'
elif version == ParquetVersion_V2_8:
return '2.8'
elif version == ParquetVersion_V2_9:
return '2.9'
elif version == ParquetVersion_V2_10:
return '2.10'
elif version == ParquetVersion_V2_11:
return '2.11'
elif version == ParquetVersion_V2_12:
return '2.12'
else:
warnings.warn(f'Unrecognized file version, assuming 2.6: {version}')
return '2.6'
warnings.warn(f'Unrecognized file version, assuming 2.12: {version}')
return '2.12'

@property
def created_by(self):
Expand Down
6 changes: 6 additions & 0 deletions python/pyarrow/includes/libparquet.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,12 @@ cdef extern from "parquet/api/schema.h" namespace "parquet" nogil:
ParquetVersion_V1" parquet::ParquetVersion::PARQUET_1_0"
ParquetVersion_V2_4" parquet::ParquetVersion::PARQUET_2_4"
ParquetVersion_V2_6" parquet::ParquetVersion::PARQUET_2_6"
ParquetVersion_V2_7" parquet::ParquetVersion::PARQUET_2_7"
ParquetVersion_V2_8" parquet::ParquetVersion::PARQUET_2_8"
ParquetVersion_V2_9" parquet::ParquetVersion::PARQUET_2_9"
ParquetVersion_V2_10" parquet::ParquetVersion::PARQUET_2_10"
ParquetVersion_V2_11" parquet::ParquetVersion::PARQUET_2_11"
ParquetVersion_V2_12" parquet::ParquetVersion::PARQUET_2_12"

enum ParquetSortOrder" parquet::SortOrder::type":
ParquetSortOrder_SIGNED" parquet::SortOrder::SIGNED"
Expand Down
6 changes: 3 additions & 3 deletions python/pyarrow/tests/parquet/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def test_parquet_metadata_api():
assert meta.num_rows == len(df)
assert meta.num_columns == ncols + 1 # +1 for index
assert meta.num_row_groups == 1
assert meta.format_version == '2.6'
assert meta.format_version == '2.12'
assert 'parquet-cpp' in meta.created_by
assert isinstance(meta.serialized_size, int)
assert isinstance(meta.metadata, dict)
Expand Down Expand Up @@ -554,12 +554,12 @@ def test_write_metadata(tempdir):
assert b'ARROW:schema' not in schema_as_arrow.metadata

# pass through writer keyword arguments
for version in ["1.0", "2.4", "2.6"]:
for version in ["1.0", "2.4", "2.6", "2.7", "2.8", "2.9", "2.10", "2.11", "2.12"]:
pq.write_metadata(schema, path, version=version)
parquet_meta = pq.read_metadata(path)
# The version is stored as a single integer in the Parquet metadata,
# so it cannot correctly express dotted format versions
expected_version = "1.0" if version == "1.0" else "2.6"
expected_version = "1.0" if version == "1.0" else "2.12"
assert parquet_meta.format_version == expected_version

# metadata_collector: list of FileMetaData objects
Expand Down
4 changes: 2 additions & 2 deletions python/pyarrow/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -4928,14 +4928,14 @@ def test_write_dataset_parquet(tempdir):
assert result.equals(table)

# using custom options
for version in ["1.0", "2.4", "2.6"]:
for version in ["1.0", "2.4", "2.6", "2.7", "2.8", "2.9", "2.10", "2.11", "2.12"]:
format = ds.ParquetFileFormat()
opts = format.make_write_options(version=version)
assert "<pyarrow.dataset.ParquetFileWriteOptions" in repr(opts)
base_dir = tempdir / f'parquet_dataset_version{version}'
ds.write_dataset(table, base_dir, format=format, file_options=opts)
meta = pq.read_metadata(base_dir / "part-0.parquet")
expected_version = "1.0" if version == "1.0" else "2.6"
expected_version = "1.0" if version == "1.0" else "2.12"
assert meta.format_version == expected_version

# ensure version is actually honored based on supported datatypes
Expand Down
4 changes: 3 additions & 1 deletion r/R/enums.R
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,9 @@ FileType <- enum("FileType",
#' @export
#' @rdname enums
ParquetVersionType <- enum("ParquetVersionType",
PARQUET_1_0 = 0L, PARQUET_2_4 = 2L, PARQUET_2_6 = 3L
PARQUET_1_0 = 0L, PARQUET_2_4 = 1L, PARQUET_2_6 = 2L,
PARQUET_2_7 = 3L, PARQUET_2_8 = 4L, PARQUET_2_9 = 5L, PARQUET_2_10 = 6L,
PARQUET_2_11 = 7L, PARQUET_2_12 = 8L
Comment on lines +138 to +140
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Apparently this change ensures it matches up with the C++; not sure about this myself.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jonkeane If we export this enum, does that mean we shouldn't go changing previous values, or that it's actually important to sync up with the C++ values? I'm not confident I fully understand how enums are used here and what matches what.

)

#' @export
Expand Down
8 changes: 7 additions & 1 deletion r/R/parquet.R
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,13 @@ valid_parquet_version <- c(
"1.0" = ParquetVersionType$PARQUET_1_0,
"2.4" = ParquetVersionType$PARQUET_2_4,
"2.6" = ParquetVersionType$PARQUET_2_6,
"latest" = ParquetVersionType$PARQUET_2_6
"2.7" = ParquetVersionType$PARQUET_2_7,
"2.8" = ParquetVersionType$PARQUET_2_8,
"2.9" = ParquetVersionType$PARQUET_2_9,
"2.10" = ParquetVersionType$PARQUET_2_10,
"2.11" = ParquetVersionType$PARQUET_2_11,
"2.12" = ParquetVersionType$PARQUET_2_12,
"latest" = ParquetVersionType$PARQUET_2_12
)

make_valid_parquet_version <- function(version, valid_versions = valid_parquet_version) {
Expand Down
26 changes: 25 additions & 1 deletion r/tests/testthat/test-parquet.R
Original file line number Diff line number Diff line change
Expand Up @@ -142,9 +142,33 @@ test_that("make_valid_parquet_version()", {
make_valid_parquet_version("2.6"),
ParquetVersionType$PARQUET_2_6
)
expect_equal(
make_valid_parquet_version("2.7"),
ParquetVersionType$PARQUET_2_7
)
expect_equal(
make_valid_parquet_version("2.8"),
ParquetVersionType$PARQUET_2_8
)
expect_equal(
make_valid_parquet_version("2.9"),
ParquetVersionType$PARQUET_2_9
)
expect_equal(
make_valid_parquet_version("2.10"),
ParquetVersionType$PARQUET_2_10
)
expect_equal(
make_valid_parquet_version("2.11"),
ParquetVersionType$PARQUET_2_11
)
expect_equal(
make_valid_parquet_version("2.12"),
ParquetVersionType$PARQUET_2_12
)
expect_equal(
make_valid_parquet_version("latest"),
ParquetVersionType$PARQUET_2_6
ParquetVersionType$PARQUET_2_12
)

expect_equal(make_valid_parquet_version(1), ParquetVersionType$PARQUET_1_0)
Expand Down
Loading