diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 4b1822c0dae..a86673e489e 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -83,6 +83,18 @@ std::string ParquetVersionToString(ParquetVersion::type ver) { return "2.4"; case ParquetVersion::PARQUET_2_6: return "2.6"; + case ParquetVersion::PARQUET_2_7: + return "2.7"; + case ParquetVersion::PARQUET_2_8: + return "2.8"; + case ParquetVersion::PARQUET_2_9: + return "2.9"; + case ParquetVersion::PARQUET_2_10: + return "2.10"; + case ParquetVersion::PARQUET_2_11: + return "2.11"; + case ParquetVersion::PARQUET_2_12: + return "2.12"; } // This should be unreachable diff --git a/cpp/src/parquet/metadata_test.cc b/cpp/src/parquet/metadata_test.cc index 572f053179c..1f0101bf298 100644 --- a/cpp/src/parquet/metadata_test.cc +++ b/cpp/src/parquet/metadata_test.cc @@ -94,7 +94,7 @@ TEST(Metadata, TestBuildAccess) { WriterProperties::Builder prop_builder; std::shared_ptr props = - prop_builder.version(ParquetVersion::PARQUET_2_6)->build(); + prop_builder.version(ParquetVersion::PARQUET_2_LATEST)->build(); fields.push_back(parquet::schema::Int32("int_col", Repetition::REQUIRED)); fields.push_back(parquet::schema::Float("float_col", Repetition::REQUIRED)); @@ -137,7 +137,7 @@ TEST(Metadata, TestBuildAccess) { ASSERT_EQ(nrows, f_accessors[loop_index]->num_rows()); ASSERT_LE(0, static_cast(f_accessors[loop_index]->size())); ASSERT_EQ(2, f_accessors[loop_index]->num_row_groups()); - ASSERT_EQ(ParquetVersion::PARQUET_2_6, f_accessors[loop_index]->version()); + ASSERT_EQ(ParquetVersion::PARQUET_2_LATEST, f_accessors[loop_index]->version()); ASSERT_EQ(DEFAULT_CREATED_BY, f_accessors[loop_index]->created_by()); ASSERT_EQ(3, f_accessors[loop_index]->num_schema_elements()); @@ -256,7 +256,7 @@ TEST(Metadata, TestBuildAccess) { ASSERT_EQ(4, f_accessor->num_row_groups()); ASSERT_EQ(nrows * 2, f_accessor->num_rows()); ASSERT_LE(0, static_cast(f_accessor->size())); - ASSERT_EQ(ParquetVersion::PARQUET_2_6, f_accessor->version()); + ASSERT_EQ(ParquetVersion::PARQUET_2_LATEST, f_accessor->version()); ASSERT_EQ(DEFAULT_CREATED_BY, f_accessor->created_by()); ASSERT_EQ(3, f_accessor->num_schema_elements()); diff --git a/cpp/src/parquet/type_fwd.h b/cpp/src/parquet/type_fwd.h index 02e896598bf..24b3c12a21e 100644 --- a/cpp/src/parquet/type_fwd.h +++ b/cpp/src/parquet/type_fwd.h @@ -54,11 +54,60 @@ struct ParquetVersion { /// Note: Parquet format 2.6.0 was released in September 2018. PARQUET_2_6, + /// Enable Parquet format 2.7 and earlier features when writing + /// + /// This enables bloom filters and encryption in addition to the + /// PARQUET_2_6 features. + /// + /// Note: Parquet format 2.7.0 was released in June 2019. + PARQUET_2_7, + + /// Enable Parquet format 2.8 and earlier features when writing + /// + /// This enables BYTE_STREAM_SPLIT encoding in addition to the + /// PARQUET_2_7 features. + /// + /// Note: Parquet format 2.8.0 was released in February 2020. + PARQUET_2_8, + + /// Enable Parquet format 2.9 and earlier features when writing + /// + /// This enables interoperable LZ4 codec in addition to the + /// PARQUET_2_8 features. + /// + /// Note: Parquet format 2.9.0 was released in January 2021. + PARQUET_2_9, + + /// Enable Parquet format 2.10 and earlier features when writing + /// + /// This enables Float16 logical type in addition to the + /// PARQUET_2_9 features. + /// + /// Note: Parquet format 2.10.0 was released in October 2022. + PARQUET_2_10, + + /// Enable Parquet format 2.11 and earlier features when writing + /// + /// This enables VARIANT logical type, GEOMETRY/GEOGRAPHY types, + /// and extended BYTE_STREAM_SPLIT encoding for INT32/INT64/FIXED_LEN_BYTE_ARRAY + /// in addition to the PARQUET_2_10 features. + /// + /// Note: Parquet format 2.11.0 was released in March 2025. + PARQUET_2_11, + + /// Enable Parquet format 2.12 and earlier features when writing + /// + /// This finalizes the VARIANT logical type specification and shredding + /// in addition to the PARQUET_2_11 features. + /// + /// Note: Parquet format 2.12.0 was released in August 2025. + PARQUET_2_12, + /// Enable latest Parquet format 2.x features /// /// This value is equal to the greatest 2.x version supported by /// this library. - PARQUET_2_LATEST = PARQUET_2_6 + PARQUET_2_LATEST = PARQUET_2_12 }; }; diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index d59c70a2744..11ce81934d9 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -1113,7 +1113,7 @@ cdef class FileMetaData(_Weakrefable): """ Parquet format version used in file (str, such as '1.0', '2.4'). - If version is missing or unparsable, will default to assuming '2.6'. + If version is missing or unparsable, will default to assuming '2.12'. """ cdef ParquetVersion version = self._metadata.version() if version == ParquetVersion_V1: @@ -1122,9 +1122,21 @@ cdef class FileMetaData(_Weakrefable): return '2.4' elif version == ParquetVersion_V2_6: return '2.6' + elif version == ParquetVersion_V2_7: + return '2.7' + elif version == ParquetVersion_V2_8: + return '2.8' + elif version == ParquetVersion_V2_9: + return '2.9' + elif version == ParquetVersion_V2_10: + return '2.10' + elif version == ParquetVersion_V2_11: + return '2.11' + elif version == ParquetVersion_V2_12: + return '2.12' else: - warnings.warn(f'Unrecognized file version, assuming 2.6: {version}') - return '2.6' + warnings.warn(f'Unrecognized file version, assuming 2.12: {version}') + return '2.12' @property def created_by(self): diff --git a/python/pyarrow/includes/libparquet.pxd b/python/pyarrow/includes/libparquet.pxd index d9dd9d1aec9..df25b85b9e6 100644 --- a/python/pyarrow/includes/libparquet.pxd +++ b/python/pyarrow/includes/libparquet.pxd @@ -144,6 +144,12 @@ cdef extern from "parquet/api/schema.h" namespace "parquet" nogil: ParquetVersion_V1" parquet::ParquetVersion::PARQUET_1_0" ParquetVersion_V2_4" parquet::ParquetVersion::PARQUET_2_4" ParquetVersion_V2_6" parquet::ParquetVersion::PARQUET_2_6" + ParquetVersion_V2_7" parquet::ParquetVersion::PARQUET_2_7" + ParquetVersion_V2_8" parquet::ParquetVersion::PARQUET_2_8" + ParquetVersion_V2_9" parquet::ParquetVersion::PARQUET_2_9" + ParquetVersion_V2_10" parquet::ParquetVersion::PARQUET_2_10" + ParquetVersion_V2_11" parquet::ParquetVersion::PARQUET_2_11" + ParquetVersion_V2_12" parquet::ParquetVersion::PARQUET_2_12" enum ParquetSortOrder" parquet::SortOrder::type": ParquetSortOrder_SIGNED" parquet::SortOrder::SIGNED" diff --git a/python/pyarrow/tests/parquet/test_metadata.py b/python/pyarrow/tests/parquet/test_metadata.py index 148bfebaa67..241177020e3 100644 --- a/python/pyarrow/tests/parquet/test_metadata.py +++ b/python/pyarrow/tests/parquet/test_metadata.py @@ -67,7 +67,7 @@ def test_parquet_metadata_api(): assert meta.num_rows == len(df) assert meta.num_columns == ncols + 1 # +1 for index assert meta.num_row_groups == 1 - assert meta.format_version == '2.6' + assert meta.format_version == '2.12' assert 'parquet-cpp' in meta.created_by assert isinstance(meta.serialized_size, int) assert isinstance(meta.metadata, dict) @@ -554,12 +554,12 @@ def test_write_metadata(tempdir): assert b'ARROW:schema' not in schema_as_arrow.metadata # pass through writer keyword arguments - for version in ["1.0", "2.4", "2.6"]: + for version in ["1.0", "2.4", "2.6", "2.7", "2.8", "2.9", "2.10", "2.11", "2.12"]: pq.write_metadata(schema, path, version=version) parquet_meta = pq.read_metadata(path) # The version is stored as a single integer in the Parquet metadata, # so it cannot correctly express dotted format versions - expected_version = "1.0" if version == "1.0" else "2.6" + expected_version = "1.0" if version == "1.0" else "2.12" assert parquet_meta.format_version == expected_version # metadata_collector: list of FileMetaData objects diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index 32bcebb28de..658f0d8a65d 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -4928,14 +4928,14 @@ def test_write_dataset_parquet(tempdir): assert result.equals(table) # using custom options - for version in ["1.0", "2.4", "2.6"]: + for version in ["1.0", "2.4", "2.6", "2.7", "2.8", "2.9", "2.10", "2.11", "2.12"]: format = ds.ParquetFileFormat() opts = format.make_write_options(version=version) assert "