@@ -81,25 +81,68 @@ std::shared_ptr<Field> FieldForArray(const std::shared_ptr<Array>& array,
8181}
8282
8383std::vector<WriteConfig> GetWriteConfigurations () {
84+ auto default_properties_builder = [] {
85+ auto builder = WriterProperties::Builder ();
86+ // Override current default of 1MB
87+ builder.data_pagesize (10'000 );
88+ // Reduce max dictionary page size so that less pages are dict-encoded.
89+ builder.dictionary_pagesize_limit (1'000 );
90+ // Emit various physical types for decimal columns
91+ builder.enable_store_decimal_as_integer ();
92+ // DataPageV2 has more interesting features such as selective compression
93+ builder.data_page_version (parquet::ParquetDataPageVersion::V2);
94+ return builder;
95+ };
96+
97+ auto default_arrow_properties_builder = [] {
98+ auto builder = ArrowWriterProperties::Builder ();
99+ // Store the Arrow schema so as to exercise more data types when reading
100+ builder.store_schema ();
101+ return builder;
102+ };
103+
84104 // clang-format off
85- auto w_brotli = WriterProperties::Builder ()
86- .disable_dictionary (" no_dict" )
87- ->compression (" compressed" , Compression::BROTLI)
88- // Override current default of 1MB
89- ->data_pagesize (20'000 )
90- // Reduce max dictionary page size so that less pages are dict-encoded.
91- ->dictionary_pagesize_limit (1'000 )
92- // Emit various physical types for decimal columns
93- ->enable_store_decimal_as_integer ()
105+ auto w_uncompressed = default_properties_builder ()
106+ .build ();
107+ // compressed columns with dictionary disabled
108+ auto w_brotli = default_properties_builder ()
109+ .disable_dictionary ()
110+ ->compression (Compression::BROTLI)
111+ ->build ();
112+ auto w_gzip = default_properties_builder ()
113+ .disable_dictionary ()
114+ ->compression (Compression::GZIP)
94115 ->build ();
95- // Store the Arrow schema so as to exercise more data types when reading
96- auto a_default = ArrowWriterProperties::Builder{}
97- . store_schema ( )
116+ auto w_lz4 = default_properties_builder ()
117+ . disable_dictionary ()
118+ -> compression (Compression::LZ4 )
98119 ->build ();
120+ auto w_snappy = default_properties_builder ()
121+ .disable_dictionary ()
122+ ->compression (Compression::SNAPPY)
123+ ->build ();
124+ auto w_zstd = default_properties_builder ()
125+ .disable_dictionary ()
126+ ->compression (Compression::ZSTD)
127+ ->build ();
128+ // v1 data pages
129+ auto w_pages_v1 = default_properties_builder ()
130+ .disable_dictionary ()
131+ ->compression (Compression::LZ4)
132+ ->data_page_version (parquet::ParquetDataPageVersion::V1)
133+ ->build ();
134+
135+ auto a_default = default_arrow_properties_builder ().build ();
99136 // clang-format on
100137
101138 std::vector<WriteConfig> configs;
139+ configs.push_back ({w_uncompressed, a_default});
102140 configs.push_back ({w_brotli, a_default});
141+ configs.push_back ({w_gzip, a_default});
142+ configs.push_back ({w_lz4, a_default});
143+ configs.push_back ({w_snappy, a_default});
144+ configs.push_back ({w_zstd, a_default});
145+ configs.push_back ({w_pages_v1, a_default});
103146 return configs;
104147}
105148
@@ -255,8 +298,6 @@ Result<std::vector<Column>> ExampleColumns(int32_t length,
255298
256299 // TODO extension types: UUID, JSON, GEOMETRY, GEOGRAPHY
257300
258- // A non-dict-encoded column (see GetWriteConfigurations)
259- columns.push_back ({" no_dict" , gen.String (length, 0 , 30 , null_probability)});
260301 // A column that should be quite compressible (see GetWriteConfigurations)
261302 columns.push_back ({" compressed" , gen.Int64 (length, -10 , 10 , null_probability)});
262303
0 commit comments