diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index f5bd7db8..ff52c967 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -9,6 +9,7 @@ jobs: go: - '1.17.x' - '1.18.x' + - '1.19.x' tags: - '' - purego @@ -44,7 +45,7 @@ jobs: - name: Setup Go ${{ matrix.go }} uses: actions/setup-go@v3 with: - go-version: 1.18.x + go-version: 1.19.x - name: Validate formatting run: make format diff --git a/bloom/filter.go b/bloom/filter.go index f4699b96..046a2180 100644 --- a/bloom/filter.go +++ b/bloom/filter.go @@ -43,7 +43,6 @@ func MakeSplitBlockFilter(data []byte) SplitBlockFilter { // filters in memory, for example: // // f := make(bloom.SplitBlockFilter, bloom.NumSplitBlocksOf(n, 10)) -// func NumSplitBlocksOf(numValues int64, bitsPerValue uint) int { numBytes := ((uint(numValues) * bitsPerValue) + 7) / 8 numBlocks := (numBytes + (BlockSize - 1)) / BlockSize diff --git a/buffer.go b/buffer.go index f4004150..b6856648 100644 --- a/buffer.go +++ b/buffer.go @@ -36,7 +36,6 @@ type Buffer struct { // buffer := parquet.NewBuffer(config) // ... // } -// func NewBuffer(options ...RowGroupOption) *Buffer { config, err := NewRowGroupConfig(options...) if err != nil { diff --git a/column_buffer_go18.go b/column_buffer_go18.go index 918b0d3e..0afee027 100644 --- a/column_buffer_go18.go +++ b/column_buffer_go18.go @@ -19,9 +19,8 @@ import ( // // - rows is the array of Go values to write to the column buffers. // -// - levels is used to track the column index, repetition and definition levels -// of values when writing optional or repeated columns. -// +// - levels is used to track the column index, repetition and definition levels +// of values when writing optional or repeated columns. type writeRowsFunc func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error // writeRowsFuncOf generates a writeRowsFunc function for the given Go type and diff --git a/config.go b/config.go index 6cdddb0e..25c8e098 100644 --- a/config.go +++ b/config.go @@ -28,7 +28,6 @@ const ( // SkipPageIndex: true, // SkipBloomFilters: true, // }) -// type FileConfig struct { SkipPageIndex bool SkipBloomFilters bool @@ -82,7 +81,6 @@ func (c *FileConfig) Validate() error { // reader := parquet.NewReader(output, schema, &parquet.ReaderConfig{ // // ... // }) -// type ReaderConfig struct { Schema *Schema } @@ -131,7 +129,6 @@ func (c *ReaderConfig) Validate() error { // writer := parquet.NewWriter(output, schema, &parquet.WriterConfig{ // CreatedBy: "my test program", // }) -// type WriterConfig struct { CreatedBy string ColumnPageBuffers PageBufferPool @@ -225,7 +222,6 @@ func (c *WriterConfig) Validate() error { // buffer := parquet.NewBuffer(&parquet.RowGroupConfig{ // ColumnBufferCapacity: 10_000, // }) -// type RowGroupConfig struct { ColumnBufferCapacity int SortingColumns []SortingColumn diff --git a/file.go b/file.go index 0fa5785a..7445421d 100644 --- a/file.go +++ b/file.go @@ -147,19 +147,19 @@ func OpenFile(r io.ReaderAt, size int64, options ...FileOption) (*File, error) { // Only leaf columns have indexes, the returned indexes are arranged using the // following layout: // -// + -------------- + -// | col 0: chunk 0 | -// + -------------- + -// | col 1: chunk 0 | -// + -------------- + -// | ... | -// + -------------- + -// | col 0: chunk 1 | -// + -------------- + -// | col 1: chunk 1 | -// + -------------- + -// | ... | -// + -------------- + +// - -------------- + +// | col 0: chunk 0 | +// - -------------- + +// | col 1: chunk 0 | +// - -------------- + +// | ... | +// - -------------- + +// | col 0: chunk 1 | +// - -------------- + +// | col 1: chunk 1 | +// - -------------- + +// | ... | +// - -------------- + // // This method is useful in combination with the SkipPageIndex option to delay // reading the page index section until after the file was opened. Note that in diff --git a/format/parquet.go b/format/parquet.go index 04610798..08c1c986 100644 --- a/format/parquet.go +++ b/format/parquet.go @@ -281,11 +281,11 @@ func (t *LogicalType) String() string { // Represents a element inside a schema definition. // -// - if it is a group (inner node) then type is undefined and num_children is -// defined +// - if it is a group (inner node) then type is undefined and num_children is +// defined // -// - if it is a primitive type (leaf) then type is defined and num_children is -// undefined +// - if it is a primitive type (leaf) then type is defined and num_children is +// undefined // // The nodes are listed in depth first traversal order. type SchemaElement struct { diff --git a/go.mod b/go.mod index 690e0d3e..54551e23 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module github.com/segmentio/parquet-go -go 1.18 +go 1.19 require ( github.com/andybalholm/brotli v1.0.3 diff --git a/hashprobe/hashprobe.go b/hashprobe/hashprobe.go index 9e4a5971..026ba7e7 100644 --- a/hashprobe/hashprobe.go +++ b/hashprobe/hashprobe.go @@ -149,7 +149,7 @@ func (t *Uint32Table) ProbeArray(keys sparse.Uint32Array, values []int32) int { // // The table uses the following memory layout: // -// [group 0][group 1][...][group N] +// [group 0][group 1][...][group N] // // Each group contains up to 7 key/value pairs, and is exactly 64 bytes in size, // which allows it to fit within a single cache line, and ensures that probes @@ -598,7 +598,7 @@ func (t *Uint128Table) ProbeArray(keys sparse.Uint128Array, values []int32) int // // This table uses the following memory layout: // -// [key A][key B][...][value A][value B][...] +// [key A][key B][...][value A][value B][...] // // The table stores values as their actual value plus one, and uses zero as a // sentinel to determine whether a slot is occupied. A linear probing strategy diff --git a/internal/bytealg/count_amd64.go b/internal/bytealg/count_amd64.go index c4263564..b41d3d8d 100644 --- a/internal/bytealg/count_amd64.go +++ b/internal/bytealg/count_amd64.go @@ -12,7 +12,6 @@ package bytealg // name old speed new speed delta // CountByte 49.6GB/s ± 0% 93.2GB/s ± 0% +87.74% (p=0.000 n=10+10) // -// // On systems that do not have AVX-512, the AVX2 version of the code is also // optimized to make use of multiple register lanes, which gives a bit better // throughput than the standard library function: @@ -23,6 +22,5 @@ package bytealg // name old speed new speed delta // CountByte 49.6GB/s ± 0% 67.1GB/s ± 0% +35.21% (p=0.000 n=10+10) // -// //go:noescape func Count(data []byte, value byte) int diff --git a/internal/unsafecast/unsafecast_go18.go b/internal/unsafecast/unsafecast_go18.go index fa869c32..fc31b685 100644 --- a/internal/unsafecast/unsafecast_go18.go +++ b/internal/unsafecast/unsafecast_go18.go @@ -8,8 +8,7 @@ // casting a [][16]byte to a []byte in order to use functions of the standard // bytes package on the slices. // -// With great power comes great responsibility. -// +// With great power comes great responsibility. package unsafecast import ( diff --git a/page_bounds_amd64.go b/page_bounds_amd64.go index 8360382b..9cb513b4 100644 --- a/page_bounds_amd64.go +++ b/page_bounds_amd64.go @@ -23,7 +23,6 @@ package parquet // running more AVX-512 instructions in the tight loops causes more contention // on CPU ports. // -// // Optimizations being trade offs, using min/max functions independently appears // to yield better throughput when the data resides in CPU caches: // diff --git a/reader.go b/reader.go index db68bb8d..1a355b6e 100644 --- a/reader.go +++ b/reader.go @@ -28,7 +28,6 @@ import ( // ... // } // -// // For programs building with Go 1.18 or later, the GenericReader[T] type // supersedes this one. type Reader struct { @@ -61,7 +60,6 @@ type Reader struct { // reader := parquet.NewReader(input, config) // ... // } -// func NewReader(input io.ReaderAt, options ...ReaderOption) *Reader { c, err := NewReaderConfig(options...) if err != nil { diff --git a/schema.go b/schema.go index 68e5e28e..0dcf7cb7 100644 --- a/schema.go +++ b/schema.go @@ -59,14 +59,14 @@ type Schema struct { // timestamp | for int64 types use the TIMESTAMP logical type with, by default, millisecond precision // split | for float32/float64, use the BYTE_STREAM_SPLIT encoding // -// The date logical type is an int32 value of the number of days since the unix epoch +// # The date logical type is an int32 value of the number of days since the unix epoch // // The timestamp precision can be changed by defining which precision to use as an argument. // Supported precisions are: nanosecond, millisecond and microsecond. Example: // -// type Message struct { -// TimestrampMicros int64 `parquet:"timestamp_micros,timestamp(microsecond)" -// } +// type Message struct { +// TimestrampMicros int64 `parquet:"timestamp_micros,timestamp(microsecond)" +// } // // The decimal tag must be followed by two integer parameters, the first integer // representing the scale and the second the precision; for example: @@ -90,9 +90,9 @@ type Schema struct { // // For example, the following will set the int64 key of the map to be a timestamp: // -// type Actions struct { -// Action map[int64]string `parquet:"," parquet-key:",timestamp"` -// } +// type Actions struct { +// Action map[int64]string `parquet:"," parquet-key:",timestamp"` +// } // // The schema name is the Go type name of the value. func SchemaOf(model interface{}) *Schema { diff --git a/search.go b/search.go index 758b1b39..67ead924 100644 --- a/search.go +++ b/search.go @@ -56,7 +56,6 @@ func Search(index ColumnIndex, value Value, typ Type) int { // pageIndex := parquet.Find(columnIndex, value, // parquet.CompareNullsFirst(typ.Compare), // ) -// func Find(index ColumnIndex, value Value, cmp func(Value, Value) int) int { switch { case index.IsAscending(): diff --git a/sort.go b/sort.go index fdfff59b..5517cc24 100644 --- a/sort.go +++ b/sort.go @@ -10,7 +10,6 @@ package parquet // Descending: true, // NullsFirst: true, // }) -// type SortConfig struct { MaxRepetitionLevel int MaxDefinitionLevel int diff --git a/value.go b/value.go index 81516d54..4f8ea07d 100644 --- a/value.go +++ b/value.go @@ -447,19 +447,19 @@ func (v Value) AppendBytes(b []byte) []byte { // // The following formatting options are supported: // -// %c prints the column index -// %+c prints the column index, prefixed with "C:" -// %d prints the definition level -// %+d prints the definition level, prefixed with "D:" -// %r prints the repetition level -// %+r prints the repetition level, prefixed with "R:" -// %q prints the quoted representation of v -// %+q prints the quoted representation of v, prefixed with "V:" -// %s prints the string representation of v -// %+s prints the string representation of v, prefixed with "V:" -// %v same as %s -// %+v prints a verbose representation of v -// %#v prints a Go value representation of v +// %c prints the column index +// %+c prints the column index, prefixed with "C:" +// %d prints the definition level +// %+d prints the definition level, prefixed with "D:" +// %r prints the repetition level +// %+r prints the repetition level, prefixed with "R:" +// %q prints the quoted representation of v +// %+q prints the quoted representation of v, prefixed with "V:" +// %s prints the string representation of v +// %+s prints the string representation of v, prefixed with "V:" +// %v same as %s +// %+v prints a verbose representation of v +// %#v prints a Go value representation of v // // Format satisfies the fmt.Formatter interface. func (v Value) Format(w fmt.State, r rune) { diff --git a/writer.go b/writer.go index 66c6363a..a472fac2 100644 --- a/writer.go +++ b/writer.go @@ -68,7 +68,6 @@ type Writer struct { // writer := parquet.NewWriter(output, config) // ... // } -// func NewWriter(output io.Writer, options ...WriterOption) *Writer { config, err := NewWriterConfig(options...) if err != nil { diff --git a/writer_go18.go b/writer_go18.go index 8fe98346..b1f9598b 100644 --- a/writer_go18.go +++ b/writer_go18.go @@ -12,29 +12,29 @@ import ( // // Using this type over Writer has multiple advantages: // -// - By leveraging type information, the Go compiler can provide greater -// guarantees that the code is correct. For example, the parquet.Writer.Write -// method accepts an argument of type interface{}, which delays type checking -// until runtime. The parquet.GenericWriter[T].Write method ensures at -// compile time that the values it receives will be of type T, reducing the -// risk of introducing errors. +// - By leveraging type information, the Go compiler can provide greater +// guarantees that the code is correct. For example, the parquet.Writer.Write +// method accepts an argument of type interface{}, which delays type checking +// until runtime. The parquet.GenericWriter[T].Write method ensures at +// compile time that the values it receives will be of type T, reducing the +// risk of introducing errors. // -// - Since type information is known at compile time, the implementation of -// parquet.GenericWriter[T] can make safe assumptions, removing the need for -// runtime validation of how the parameters are passed to its methods. -// Optimizations relying on type information are more effective, some of the -// writer's state can be precomputed at initialization, which was not possible -// with parquet.Writer. +// - Since type information is known at compile time, the implementation of +// parquet.GenericWriter[T] can make safe assumptions, removing the need for +// runtime validation of how the parameters are passed to its methods. +// Optimizations relying on type information are more effective, some of the +// writer's state can be precomputed at initialization, which was not possible +// with parquet.Writer. // -// - The parquet.GenericWriter[T].Write method uses a data-oriented design, -// accepting an slice of T instead of a single value, creating more -// opportunities to amortize the runtime cost of abstractions. -// This optimization is not available for parquet.Writer because its Write -// method's argument would be of type []interface{}, which would require -// conversions back and forth from concrete types to empty interfaces (since -// a []T cannot be interpreted as []interface{} in Go), would make the API -// more difficult to use and waste compute resources in the type conversions, -// defeating the purpose of the optimization in the first place. +// - The parquet.GenericWriter[T].Write method uses a data-oriented design, +// accepting an slice of T instead of a single value, creating more +// opportunities to amortize the runtime cost of abstractions. +// This optimization is not available for parquet.Writer because its Write +// method's argument would be of type []interface{}, which would require +// conversions back and forth from concrete types to empty interfaces (since +// a []T cannot be interpreted as []interface{} in Go), would make the API +// more difficult to use and waste compute resources in the type conversions, +// defeating the purpose of the optimization in the first place. // // Note that this type is only available when compiling with Go 1.18 or later. type GenericWriter[T any] struct {