Skip to content

Commit

Permalink
GH-37582: [Go][Parquet] Implement Float16 logical type (#37599)
Browse files Browse the repository at this point in the history
### Rationale for this change

There is an active proposal for a Float16 logical type in Parquet (apache/parquet-format#184) with C++/Python implementations in progress (#36073), so we should add one for Go as well.

### What changes are included in this PR?

- [x] Adds `LogicalType` definitions and methods for `Float16`
- [x] Adds support for `Float16` column statistics and comparators
- [x] Adds support for interchange between Parquet and Arrow's half-precision float

### Are these changes tested?

Yes

### Are there any user-facing changes?

Yes

* Closes: #37582

Authored-by: benibus <[email protected]>
Signed-off-by: Matt Topol <[email protected]>
  • Loading branch information
benibus authored Nov 13, 2023
1 parent 1ff43ab commit bff5fb9
Show file tree
Hide file tree
Showing 25 changed files with 1,183 additions and 75 deletions.
46 changes: 41 additions & 5 deletions go/arrow/float16/float16.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
package float16

import (
"encoding/binary"
"math"
"strconv"
)
Expand All @@ -29,6 +30,11 @@ type Num struct {
bits uint16
}

var (
MaxNum = Num{bits: 0b0111101111111111}
MinNum = MaxNum.Negate()
)

// New creates a new half-precision floating point value from the provided
// float32 value.
func New(f float32) Num {
Expand Down Expand Up @@ -86,6 +92,11 @@ func (n Num) Div(rhs Num) Num {
return New(n.Float32() / rhs.Float32())
}

// Equal returns true if the value represented by n is == other
func (n Num) Equal(other Num) bool {
return n.Float32() == other.Float32()
}

// Greater returns true if the value represented by n is > other
func (n Num) Greater(other Num) bool {
return n.Float32() > other.Float32()
Expand Down Expand Up @@ -152,14 +163,39 @@ func (n Num) Abs() Num {
}

func (n Num) Sign() int {
f := n.Float32()
if f > 0 {
return 1
} else if f == 0 {
if n.IsZero() {
return 0
} else if n.Signbit() {
return -1
}
return -1
return 1
}

func (n Num) Signbit() bool { return (n.bits & 0x8000) != 0 }

func (n Num) IsNaN() bool { return (n.bits & 0x7fff) > 0x7c00 }

func (n Num) IsZero() bool { return (n.bits & 0x7fff) == 0 }

func (f Num) Uint16() uint16 { return f.bits }
func (f Num) String() string { return strconv.FormatFloat(float64(f.Float32()), 'g', -1, 32) }

func Inf() Num { return Num{bits: 0x7c00} }

func NaN() Num { return Num{bits: 0x7fff} }

func FromBits(src uint16) Num { return Num{bits: src} }

func FromLEBytes(src []byte) Num {
return Num{bits: binary.LittleEndian.Uint16(src)}
}

func (f Num) PutLEBytes(dst []byte) {
binary.LittleEndian.PutUint16(dst, f.bits)
}

func (f Num) ToLEBytes() []byte {
dst := make([]byte, 2)
f.PutLEBytes(dst)
return dst
}
43 changes: 43 additions & 0 deletions go/arrow/float16/float16_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,7 @@ func TestSign(t *testing.T) {
}{
{Num{bits: 0x4580}, 1}, // 5.5
{Num{bits: 0x0000}, 0}, // 0
{Num{bits: 0x8000}, 0}, // -0
{Num{bits: 0xC580}, -1}, // -5.5
} {
t.Run("sign", func(t *testing.T) {
Expand All @@ -248,3 +249,45 @@ func TestSign(t *testing.T) {
})
}
}

func TestSignbit(t *testing.T) {
for _, tc := range []struct {
n Num
want bool
}{
{Num{bits: 0x4580}, false}, // 5.5
{Num{bits: 0x0000}, false}, // 0
{Num{bits: 0x8000}, true}, // -0
{Num{bits: 0xC580}, true}, // -5.5
} {
t.Run("signbit", func(t *testing.T) {
n := tc.n.Signbit()
if got, want := n, tc.want; got != want {
t.Fatalf("invalid value. got=%v, want=%v", got, want)
}
})
}
}

func TestIsNaN(t *testing.T) {
for _, tc := range []struct {
n Num
want bool
}{
{NaN(), true},
{NaN().Negate(), true},
{Inf(), false},
{Inf().Negate(), false},
{Num{bits: 0x7c01}, true}, // nan
{Num{bits: 0xfc01}, true}, // -nan
{Num{bits: 0x7e00}, true}, // nan
{Num{bits: 0xfe00}, true}, // -nan
} {
t.Run("isnan", func(t *testing.T) {
n := tc.n.IsNaN()
if got, want := n, tc.want; got != want {
t.Fatalf("invalid value. got=%v, want=%v", got, want)
}
})
}
}
13 changes: 11 additions & 2 deletions go/parquet/file/column_writer_types.gen.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

28 changes: 22 additions & 6 deletions go/parquet/file/column_writer_types.gen.go.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ package file

import (
"fmt"

"github.com/apache/arrow/go/v15/parquet"
"github.com/apache/arrow/go/v15/parquet/metadata"
"github.com/apache/arrow/go/v15/parquet/internal/encoding"
Expand Down Expand Up @@ -83,7 +83,7 @@ func (w *{{.Name}}ColumnChunkWriter) WriteBatch(values []{{.name}}, defLevels, r
// writes a large number of values, the DataPage size can be much above the limit.
// The purpose of this chunking is to bound this. Even if a user writes large number
// of values, the chunking will ensure the AddDataPage() is called at a reasonable
// pagesize limit
// pagesize limit
var n int64
switch {
case defLevels != nil:
Expand All @@ -107,7 +107,7 @@ func (w *{{.Name}}ColumnChunkWriter) WriteBatch(values []{{.name}}, defLevels, r
valueOffset += toWrite
w.checkDictionarySizeLimit()
})
return
return
}

// WriteBatchSpaced writes a batch of repetition levels, definition levels, and values to the
Expand All @@ -132,7 +132,7 @@ func (w *{{.Name}}ColumnChunkWriter) WriteBatchSpaced(values []{{.name}}, defLev
length = len(values)
}
doBatches(int64(length), w.props.WriteBatchSize(), func(offset, batch int64) {
var vals []{{.name}}
var vals []{{.name}}
info := w.maybeCalculateValidityBits(levelSliceOrNil(defLevels, offset, batch), batch)

w.writeLevelsSpaced(batch, levelSliceOrNil(defLevels, offset, batch), levelSliceOrNil(repLevels, offset, batch))
Expand Down Expand Up @@ -165,7 +165,7 @@ func (w *{{.Name}}ColumnChunkWriter) WriteDictIndices(indices arrow.Array, defLe
}
}
}()

valueOffset := int64(0)
length := len(defLevels)
if defLevels == nil {
Expand Down Expand Up @@ -193,14 +193,22 @@ func (w *{{.Name}}ColumnChunkWriter) WriteDictIndices(indices arrow.Array, defLe

valueOffset += info.numSpaced()
})

return
}

func (w *{{.Name}}ColumnChunkWriter) writeValues(values []{{.name}}, numNulls int64) {
w.currentEncoder.(encoding.{{.Name}}Encoder).Put(values)
if w.pageStatistics != nil {
{{- if ne .Name "FixedLenByteArray"}}
w.pageStatistics.(*metadata.{{.Name}}Statistics).Update(values, numNulls)
{{- else}}
if w.Descr().LogicalType().Equals(schema.Float16LogicalType{}) {
w.pageStatistics.(*metadata.Float16Statistics).Update(values, numNulls)
} else {
w.pageStatistics.(*metadata.{{.Name}}Statistics).Update(values, numNulls)
}
{{- end}}
}
}

Expand All @@ -212,7 +220,15 @@ func (w *{{.Name}}ColumnChunkWriter) writeValuesSpaced(spacedValues []{{.name}},
}
if w.pageStatistics != nil {
nulls := numValues - numRead
{{- if ne .Name "FixedLenByteArray"}}
w.pageStatistics.(*metadata.{{.Name}}Statistics).UpdateSpaced(spacedValues, validBits, validBitsOffset, nulls)
{{- else}}
if w.Descr().LogicalType().Equals(schema.Float16LogicalType{}) {
w.pageStatistics.(*metadata.Float16Statistics).UpdateSpaced(spacedValues, validBits, validBitsOffset, nulls)
} else {
w.pageStatistics.(*metadata.{{.Name}}Statistics).UpdateSpaced(spacedValues, validBits, validBitsOffset, nulls)
}
{{- end}}
}
}

Expand Down
2 changes: 1 addition & 1 deletion go/parquet/internal/gen-go/parquet/GoUnusedProtection__.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 8 additions & 1 deletion go/parquet/internal/gen-go/parquet/parquet-consts.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit bff5fb9

Please sign in to comment.