diff --git a/convert.go b/convert.go index d77a2002..a37f7b3c 100644 --- a/convert.go +++ b/convert.go @@ -1,11 +1,19 @@ package parquet import ( + "encoding/binary" + "encoding/hex" "fmt" "io" + "math" + "math/big" + "strconv" "sync" + "time" + "github.com/segmentio/parquet-go/deprecated" "github.com/segmentio/parquet-go/encoding" + "github.com/segmentio/parquet-go/format" ) // ConvertError is an error type returned by calls to Convert when the conversion @@ -565,3 +573,475 @@ func (c *convertedRows) Schema() *Schema { func (c *convertedRows) SeekToRow(rowIndex int64) error { return c.rows.SeekToRow(rowIndex) } + +var ( + trueBytes = []byte(`true`) + falseBytes = []byte(`false`) + unixEpoch = time.Date(1970, time.January, 1, 0, 0, 0, 0, time.UTC) +) + +func convertBooleanToInt32(v Value) (Value, error) { + return v.convertToInt32(int32(v.byte())), nil +} + +func convertBooleanToInt64(v Value) (Value, error) { + return v.convertToInt64(int64(v.byte())), nil +} + +func convertBooleanToInt96(v Value) (Value, error) { + return v.convertToInt96(deprecated.Int96{0: uint32(v.byte())}), nil +} + +func convertBooleanToFloat(v Value) (Value, error) { + return v.convertToFloat(float32(v.byte())), nil +} + +func convertBooleanToDouble(v Value) (Value, error) { + return v.convertToDouble(float64(v.byte())), nil +} + +func convertBooleanToByteArray(v Value) (Value, error) { + return v.convertToByteArray([]byte{v.byte()}), nil +} + +func convertBooleanToFixedLenByteArray(v Value, size int) (Value, error) { + b := []byte{v.byte()} + c := make([]byte, size) + copy(c, b) + return v.convertToFixedLenByteArray(c), nil +} + +func convertBooleanToString(v Value) (Value, error) { + b := ([]byte)(nil) + if v.boolean() { + b = trueBytes + } else { + b = falseBytes + } + return v.convertToByteArray(b), nil +} + +func convertInt32ToBoolean(v Value) (Value, error) { + return v.convertToBoolean(v.int32() != 0), nil +} + +func convertInt32ToInt64(v Value) (Value, error) { + return v.convertToInt64(int64(v.int32())), nil +} + +func convertInt32ToInt96(v Value) (Value, error) { + return v.convertToInt96(deprecated.Int32ToInt96(v.int32())), nil +} + +func convertInt32ToFloat(v Value) (Value, error) { + return v.convertToFloat(float32(v.int32())), nil +} + +func convertInt32ToDouble(v Value) (Value, error) { + return v.convertToDouble(float64(v.int32())), nil +} + +func convertInt32ToByteArray(v Value) (Value, error) { + b := make([]byte, 4) + binary.LittleEndian.PutUint32(b, v.uint32()) + return v.convertToByteArray(b), nil +} + +func convertInt32ToFixedLenByteArray(v Value, size int) (Value, error) { + b := make([]byte, 4) + c := make([]byte, size) + binary.LittleEndian.PutUint32(b, v.uint32()) + copy(c, b) + return v.convertToFixedLenByteArray(c), nil +} + +func convertInt32ToString(v Value) (Value, error) { + return v.convertToByteArray(strconv.AppendInt(nil, int64(v.int32()), 10)), nil +} + +func convertInt64ToBoolean(v Value) (Value, error) { + return v.convertToBoolean(v.int64() != 0), nil +} + +func convertInt64ToInt32(v Value) (Value, error) { + return v.convertToInt32(int32(v.int64())), nil +} + +func convertInt64ToInt96(v Value) (Value, error) { + return v.convertToInt96(deprecated.Int64ToInt96(v.int64())), nil +} + +func convertInt64ToFloat(v Value) (Value, error) { + return v.convertToFloat(float32(v.int64())), nil +} + +func convertInt64ToDouble(v Value) (Value, error) { + return v.convertToDouble(float64(v.int64())), nil +} + +func convertInt64ToByteArray(v Value) (Value, error) { + b := make([]byte, 8) + binary.LittleEndian.PutUint64(b, v.uint64()) + return v.convertToByteArray(b), nil +} + +func convertInt64ToFixedLenByteArray(v Value, size int) (Value, error) { + b := make([]byte, 8) + c := make([]byte, size) + binary.LittleEndian.PutUint64(b, v.uint64()) + copy(c, b) + return v.convertToFixedLenByteArray(c), nil +} + +func convertInt64ToString(v Value) (Value, error) { + return v.convertToByteArray(strconv.AppendInt(nil, v.int64(), 10)), nil +} + +func convertInt96ToBoolean(v Value) (Value, error) { + return v.convertToBoolean(!v.int96().IsZero()), nil +} + +func convertInt96ToInt32(v Value) (Value, error) { + return v.convertToInt32(v.int96().Int32()), nil +} + +func convertInt96ToInt64(v Value) (Value, error) { + return v.convertToInt64(v.int96().Int64()), nil +} + +func convertInt96ToFloat(v Value) (Value, error) { + return v, invalidConversion(v, "INT96", "FLOAT") +} + +func convertInt96ToDouble(v Value) (Value, error) { + return v, invalidConversion(v, "INT96", "DOUBLE") +} + +func convertInt96ToByteArray(v Value) (Value, error) { + return v.convertToByteArray(v.byteArray()), nil +} + +func convertInt96ToFixedLenByteArray(v Value, size int) (Value, error) { + b := v.byteArray() + if len(b) < size { + c := make([]byte, size) + copy(c, b) + b = c + } else { + b = b[:size] + } + return v.convertToFixedLenByteArray(b), nil +} + +func convertInt96ToString(v Value) (Value, error) { + return v.convertToByteArray([]byte(v.String())), nil +} + +func convertFloatToBoolean(v Value) (Value, error) { + return v.convertToBoolean(v.float() != 0), nil +} + +func convertFloatToInt32(v Value) (Value, error) { + return v.convertToInt32(int32(v.float())), nil +} + +func convertFloatToInt64(v Value) (Value, error) { + return v.convertToInt64(int64(v.float())), nil +} + +func convertFloatToInt96(v Value) (Value, error) { + return v, invalidConversion(v, "FLOAT", "INT96") +} + +func convertFloatToDouble(v Value) (Value, error) { + return v.convertToDouble(float64(v.float())), nil +} + +func convertFloatToByteArray(v Value) (Value, error) { + b := make([]byte, 4) + binary.LittleEndian.PutUint32(b, v.uint32()) + return v.convertToByteArray(b), nil +} + +func convertFloatToFixedLenByteArray(v Value, size int) (Value, error) { + b := make([]byte, 4) + c := make([]byte, size) + binary.LittleEndian.PutUint32(b, v.uint32()) + copy(c, b) + return v.convertToFixedLenByteArray(c), nil +} + +func convertFloatToString(v Value) (Value, error) { + return v.convertToByteArray(strconv.AppendFloat(nil, float64(v.float()), 'g', -1, 32)), nil +} + +func convertDoubleToBoolean(v Value) (Value, error) { + return v.convertToBoolean(v.double() != 0), nil +} + +func convertDoubleToInt32(v Value) (Value, error) { + return v.convertToInt32(int32(v.double())), nil +} + +func convertDoubleToInt64(v Value) (Value, error) { + return v.convertToInt64(int64(v.double())), nil +} + +func convertDoubleToInt96(v Value) (Value, error) { + return v, invalidConversion(v, "FLOAT", "INT96") +} + +func convertDoubleToFloat(v Value) (Value, error) { + return v.convertToFloat(float32(v.double())), nil +} + +func convertDoubleToByteArray(v Value) (Value, error) { + b := make([]byte, 8) + binary.LittleEndian.PutUint64(b, v.uint64()) + return v.convertToByteArray(b), nil +} + +func convertDoubleToFixedLenByteArray(v Value, size int) (Value, error) { + b := make([]byte, 8) + c := make([]byte, size) + binary.LittleEndian.PutUint64(b, v.uint64()) + copy(c, b) + return v.convertToFixedLenByteArray(c), nil +} + +func convertDoubleToString(v Value) (Value, error) { + return v.convertToByteArray(strconv.AppendFloat(nil, v.double(), 'g', -1, 64)), nil +} + +func convertByteArrayToBoolean(v Value) (Value, error) { + return v.convertToBoolean(!isZero(v.byteArray())), nil +} + +func convertByteArrayToInt32(v Value) (Value, error) { + b := make([]byte, 4) + copy(b, v.byteArray()) + return v.convertToInt32(int32(binary.LittleEndian.Uint32(b))), nil +} + +func convertByteArrayToInt64(v Value) (Value, error) { + b := make([]byte, 8) + copy(b, v.byteArray()) + return v.convertToInt64(int64(binary.LittleEndian.Uint64(b))), nil +} + +func convertByteArrayToInt96(v Value) (Value, error) { + b := make([]byte, 12) + copy(b, v.byteArray()) + return v.convertToInt96(deprecated.Int96{ + 0: binary.LittleEndian.Uint32(b[0:4]), + 1: binary.LittleEndian.Uint32(b[4:8]), + 2: binary.LittleEndian.Uint32(b[8:12]), + }), nil +} + +func convertByteArrayToFloat(v Value) (Value, error) { + b := make([]byte, 4) + copy(b, v.byteArray()) + return v.convertToFloat(math.Float32frombits(binary.LittleEndian.Uint32(b))), nil +} + +func convertByteArrayToDouble(v Value) (Value, error) { + b := make([]byte, 8) + copy(b, v.byteArray()) + return v.convertToDouble(math.Float64frombits(binary.LittleEndian.Uint64(b))), nil +} + +func convertByteArrayToFixedLenByteArray(v Value, size int) (Value, error) { + b := v.byteArray() + if len(b) < size { + c := make([]byte, size) + copy(c, b) + b = c + } else { + b = b[:size] + } + return v.convertToFixedLenByteArray(b), nil +} + +func convertFixedLenByteArrayToString(v Value) (Value, error) { + b := v.byteArray() + c := make([]byte, hex.EncodedLen(len(b))) + hex.Encode(c, b) + return v.convertToByteArray(c), nil +} + +func convertStringToBoolean(v Value) (Value, error) { + b, err := strconv.ParseBool(v.string()) + if err != nil { + return v, conversionError(v, "STRING", "BOOLEAN", err) + } + return v.convertToBoolean(b), nil +} + +func convertStringToInt32(v Value) (Value, error) { + i, err := strconv.ParseInt(v.string(), 10, 32) + if err != nil { + return v, conversionError(v, "STRING", "INT32", err) + } + return v.convertToInt32(int32(i)), nil +} + +func convertStringToInt64(v Value) (Value, error) { + i, err := strconv.ParseInt(v.string(), 10, 64) + if err != nil { + return v, conversionError(v, "STRING", "INT64", err) + } + return v.convertToInt64(i), nil +} + +func convertStringToInt96(v Value) (Value, error) { + i, ok := new(big.Int).SetString(v.string(), 10) + if !ok { + return v, conversionError(v, "STRING", "INT96", strconv.ErrSyntax) + } + b := i.Bytes() + c := make([]byte, 12) + copy(c, b) + i96 := deprecated.BytesToInt96(c) + return v.convertToInt96(i96[0]), nil +} + +func convertStringToFloat(v Value) (Value, error) { + f, err := strconv.ParseFloat(v.string(), 32) + if err != nil { + return v, conversionError(v, "STRING", "FLOAT", err) + } + return v.convertToFloat(float32(f)), nil +} + +func convertStringToDouble(v Value) (Value, error) { + f, err := strconv.ParseFloat(v.string(), 64) + if err != nil { + return v, conversionError(v, "STRING", "DOUBLE", err) + } + return v.convertToDouble(f), nil +} + +func convertStringToFixedLenByteArray(v Value, size int) (Value, error) { + b := v.byteArray() + c := make([]byte, size) + _, err := hex.Decode(c, b) + if err != nil { + return v, conversionError(v, "STRING", "BYTE_ARRAY", err) + } + return v.convertToFixedLenByteArray(c), nil +} + +func convertStringToDate(v Value, tz *time.Location) (Value, error) { + t, err := time.ParseInLocation("2006-01-02", v.string(), tz) + if err != nil { + return v, conversionError(v, "STRING", "DATE", err) + } + d := daysSinceUnixEpoch(t) + return v.convertToInt32(int32(d)), nil +} + +func convertStringToTimeMillis(v Value, tz *time.Location) (Value, error) { + t, err := time.ParseInLocation("15:04:05.999", v.string(), tz) + if err != nil { + return v, conversionError(v, "STRING", "TIME", err) + } + m := nearestMidnightLessThan(t) + milliseconds := t.Sub(m).Milliseconds() + return v.convertToInt32(int32(milliseconds)), nil +} + +func convertStringToTimeMicros(v Value, tz *time.Location) (Value, error) { + t, err := time.ParseInLocation("15:04:05.999999", v.string(), tz) + if err != nil { + return v, conversionError(v, "STRING", "TIME", err) + } + m := nearestMidnightLessThan(t) + microseconds := t.Sub(m).Microseconds() + return v.convertToInt64(microseconds), nil +} + +func convertDateToTimestamp(v Value, u format.TimeUnit, tz *time.Location) (Value, error) { + t := unixEpoch.AddDate(0, 0, int(v.int32())) + d := timeUnitDuration(u) + return v.convertToInt64(int64(t.In(tz).Sub(unixEpoch) / d)), nil +} + +func convertDateToString(v Value) (Value, error) { + t := unixEpoch.AddDate(0, 0, int(v.int32())) + b := t.AppendFormat(make([]byte, 0, 10), "2006-01-02") + return v.convertToByteArray(b), nil +} + +func convertTimeMillisToString(v Value, tz *time.Location) (Value, error) { + t := time.UnixMilli(int64(v.int32())).In(tz) + b := t.AppendFormat(make([]byte, 0, 12), "15:04:05.999") + return v.convertToByteArray(b), nil +} + +func convertTimeMicrosToString(v Value, tz *time.Location) (Value, error) { + t := time.UnixMicro(v.int64()).In(tz) + b := t.AppendFormat(make([]byte, 0, 15), "15:04:05.999999") + return v.convertToByteArray(b), nil +} + +func convertTimestampToDate(v Value, u format.TimeUnit, tz *time.Location) (Value, error) { + t := timestamp(v, u, tz) + d := daysSinceUnixEpoch(t) + return v.convertToInt32(int32(d)), nil +} + +func convertTimestampToTimeMillis(v Value, u format.TimeUnit, sourceZone, targetZone *time.Location) (Value, error) { + t := timestamp(v, u, sourceZone) + m := nearestMidnightLessThan(t) + milliseconds := t.In(targetZone).Sub(m).Milliseconds() + return v.convertToInt32(int32(milliseconds)), nil +} + +func convertTimestampToTimeMicros(v Value, u format.TimeUnit, sourceZone, targetZone *time.Location) (Value, error) { + t := timestamp(v, u, sourceZone) + m := nearestMidnightLessThan(t) + microseconds := t.In(targetZone).Sub(m).Microseconds() + return v.convertToInt64(int64(microseconds)), nil +} + +func convertTimestampToTimestamp(v Value, sourceUnit, targetUnit format.TimeUnit) (Value, error) { + sourceScale := timeUnitDuration(sourceUnit).Nanoseconds() + targetScale := timeUnitDuration(targetUnit).Nanoseconds() + targetValue := (v.int64() * sourceScale) / targetScale + return v.convertToInt64(targetValue), nil +} + +const nanosecondsPerDay = 24 * 60 * 60 * 1e9 + +func daysSinceUnixEpoch(t time.Time) int { + return int(t.Sub(unixEpoch).Hours()) / 24 +} + +func nearestMidnightLessThan(t time.Time) time.Time { + y, m, d := t.Date() + return time.Date(y, m, d, 0, 0, 0, 0, t.Location()) +} + +func timestamp(v Value, u format.TimeUnit, tz *time.Location) time.Time { + return unixEpoch.In(tz).Add(time.Duration(v.int64()) * timeUnitDuration(u)) +} + +func timeUnitDuration(unit format.TimeUnit) time.Duration { + switch { + case unit.Millis != nil: + return time.Millisecond + case unit.Micros != nil: + return time.Microsecond + default: + return time.Nanosecond + } +} + +func invalidConversion(value Value, from, to string) error { + return fmt.Errorf("%s to %s: %s: %w", from, to, value, ErrInvalidConversion) +} + +func conversionError(value Value, from, to string, err error) error { + return fmt.Errorf("%s to %s: %q: %s: %w", from, to, value.string(), err, ErrInvalidConversion) +} diff --git a/convert_test.go b/convert_test.go index 9229770b..79872dca 100644 --- a/convert_test.go +++ b/convert_test.go @@ -6,6 +6,7 @@ import ( "time" "github.com/segmentio/parquet-go" + "github.com/segmentio/parquet-go/deprecated" ) type AddressBook1 struct { @@ -481,7 +482,7 @@ func TestConvert(t *testing.T) { func newInt64(i int64) *int64 { return &i } func newString(s string) *string { return &s } -func TestConvertTimestamp(t *testing.T) { +func TestConvertValue(t *testing.T) { now := time.Unix(42, 0) ms := now.UnixMilli() us := now.UnixMicro() @@ -510,60 +511,681 @@ func TestConvertTimestamp(t *testing.T) { fromType parquet.Type fromValue parquet.Value toType parquet.Type - expected int64 + toValue parquet.Value }{ + { + scenario: "true to boolean", + fromType: parquet.BooleanType, + fromValue: parquet.BooleanValue(true), + toType: parquet.BooleanType, + toValue: parquet.BooleanValue(true), + }, + + { + scenario: "true to int32", + fromType: parquet.BooleanType, + fromValue: parquet.BooleanValue(true), + toType: parquet.Int32Type, + toValue: parquet.Int32Value(1), + }, + + { + scenario: "true to int64", + fromType: parquet.BooleanType, + fromValue: parquet.BooleanValue(true), + toType: parquet.Int64Type, + toValue: parquet.Int64Value(1), + }, + + { + scenario: "true to int96", + fromType: parquet.BooleanType, + fromValue: parquet.BooleanValue(true), + toType: parquet.Int96Type, + toValue: parquet.Int96Value(deprecated.Int96{0: 1}), + }, + + { + scenario: "true to float", + fromType: parquet.BooleanType, + fromValue: parquet.BooleanValue(true), + toType: parquet.FloatType, + toValue: parquet.FloatValue(1), + }, + + { + scenario: "true to double", + fromType: parquet.BooleanType, + fromValue: parquet.BooleanValue(true), + toType: parquet.FloatType, + toValue: parquet.FloatValue(1), + }, + + { + scenario: "true to byte array", + fromType: parquet.BooleanType, + fromValue: parquet.BooleanValue(true), + toType: parquet.ByteArrayType, + toValue: parquet.ByteArrayValue([]byte{1}), + }, + + { + scenario: "true to fixed length byte array", + fromType: parquet.BooleanType, + fromValue: parquet.BooleanValue(true), + toType: parquet.FixedLenByteArrayType(4), + toValue: parquet.FixedLenByteArrayValue([]byte{1, 0, 0, 0}), + }, + + { + scenario: "true to string", + fromType: parquet.BooleanType, + fromValue: parquet.BooleanValue(true), + toType: parquet.String().Type(), + toValue: parquet.ByteArrayValue([]byte(`true`)), + }, + + { + scenario: "false to boolean", + fromType: parquet.BooleanType, + fromValue: parquet.BooleanValue(false), + toType: parquet.BooleanType, + toValue: parquet.BooleanValue(false), + }, + + { + scenario: "false to int32", + fromType: parquet.BooleanType, + fromValue: parquet.BooleanValue(false), + toType: parquet.Int32Type, + toValue: parquet.Int32Value(0), + }, + + { + scenario: "false to int64", + fromType: parquet.BooleanType, + fromValue: parquet.BooleanValue(false), + toType: parquet.Int64Type, + toValue: parquet.Int64Value(0), + }, + + { + scenario: "false to int96", + fromType: parquet.BooleanType, + fromValue: parquet.BooleanValue(false), + toType: parquet.Int96Type, + toValue: parquet.Int96Value(deprecated.Int96{}), + }, + + { + scenario: "false to float", + fromType: parquet.BooleanType, + fromValue: parquet.BooleanValue(false), + toType: parquet.FloatType, + toValue: parquet.FloatValue(0), + }, + + { + scenario: "false to double", + fromType: parquet.BooleanType, + fromValue: parquet.BooleanValue(false), + toType: parquet.FloatType, + toValue: parquet.FloatValue(0), + }, + + { + scenario: "false to byte array", + fromType: parquet.BooleanType, + fromValue: parquet.BooleanValue(false), + toType: parquet.ByteArrayType, + toValue: parquet.ByteArrayValue([]byte{0}), + }, + + { + scenario: "false to fixed length byte array", + fromType: parquet.BooleanType, + fromValue: parquet.BooleanValue(false), + toType: parquet.FixedLenByteArrayType(4), + toValue: parquet.FixedLenByteArrayValue([]byte{0, 0, 0, 0}), + }, + + { + scenario: "false to string", + fromType: parquet.BooleanType, + fromValue: parquet.BooleanValue(false), + toType: parquet.String().Type(), + toValue: parquet.ByteArrayValue([]byte(`false`)), + }, + + { + scenario: "int32 to true", + fromType: parquet.Int32Type, + fromValue: parquet.Int32Value(10), + toType: parquet.BooleanType, + toValue: parquet.BooleanValue(true), + }, + + { + scenario: "int32 to false", + fromType: parquet.Int32Type, + fromValue: parquet.Int32Value(0), + toType: parquet.BooleanType, + toValue: parquet.BooleanValue(false), + }, + + { + scenario: "int32 to int32", + fromType: parquet.Int32Type, + fromValue: parquet.Int32Value(42), + toType: parquet.Int32Type, + toValue: parquet.Int32Value(42), + }, + + { + scenario: "int32 to int64", + fromType: parquet.Int32Type, + fromValue: parquet.Int32Value(-21), + toType: parquet.Int64Type, + toValue: parquet.Int64Value(-21), + }, + + { + scenario: "int32 to int96", + fromType: parquet.Int32Type, + fromValue: parquet.Int32Value(123), + toType: parquet.Int96Type, + toValue: parquet.Int96Value(deprecated.Int96{0: 123}), + }, + + { + scenario: "int32 to float", + fromType: parquet.Int32Type, + fromValue: parquet.Int32Value(9), + toType: parquet.FloatType, + toValue: parquet.FloatValue(9), + }, + + { + scenario: "int32 to double", + fromType: parquet.Int32Type, + fromValue: parquet.Int32Value(100), + toType: parquet.DoubleType, + toValue: parquet.DoubleValue(100), + }, + + { + scenario: "int32 to byte array", + fromType: parquet.Int32Type, + fromValue: parquet.Int32Value(1 << 8), + toType: parquet.ByteArrayType, + toValue: parquet.ByteArrayValue([]byte{0, 1, 0, 0}), + }, + + { + scenario: "int32 to fixed length byte array", + fromType: parquet.Int32Type, + fromValue: parquet.Int32Value(1 << 8), + toType: parquet.FixedLenByteArrayType(3), + toValue: parquet.FixedLenByteArrayValue([]byte{0, 1, 0}), + }, + + { + scenario: "int32 to string", + fromType: parquet.Int32Type, + fromValue: parquet.Int32Value(12345), + toType: parquet.String().Type(), + toValue: parquet.ByteArrayValue([]byte(`12345`)), + }, + + { + scenario: "int64 to true", + fromType: parquet.Int64Type, + fromValue: parquet.Int64Value(10), + toType: parquet.BooleanType, + toValue: parquet.BooleanValue(true), + }, + + { + scenario: "int64 to false", + fromType: parquet.Int64Type, + fromValue: parquet.Int64Value(0), + toType: parquet.BooleanType, + toValue: parquet.BooleanValue(false), + }, + + { + scenario: "int64 to int32", + fromType: parquet.Int64Type, + fromValue: parquet.Int64Value(-21), + toType: parquet.Int32Type, + toValue: parquet.Int32Value(-21), + }, + + { + scenario: "int64 to int64", + fromType: parquet.Int64Type, + fromValue: parquet.Int64Value(42), + toType: parquet.Int64Type, + toValue: parquet.Int64Value(42), + }, + + { + scenario: "int64 to int96", + fromType: parquet.Int64Type, + fromValue: parquet.Int64Value(123), + toType: parquet.Int96Type, + toValue: parquet.Int96Value(deprecated.Int96{0: 123}), + }, + + { + scenario: "int64 to float", + fromType: parquet.Int64Type, + fromValue: parquet.Int64Value(9), + toType: parquet.FloatType, + toValue: parquet.FloatValue(9), + }, + + { + scenario: "int64 to double", + fromType: parquet.Int64Type, + fromValue: parquet.Int64Value(100), + toType: parquet.DoubleType, + toValue: parquet.DoubleValue(100), + }, + + { + scenario: "int64 to byte array", + fromType: parquet.Int64Type, + fromValue: parquet.Int64Value(1 << 8), + toType: parquet.ByteArrayType, + toValue: parquet.ByteArrayValue([]byte{0, 1, 0, 0, 0, 0, 0, 0}), + }, + + { + scenario: "int64 to fixed length byte array", + fromType: parquet.Int64Type, + fromValue: parquet.Int64Value(1 << 8), + toType: parquet.FixedLenByteArrayType(3), + toValue: parquet.FixedLenByteArrayValue([]byte{0, 1, 0}), + }, + + { + scenario: "int64 to string", + fromType: parquet.Int64Type, + fromValue: parquet.Int64Value(1234567890), + toType: parquet.String().Type(), + toValue: parquet.ByteArrayValue([]byte(`1234567890`)), + }, + + { + scenario: "float to true", + fromType: parquet.FloatType, + fromValue: parquet.FloatValue(0.1), + toType: parquet.BooleanType, + toValue: parquet.BooleanValue(true), + }, + + { + scenario: "float to false", + fromType: parquet.FloatType, + fromValue: parquet.FloatValue(0), + toType: parquet.BooleanType, + toValue: parquet.BooleanValue(false), + }, + + { + scenario: "float to int32", + fromType: parquet.FloatType, + fromValue: parquet.FloatValue(9.9), + toType: parquet.Int32Type, + toValue: parquet.Int32Value(9), + }, + + { + scenario: "float to int64", + fromType: parquet.FloatType, + fromValue: parquet.FloatValue(-1.5), + toType: parquet.Int64Type, + toValue: parquet.Int64Value(-1), + }, + + { + scenario: "float to float", + fromType: parquet.FloatType, + fromValue: parquet.FloatValue(1.234), + toType: parquet.FloatType, + toValue: parquet.FloatValue(1.234), + }, + + { + scenario: "float to double", + fromType: parquet.FloatType, + fromValue: parquet.FloatValue(-0.5), + toType: parquet.DoubleType, + toValue: parquet.DoubleValue(-0.5), + }, + + { + scenario: "float to string", + fromType: parquet.FloatType, + fromValue: parquet.FloatValue(0.125), + toType: parquet.String().Type(), + toValue: parquet.ByteArrayValue([]byte(`0.125`)), + }, + + { + scenario: "double to true", + fromType: parquet.DoubleType, + fromValue: parquet.DoubleValue(0.1), + toType: parquet.BooleanType, + toValue: parquet.BooleanValue(true), + }, + + { + scenario: "double to false", + fromType: parquet.DoubleType, + fromValue: parquet.DoubleValue(0), + toType: parquet.BooleanType, + toValue: parquet.BooleanValue(false), + }, + + { + scenario: "double to int32", + fromType: parquet.DoubleType, + fromValue: parquet.DoubleValue(9.9), + toType: parquet.Int32Type, + toValue: parquet.Int32Value(9), + }, + + { + scenario: "double to int64", + fromType: parquet.DoubleType, + fromValue: parquet.DoubleValue(-1.5), + toType: parquet.Int64Type, + toValue: parquet.Int64Value(-1), + }, + + { + scenario: "double to float", + fromType: parquet.DoubleType, + fromValue: parquet.DoubleValue(1.234), + toType: parquet.FloatType, + toValue: parquet.FloatValue(1.234), + }, + + { + scenario: "double to double", + fromType: parquet.DoubleType, + fromValue: parquet.DoubleValue(-0.5), + toType: parquet.DoubleType, + toValue: parquet.DoubleValue(-0.5), + }, + + { + scenario: "double to string", + fromType: parquet.DoubleType, + fromValue: parquet.DoubleValue(0.125), + toType: parquet.String().Type(), + toValue: parquet.ByteArrayValue([]byte(`0.125`)), + }, + + { + scenario: "string to true", + fromType: parquet.String().Type(), + fromValue: parquet.ByteArrayValue([]byte(`true`)), + toType: parquet.BooleanType, + toValue: parquet.BooleanValue(true), + }, + + { + scenario: "string to false", + fromType: parquet.String().Type(), + fromValue: parquet.ByteArrayValue([]byte(`false`)), + toType: parquet.BooleanType, + toValue: parquet.BooleanValue(false), + }, + + { + scenario: "string to int32", + fromType: parquet.String().Type(), + fromValue: parquet.ByteArrayValue([]byte(`-21`)), + toType: parquet.Int32Type, + toValue: parquet.Int32Value(-21), + }, + + { + scenario: "string to int64", + fromType: parquet.String().Type(), + fromValue: parquet.ByteArrayValue([]byte(`42`)), + toType: parquet.Int64Type, + toValue: parquet.Int64Value(42), + }, + + { + scenario: "string to int96", + fromType: parquet.String().Type(), + fromValue: parquet.ByteArrayValue([]byte(`123`)), + toType: parquet.Int96Type, + toValue: parquet.Int96Value(deprecated.Int96{0: 123}), + }, + + { + scenario: "string to float", + fromType: parquet.String().Type(), + fromValue: parquet.ByteArrayValue([]byte(`-0.5`)), + toType: parquet.FloatType, + toValue: parquet.FloatValue(-0.5), + }, + + { + scenario: "string to double", + fromType: parquet.String().Type(), + fromValue: parquet.ByteArrayValue([]byte(`0.5`)), + toType: parquet.DoubleType, + toValue: parquet.DoubleValue(0.5), + }, + + { + scenario: "string to byte array", + fromType: parquet.String().Type(), + fromValue: parquet.ByteArrayValue([]byte(`ABC`)), + toType: parquet.ByteArrayType, + toValue: parquet.ByteArrayValue([]byte(`ABC`)), + }, + + { + scenario: "string to fixed length byte array", + fromType: parquet.String().Type(), + fromValue: parquet.ByteArrayValue([]byte(`99B816772522447EBF76821A7C5ADF65`)), + toType: parquet.FixedLenByteArrayType(16), + toValue: parquet.FixedLenByteArrayValue([]byte{ + 0x99, 0xb8, 0x16, 0x77, 0x25, 0x22, 0x44, 0x7e, + 0xbf, 0x76, 0x82, 0x1a, 0x7c, 0x5a, 0xdf, 0x65, + }), + }, + + { + scenario: "string to string", + fromType: parquet.String().Type(), + fromValue: parquet.ByteArrayValue([]byte(`Hello World!`)), + toType: parquet.String().Type(), + toValue: parquet.ByteArrayValue([]byte(`Hello World!`)), + }, + + { + scenario: "string to date", + fromType: parquet.String().Type(), + fromValue: parquet.ByteArrayValue([]byte(`1970-01-03`)), + toType: parquet.Date().Type(), + toValue: parquet.Int32Value(2), + }, + + { + scenario: "string to millisecond time", + fromType: parquet.String().Type(), + fromValue: parquet.ByteArrayValue([]byte(`12:34:56.789`)), + toType: parquet.Time(parquet.Millisecond).Type(), + toValue: parquet.Int32Value(45296789), + }, + + { + scenario: "string to microsecond time", + fromType: parquet.String().Type(), + fromValue: parquet.ByteArrayValue([]byte(`12:34:56.789012`)), + toType: parquet.Time(parquet.Microsecond).Type(), + toValue: parquet.Int64Value(45296789012), + }, + + { + scenario: "date to millisecond timestamp", + fromType: parquet.Date().Type(), + fromValue: parquet.Int32Value(19338), + toType: parquet.Timestamp(parquet.Millisecond).Type(), + toValue: parquet.Int64Value(1670803200000), + }, + + { + scenario: "date to microsecond timestamp", + fromType: parquet.Date().Type(), + fromValue: parquet.Int32Value(19338), + toType: parquet.Timestamp(parquet.Microsecond).Type(), + toValue: parquet.Int64Value(1670803200000000), + }, + + { + scenario: "date to string", + fromType: parquet.Date().Type(), + fromValue: parquet.Int32Value(18995), + toType: parquet.String().Type(), + toValue: parquet.ByteArrayValue([]byte(`2022-01-03`)), + }, + + { + scenario: "millisecond time to string", + fromType: parquet.Time(parquet.Millisecond).Type(), + fromValue: parquet.Int32Value(45296789), + toType: parquet.String().Type(), + toValue: parquet.ByteArrayValue([]byte(`12:34:56.789`)), + }, + + { + scenario: "microsecond time to string", + fromType: parquet.Time(parquet.Microsecond).Type(), + fromValue: parquet.Int64Value(45296789012), + toType: parquet.String().Type(), + toValue: parquet.ByteArrayValue([]byte(`12:34:56.789012`)), + }, + + { + scenario: "millisecond timestamp to date", + fromType: parquet.Timestamp(parquet.Millisecond).Type(), + fromValue: parquet.Int64Value(1670888613000), + toType: parquet.Date().Type(), + toValue: parquet.Int32Value(19338), + }, + + { + scenario: "microsecond timestamp to date", + fromType: parquet.Timestamp(parquet.Microsecond).Type(), + fromValue: parquet.Int64Value(1670888613000123), + toType: parquet.Date().Type(), + toValue: parquet.Int32Value(19338), + }, + + { + scenario: "millisecond timestamp to millisecond time", + fromType: parquet.Timestamp(parquet.Millisecond).Type(), + fromValue: parquet.Int64Value(1670888613123), + toType: parquet.Time(parquet.Millisecond).Type(), + toValue: parquet.Int32Value(85413123), + }, + + { + scenario: "millisecond timestamp to micronsecond time", + fromType: parquet.Timestamp(parquet.Millisecond).Type(), + fromValue: parquet.Int64Value(1670888613123), + toType: parquet.Time(parquet.Microsecond).Type(), + toValue: parquet.Int64Value(85413123000), + }, + + { + scenario: "microsecond timestamp to millisecond time", + fromType: parquet.Timestamp(parquet.Microsecond).Type(), + fromValue: parquet.Int64Value(1670888613123456), + toType: parquet.Time(parquet.Millisecond).Type(), + toValue: parquet.Int32Value(85413123), + }, + + { + scenario: "microsecond timestamp to micronsecond time", + fromType: parquet.Timestamp(parquet.Microsecond).Type(), + fromValue: parquet.Int64Value(1670888613123456), + toType: parquet.Time(parquet.Microsecond).Type(), + toValue: parquet.Int64Value(85413123456), + }, + { scenario: "micros to nanos", fromType: usType, fromValue: usVal, toType: nsType, - expected: ns, + toValue: parquet.Int64Value(ns), }, + { scenario: "millis to nanos", fromType: msType, fromValue: msVal, toType: nsType, - expected: ns, + toValue: parquet.Int64Value(ns), }, + { scenario: "nanos to micros", fromType: nsType, fromValue: nsVal, toType: usType, - expected: us, + toValue: parquet.Int64Value(us), }, + { scenario: "nanos to nanos", fromType: nsType, fromValue: nsVal, toType: nsType, - expected: ns, + toValue: parquet.Int64Value(ns), }, + { scenario: "int64 to nanos", fromType: parquet.Int64Type, fromValue: nsVal, toType: nsType, - expected: ns, + toValue: parquet.Int64Value(ns), }, + { scenario: "int64 to int64", fromType: parquet.Int64Type, fromValue: nsVal, toType: parquet.Int64Type, - expected: ns, + toValue: parquet.Int64Value(ns), }, } for _, test := range timestampConversionTests { t.Run(test.scenario, func(t *testing.T) { - a, err := test.toType.ConvertValue(test.fromValue, test.fromType) + // Set levels to ensure that they are retained by the conversion. + from := test.fromValue.Level(1, 2, 3) + want := test.toValue.Level(1, 2, 3) + + got, err := test.toType.ConvertValue(from, test.fromType) if err != nil { t.Fatal(err) } - if a.Int64() != test.expected { - t.Errorf("converted value mismatch:\nwant = %+v\ngot = %+v", test.expected, a.Int64()) + + if !parquet.DeepEqual(want, got) { + t.Errorf("converted value mismatch:\nwant = %+v\ngot = %+v", want, got) } }) } diff --git a/deprecated/int96.go b/deprecated/int96.go index f82d2d40..1bed7a5d 100644 --- a/deprecated/int96.go +++ b/deprecated/int96.go @@ -9,6 +9,29 @@ import ( // Int96 is an implementation of the deprecated INT96 parquet type. type Int96 [3]uint32 +// Int32ToInt96 converts a int32 value to a Int96. +func Int32ToInt96(value int32) (i96 Int96) { + if value < 0 { + i96[2] = 0xFFFFFFFF + i96[1] = 0xFFFFFFFF + } + i96[0] = uint32(value) + return +} + +// Int64ToInt96 converts a int64 value to Int96. +func Int64ToInt96(value int64) (i96 Int96) { + if value < 0 { + i96[2] = 0xFFFFFFFF + } + i96[1] = uint32(value >> 32) + i96[0] = uint32(value) + return +} + +// IsZero returns true if i is the zero-value. +func (i Int96) IsZero() bool { return i == Int96{} } + // Negative returns true if i is a negative value. func (i Int96) Negative() bool { return (i[2] >> 31) != 0 @@ -48,6 +71,16 @@ func (i Int96) Int() *big.Int { return z } +// Int32 converts i to a int32, potentially truncating the value. +func (i Int96) Int32() int32 { + return int32(i[0]) +} + +// Int64 converts i to a int64, potentially truncating the value. +func (i Int96) Int64() int64 { + return int64(i[1])<<32 | int64(i[0]) +} + // String returns a string representation of i. func (i Int96) String() string { return i.Int().String() diff --git a/errors.go b/errors.go index 1d4c947e..bcef1faf 100644 --- a/errors.go +++ b/errors.go @@ -54,6 +54,11 @@ var ( // ErrTooManyRowGroups is returned when attempting to generate a parquet // file with more than MaxRowGroups row groups. ErrTooManyRowGroups = errors.New("the limit of 32767 row groups has been reached") + + // ErrConversion is used to indicate that a conversion betwen two values + // cannot be done because there are no rules to translate between their + // physical types. + ErrInvalidConversion = errors.New("invalid conversion between parquet values") ) type errno int diff --git a/null.go b/null.go index 9cb2277b..7fb37813 100644 --- a/null.go +++ b/null.go @@ -113,12 +113,3 @@ func nullIndexFuncOfByteArray(n int) nullIndexFunc { } } } - -func isZero(b []byte) bool { - for _, c := range b { - if c != 0 { - return false - } - } - return true -} diff --git a/parquet.go b/parquet.go index 69dd5b8c..d32b02be 100644 --- a/parquet.go +++ b/parquet.go @@ -42,3 +42,12 @@ func typeNameOf(t reflect.Type) string { } return s1 + " (" + s2 + ")" } + +func isZero(b []byte) bool { + for _, c := range b { + if c != 0 { + return false + } + } + return true +} diff --git a/type.go b/type.go index 32f20643..a100c672 100644 --- a/type.go +++ b/type.go @@ -318,12 +318,32 @@ func (t booleanType) AssignValue(dst reflect.Value, src Value) error { default: dst.Set(reflect.ValueOf(v)) } - return nil } func (t booleanType) ConvertValue(val Value, typ Type) (Value, error) { - return val, checkTypeKindEqual(t, typ) + switch typ.(type) { + case *stringType: + return convertStringToBoolean(val) + } + switch typ.Kind() { + case Boolean: + return val, nil + case Int32: + return convertInt32ToBoolean(val) + case Int64: + return convertInt64ToBoolean(val) + case Int96: + return convertInt96ToBoolean(val) + case Float: + return convertFloatToBoolean(val) + case Double: + return convertDoubleToBoolean(val) + case ByteArray, FixedLenByteArray: + return convertByteArrayToBoolean(val) + default: + return makeValueKind(Boolean), nil + } } type int32Type struct{} @@ -381,12 +401,32 @@ func (t int32Type) AssignValue(dst reflect.Value, src Value) error { default: dst.Set(reflect.ValueOf(v)) } - return nil } func (t int32Type) ConvertValue(val Value, typ Type) (Value, error) { - return val, checkTypeKindEqual(t, typ) + switch typ.(type) { + case *stringType: + return convertStringToInt32(val) + } + switch typ.Kind() { + case Boolean: + return convertBooleanToInt32(val) + case Int32: + return val, nil + case Int64: + return convertInt64ToInt32(val) + case Int96: + return convertInt96ToInt32(val) + case Float: + return convertFloatToInt32(val) + case Double: + return convertDoubleToInt32(val) + case ByteArray, FixedLenByteArray: + return convertByteArrayToInt32(val) + default: + return makeValueKind(Int32), nil + } } type int64Type struct{} @@ -448,7 +488,28 @@ func (t int64Type) AssignValue(dst reflect.Value, src Value) error { } func (t int64Type) ConvertValue(val Value, typ Type) (Value, error) { - return val, checkTypeKindEqual(t, typ) + switch typ.(type) { + case *stringType: + return convertStringToInt64(val) + } + switch typ.Kind() { + case Boolean: + return convertBooleanToInt64(val) + case Int32: + return convertInt32ToInt64(val) + case Int64: + return val, nil + case Int96: + return convertInt96ToInt64(val) + case Float: + return convertFloatToInt64(val) + case Double: + return convertDoubleToInt64(val) + case ByteArray, FixedLenByteArray: + return convertByteArrayToInt64(val) + default: + return makeValueKind(Int64), nil + } } type int96Type struct{} @@ -504,7 +565,28 @@ func (t int96Type) AssignValue(dst reflect.Value, src Value) error { } func (t int96Type) ConvertValue(val Value, typ Type) (Value, error) { - return val, checkTypeKindEqual(t, typ) + switch typ.(type) { + case *stringType: + return convertStringToInt96(val) + } + switch typ.Kind() { + case Boolean: + return convertBooleanToInt96(val) + case Int32: + return convertInt32ToInt96(val) + case Int64: + return convertInt64ToInt96(val) + case Int96: + return val, nil + case Float: + return convertFloatToInt96(val) + case Double: + return convertDoubleToInt96(val) + case ByteArray, FixedLenByteArray: + return convertByteArrayToInt96(val) + default: + return makeValueKind(Int96), nil + } } type floatType struct{} @@ -564,7 +646,28 @@ func (t floatType) AssignValue(dst reflect.Value, src Value) error { } func (t floatType) ConvertValue(val Value, typ Type) (Value, error) { - return val, checkTypeKindEqual(t, typ) + switch typ.(type) { + case *stringType: + return convertStringToFloat(val) + } + switch typ.Kind() { + case Boolean: + return convertBooleanToFloat(val) + case Int32: + return convertInt32ToFloat(val) + case Int64: + return convertInt64ToFloat(val) + case Int96: + return convertInt96ToFloat(val) + case Float: + return val, nil + case Double: + return convertDoubleToFloat(val) + case ByteArray, FixedLenByteArray: + return convertByteArrayToFloat(val) + default: + return makeValueKind(Float), nil + } } type doubleType struct{} @@ -620,12 +723,32 @@ func (t doubleType) AssignValue(dst reflect.Value, src Value) error { default: dst.Set(reflect.ValueOf(v)) } - return nil } func (t doubleType) ConvertValue(val Value, typ Type) (Value, error) { - return val, checkTypeKindEqual(t, typ) + switch typ.(type) { + case *stringType: + return convertStringToDouble(val) + } + switch typ.Kind() { + case Boolean: + return convertBooleanToDouble(val) + case Int32: + return convertInt32ToDouble(val) + case Int64: + return convertInt64ToDouble(val) + case Int96: + return convertInt96ToDouble(val) + case Float: + return convertFloatToDouble(val) + case Double: + return val, nil + case ByteArray, FixedLenByteArray: + return convertByteArrayToDouble(val) + default: + return makeValueKind(Double), nil + } } type byteArrayType struct{} @@ -684,12 +807,28 @@ func (t byteArrayType) AssignValue(dst reflect.Value, src Value) error { val := reflect.ValueOf(string(v)) dst.Set(val) } - return nil } func (t byteArrayType) ConvertValue(val Value, typ Type) (Value, error) { - return val, checkTypeKindEqual(t, typ) + switch typ.Kind() { + case Boolean: + return convertBooleanToByteArray(val) + case Int32: + return convertInt32ToByteArray(val) + case Int64: + return convertInt64ToByteArray(val) + case Int96: + return convertInt96ToByteArray(val) + case Float: + return convertFloatToByteArray(val) + case Double: + return convertDoubleToByteArray(val) + case ByteArray, FixedLenByteArray: + return val, nil + default: + return makeValueKind(ByteArray), nil + } } type fixedLenByteArrayType struct{ length int } @@ -776,7 +915,72 @@ func (t fixedLenByteArrayType) AssignValue(dst reflect.Value, src Value) error { } func (t fixedLenByteArrayType) ConvertValue(val Value, typ Type) (Value, error) { - return val, checkTypeKindEqual(t, typ) + switch typ.(type) { + case *stringType: + return convertStringToFixedLenByteArray(val, t.length) + } + switch typ.Kind() { + case Boolean: + return convertBooleanToFixedLenByteArray(val, t.length) + case Int32: + return convertInt32ToFixedLenByteArray(val, t.length) + case Int64: + return convertInt64ToFixedLenByteArray(val, t.length) + case Int96: + return convertInt96ToFixedLenByteArray(val, t.length) + case Float: + return convertFloatToFixedLenByteArray(val, t.length) + case Double: + return convertDoubleToFixedLenByteArray(val, t.length) + case ByteArray, FixedLenByteArray: + return convertByteArrayToFixedLenByteArray(val, t.length) + default: + return makeValueBytes(FixedLenByteArray, make([]byte, t.length)), nil + } +} + +type uint32Type struct{ int32Type } + +func (t uint32Type) Compare(a, b Value) int { + return compareUint32(a.uint32(), b.uint32()) +} + +func (t uint32Type) NewColumnIndexer(sizeLimit int) ColumnIndexer { + return newUint32ColumnIndexer() +} + +func (t uint32Type) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { + return newUint32ColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) +} + +func (t uint32Type) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { + return newUint32Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) +} + +func (t uint32Type) NewPage(columnIndex, numValues int, data encoding.Values) Page { + return newUint32Page(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) +} + +type uint64Type struct{ int64Type } + +func (t uint64Type) Compare(a, b Value) int { + return compareUint64(a.uint64(), b.uint64()) +} + +func (t uint64Type) NewColumnIndexer(sizeLimit int) ColumnIndexer { + return newUint64ColumnIndexer() +} + +func (t uint64Type) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { + return newUint64ColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) +} + +func (t uint64Type) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { + return newUint64Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) +} + +func (t uint64Type) NewPage(columnIndex, numValues int, data encoding.Values) Page { + return newUint64Page(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) } // BE128 stands for "big-endian 128 bits". This type is used as a special case @@ -910,16 +1114,26 @@ var unsignedIntTypes = [...]intType{ type intType format.IntType -func (t *intType) String() string { return (*format.IntType)(t).String() } - -func (t *intType) Kind() Kind { - if t.BitWidth == 64 { - return Int64 +func (t *intType) baseType() Type { + if t.IsSigned { + if t.BitWidth == 64 { + return int64Type{} + } else { + return int32Type{} + } } else { - return Int32 + if t.BitWidth == 64 { + return uint64Type{} + } else { + return uint32Type{} + } } } +func (t *intType) String() string { return (*format.IntType)(t).String() } + +func (t *intType) Kind() Kind { return t.baseType().Kind() } + func (t *intType) Length() int { return int(t.BitWidth) } func (t *intType) EstimateSize(n int) int { return (int(t.BitWidth) / 8) * n } @@ -927,6 +1141,9 @@ func (t *intType) EstimateSize(n int) int { return (int(t.BitWidth) / 8) * n } func (t *intType) EstimateNumValues(n int) int { return n / (int(t.BitWidth) / 8) } func (t *intType) Compare(a, b Value) int { + // This code is similar to t.baseType().Compare(a,b) but comparison methods + // tend to be invoked a lot (e.g. when sorting) so avoiding the interface + // indirection in this case yields much better throughput in some cases. if t.BitWidth == 64 { i1 := a.int64() i2 := b.int64() @@ -946,17 +1163,9 @@ func (t *intType) Compare(a, b Value) int { } } -func (t *intType) ColumnOrder() *format.ColumnOrder { - return &typeDefinedColumnOrder -} +func (t *intType) ColumnOrder() *format.ColumnOrder { return t.baseType().ColumnOrder() } -func (t *intType) PhysicalType() *format.Type { - if t.BitWidth == 64 { - return &physicalTypes[Int64] - } else { - return &physicalTypes[Int32] - } -} +func (t *intType) PhysicalType() *format.Type { return t.baseType().PhysicalType() } func (t *intType) LogicalType() *format.LogicalType { return &format.LogicalType{Integer: (*format.IntType)(t)} @@ -973,114 +1182,50 @@ func (t *intType) ConvertedType() *deprecated.ConvertedType { } func (t *intType) NewColumnIndexer(sizeLimit int) ColumnIndexer { - if t.IsSigned { - if t.BitWidth == 64 { - return newInt64ColumnIndexer() - } else { - return newInt32ColumnIndexer() - } - } else { - if t.BitWidth == 64 { - return newUint64ColumnIndexer() - } else { - return newUint32ColumnIndexer() - } - } + return t.baseType().NewColumnIndexer(sizeLimit) } func (t *intType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { - if t.IsSigned { - if t.BitWidth == 64 { - return newInt64ColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) - } else { - return newInt32ColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) - } - } else { - if t.BitWidth == 64 { - return newUint64ColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) - } else { - return newUint32ColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) - } - } + return t.baseType().NewColumnBuffer(columnIndex, numValues) } func (t *intType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { - if t.IsSigned { - if t.BitWidth == 64 { - return newInt64Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) - } else { - return newInt32Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) - } - } else { - if t.BitWidth == 64 { - return newUint64Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) - } else { - return newUint32Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) - } - } + return t.baseType().NewDictionary(columnIndex, numValues, data) } func (t *intType) NewPage(columnIndex, numValues int, data encoding.Values) Page { - if t.IsSigned { - if t.BitWidth == 64 { - return newInt64Page(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) - } else { - return newInt32Page(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) - } - } else { - if t.BitWidth == 64 { - return newUint64Page(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) - } else { - return newUint32Page(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) - } - } + return t.baseType().NewPage(columnIndex, numValues, data) } -func (t *intType) NewValues(values []byte, _ []uint32) encoding.Values { - if t.BitWidth == 64 { - return encoding.Int64ValuesFromBytes(values) - } else { - return encoding.Int32ValuesFromBytes(values) - } +func (t *intType) NewValues(values []byte, offsets []uint32) encoding.Values { + return t.baseType().NewValues(values, offsets) } func (t *intType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { - if t.BitWidth == 64 { - return encoding.EncodeInt64(dst, src, enc) - } else { - return encoding.EncodeInt32(dst, src, enc) - } + return t.baseType().Encode(dst, src, enc) } func (t *intType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { - if t.BitWidth == 64 { - return encoding.DecodeInt64(dst, src, enc) - } else { - return encoding.DecodeInt32(dst, src, enc) - } + return t.baseType().Decode(dst, src, enc) } func (t *intType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { - if t.BitWidth == 64 { - return Int64Type.EstimateDecodeSize(numValues, src, enc) - } else { - return Int32Type.EstimateDecodeSize(numValues, src, enc) - } + return t.baseType().EstimateDecodeSize(numValues, src, enc) } func (t *intType) AssignValue(dst reflect.Value, src Value) error { if t.BitWidth == 64 { - return Int64Type.AssignValue(dst, src) + return int64Type{}.AssignValue(dst, src) } else { - return Int32Type.AssignValue(dst, src) + return int32Type{}.AssignValue(dst, src) } } func (t *intType) ConvertValue(val Value, typ Type) (Value, error) { if t.BitWidth == 64 { - return Int64Type.ConvertValue(val, typ) + return int64Type{}.ConvertValue(val, typ) } else { - return Int32Type.ConvertValue(val, typ) + return int32Type{}.ConvertValue(val, typ) } } @@ -1131,9 +1276,9 @@ func (t *stringType) Kind() Kind { return ByteArray } func (t *stringType) Length() int { return 0 } -func (t *stringType) EstimateSize(n int) int { return ByteArrayType.EstimateSize(n) } +func (t *stringType) EstimateSize(n int) int { return byteArrayType{}.EstimateSize(n) } -func (t *stringType) EstimateNumValues(n int) int { return ByteArrayType.EstimateNumValues(n) } +func (t *stringType) EstimateNumValues(n int) int { return byteArrayType{}.EstimateNumValues(n) } func (t *stringType) Compare(a, b Value) int { return bytes.Compare(a.byteArray(), b.byteArray()) @@ -1184,15 +1329,45 @@ func (t *stringType) Decode(dst encoding.Values, src []byte, enc encoding.Encodi } func (t *stringType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { - return ByteArrayType.EstimateDecodeSize(numValues, src, enc) + return byteArrayType{}.EstimateDecodeSize(numValues, src, enc) } func (t *stringType) AssignValue(dst reflect.Value, src Value) error { - return ByteArrayType.AssignValue(dst, src) + return byteArrayType{}.AssignValue(dst, src) } func (t *stringType) ConvertValue(val Value, typ Type) (Value, error) { - return ByteArrayType.ConvertValue(val, typ) + switch t2 := typ.(type) { + case *dateType: + return convertDateToString(val) + case *timeType: + tz := t2.tz() + if t2.Unit.Micros != nil { + return convertTimeMicrosToString(val, tz) + } else { + return convertTimeMillisToString(val, tz) + } + } + switch typ.Kind() { + case Boolean: + return convertBooleanToString(val) + case Int32: + return convertInt32ToString(val) + case Int64: + return convertInt64ToString(val) + case Int96: + return convertInt96ToString(val) + case Float: + return convertFloatToString(val) + case Double: + return convertDoubleToString(val) + case ByteArray: + return val, nil + case FixedLenByteArray: + return convertFixedLenByteArrayToString(val) + default: + return makeValueKind(ByteArray), nil + } } // UUID constructs a leaf node of UUID logical type. @@ -1204,15 +1379,15 @@ type uuidType format.UUIDType func (t *uuidType) String() string { return (*format.UUIDType)(t).String() } -func (t *uuidType) Kind() Kind { return FixedLenByteArray } +func (t *uuidType) Kind() Kind { return be128Type{}.Kind() } -func (t *uuidType) Length() int { return 16 } +func (t *uuidType) Length() int { return be128Type{}.Length() } -func (t *uuidType) EstimateSize(n int) int { return 16 * n } +func (t *uuidType) EstimateSize(n int) int { return be128Type{}.EstimateSize(n) } -func (t *uuidType) EstimateNumValues(n int) int { return n / 16 } +func (t *uuidType) EstimateNumValues(n int) int { return be128Type{}.EstimateNumValues(n) } -func (t *uuidType) Compare(a, b Value) int { return compareBE128(a.be128(), b.be128()) } +func (t *uuidType) Compare(a, b Value) int { return be128Type{}.Compare(a, b) } func (t *uuidType) ColumnOrder() *format.ColumnOrder { return &typeDefinedColumnOrder } @@ -1225,31 +1400,31 @@ func (t *uuidType) LogicalType() *format.LogicalType { func (t *uuidType) ConvertedType() *deprecated.ConvertedType { return nil } func (t *uuidType) NewColumnIndexer(sizeLimit int) ColumnIndexer { - return newBE128ColumnIndexer() + return be128Type{}.NewColumnIndexer(sizeLimit) } func (t *uuidType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { - return newBE128Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) + return be128Type{}.NewDictionary(columnIndex, numValues, data) } func (t *uuidType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { - return newBE128ColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) + return be128Type{}.NewColumnBuffer(columnIndex, numValues) } func (t *uuidType) NewPage(columnIndex, numValues int, data encoding.Values) Page { - return newBE128Page(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) + return be128Type{}.NewPage(columnIndex, numValues, data) } -func (t *uuidType) NewValues(values []byte, _ []uint32) encoding.Values { - return encoding.FixedLenByteArrayValues(values, 16) +func (t *uuidType) NewValues(values []byte, offsets []uint32) encoding.Values { + return be128Type{}.NewValues(values, offsets) } func (t *uuidType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { - return encoding.EncodeFixedLenByteArray(dst, src, enc) + return be128Type{}.Encode(dst, src, enc) } func (t *uuidType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { - return encoding.DecodeFixedLenByteArray(dst, src, enc) + return be128Type{}.Decode(dst, src, enc) } func (t *uuidType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { @@ -1273,25 +1448,19 @@ type enumType format.EnumType func (t *enumType) String() string { return (*format.EnumType)(t).String() } -func (t *enumType) Kind() Kind { return ByteArray } +func (t *enumType) Kind() Kind { return new(stringType).Kind() } -func (t *enumType) Length() int { return 0 } +func (t *enumType) Length() int { return new(stringType).Length() } -func (t *enumType) EstimateSize(n int) int { return ByteArrayType.EstimateSize(n) } +func (t *enumType) EstimateSize(n int) int { return new(stringType).EstimateSize(n) } -func (t *enumType) EstimateNumValues(n int) int { return ByteArrayType.EstimateNumValues(n) } +func (t *enumType) EstimateNumValues(n int) int { return new(stringType).EstimateNumValues(n) } -func (t *enumType) Compare(a, b Value) int { - return bytes.Compare(a.byteArray(), b.byteArray()) -} +func (t *enumType) Compare(a, b Value) int { return new(stringType).Compare(a, b) } -func (t *enumType) ColumnOrder() *format.ColumnOrder { - return &typeDefinedColumnOrder -} +func (t *enumType) ColumnOrder() *format.ColumnOrder { return new(stringType).ColumnOrder() } -func (t *enumType) PhysicalType() *format.Type { - return &physicalTypes[ByteArray] -} +func (t *enumType) PhysicalType() *format.Type { return new(stringType).PhysicalType() } func (t *enumType) LogicalType() *format.LogicalType { return &format.LogicalType{Enum: (*format.EnumType)(t)} @@ -1302,43 +1471,48 @@ func (t *enumType) ConvertedType() *deprecated.ConvertedType { } func (t *enumType) NewColumnIndexer(sizeLimit int) ColumnIndexer { - return newByteArrayColumnIndexer(sizeLimit) + return new(stringType).NewColumnIndexer(sizeLimit) } func (t *enumType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { - return newByteArrayDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) + return new(stringType).NewDictionary(columnIndex, numValues, data) } func (t *enumType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { - return newByteArrayColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) + return new(stringType).NewColumnBuffer(columnIndex, numValues) } func (t *enumType) NewPage(columnIndex, numValues int, data encoding.Values) Page { - return newByteArrayPage(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) + return new(stringType).NewPage(columnIndex, numValues, data) } func (t *enumType) NewValues(values []byte, offsets []uint32) encoding.Values { - return encoding.ByteArrayValues(values, offsets) + return new(stringType).NewValues(values, offsets) } func (t *enumType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { - return encoding.EncodeByteArray(dst, src, enc) + return new(stringType).Encode(dst, src, enc) } func (t *enumType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { - return encoding.DecodeByteArray(dst, src, enc) + return new(stringType).Decode(dst, src, enc) } func (t *enumType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { - return ByteArrayType.EstimateDecodeSize(numValues, src, enc) + return new(stringType).EstimateDecodeSize(numValues, src, enc) } func (t *enumType) AssignValue(dst reflect.Value, src Value) error { - return ByteArrayType.AssignValue(dst, src) + return new(stringType).AssignValue(dst, src) } func (t *enumType) ConvertValue(val Value, typ Type) (Value, error) { - return ByteArrayType.ConvertValue(val, typ) + switch typ.(type) { + case *byteArrayType, *stringType, *enumType: + return val, nil + default: + return val, invalidConversion(val, "ENUM", typ.String()) + } } // JSON constructs a leaf node of JSON logical type. @@ -1350,25 +1524,19 @@ type jsonType format.JsonType func (t *jsonType) String() string { return (*format.JsonType)(t).String() } -func (t *jsonType) Kind() Kind { return ByteArray } +func (t *jsonType) Kind() Kind { return byteArrayType{}.Kind() } -func (t *jsonType) Length() int { return 0 } +func (t *jsonType) Length() int { return byteArrayType{}.Length() } -func (t *jsonType) EstimateSize(n int) int { return ByteArrayType.EstimateSize(n) } +func (t *jsonType) EstimateSize(n int) int { return byteArrayType{}.EstimateSize(n) } -func (t *jsonType) EstimateNumValues(n int) int { return ByteArrayType.EstimateNumValues(n) } +func (t *jsonType) EstimateNumValues(n int) int { return byteArrayType{}.EstimateNumValues(n) } -func (t *jsonType) Compare(a, b Value) int { - return bytes.Compare(a.byteArray(), b.byteArray()) -} +func (t *jsonType) Compare(a, b Value) int { return byteArrayType{}.Compare(a, b) } -func (t *jsonType) ColumnOrder() *format.ColumnOrder { - return &typeDefinedColumnOrder -} +func (t *jsonType) ColumnOrder() *format.ColumnOrder { return byteArrayType{}.ColumnOrder() } -func (t *jsonType) PhysicalType() *format.Type { - return &physicalTypes[ByteArray] -} +func (t *jsonType) PhysicalType() *format.Type { return byteArrayType{}.PhysicalType() } func (t *jsonType) LogicalType() *format.LogicalType { return &format.LogicalType{Json: (*format.JsonType)(t)} @@ -1379,43 +1547,48 @@ func (t *jsonType) ConvertedType() *deprecated.ConvertedType { } func (t *jsonType) NewColumnIndexer(sizeLimit int) ColumnIndexer { - return newByteArrayColumnIndexer(sizeLimit) + return byteArrayType{}.NewColumnIndexer(sizeLimit) } func (t *jsonType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { - return newByteArrayDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) + return byteArrayType{}.NewDictionary(columnIndex, numValues, data) } func (t *jsonType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { - return newByteArrayColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) + return byteArrayType{}.NewColumnBuffer(columnIndex, numValues) } func (t *jsonType) NewPage(columnIndex, numValues int, data encoding.Values) Page { - return newByteArrayPage(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) + return byteArrayType{}.NewPage(columnIndex, numValues, data) } func (t *jsonType) NewValues(values []byte, offsets []uint32) encoding.Values { - return encoding.ByteArrayValues(values, offsets) + return byteArrayType{}.NewValues(values, offsets) } func (t *jsonType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { - return encoding.EncodeByteArray(dst, src, enc) + return byteArrayType{}.Encode(dst, src, enc) } func (t *jsonType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { - return encoding.DecodeByteArray(dst, src, enc) + return byteArrayType{}.Decode(dst, src, enc) } func (t *jsonType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { - return ByteArrayType.EstimateDecodeSize(numValues, src, enc) + return byteArrayType{}.EstimateDecodeSize(numValues, src, enc) } func (t *jsonType) AssignValue(dst reflect.Value, src Value) error { - return ByteArrayType.AssignValue(dst, src) + return byteArrayType{}.AssignValue(dst, src) } func (t *jsonType) ConvertValue(val Value, typ Type) (Value, error) { - return ByteArrayType.ConvertValue(val, typ) + switch typ.(type) { + case *byteArrayType, *stringType, *jsonType: + return val, nil + default: + return val, invalidConversion(val, "JSON", typ.String()) + } } // BSON constructs a leaf node of BSON logical type. @@ -1427,25 +1600,19 @@ type bsonType format.BsonType func (t *bsonType) String() string { return (*format.BsonType)(t).String() } -func (t *bsonType) Kind() Kind { return ByteArray } +func (t *bsonType) Kind() Kind { return byteArrayType{}.Kind() } -func (t *bsonType) Length() int { return 0 } +func (t *bsonType) Length() int { return byteArrayType{}.Length() } -func (t *bsonType) EstimateSize(n int) int { return ByteArrayType.EstimateSize(n) } +func (t *bsonType) EstimateSize(n int) int { return byteArrayType{}.EstimateSize(n) } -func (t *bsonType) EstimateNumValues(n int) int { return ByteArrayType.EstimateNumValues(n) } +func (t *bsonType) EstimateNumValues(n int) int { return byteArrayType{}.EstimateNumValues(n) } -func (t *bsonType) Compare(a, b Value) int { - return bytes.Compare(a.byteArray(), b.byteArray()) -} +func (t *bsonType) Compare(a, b Value) int { return byteArrayType{}.Compare(a, b) } -func (t *bsonType) ColumnOrder() *format.ColumnOrder { - return &typeDefinedColumnOrder -} +func (t *bsonType) ColumnOrder() *format.ColumnOrder { return byteArrayType{}.ColumnOrder() } -func (t *bsonType) PhysicalType() *format.Type { - return &physicalTypes[ByteArray] -} +func (t *bsonType) PhysicalType() *format.Type { return byteArrayType{}.PhysicalType() } func (t *bsonType) LogicalType() *format.LogicalType { return &format.LogicalType{Bson: (*format.BsonType)(t)} @@ -1456,43 +1623,48 @@ func (t *bsonType) ConvertedType() *deprecated.ConvertedType { } func (t *bsonType) NewColumnIndexer(sizeLimit int) ColumnIndexer { - return newByteArrayColumnIndexer(sizeLimit) + return byteArrayType{}.NewColumnIndexer(sizeLimit) } func (t *bsonType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { - return newByteArrayDictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) + return byteArrayType{}.NewDictionary(columnIndex, numValues, data) } func (t *bsonType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { - return newByteArrayColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) + return byteArrayType{}.NewColumnBuffer(columnIndex, numValues) } func (t *bsonType) NewPage(columnIndex, numValues int, data encoding.Values) Page { - return newByteArrayPage(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) + return byteArrayType{}.NewPage(columnIndex, numValues, data) } func (t *bsonType) NewValues(values []byte, offsets []uint32) encoding.Values { - return encoding.ByteArrayValues(values, offsets) + return byteArrayType{}.NewValues(values, offsets) } func (t *bsonType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { - return encoding.EncodeByteArray(dst, src, enc) + return byteArrayType{}.Encode(dst, src, enc) } func (t *bsonType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { - return encoding.DecodeByteArray(dst, src, enc) + return byteArrayType{}.Decode(dst, src, enc) } func (t *bsonType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { - return ByteArrayType.EstimateDecodeSize(numValues, src, enc) + return byteArrayType{}.EstimateDecodeSize(numValues, src, enc) } func (t *bsonType) AssignValue(dst reflect.Value, src Value) error { - return ByteArrayType.AssignValue(dst, src) + return byteArrayType{}.AssignValue(dst, src) } func (t *bsonType) ConvertValue(val Value, typ Type) (Value, error) { - return ByteArrayType.ConvertValue(val, typ) + switch typ.(type) { + case *byteArrayType, *bsonType: + return val, nil + default: + return val, invalidConversion(val, "BSON", typ.String()) + } } // Date constructs a leaf node of DATE logical type. @@ -1504,21 +1676,19 @@ type dateType format.DateType func (t *dateType) String() string { return (*format.DateType)(t).String() } -func (t *dateType) Kind() Kind { return Int32 } +func (t *dateType) Kind() Kind { return int32Type{}.Kind() } -func (t *dateType) Length() int { return 32 } +func (t *dateType) Length() int { return int32Type{}.Length() } -func (t *dateType) EstimateSize(n int) int { return Int32Type.EstimateSize(n) } +func (t *dateType) EstimateSize(n int) int { return int32Type{}.EstimateSize(n) } -func (t *dateType) EstimateNumValues(n int) int { return Int32Type.EstimateNumValues(n) } +func (t *dateType) EstimateNumValues(n int) int { return int32Type{}.EstimateNumValues(n) } -func (t *dateType) Compare(a, b Value) int { return compareInt32(a.int32(), b.int32()) } +func (t *dateType) Compare(a, b Value) int { return int32Type{}.Compare(a, b) } -func (t *dateType) ColumnOrder() *format.ColumnOrder { - return &typeDefinedColumnOrder -} +func (t *dateType) ColumnOrder() *format.ColumnOrder { return int32Type{}.ColumnOrder() } -func (t *dateType) PhysicalType() *format.Type { return &physicalTypes[Int32] } +func (t *dateType) PhysicalType() *format.Type { return int32Type{}.PhysicalType() } func (t *dateType) LogicalType() *format.LogicalType { return &format.LogicalType{Date: (*format.DateType)(t)} @@ -1529,43 +1699,49 @@ func (t *dateType) ConvertedType() *deprecated.ConvertedType { } func (t *dateType) NewColumnIndexer(sizeLimit int) ColumnIndexer { - return newInt32ColumnIndexer() + return int32Type{}.NewColumnIndexer(sizeLimit) } -func (t *dateType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { - return newInt32ColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) +func (t *dateType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { + return int32Type{}.NewDictionary(columnIndex, numValues, data) } -func (t *dateType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { - return newInt32Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) +func (t *dateType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { + return int32Type{}.NewColumnBuffer(columnIndex, numValues) } func (t *dateType) NewPage(columnIndex, numValues int, data encoding.Values) Page { - return newInt32Page(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) + return int32Type{}.NewPage(columnIndex, numValues, data) } -func (t *dateType) NewValues(values []byte, _ []uint32) encoding.Values { - return encoding.Int32ValuesFromBytes(values) +func (t *dateType) NewValues(values []byte, offsets []uint32) encoding.Values { + return int32Type{}.NewValues(values, offsets) } func (t *dateType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { - return encoding.EncodeInt32(dst, src, enc) + return int32Type{}.Encode(dst, src, enc) } func (t *dateType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { - return encoding.DecodeInt32(dst, src, enc) + return int32Type{}.Decode(dst, src, enc) } func (t *dateType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { - return Int32Type.EstimateDecodeSize(numValues, src, enc) + return int32Type{}.EstimateDecodeSize(numValues, src, enc) } func (t *dateType) AssignValue(dst reflect.Value, src Value) error { - return Int32Type.AssignValue(dst, src) + return int32Type{}.AssignValue(dst, src) } func (t *dateType) ConvertValue(val Value, typ Type) (Value, error) { - return Int32Type.ConvertValue(val, typ) + switch src := typ.(type) { + case *stringType: + return convertStringToDate(val, time.UTC) + case *timestampType: + return convertTimestampToDate(val, src.Unit, src.tz()) + } + return int32Type{}.ConvertValue(val, typ) } // TimeUnit represents units of time in the parquet type system. @@ -1613,69 +1789,41 @@ func Time(unit TimeUnit) Node { type timeType format.TimeType -func (t *timeType) useInt32() bool { - return t.Unit.Millis != nil -} - -func (t *timeType) useInt64() bool { - return t.Unit.Micros != nil -} - -func (t *timeType) String() string { - return (*format.TimeType)(t).String() -} - -func (t *timeType) Kind() Kind { - if t.useInt32() { - return Int32 +func (t *timeType) tz() *time.Location { + if t.IsAdjustedToUTC { + return time.UTC } else { - return Int64 + return time.Local } } -func (t *timeType) Length() int { +func (t *timeType) baseType() Type { if t.useInt32() { - return 32 + return int32Type{} } else { - return 64 + return int64Type{} } } -func (t *timeType) EstimateSize(n int) int { - if t.useInt32() { - return Int32Type.EstimateSize(n) - } else { - return Int64Type.EstimateNumValues(n) - } -} +func (t *timeType) useInt32() bool { return t.Unit.Millis != nil } -func (t *timeType) EstimateNumValues(n int) int { - if t.useInt32() { - return Int32Type.EstimateNumValues(n) - } else { - return Int64Type.EstimateNumValues(n) - } -} +func (t *timeType) useInt64() bool { return t.Unit.Micros != nil } -func (t *timeType) Compare(a, b Value) int { - if t.useInt32() { - return compareInt32(a.int32(), b.int32()) - } else { - return compareInt64(a.int64(), b.int64()) - } -} +func (t *timeType) String() string { return (*format.TimeType)(t).String() } -func (t *timeType) ColumnOrder() *format.ColumnOrder { - return &typeDefinedColumnOrder -} +func (t *timeType) Kind() Kind { return t.baseType().Kind() } -func (t *timeType) PhysicalType() *format.Type { - if t.useInt32() { - return &physicalTypes[Int32] - } else { - return &physicalTypes[Int64] - } -} +func (t *timeType) Length() int { return t.baseType().Length() } + +func (t *timeType) EstimateSize(n int) int { return t.baseType().EstimateSize(n) } + +func (t *timeType) EstimateNumValues(n int) int { return t.baseType().EstimateNumValues(n) } + +func (t *timeType) Compare(a, b Value) int { return t.baseType().Compare(a, b) } + +func (t *timeType) ColumnOrder() *format.ColumnOrder { return t.baseType().ColumnOrder() } + +func (t *timeType) PhysicalType() *format.Type { return t.baseType().PhysicalType() } func (t *timeType) LogicalType() *format.LogicalType { return &format.LogicalType{Time: (*format.TimeType)(t)} @@ -1693,83 +1841,59 @@ func (t *timeType) ConvertedType() *deprecated.ConvertedType { } func (t *timeType) NewColumnIndexer(sizeLimit int) ColumnIndexer { - if t.useInt32() { - return newInt32ColumnIndexer() - } else { - return newInt64ColumnIndexer() - } + return t.baseType().NewColumnIndexer(sizeLimit) } func (t *timeType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { - if t.useInt32() { - return newInt32ColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) - } else { - return newInt64ColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) - } + return t.baseType().NewColumnBuffer(columnIndex, numValues) } func (t *timeType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { - if t.useInt32() { - return newInt32Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) - } else { - return newInt64Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) - } + return t.baseType().NewDictionary(columnIndex, numValues, data) } func (t *timeType) NewPage(columnIndex, numValues int, data encoding.Values) Page { - if t.useInt32() { - return newInt32Page(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) - } else { - return newInt64Page(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) - } + return t.baseType().NewPage(columnIndex, numValues, data) } -func (t *timeType) NewValues(values []byte, _ []uint32) encoding.Values { - if t.useInt32() { - return encoding.Int32ValuesFromBytes(values) - } else { - return encoding.Int64ValuesFromBytes(values) - } +func (t *timeType) NewValues(values []byte, offset []uint32) encoding.Values { + return t.baseType().NewValues(values, offset) } func (t *timeType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { - if t.useInt32() { - return encoding.EncodeInt32(dst, src, enc) - } else { - return encoding.EncodeInt64(dst, src, enc) - } + return t.baseType().Encode(dst, src, enc) } func (t *timeType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { - if t.useInt32() { - return encoding.DecodeInt32(dst, src, enc) - } else { - return encoding.DecodeInt64(dst, src, enc) - } + return t.baseType().Decode(dst, src, enc) } func (t *timeType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { - if t.useInt32() { - return Int32Type.EstimateDecodeSize(numValues, src, enc) - } else { - return Int64Type.EstimateDecodeSize(numValues, src, enc) - } + return t.baseType().EstimateDecodeSize(numValues, src, enc) } func (t *timeType) AssignValue(dst reflect.Value, src Value) error { - if t.useInt32() { - return Int32Type.AssignValue(dst, src) - } else { - return Int64Type.AssignValue(dst, src) - } + return t.baseType().AssignValue(dst, src) } func (t *timeType) ConvertValue(val Value, typ Type) (Value, error) { - if t.useInt32() { - return Int32Type.ConvertValue(val, typ) - } else { - return Int64Type.ConvertValue(val, typ) + switch src := typ.(type) { + case *stringType: + tz := t.tz() + if t.Unit.Micros != nil { + return convertStringToTimeMicros(val, tz) + } else { + return convertStringToTimeMillis(val, tz) + } + case *timestampType: + tz := t.tz() + if t.Unit.Micros != nil { + return convertTimestampToTimeMicros(val, src.Unit, src.tz(), tz) + } else { + return convertTimestampToTimeMillis(val, src.Unit, src.tz(), tz) + } } + return t.baseType().ConvertValue(val, typ) } // Timestamp constructs of leaf node of TIMESTAMP logical type. @@ -1781,21 +1905,29 @@ func Timestamp(unit TimeUnit) Node { type timestampType format.TimestampType +func (t *timestampType) tz() *time.Location { + if t.IsAdjustedToUTC { + return time.UTC + } else { + return time.Local + } +} + func (t *timestampType) String() string { return (*format.TimestampType)(t).String() } -func (t *timestampType) Kind() Kind { return Int64 } +func (t *timestampType) Kind() Kind { return int64Type{}.Kind() } -func (t *timestampType) Length() int { return 64 } +func (t *timestampType) Length() int { return int64Type{}.Length() } -func (t *timestampType) EstimateSize(n int) int { return Int64Type.EstimateSize(n) } +func (t *timestampType) EstimateSize(n int) int { return int64Type{}.EstimateSize(n) } -func (t *timestampType) EstimateNumValues(n int) int { return Int64Type.EstimateNumValues(n) } +func (t *timestampType) EstimateNumValues(n int) int { return int64Type{}.EstimateNumValues(n) } -func (t *timestampType) Compare(a, b Value) int { return compareInt64(a.int64(), b.int64()) } +func (t *timestampType) Compare(a, b Value) int { return int64Type{}.Compare(a, b) } -func (t *timestampType) ColumnOrder() *format.ColumnOrder { return &typeDefinedColumnOrder } +func (t *timestampType) ColumnOrder() *format.ColumnOrder { return int64Type{}.ColumnOrder() } -func (t *timestampType) PhysicalType() *format.Type { return &physicalTypes[Int64] } +func (t *timestampType) PhysicalType() *format.Type { return int64Type{}.PhysicalType() } func (t *timestampType) LogicalType() *format.LogicalType { return &format.LogicalType{Timestamp: (*format.TimestampType)(t)} @@ -1813,35 +1945,35 @@ func (t *timestampType) ConvertedType() *deprecated.ConvertedType { } func (t *timestampType) NewColumnIndexer(sizeLimit int) ColumnIndexer { - return newInt64ColumnIndexer() + return int64Type{}.NewColumnIndexer(sizeLimit) } -func (t *timestampType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { - return newInt64ColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues)) +func (t *timestampType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { + return int64Type{}.NewDictionary(columnIndex, numValues, data) } -func (t *timestampType) NewDictionary(columnIndex, numValues int, data encoding.Values) Dictionary { - return newInt64Dictionary(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) +func (t *timestampType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer { + return int64Type{}.NewColumnBuffer(columnIndex, numValues) } func (t *timestampType) NewPage(columnIndex, numValues int, data encoding.Values) Page { - return newInt64Page(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data) + return int64Type{}.NewPage(columnIndex, numValues, data) } -func (t *timestampType) NewValues(values []byte, _ []uint32) encoding.Values { - return encoding.Int64ValuesFromBytes(values) +func (t *timestampType) NewValues(values []byte, offsets []uint32) encoding.Values { + return int64Type{}.NewValues(values, offsets) } func (t *timestampType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) { - return encoding.EncodeInt64(dst, src, enc) + return int64Type{}.Encode(dst, src, enc) } func (t *timestampType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) { - return encoding.DecodeInt64(dst, src, enc) + return int64Type{}.Decode(dst, src, enc) } func (t *timestampType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int { - return Int64Type.EstimateDecodeSize(numValues, src, enc) + return int64Type{}.EstimateDecodeSize(numValues, src, enc) } func (t *timestampType) AssignValue(dst reflect.Value, src Value) error { @@ -1865,37 +1997,18 @@ func (t *timestampType) AssignValue(dst reflect.Value, src Value) error { dst.Set(reflect.ValueOf(val)) return nil default: - return Int64Type.AssignValue(dst, src) + return int64Type{}.AssignValue(dst, src) } } func (t *timestampType) ConvertValue(val Value, typ Type) (Value, error) { - var sourceTs *format.TimestampType - if typ.LogicalType() != nil { - sourceTs = typ.LogicalType().Timestamp - } - - // Ignore when source is not a timestamp (i.e., Integer) - if sourceTs == nil { - return val, nil - } - - source := timeUnitDuration(sourceTs.Unit) - target := timeUnitDuration(t.Unit) - converted := val.int64() * source.Nanoseconds() / target.Nanoseconds() - - return ValueOf(converted), nil -} - -func timeUnitDuration(unit format.TimeUnit) time.Duration { - switch { - case unit.Millis != nil: - return time.Millisecond - case unit.Micros != nil: - return time.Microsecond - default: - return time.Nanosecond + switch src := typ.(type) { + case *timestampType: + return convertTimestampToTimestamp(val, src.Unit, t.Unit) + case *dateType: + return convertDateToTimestamp(val, t.Unit, t.tz()) } + return int64Type{}.ConvertValue(val, typ) } // List constructs a node of LIST logical type. diff --git a/value.go b/value.go index 6f7bad5a..e223d98c 100644 --- a/value.go +++ b/value.go @@ -452,9 +452,70 @@ func (v *Value) double() float64 { return math.Float64frombits(uint64(v. func (v *Value) uint32() uint32 { return uint32(v.u64) } func (v *Value) uint64() uint64 { return v.u64 } func (v *Value) byteArray() []byte { return unsafecast.Bytes(v.ptr, int(v.u64)) } +func (v *Value) string() string { return unsafecast.BytesToString(v.byteArray()) } func (v *Value) be128() *[16]byte { return (*[16]byte)(unsafe.Pointer(v.ptr)) } func (v *Value) column() int { return int(^v.columnIndex) } +func (v Value) convertToBoolean(x bool) Value { + v.kind = ^int8(Boolean) + v.ptr = nil + v.u64 = 0 + if x { + v.u64 = 1 + } + return v +} + +func (v Value) convertToInt32(x int32) Value { + v.kind = ^int8(Int32) + v.ptr = nil + v.u64 = uint64(x) + return v +} + +func (v Value) convertToInt64(x int64) Value { + v.kind = ^int8(Int64) + v.ptr = nil + v.u64 = uint64(x) + return v +} + +func (v Value) convertToInt96(x deprecated.Int96) Value { + i96 := makeValueInt96(x) + v.kind = i96.kind + v.ptr = i96.ptr + v.u64 = i96.u64 + return v +} + +func (v Value) convertToFloat(x float32) Value { + v.kind = ^int8(Float) + v.ptr = nil + v.u64 = uint64(math.Float32bits(x)) + return v +} + +func (v Value) convertToDouble(x float64) Value { + v.kind = ^int8(Double) + v.ptr = nil + v.u64 = math.Float64bits(x) + return v +} + +func (v Value) convertToByteArray(x []byte) Value { + v.kind = ^int8(ByteArray) + v.ptr = unsafecast.AddressOfBytes(x) + v.u64 = uint64(len(x)) + return v +} + +func (v Value) convertToFixedLenByteArray(x []byte) Value { + v.kind = ^int8(FixedLenByteArray) + v.ptr = unsafecast.AddressOfBytes(x) + v.u64 = uint64(len(x)) + return v +} + // Kind returns the kind of v, which represents its parquet physical type. func (v Value) Kind() Kind { return ^Kind(v.kind) } @@ -619,7 +680,6 @@ func (v Value) Format(w fmt.State, r rune) { fmt.Fprintf(w, "%+[1]c %+[1]d %+[1]r %+[1]s", v) case w.Flag('#'): v.formatGoString(w) - //fmt.Fprintf(w, "parquet.Value{%+[1]c, %+[1]d, %+[1]r, %+[1]s}", v) default: v.Format(w, 's') }