diff --git a/src/Quadmath.jl b/src/Quadmath.jl index de87b0f..57982b1 100644 --- a/src/Quadmath.jl +++ b/src/Quadmath.jl @@ -82,39 +82,20 @@ end # and passed on the xmm registers, matching the x86_64 ABI for __float128. const Cfloat128 = NTuple{2,VecElement{Float64}} -struct Float128 <: AbstractFloat - data::Cfloat128 - function Float128(data::Cfloat128) - new(data) - end -end +primitive type Float128 <: AbstractFloat 128 end +Float128(data::Cfloat128) = reinterpret(Float128, data) + convert(::Type{Float128}, x::Number) = Float128(x) const ComplexF128 = Complex{Float128} -Base.cconvert(::Type{Cfloat128}, x::Float128) = x.data -Base.cconvert(::Type{Ref{Cfloat128}}, x::Float128) = Ref{Cfloat128}(x.data) - +Base.cconvert(::Type{Cfloat128}, x::Float128) = reinterpret(Cfloat128, x) +Base.cconvert(::Type{Ref{Cfloat128}}, x::Float128) = Ref(Cfloat128(x)) # reinterpret -function reinterpret(::Type{UInt128}, x::Float128) - hi = reinterpret(UInt64, x.data[2].value) - lo = reinterpret(UInt64, x.data[1].value) - UInt128(hi) << 64 | lo -end -function reinterpret(::Type{Float128}, x::UInt128) - fhi = reinterpret(Float64, (x >> 64) % UInt64) - flo = reinterpret(Float64, x % UInt64) - Float128((VecElement(flo), VecElement(fhi))) -end reinterpret(::Type{Unsigned}, x::Float128) = reinterpret(UInt128, x) reinterpret(::Type{Signed}, x::Float128) = reinterpret(Int128, x) -reinterpret(::Type{Int128}, x::Float128) = - reinterpret(Int128, reinterpret(UInt128, x)) -reinterpret(::Type{Float128}, x::Int128) = - reinterpret(Float128, reinterpret(UInt128, x)) - sign_mask(::Type{Float128}) = 0x8000_0000_0000_0000_0000_0000_0000_0000 exponent_mask(::Type{Float128}) = 0x7fff_0000_0000_0000_0000_0000_0000_0000 exponent_one(::Type{Float128}) = 0x3fff_0000_0000_0000_0000_0000_0000_0000 @@ -136,21 +117,31 @@ inttype(::Type{Float128}) = Int128 # conversion Float128(x::Float128) = x +function Float128(x::T) where T <: Base.IEEEFloat + if x===zero(x) + return reinterpret(Float128, zero(UInt128)) + elseif x===-zero(x) + return reinterpret(Float128, sign_mask(Float128)) + else + s = UInt128(signbit(x))<<127 + if !isfinite(x) + xf = reinterpret(Unsigned, x) & significand_mask(T) + d = exponent_mask(Float128) + else + f, e = frexp(x) + xf = reinterpret(Unsigned, f) & significand_mask(T) + d = ((e+exponent_bias(Float128)-1) % UInt128) << significand_bits(Float128) + end + xu = UInt128(xf)<<(significand_bits(Float128)-significand_bits(T)) + return reinterpret(Float128, s|d|xu) + end +end -# Float64 -@assume_effects :foldable Float128(x::Float64) = - Float128(@quad_ccall(quadoplib.__extenddftf2(x::Cdouble)::Cfloat128)) +# truncation @assume_effects :foldable Float64(x::Float128) = @quad_ccall(quadoplib.__trunctfdf2(x::Cfloat128)::Cdouble) - -# Float32 -@assume_effects :foldable Float128(x::Float32) = - Float128(@quad_ccall(quadoplib.__extendsftf2(x::Cfloat)::Cfloat128)) @assume_effects :foldable Float32(x::Float128) = @quad_ccall(quadoplib.__trunctfsf2(x::Cfloat128)::Cfloat) - -# Float16 -Float128(x::Float16) = Float128(Float32(x)) Float16(x::Float128) = Float16(Float64(x)) # TODO: avoid double rounding # TwicePrecision @@ -158,67 +149,28 @@ Float128(x::Base.TwicePrecision{Float64}) = Float128(x.hi) + Float128(x.lo) # integer -> Float128 -@assume_effects :foldable Float128(x::Int32) = - Float128(@quad_ccall(quadoplib.__floatsitf(x::Int32)::Cfloat128)) - -@assume_effects :foldable Float128(x::UInt32) = - Float128(@quad_ccall(quadoplib.__floatunsitf(x::UInt32)::Cfloat128)) - -@assume_effects :foldable Float128(x::Int64) = - Float128(@quad_ccall(quadoplib.__floatditf(x::Int64)::Cfloat128)) - -@assume_effects :foldable Float128(x::UInt64) = - Float128(@quad_ccall(quadoplib.__floatunditf(x::UInt64)::Cfloat128)) - -Float128(x::Int16) = Float128(Int32(x)) -Float128(x::Int8) = Float128(Int32(x)) -Float128(x::UInt16) = Float128(UInt32(x)) -Float128(x::UInt8) = Float128(UInt32(x)) - -function Float128(x::UInt128) - x == 0 && return Float128(0.0) - n = 128-leading_zeros(x) # ndigits0z(x,2) - if n <= 113 +function _Float128_bits(x::T) where T<:Base.BitUnsigned + n = 8*sizeof(T)-leading_zeros(x) # Base.top_set_bit(x) + if n <= 113 # not enough significand to need to care about rounding y = ((x % UInt128) << (113-n)) & significand_mask(Float128) else y = ((x >> (n-114)) % UInt128) & 0x0001_ffff_ffff_ffff_ffff_ffff_ffff_ffff # keep 1 extra bit y = (y+1)>>1 # round, ties up (extra leading bit in case of next exponent) y &= ~UInt128(trailing_zeros(x) == (n-114)) # fix last bit to round to even end - d = ((n+16382) % UInt128) << 112 - # reinterpret(Float128, d + y) - d += y - if Sys.iswindows() - return reinterpret(Float128,d) - else - y1 = reinterpret(Float64,UInt64(d >> 64)) - y2 = reinterpret(Float64,(d % UInt64)) - return Float128((VecElement(y2),VecElement(y1))) - end + d = ((n+exponent_bias(Float128)-1) % UInt128) << significand_bits(Float128) + return d + y +end +function Float128(x::T) where T<:Base.BitUnsigned + iszero(x) && return reinterpret(Float128, UInt128(0)) + reinterpret(Float128, _Float128_bits(x)) end -function Float128(x::Int128) - x == 0 && return 0.0 - s = reinterpret(UInt128,x) & sign_mask(Float128) # sign bit - x = abs(x) % UInt128 - n = 128-leading_zeros(x) # ndigits0z(x,2) - if n <= 113 - y = ((x % UInt128) << (113-n)) & significand_mask(Float128) - else - y = ((x >> (n-114)) % UInt128) & 0x0001_ffff_ffff_ffff_ffff_ffff_ffff_ffff # keep 1 extra bit - y = (y+1)>>1 # round, ties up (extra leading bit in case of next exponent) - y &= ~UInt128(trailing_zeros(x) == (n-114)) # fix last bit to round to even - end - d = ((n+16382) % UInt128) << 112 - # reinterpret(Float128, s | d + y) - d = s | d + y - if Sys.iswindows() - return reinterpret(Float128,d) - else - y1 = reinterpret(Float64,UInt64(d >> 64)) - y2 = reinterpret(Float64,(d % UInt64)) - Float128((VecElement(y2),VecElement(y1))) - end +function Float128(x::T) where T<:Base.BitSigned + iszero(x) && return reinterpret(Float128, UInt128(0)) + s = UInt128(signbit(x)) << 127 # sign bit + ux = abs(x) % Unsigned # the % Unsigned doesn't care that abs(typemin) == typemin + reinterpret(Float128, s|_Float128_bits(ux)) end # Float128 -> integer requires arithmetic, so is below @@ -246,79 +198,87 @@ Float128(x::Bool) = x ? Float128(1) : Float128(0) @assume_effects :foldable (/)(x::Float128, y::Float128) = Float128(@quad_ccall(quadoplib.__divtf3(x::Cfloat128, y::Cfloat128)::Cfloat128)) -@assume_effects :foldable (-)(x::Float128) = - Float128(@quad_ccall(quadoplib.__negtf2(x::Cfloat128)::Cfloat128)) +function (-)(x::Float128) + # xor by sign + reinterpret(Float128, reinterpret(UInt128, x) ⊻ sign_mask(Float128)) +end # Float128 -> Integer -@assume_effects :foldable unsafe_trunc(::Type{Int32}, x::Float128) = - @quad_ccall(quadoplib.__fixtfsi(x::Cfloat128)::Int32) - -@assume_effects :foldable unsafe_trunc(::Type{Int64}, x::Float128) = - @quad_ccall(quadoplib.__fixtfdi(x::Cfloat128)::Int64) - -@assume_effects :foldable unsafe_trunc(::Type{UInt32}, x::Float128) = - @quad_ccall(quadoplib.__fixunstfsi(x::Cfloat128)::UInt32) - -@assume_effects :foldable unsafe_trunc(::Type{UInt64}, x::Float128) = - @quad_ccall(quadoplib.__fixunstfdi(x::Cfloat128)::UInt64) - -function unsafe_trunc(::Type{UInt128}, x::Float128) +function unsafe_trunc(::Type{T}, x::Float128) where T<:Base.BitUnsigned xu = reinterpret(UInt128,x) k = (Int64(xu >> 112) & 0x07fff) - 16382 - 113 xu = (xu & significand_mask(Float128)) | 0x0001_0000_0000_0000_0000_0000_0000_0000 if k <= 0 - UInt128(xu >> -k) + (xu >> -k) % T else - UInt128(xu) << k + (xu % T) << k end end -function unsafe_trunc(::Type{Int128}, x::Float128) - copysign(unsafe_trunc(UInt128,x) % Int128, x) +function unsafe_trunc(::Type{T}, x::Float128) where T<:Base.BitSigned + copysign(unsafe_trunc(unsigned(T),x) % T, x) end trunc(::Type{Signed}, x::Float128) = trunc(Int,x) trunc(::Type{Unsigned}, x::Float128) = trunc(Int,x) trunc(::Type{Integer}, x::Float128) = trunc(Int,x) -for Ti in (Int32, Int64, Int128, UInt32, UInt64, UInt128) - if Ti <: Unsigned || sizeof(Ti) < sizeof(Float128) - # Here `Float128(typemin(Ti))-1` is exact, so we can compare the lower-bound - # directly. `Float128(typemax(Ti))+1` is either always exactly representable, or - # rounded to `Inf` (e.g. when `Ti==UInt128 && Float128==Float32`). - @eval begin - function trunc(::Type{$Ti},x::Float128) - if Float128(typemin($Ti)) - one(Float128) < x < Float128(typemax($Ti)) + one(Float128) - return unsafe_trunc($Ti,x) - else - throw(InexactError(:trunc, $Ti, x)) - end +for Ti in (UInt8, UInt16, UInt32, UInt64, UInt128) + # Here `Float128(typemin(Ti))` is 0, so we can check the sign of the float + # `Float128(typemax(Ti))+1` is exactly representable + @eval begin + function trunc(::Type{$Ti},x::Float128) + if !signbit(x) && x < Float128(typemax($Ti)) + one(Float128) + return unsafe_trunc($Ti,x) + else + throw(InexactError(:trunc, $Ti, x)) end - function (::Type{$Ti})(x::Float128) - if (Float128(typemin($Ti)) <= x <= Float128(typemax($Ti))) && (round(x, RoundToZero) == x) - return unsafe_trunc($Ti,x) - else - throw(InexactError($(Expr(:quote,Ti.name.name)), $Ti, x)) - end + end + function (::Type{$Ti})(x::Float128) + if (!signbit(x) && x <= Float128(typemax($Ti))) && (round(x, RoundToZero) == x) + return unsafe_trunc($Ti,x) + else + throw(InexactError($(Expr(:quote,Ti.name.name)), $Ti, x)) end end - else - # Here `eps(Float128(typemin(Ti))) > 1`, so the only value which can be truncated to - # `Float128(typemin(Ti)` is itself. Similarly, `Float128(typemax(Ti))` is inexact and will - # be rounded up. This assumes that `Float128(typemin(Ti)) > -Inf`, which is true for - # these types, but not for `Float16` or larger integer types. - @eval begin - function trunc(::Type{$Ti},x::Float128) - if Float128(typemin($Ti)) <= x < Float128(typemax($Ti)) - return unsafe_trunc($Ti,x) - else - throw(InexactError(:trunc, $Ti, x)) - end + end +end +for Ti in (Int8, Int16, Int32, Int64) + # Here `Float128(typemin(Ti))-1` is exact, so we can compare the lower-bound + # directly. `Float128(typemax(Ti))+1` is always exactly representable + @eval begin + function trunc(::Type{$Ti},x::Float128) + if Float128(typemin($Ti)) - one(Float128) < x < Float128(typemax($Ti)) + one(Float128) + return unsafe_trunc($Ti,x) + else + throw(InexactError(:trunc, $Ti, x)) end - function (::Type{$Ti})(x::Float128) - if (Float128(typemin($Ti)) <= x < Float128(typemax($Ti))) && (round(x, RoundToZero) == x) - return unsafe_trunc($Ti,x) - else - throw(InexactError($(Expr(:quote,Ti.name.name)), $Ti, x)) - end + end + function (::Type{$Ti})(x::Float128) + if (Float128(typemin($Ti)) <= x <= Float128(typemax($Ti))) && (round(x, RoundToZero) == x) + return unsafe_trunc($Ti,x) + else + throw(InexactError($(Expr(:quote,Ti.name.name)), $Ti, x)) + end + end + end +end +for Ti in (Int128,) + # Here `eps(Float128(typemin(Ti))) > 1`, so the only value which can be truncated to + # `Float128(typemin(Ti)` is itself. Similarly, `Float128(typemax(Ti))` is inexact and will + # be rounded up. This assumes that `Float128(typemin(Ti)) > -Inf`, which is true for + # Int128, but possibly not for larger integer types. + @eval begin + function trunc(::Type{$Ti},x::Float128) + if Float128(typemin($Ti)) <= x < Float128(typemax($Ti)) + return unsafe_trunc($Ti,x) + else + throw(InexactError(:trunc, $Ti, x)) + end + end + function (::Type{$Ti})(x::Float128) + if (Float128(typemin($Ti)) <= x < Float128(typemax($Ti))) && (round(x, RoundToZero) == x) + return unsafe_trunc($Ti,x) + else + throw(InexactError($(Expr(:quote,Ti.name.name)), $Ti, x)) end end end @@ -336,7 +296,10 @@ for f in (:acos, :acosh, :asin, :asinh, :atan, :atanh, :cosh, :cos, end end -@assume_effects :foldable abs(x::Float128) = Float128(@quad_ccall(libquadmath.fabsq(x::Cfloat128)::Cfloat128)) +function abs(x::Float128) + # mask out sign + reinterpret(Float128, reinterpret(UInt128, x)&(~sign_mask(Float128))) +end @assume_effects :foldable round(x::Float128) = Float128(@quad_ccall(libquadmath.rintq(x::Cfloat128)::Cfloat128)) round(x::Float128, r::RoundingMode{:Down}) = floor(x) round(x::Float128, r::RoundingMode{:Up}) = ceil(x) @@ -368,6 +331,16 @@ Base.Integer(x::Float128) = Int(x) sincos(x::Float128) = (sin(x), cos(x)) ## misc + +""" + Inf128 + +Positive infinity of type [`Float128`](@ref). +""" +const Inf128 = reinterpret(Float128, exponent_mask(Float128)) +typemax(::Type{Float128}) = Inf128 +typemin(::Type{Float128}) = -Inf128 + @static if !Sys.iswindows() # disable fma on Windows until rounding mode issue fixed # https://github.com/JuliaMath/Quadmath.jl/issues/31 @@ -375,9 +348,14 @@ sincos(x::Float128) = (sin(x), cos(x)) Float128(@quad_ccall(libquadmath.fmaq(x::Cfloat128, y::Cfloat128, z::Cfloat128)::Cfloat128)) end -@assume_effects :foldable isnan(x::Float128) = 0 != @quad_ccall(libquadmath.isnanq(x::Cfloat128)::Cint) -@assume_effects :foldable isinf(x::Float128) = 0 != @quad_ccall(libquadmath.isinfq(x::Cfloat128)::Cint) -@assume_effects :foldable isfinite(x::Float128) = 0 != @quad_ccall(libquadmath.finiteq(x::Cfloat128)::Cint) +function isinf(x::Float128) + return x===Inf128 || x===-Inf128 +end +function isfinite(x::Float128) + xu = reinterpret(UInt128, x) + return xu>>significand_bits(Float128)&0x7fff != 0x7fff +end +isnan(x::Float128) = !(isfinite(x) || isinf(x)) isinteger(x::Float128) = isfinite(x) && x === trunc(x) @@ -392,15 +370,6 @@ floatmin(::Type{Float128}) = reinterpret(Float128, 0x0001_0000_0000_0000_0000_00 floatmax(::Type{Float128}) = reinterpret(Float128, 0x7ffe_ffff_ffff_ffff_ffff_ffff_ffff_ffff) maxintfloat(::Type{Float128}) = Float128(0x0002_0000_0000_0000_0000_0000_0000_0000) -""" - Inf128 - -Positive infinity of type [`Float128`](@ref). -""" -const Inf128 = reinterpret(Float128, 0x7fff_0000_0000_0000_0000_0000_0000_0000) -typemax(::Type{Float128}) = Inf128 -typemin(::Type{Float128}) = -Inf128 - @assume_effects :foldable ldexp(x::Float128, n::Cint) = Float128(@quad_ccall(libquadmath.ldexpq(x::Cfloat128, n::Cint)::Cfloat128)) ldexp(x::Float128, n::Integer) = @@ -481,20 +450,10 @@ end function BigFloat(x::Float128; precision=precision(BigFloat)) if !isfinite(x) || iszero(x) - @static if VERSION < v"1.1" - return BigFloat(Float64(x), precision) - else - return BigFloat(Float64(x), precision=precision) - end + return BigFloat(Float64(x), precision=precision) end - @static if VERSION < v"1.1" - b = setprecision(BigFloat, max(precision,113)) do - BigFloat() - end - else - b = BigFloat(precision=max(precision,113)) - end + b = BigFloat(precision=max(precision,113)) y, k = frexp(x) b.exp = Clong(k) @@ -514,16 +473,7 @@ function BigFloat(x::Float128; precision=precision(BigFloat)) end if precision < 113 - @static if VERSION < v"1.1" - b2 = setprecision(BigFloat, precision) do - BigFloat() - end - ccall((:mpfr_set, :libmpfr), Int32, (Ref{BigFloat}, Ref{BigFloat}, Int32), - b2, b, MPFR.ROUNDING_MODE[]) - return b2 - else - return BigFloat(b, precision=precision) - end + return BigFloat(b, precision=precision) else return b end @@ -545,16 +495,7 @@ function Float128(x::BigFloat) z = reinterpret(Float128, UInt128(0)) end - @static if VERSION < v"1.1" - y = setprecision(BigFloat, prec) do - BigFloat() - end - ccall((:mpfr_set, :libmpfr), Int32, (Ref{BigFloat}, Ref{BigFloat}, Int32), - y, x, MPFR.ROUNDING_MODE[]) - else - y = BigFloat(x, precision=prec) - end - + y = BigFloat(x, precision=prec) u = zero(UInt128) i = cld(prec, sizeof(MPFR.Limb)*8) j = 113 @@ -574,13 +515,7 @@ function BigInt(x::Float128) BigInt(BigFloat(x, precision=precision(Float128))) end function Float128(x::BigInt) - @static if VERSION < v"1.1" - y = setprecision(BigFloat, precision(Float128)) do - BigFloat(x) - end - else - y = BigFloat(x, precision=precision(Float128)) - end + y = BigFloat(x, precision=precision(Float128)) Float128(y) end diff --git a/src/printf.jl b/src/printf.jl index a9a1e0e..5b7ff4c 100644 --- a/src/printf.jl +++ b/src/printf.jl @@ -1,60 +1,3 @@ - - -if VERSION < v"1.6.0-beta" - import Printf: ini_dec, fix_dec, ini_hex, ini_HEX - - function fp128_printf(out, d::Float128, flags::String, width::Int, precision::Int, c::Char, digits) - fmt_len = sizeof(flags)+4 - if width > 0 - fmt_len += ndigits(width) - end - if precision >= 0 - fmt_len += ndigits(precision)+1 - end - fmt = IOBuffer(maxsize=fmt_len) - print(fmt, '%') - print(fmt, flags) - if width > 0 - print(fmt, width) - end - if precision == 0 - print(fmt, '.') - print(fmt, '0') - elseif precision > 0 - print(fmt, '.') - print(fmt, precision) - end - print(fmt, 'Q') - print(fmt, c) - write(fmt, UInt8(0)) - printf_fmt = take!(fmt) - @assert length(printf_fmt) == fmt_len - bufsiz = length(digits) - lng = @ccall(libquadmath.quadmath_snprintf(digits::Ptr{UInt8}, bufsiz::Csize_t, printf_fmt::Ptr{UInt8}, d::(Cfloat128...))::Cint) - lng > 0 || error("invalid printf formatting for Float128") - unsafe_write(out, pointer(digits), min(lng, bufsiz-1)) - return (false, ()) - end - - if VERSION < v"1.1" - using Base.Grisu: DIGITSs - fix_dec(out, d::Float128, flags::String, width::Int, precision::Int, c::Char) = fp128_printf(out, d, flags, width, precision, c, DIGITSs[Threads.threadid()]) - ini_dec(out, d::Float128, ndigits::Int, flags::String, width::Int, precision::Int, c::Char) = fp128_printf(out, d, flags, width, precision, c, DIGITSs[Threads.threadid()]) - ini_hex(out, d::Float128, ndigits::Int, flags::String, width::Int, precision::Int, c::Char) = fp128_printf(out, d, flags, width, precision, c, DIGITSs[Threads.threadid()]) - ini_HEX(out, d::Float128, ndigits::Int, flags::String, width::Int, precision::Int, c::Char) = fp128_printf(out, d, flags, width, precision, c, DIGITSs[Threads.threadid()]) - ini_hex(out, d::Float128, flags::String, width::Int, precision::Int, c::Char) = fp128_printf(out, d, flags, width, precision, c, DIGITSs[Threads.threadid()]) - ini_HEX(out, d::Float128, flags::String, width::Int, precision::Int, c::Char) = fp128_printf(out, d, flags, width, precision, c, DIGITSs[Threads.threadid()]) - else - fix_dec(out, d::Float128, flags::String, width::Int, precision::Int, c::Char, digits) = fp128_printf(out, d, flags, width, precision, c, digits) - ini_dec(out, d::Float128, ndigits::Int, flags::String, width::Int, precision::Int, c::Char, digits) = fp128_printf(out, d, flags, width, precision, c, digits) - ini_hex(out, d::Float128, ndigits::Int, flags::String, width::Int, precision::Int, c::Char, digits) = fp128_printf(out, d, flags, width, precision, c, digits) - ini_HEX(out, d::Float128, ndigits::Int, flags::String, width::Int, precision::Int, c::Char, digits) = fp128_printf(out, d, flags, width, precision, c, digits) - ini_hex(out, d::Float128, flags::String, width::Int, precision::Int, c::Char, digits) = fp128_printf(out, d, flags, width, precision, c, digits) - ini_HEX(out, d::Float128, flags::String, width::Int, precision::Int, c::Char, digits) = fp128_printf(out, d, flags, width, precision, c, digits) - end -else - # Julia v1.6+ - # placeholder - import Printf - Printf.tofloat(x::Float128) = BigFloat(x) -end +# placeholder +import Printf +Printf.tofloat(x::Float128) = BigFloat(x)