diff --git a/src/binary16.rs b/src/binary16.rs
index bda9147..41affe5 100644
--- a/src/binary16.rs
+++ b/src/binary16.rs
@@ -54,10 +54,13 @@ impl f16 {
     /// Exponents that underflow the minimum 16-bit exponent will result in
     /// 16-bit subnormals or ±0. All other values are truncated and rounded
     /// to the nearest representable 16-bit value.
+    ///
+    /// This will prefer correctness over speed. Currently, this always
+    /// uses an intrinsic if available.
     #[inline]
     #[must_use]
     pub fn from_f32(value: f32) -> f16 {
-        f16(arch::f32_to_f16(value))
+        Self::from_f32_instrinsic(value)
     }
 
     /// Constructs a 16-bit floating point value from a 32-bit floating point
@@ -80,6 +83,21 @@ impl f16 {
         f16(arch::f32_to_f16_fallback(value))
     }
 
+    /// Constructs a 16-bit floating point value from a 32-bit floating point
+    /// value.
+    ///
+    /// This operation is lossy. If the 32-bit value is to large to fit in
+    /// 16-bits, ±∞ will result. NaN values are preserved. 32-bit subnormal
+    /// values are too tiny to be represented in 16-bits and result in ±0.
+    /// Exponents that underflow the minimum 16-bit exponent will result in
+    /// 16-bit subnormals or ±0. All other values are truncated and rounded
+    /// to the nearest representable 16-bit value.
+    #[inline]
+    #[must_use]
+    pub fn from_f32_instrinsic(value: f32) -> f16 {
+        f16(arch::f32_to_f16(value))
+    }
+
     /// Constructs a 16-bit floating point value from a 64-bit floating point
     /// value.
     ///
@@ -89,10 +107,18 @@ impl f16 {
     /// Exponents that underflow the minimum 16-bit exponent will result in
     /// 16-bit subnormals or ±0. All other values are truncated and rounded
     /// to the nearest representable 16-bit value.
+    ///
+    /// This will prefer correctness over speed: on x86 systems, this currently
+    /// uses a software rather than an instrinsic implementation on x86.
     #[inline]
     #[must_use]
     pub fn from_f64(value: f64) -> f16 {
-        f16(arch::f64_to_f16(value))
+        // FIXME: Once `_mm_cvtpd_ph` is stablized, move to using the intrinsic.
+        if cfg!(any(target_arch = "x86", target_arch = "x86_64")) {
+            Self::from_f64_const(value)
+        } else {
+            Self::from_f64_instrinsic(value)
+        }
     }
 
     /// Constructs a 16-bit floating point value from a 64-bit floating point
@@ -115,6 +141,25 @@ impl f16 {
         f16(arch::f64_to_f16_fallback(value))
     }
 
+    /// Constructs a 16-bit floating point value from a 64-bit floating point
+    /// value.
+    ///
+    /// This operation is lossy. If the 64-bit value is to large to fit in
+    /// 16-bits, ±∞ will result. NaN values are preserved. 64-bit subnormal
+    /// values are too tiny to be represented in 16-bits and result in ±0.
+    /// Exponents that underflow the minimum 16-bit exponent will result in
+    /// 16-bit subnormals or ±0. All other values are truncated and rounded
+    /// to the nearest representable 16-bit value.
+    ///
+    /// This prefers to use vendor instrinsics if possible, otherwise, it
+    /// goes to a fallback. On x86 and x86_64, this can be more lossy than
+    /// `from_f64`.
+    #[inline]
+    #[must_use]
+    pub fn from_f64_instrinsic(value: f64) -> f16 {
+        f16(arch::f64_to_f16(value))
+    }
+
     /// Converts a [`struct@f16`] into the underlying bit representation.
     #[inline]
     #[must_use]
@@ -238,10 +283,13 @@ impl f16 {
     ///
     /// This conversion is lossless as all 16-bit floating point values can be
     /// represented exactly in 32-bit floating point.
+    ///
+    /// This will prefer correctness over speed. Currently, this always
+    /// uses an intrinsic if available.
     #[inline]
     #[must_use]
     pub fn to_f32(self) -> f32 {
-        arch::f16_to_f32(self.0)
+        self.to_f32_intrinsic()
     }
 
     /// Converts a [`struct@f16`] value into a `f32` value.
@@ -259,6 +307,16 @@ impl f16 {
         arch::f16_to_f32_fallback(self.0)
     }
 
+    /// Converts a [`struct@f16`] value into a `f32` value.
+    ///
+    /// This conversion is lossless as all 16-bit floating point values can be
+    /// represented exactly in 32-bit floating point.
+    #[inline]
+    #[must_use]
+    pub fn to_f32_intrinsic(self) -> f32 {
+        arch::f16_to_f32(self.0)
+    }
+
     /// Convert the data to an `f32` type, used for numerical operations.
     #[inline(always)]
     pub fn as_f32(self) -> f32 {
@@ -275,10 +333,13 @@ impl f16 {
     ///
     /// This conversion is lossless as all 16-bit floating point values can be
     /// represented exactly in 64-bit floating point.
+    ///
+    /// This will prefer correctness over speed: on x86 systems, this currently
+    /// uses a software rather than an instrinsic implementation on x86.
     #[inline]
     #[must_use]
     pub fn to_f64(self) -> f64 {
-        arch::f16_to_f64(self.0)
+        self.to_f64_const()
     }
 
     /// Converts a [`struct@f16`] value into a `f64` value.
@@ -296,6 +357,16 @@ impl f16 {
         arch::f16_to_f64_fallback(self.0)
     }
 
+    /// Converts a [`struct@f16`] value into a `f32` value.
+    ///
+    /// This conversion is lossless as all 16-bit floating point values can be
+    /// represented exactly in 32-bit floating point.
+    #[inline]
+    #[must_use]
+    pub fn to_f64_intrinsic(self) -> f64 {
+        arch::f16_to_f64(self.0)
+    }
+
     /// Convert the data to an `f64` type, used for numerical operations.
     #[inline(always)]
     pub fn as_f64(self) -> f64 {
@@ -1395,8 +1466,8 @@ const fn ge(lhs: f16, rhs: f16) -> bool {
 #[allow(clippy::cognitive_complexity, clippy::float_cmp, clippy::neg_cmp_op_on_partial_ord)]
 #[cfg(test)]
 mod test {
-    #[allow(unused_imports)]
     use core::cmp::Ordering;
+    use core::mem;
 
     use super::*;
 
@@ -1808,4 +1879,33 @@ mod test {
         assert_eq!(f16::from_f32(4.) / f16::from_f32(2.), f16::from_f32(2.));
         assert_eq!(f16::from_f32(4.) % f16::from_f32(3.), f16::from_f32(1.));
     }
+
+    #[test]
+    fn issue_116() {
+        // SEE: https://github.com/starkat99/half-rs/issues/116
+        //  This is lossy until `_mm_cvtpd_ph` will be stable on x86.
+        let max_diff = if cfg!(any(target_arch = "x86", target_arch = "x86_64")) {
+            1
+        } else {
+            0
+        };
+
+        // from the round-to-even section of the test case
+        let x: f64 = unsafe { mem::transmute(0x3f0ffbfffffffffcu64) };
+        let bits = f16::from_f64(x).to_bits();
+        let const_bits = f16::from_f64_const(x).to_bits();
+        let inst_bits = f16::from_f64_instrinsic(x).to_bits();
+        assert_eq!(const_bits, bits);
+        assert!(inst_bits.abs_diff(bits) <= max_diff);
+
+        // from the double rounding section of the test case
+        // comment from the cpython test case: should be 2047, if double-rounded
+        // 64>32>16, becomes 2048
+        let x: f64 = unsafe { mem::transmute(0x409ffdffffff0000u64) };
+        let bits = f16::from_f64(x).to_bits();
+        let const_bits = f16::from_f64_const(x).to_bits();
+        let inst_bits = f16::from_f64_instrinsic(x).to_bits();
+        assert_eq!(const_bits, bits);
+        assert!(inst_bits.abs_diff(bits) <= max_diff);
+    }
 }
diff --git a/src/binary16/arch.rs b/src/binary16/arch.rs
index 3aa533d..bf59b33 100644
--- a/src/binary16/arch.rs
+++ b/src/binary16/arch.rs
@@ -81,7 +81,7 @@ pub(crate) fn f32_to_f16(f: f32) -> u16 {
 pub(crate) fn f64_to_f16(f: f64) -> u16 {
     convert_fn! {
         if x86_feature("f16c") {
-            unsafe { x86::f32_to_f16_x86_f16c(f as f32) }
+            unsafe { x86::f64_to_f16_x86_f16c(f) }
         } else if aarch64_feature("fp16") {
             unsafe { aarch64::f64_to_f16_fp16(f) }
         } else {
@@ -107,7 +107,7 @@ pub(crate) fn f16_to_f32(i: u16) -> f32 {
 pub(crate) fn f16_to_f64(i: u16) -> f64 {
     convert_fn! {
         if x86_feature("f16c") {
-            unsafe { x86::f16_to_f32_x86_f16c(i) as f64 }
+            unsafe { x86::f16_to_f64_x86_f16c(i) }
         } else if aarch64_feature("fp16") {
             unsafe { aarch64::f16_to_f64_fp16(i) }
         } else {
diff --git a/src/binary16/arch/x86.rs b/src/binary16/arch/x86.rs
index 8fe781b..4f7373f 100644
--- a/src/binary16/arch/x86.rs
+++ b/src/binary16/arch/x86.rs
@@ -41,6 +41,13 @@ pub(super) unsafe fn f16_to_f32_x86_f16c(i: u16) -> f32 {
     *(&retval as *const __m128).cast()
 }
 
+#[inline]
+#[target_feature(enable = "f16c")]
+pub(super) unsafe fn f16_to_f64_x86_f16c(i: u16) -> f64 {
+    // FIXME: Change to use `_mm_cvtph_pd` when stable.
+    f16_to_f32_x86_f16c(i) as f64
+}
+
 #[inline]
 #[target_feature(enable = "f16c")]
 pub(super) unsafe fn f32_to_f16_x86_f16c(f: f32) -> u16 {
@@ -50,6 +57,13 @@ pub(super) unsafe fn f32_to_f16_x86_f16c(f: f32) -> u16 {
     *(&retval as *const __m128i).cast()
 }
 
+#[inline]
+#[target_feature(enable = "f16c")]
+pub(super) unsafe fn f64_to_f16_x86_f16c(f: f64) -> u16 {
+    // FIXME: Change to use `_mm_cvtpd_ph` when stable.
+    f32_to_f16_x86_f16c(f as f32)
+}
+
 #[inline]
 #[target_feature(enable = "f16c")]
 pub(super) unsafe fn f16x4_to_f32x4_x86_f16c(v: &[u16; 4]) -> [f32; 4] {