diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 00a4414..95a5364 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -11,7 +11,7 @@ on:
 env:
   RUSTFLAGS: -Dwarnings
   RUST_BACKTRACE: 1
-  nightly: nightly-2021-04-13
+  nightly: nightly-2022-11-12
 
 defaults:
   run:
@@ -23,11 +23,11 @@ jobs:
     name: rustfmt
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
     - name: Install Rust
-      run: rustup update stable && rustup default stable
+      run: rustup update stable
     - name: Check formatting
-      run: cargo fmt --all -- --check
+      run: cargo fmt --all --check
 
   # TODO
   # # Apply clippy lints
@@ -35,7 +35,7 @@ jobs:
   #   name: clippy
   #   runs-on: ubuntu-latest
   #   steps:
-  #   - uses: actions/checkout@v2
+  #   - uses: actions/checkout@v4
   #   - name: Apply clippy lints
   #     run: cargo clippy --all-features
 
@@ -48,11 +48,11 @@ jobs:
     name: minrust
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
-      - name: Install Rust
-        run: rustup update 1.39.0 && rustup default 1.39.0
+      - uses: actions/checkout@v4
+      - name: Install cargo-hack
+        uses: taiki-e/install-action@cargo-hack
       - name: Check
-        run: . ci/test-stable.sh check
+        run: ci/test-stable.sh check
 
   # Stable
   stable:
@@ -65,23 +65,23 @@ jobs:
           - windows-latest
     runs-on: ${{ matrix.os }}
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Install Rust
         # --no-self-update is necessary because the windows environment cannot self-update rustup.exe.
-        run: rustup update stable --no-self-update && rustup default stable
+        run: rustup update stable --no-self-update
       - name: Test
-        run: . ci/test-stable.sh test
+        run: ci/test-stable.sh test
 
   # Nightly
   nightly:
     name: nightly
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Install Rust
         run: rustup update $nightly && rustup default $nightly
       - name: Test
-        run: . ci/test-stable.sh test
+        run: ci/test-stable.sh test
 
   # Run tests on some extra platforms
   cross:
@@ -96,9 +96,9 @@ jobs:
           - wasm32-unknown-unknown
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Install Rust
-        run: rustup update stable && rustup default stable
+        run: rustup update stable
       - name: cross build --target ${{ matrix.target }}
         run: |
           cargo install cross
@@ -111,23 +111,61 @@ jobs:
           cargo build --target ${{ matrix.target }}
         if: matrix.target == 'wasm32-unknown-unknown'
 
+  # Build for no_std environment.
+  no-std:
+    strategy:
+      fail-fast: false
+      matrix:
+        # thumbv7m-none-eabi supports atomic CAS.
+        # thumbv6m-none-eabi supports atomic, but not atomic CAS.
+        # riscv32i-unknown-none-elf does not support atomic at all.
+        target:
+          - thumbv7m-none-eabi
+          - thumbv6m-none-eabi
+          - riscv32i-unknown-none-elf
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - name: Install Rust
+        run: rustup update stable
+      - name: Install cargo-hack
+        run: cargo install cargo-hack
+      - run: rustup target add ${{ matrix.target }}
+        # * --optional-deps is needed for serde feature
+        # * --no-dev-deps is needed to avoid https://github.com/rust-lang/cargo/issues/4866
+      - run: cargo hack build --target ${{ matrix.target }} --feature-powerset --skip std,default --optional-deps --no-dev-deps
+
+  # When this job failed, run ci/no_atomic_cas.sh and commit result changes.
+  # TODO(taiki-e): Ideally, this should be automated using a bot that creates
+  #                PR when failed, but there is no bandwidth to implement it
+  #                right now...
+  codegen:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - name: Install Rust
+        run: rustup update nightly && rustup default nightly
+      - run: ci/no_atomic_cas.sh
+      - run: git diff --exit-code
+
   # Sanitizers
   tsan:
     name: tsan
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Install Rust
         run: rustup update $nightly && rustup default $nightly
       - name: Install rust-src
         run: rustup component add rust-src
       - name: ASAN / TSAN
-        run: . ci/tsan.sh
+        run: ci/tsan.sh
+
   miri:
     name: miri
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Miri
         run: ci/miri.sh
 
@@ -136,7 +174,7 @@ jobs:
     name: loom
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Install Rust
         run: rustup update $nightly && rustup default $nightly
       - name: Loom tests
@@ -155,9 +193,9 @@ jobs:
       - loom
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Install Rust
-        run: rustup update stable && rustup default stable
+        run: rustup update $nightly && rustup default $nightly
       - name: Build documentation
         run: cargo doc --no-deps --all-features
         env:
diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000..13566b8
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,8 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..7ada586
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/rt-bytes.iml" filepath="$PROJECT_DIR$/.idea/rt-bytes.iml" />
+    </modules>
+  </component>
+</project>
\ No newline at end of file
diff --git a/.idea/rt-bytes.iml b/.idea/rt-bytes.iml
new file mode 100644
index 0000000..2052624
--- /dev/null
+++ b/.idea/rt-bytes.iml
@@ -0,0 +1,13 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="CPP_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$">
+      <sourceFolder url="file://$MODULE_DIR$/benches" isTestSource="true" />
+      <sourceFolder url="file://$MODULE_DIR$/src" isTestSource="false" />
+      <sourceFolder url="file://$MODULE_DIR$/tests" isTestSource="true" />
+      <excludeFolder url="file://$MODULE_DIR$/target" />
+    </content>
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..94a25f7
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>
\ No newline at end of file
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 99975bf..2335717 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,135 @@
+# 1.6.0 (March 22, 2024)
+
+### Added
+
+- Add `Bytes::is_unique` (#643)
+
+### Documented
+
+- Fix changelog typo (#628)
+- Fix some spelling mistakes (#633)
+- Typo fix (#637)
+- Fix broken links (#639)
+- Add security policy (#649)
+
+### Internal changes
+
+- Move comment to correct constant (#629)
+- Various cleanup (#635)
+- Simplify `UninitSlice::as_uninit_slice_mut()` logic (#644)
+- Use `self.` instead of `Self::` (#642)
+- `BytesMut`: Assert alignment of `Shared` (#652)
+- Remove unnecessary namespace qualifier (#660)
+- Remove an unnecessary else branch (#662)
+- Remove unreachable else branch (#661)
+- make parameter mut in `From<Vec>` (#667)
+- Restore commented tests (#665)
+- Use `sub` instead of `offset` (#668)
+- Calculate original capacity only if necessary (#666)
+- `set_vec_pos` does not need a second parameter (#672)
+- `get_vec_pos`: use `&self` instead of `&mut self` (#670)
+- Refactor `split_at`/`split_to` (#663)
+- Use `Iterator` from the prelude (#673)
+- `copy_to_bytes`: Add panic section to docs (#676)
+- Remove redundant reserve call (#674)
+- Use `ManuallyDrop` instead of `mem::forget` (#675)
+
+
+# 1.5.0 (September 7, 2023)
+
+### Added
+
+- Add `UninitSlice::{new,uninit}` (#598, #599)
+- Implement `BufMut` for `&mut [MaybeUninit<u8>]` (#597)
+
+### Changed
+
+- Mark `BytesMut::extend_from_slice` as inline (#595)
+
+# 1.4.0 (January 31, 2023)
+
+### Added
+
+- Make `IntoIter` constructor public (#581)
+
+### Fixed
+
+- Avoid large reallocations when freezing `BytesMut` (#592)
+
+### Documented
+
+- Document which functions require `std` (#591)
+- Fix duplicate "the the" typos (#585)
+
+# 1.3.0 (November 20, 2022)
+
+### Added
+
+- Rename and expose `BytesMut::spare_capacity_mut` (#572)
+- Implement native-endian get and put functions for `Buf` and `BufMut` (#576)
+
+### Fixed
+
+- Don't have important data in unused capacity when calling reserve (#563)
+
+### Documented
+
+- `Bytes::new` etc should return `Self` not `Bytes` (#568)
+
+# 1.2.1 (July 30, 2022)
+
+### Fixed
+
+- Fix unbounded memory growth when using `reserve` (#560)
+
+# 1.2.0 (July 19, 2022)
+
+### Added
+
+- Add `BytesMut::zeroed` (#517)
+- Implement `Extend<Bytes>` for `BytesMut` (#527)
+- Add conversion from `BytesMut` to `Vec<u8>` (#543, #554)
+- Add conversion from `Bytes` to `Vec<u8>` (#547)
+- Add `UninitSlice::as_uninit_slice_mut()` (#548)
+- Add const to `Bytes::{len,is_empty}` (#514)
+
+### Changed
+
+- Reuse vector in `BytesMut::reserve` (#539, #544)
+
+### Fixed
+
+- Make miri happy (#515, #523, #542, #545, #553)
+- Make tsan happy (#541)
+- Fix `remaining_mut()` on chain (#488)
+- Fix amortized asymptotics of `BytesMut` (#555)
+
+### Documented
+
+- Redraw layout diagram with box drawing characters (#539)
+- Clarify `BytesMut::unsplit` docs (#535)
+
+# 1.1.0 (August 25, 2021)
+
+### Added
+
+- `BufMut::put_bytes(self, val, cnt)` (#487)
+- Implement `From<Box<[u8]>>` for `Bytes` (#504)
+
+### Changed
+
+- Override `put_slice` for `&mut [u8]` (#483)
+- Panic on integer overflow in `Chain::remaining` (#482)
+- Add inline tags to `UninitSlice` methods (#443)
+- Override `copy_to_bytes` for Chain and Take (#481)
+- Keep capacity when unsplit on empty other buf (#502)
+
+### Documented
+
+- Clarify `BufMut` allocation guarantees (#501)
+- Clarify `BufMut::put_int` behavior (#486)
+- Clarify actions of `clear` and `truncate`. (#508)
+
 # 1.0.1 (January 11, 2021)
 
 ### Changed
diff --git a/Cargo.toml b/Cargo.toml
index 34d70f8..13a8c4f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,8 +3,10 @@
 name = "bytes"
 # When releasing to crates.io:
 # - Update CHANGELOG.md.
-# - Create "v1.0.x" git tag.
-version = "1.0.1"
+# - Create "v1.x.y" git tag.
+version = "1.6.0"
+edition = "2018"
+rust-version = "1.39"
 license = "MIT"
 authors = [
     "Carl Lerche <me@carllerche.com>",
@@ -15,7 +17,6 @@ repository = "https://github.com/tokio-rs/bytes"
 readme = "README.md"
 keywords = ["buffers", "zero-copy", "io"]
 categories = ["network-programming", "data-structures"]
-edition = "2018"
 
 [features]
 default = ["std"]
@@ -23,12 +24,13 @@ std = []
 
 [dependencies]
 serde = { version = "1.0.60", optional = true, default-features = false, features = ["alloc"] }
+portable-atomic = { version = "1.6.0", default-features = false }
 
 [dev-dependencies]
 serde_test = "1.0"
 
 [target.'cfg(loom)'.dev-dependencies]
-loom = "0.5"
+loom = "0.7"
 
 [package.metadata.docs.rs]
 rustdoc-args = ["--cfg", "docsrs"]
diff --git a/README.md b/README.md
index 468485d..9e42a4d 100644
--- a/README.md
+++ b/README.md
@@ -1,47 +1,13 @@
-# Bytes
+# Patch
 
-A utility library for working with bytes.
+> **_NOTE:_**  This patch adds atomic-cas polyfill to bytes for targets like thumbv6m-none-eabi until the following PR/issue is resolved:
+>
+> * https://github.com/tokio-rs/bytes/pull/467
+> * https://github.com/tokio-rs/bytes/issues/461
 
-[![Crates.io][crates-badge]][crates-url]
-[![Build Status][ci-badge]][ci-url]
+The patch can be applied by adding the following segment to root Cargo.toml:
 
-[crates-badge]: https://img.shields.io/crates/v/bytes.svg
-[crates-url]: https://crates.io/crates/bytes
-[ci-badge]: https://github.com/tokio-rs/bytes/workflows/CI/badge.svg
-[ci-url]: https://github.com/tokio-rs/bytes/actions
-
-[Documentation](https://docs.rs/bytes)
-
-## Usage
-
-To use `bytes`, first add this to your `Cargo.toml`:
-
-```toml
-[dependencies]
-bytes = "1"
-```
-
-Next, add this to your crate:
-
-```rust
-use bytes::{Bytes, BytesMut, Buf, BufMut};
-```
-
-## Serde support
-
-Serde support is optional and disabled by default. To enable use the feature `serde`.
-
-```toml
-[dependencies]
-bytes = { version = "1", features = ["serde"] }
-```
-
-## License
-
-This project is licensed under the [MIT license](LICENSE).
-
-### Contribution
-
-Unless you explicitly state otherwise, any contribution intentionally submitted
-for inclusion in `bytes` by you, shall be licensed as MIT, without any additional
-terms or conditions.
+````TOML
+[patch.crates-io]
+bytes = { git = "ssh://git@github.com/pegasus-aero/rt-bytes.git", branch = "cfg_target_has_atomic_v1.6.0" }
+````
diff --git a/SECURITY.md b/SECURITY.md
new file mode 100644
index 0000000..b74a831
--- /dev/null
+++ b/SECURITY.md
@@ -0,0 +1,9 @@
+# Security Policy
+
+Bytes is part of the Tokio project and uses the same security policy as [Tokio][tokio-security].
+
+## Report a security issue
+
+The process for reporting an issue is the same as for [Tokio][tokio-security]. This includes private reporting via security@tokio.rs.
+
+[tokio-security]: https://github.com/tokio-rs/tokio/security/policy
diff --git a/benches/buf.rs b/benches/buf.rs
index 6dc8516..616d187 100644
--- a/benches/buf.rs
+++ b/benches/buf.rs
@@ -46,7 +46,7 @@ impl TestBuf {
 }
 impl Buf for TestBuf {
     fn remaining(&self) -> usize {
-        return self.buf.len() - self.pos;
+        self.buf.len() - self.pos
     }
     fn advance(&mut self, cnt: usize) {
         self.pos += cnt;
diff --git a/benches/bytes.rs b/benches/bytes.rs
index c5b8412..8782d00 100644
--- a/benches/bytes.rs
+++ b/benches/bytes.rs
@@ -47,7 +47,7 @@ fn clone_static(b: &mut Bencher) {
 
     b.iter(|| {
         for _ in 0..1024 {
-            test::black_box(&bytes.clone());
+            test::black_box(test::black_box(&bytes).clone());
         }
     })
 }
@@ -58,7 +58,7 @@ fn clone_shared(b: &mut Bencher) {
 
     b.iter(|| {
         for _ in 0..1024 {
-            test::black_box(&bytes.clone());
+            test::black_box(test::black_box(&bytes).clone());
         }
     })
 }
@@ -70,7 +70,7 @@ fn clone_arc_vec(b: &mut Bencher) {
 
     b.iter(|| {
         for _ in 0..1024 {
-            test::black_box(&bytes.clone());
+            test::black_box(test::black_box(&bytes).clone());
         }
     })
 }
@@ -88,6 +88,7 @@ fn from_long_slice(b: &mut Bencher) {
 #[bench]
 fn slice_empty(b: &mut Bencher) {
     b.iter(|| {
+        // `clone` is to convert to ARC
         let b = Bytes::from(vec![17; 1024]).clone();
         for i in 0..1000 {
             test::black_box(b.slice(i % 100..i % 100));
diff --git a/build.rs b/build.rs
new file mode 100644
index 0000000..b71e3ca
--- /dev/null
+++ b/build.rs
@@ -0,0 +1,31 @@
+#![warn(rust_2018_idioms)]
+
+use std::env;
+
+include!("no_atomic_cas.rs");
+
+// The rustc-cfg strings below are *not* public API. Please let us know by
+// opening a GitHub issue if your build environment requires some way to enable
+// these cfgs other than by executing our build script.
+fn main() {
+    let target = match env::var("TARGET") {
+        Ok(target) => target,
+        Err(e) => {
+            println!(
+                "cargo:warning=bytes: unable to get TARGET environment variable: {}",
+                e
+            );
+            return;
+        }
+    };
+
+    // Note that this is `no_*`, not `has_*`. This allows treating
+    // `cfg(target_has_atomic = "ptr")` as true when the build script doesn't
+    // run. This is needed for compatibility with non-cargo build systems that
+    // don't run the build script.
+    if NO_ATOMIC_CAS.contains(&&*target) {
+        println!("cargo:rustc-cfg=bytes_no_atomic_cas");
+    }
+
+    println!("cargo:rerun-if-changed=no_atomic_cas.rs");
+}
diff --git a/ci/miri.sh b/ci/miri.sh
index 88d2b6a..0158756 100755
--- a/ci/miri.sh
+++ b/ci/miri.sh
@@ -1,11 +1,11 @@
 #!/bin/bash
 set -e
 
-MIRI_NIGHTLY=nightly-$(curl -s https://rust-lang.github.io/rustup-components-history/x86_64-unknown-linux-gnu/miri)
-echo "Installing latest nightly with Miri: $MIRI_NIGHTLY"
-rustup set profile minimal
-rustup default "$MIRI_NIGHTLY"
-rustup component add miri
+rustup toolchain install nightly --component miri
+rustup override set nightly
+cargo miri setup
+
+export MIRIFLAGS="-Zmiri-strict-provenance"
 
 cargo miri test
 cargo miri test --target mips64-unknown-linux-gnuabi64
diff --git a/ci/no_atomic_cas.sh b/ci/no_atomic_cas.sh
new file mode 100755
index 0000000..bc2e350
--- /dev/null
+++ b/ci/no_atomic_cas.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+# Update the list of targets that do not support atomic CAS operations.
+#
+# Usage:
+#    ./ci/no_atomic_cas.sh
+
+set -euo pipefail
+IFS=$'\n\t'
+
+cd "$(cd "$(dirname "$0")" && pwd)"/..
+
+file="no_atomic_cas.rs"
+
+{
+    echo "// This file is @generated by $(basename "$0")."
+    echo "// It is not intended for manual editing."
+    echo ""
+} >"$file"
+
+echo "const NO_ATOMIC_CAS: &[&str] = &[" >>"$file"
+for target in $(rustc --print target-list); do
+    res=$(rustc --print target-spec-json -Z unstable-options --target "$target" \
+        | jq -r "select(.\"atomic-cas\" == false)")
+    [[ -z "$res" ]] || echo "    \"$target\"," >>"$file"
+done
+echo "];" >>"$file"
diff --git a/ci/test-stable.sh b/ci/test-stable.sh
old mode 100644
new mode 100755
index 01a32f5..ad97574
--- a/ci/test-stable.sh
+++ b/ci/test-stable.sh
@@ -4,9 +4,6 @@ set -ex
 
 cmd="${1:-test}"
 
-# Install cargo-hack for feature flag test
-cargo install cargo-hack
-
 # Run with each feature
 # * --each-feature includes both default/no-default features
 # * --optional-deps is needed for serde feature
@@ -14,14 +11,15 @@ cargo hack "${cmd}" --each-feature --optional-deps
 # Run with all features
 cargo "${cmd}" --all-features
 
-cargo doc --no-deps --all-features
-
 if [[ "${RUST_VERSION}" == "nightly"* ]]; then
     # Check benchmarks
     cargo check --benches
 
     # Check minimal versions
-    cargo clean
-    cargo update -Zminimal-versions
+    # Remove dev-dependencies from Cargo.toml to prevent the next `cargo update`
+    # from determining minimal versions based on dev-dependencies.
+    cargo hack --remove-dev-deps --workspace
+    # Update Cargo.lock to minimal version dependencies.
+    cargo update -Z minimal-versions
     cargo check --all-features
 fi
diff --git a/ci/tsan.sh b/ci/tsan.sh
old mode 100644
new mode 100755
diff --git a/clippy.toml b/clippy.toml
new file mode 100644
index 0000000..53095b1
--- /dev/null
+++ b/clippy.toml
@@ -0,0 +1 @@
+msrv = "1.39"
diff --git a/no_atomic_cas.rs b/no_atomic_cas.rs
new file mode 100644
index 0000000..9b05d4b
--- /dev/null
+++ b/no_atomic_cas.rs
@@ -0,0 +1,13 @@
+// This file is @generated by no_atomic_cas.sh.
+// It is not intended for manual editing.
+
+const NO_ATOMIC_CAS: &[&str] = &[
+    "avr-unknown-gnu-atmega328",
+    "bpfeb-unknown-none",
+    "bpfel-unknown-none",
+    "msp430-none-elf",
+    "riscv32i-unknown-none-elf",
+    "riscv32imc-unknown-none-elf",
+    "thumbv4t-none-eabi",
+    "thumbv6m-none-eabi",
+];
diff --git a/src/buf/buf_impl.rs b/src/buf/buf_impl.rs
index a33c8a4..79cbf62 100644
--- a/src/buf/buf_impl.rs
+++ b/src/buf/buf_impl.rs
@@ -1,8 +1,9 @@
 #[cfg(feature = "std")]
 use crate::buf::{reader, Reader};
 use crate::buf::{take, Chain, Take};
-
-use core::{cmp, mem, ptr};
+#[cfg(feature = "std")]
+use crate::{min_u64_usize, saturating_sub_usize_u64};
+use crate::{panic_advance, panic_does_not_fit};
 
 #[cfg(feature = "std")]
 use std::io::IoSlice;
@@ -11,7 +12,12 @@ use alloc::boxed::Box;
 
 macro_rules! buf_get_impl {
     ($this:ident, $typ:tt::$conv:tt) => {{
-        const SIZE: usize = mem::size_of::<$typ>();
+        const SIZE: usize = core::mem::size_of::<$typ>();
+
+        if $this.remaining() < SIZE {
+            panic_advance(SIZE, $this.remaining());
+        }
+
         // try to convert directly from the bytes
         // this Option<ret> trick is to avoid keeping a borrow on self
         // when advance() is called (mut borrow) and to call bytes() only once
@@ -32,19 +38,30 @@ macro_rules! buf_get_impl {
         }
     }};
     (le => $this:ident, $typ:tt, $len_to_read:expr) => {{
-        debug_assert!(mem::size_of::<$typ>() >= $len_to_read);
+        const SIZE: usize = core::mem::size_of::<$typ>();
 
         // The same trick as above does not improve the best case speed.
         // It seems to be linked to the way the method is optimised by the compiler
-        let mut buf = [0; (mem::size_of::<$typ>())];
-        $this.copy_to_slice(&mut buf[..($len_to_read)]);
+        let mut buf = [0; SIZE];
+
+        let subslice = match buf.get_mut(..$len_to_read) {
+            Some(subslice) => subslice,
+            None => panic_does_not_fit(SIZE, $len_to_read),
+        };
+
+        $this.copy_to_slice(subslice);
         return $typ::from_le_bytes(buf);
     }};
     (be => $this:ident, $typ:tt, $len_to_read:expr) => {{
-        debug_assert!(mem::size_of::<$typ>() >= $len_to_read);
+        const SIZE: usize = core::mem::size_of::<$typ>();
+
+        let slice_at = match SIZE.checked_sub($len_to_read) {
+            Some(slice_at) => slice_at,
+            None => panic_does_not_fit(SIZE, $len_to_read),
+        };
 
-        let mut buf = [0; (mem::size_of::<$typ>())];
-        $this.copy_to_slice(&mut buf[mem::size_of::<$typ>() - ($len_to_read)..]);
+        let mut buf = [0; SIZE];
+        $this.copy_to_slice(&mut buf[slice_at..]);
         return $typ::from_be_bytes(buf);
     }};
 }
@@ -160,6 +177,7 @@ pub trait Buf {
     ///
     /// [`writev`]: http://man7.org/linux/man-pages/man2/readv.2.html
     #[cfg(feature = "std")]
+    #[cfg_attr(docsrs, doc(cfg(feature = "std")))]
     fn chunks_vectored<'a>(&'a self, dst: &mut [IoSlice<'a>]) -> usize {
         if dst.is_empty() {
             return 0;
@@ -246,23 +264,18 @@ pub trait Buf {
     ///
     /// # Panics
     ///
-    /// This function panics if `self.remaining() < dst.len()`
-    fn copy_to_slice(&mut self, dst: &mut [u8]) {
-        let mut off = 0;
-
-        assert!(self.remaining() >= dst.len());
-
-        while off < dst.len() {
-            let cnt;
-
-            unsafe {
-                let src = self.chunk();
-                cnt = cmp::min(src.len(), dst.len() - off);
+    /// This function panics if `self.remaining() < dst.len()`.
+    fn copy_to_slice(&mut self, mut dst: &mut [u8]) {
+        if self.remaining() < dst.len() {
+            panic_advance(dst.len(), self.remaining());
+        }
 
-                ptr::copy_nonoverlapping(src.as_ptr(), dst[off..].as_mut_ptr(), cnt);
+        while !dst.is_empty() {
+            let src = self.chunk();
+            let cnt = usize::min(src.len(), dst.len());
 
-                off += cnt;
-            }
+            dst[..cnt].copy_from_slice(&src[..cnt]);
+            dst = &mut dst[cnt..];
 
             self.advance(cnt);
         }
@@ -285,7 +298,9 @@ pub trait Buf {
     ///
     /// This function panics if there is no more remaining data in `self`.
     fn get_u8(&mut self) -> u8 {
-        assert!(self.remaining() >= 1);
+        if self.remaining() < 1 {
+            panic_advance(1, 0);
+        }
         let ret = self.chunk()[0];
         self.advance(1);
         ret
@@ -308,7 +323,9 @@ pub trait Buf {
     ///
     /// This function panics if there is no more remaining data in `self`.
     fn get_i8(&mut self) -> i8 {
-        assert!(self.remaining() >= 1);
+        if self.remaining() < 1 {
+            panic_advance(1, 0);
+        }
         let ret = self.chunk()[0] as i8;
         self.advance(1);
         ret
@@ -354,6 +371,29 @@ pub trait Buf {
         buf_get_impl!(self, u16::from_le_bytes);
     }
 
+    /// Gets an unsigned 16 bit integer from `self` in native-endian byte order.
+    ///
+    /// The current position is advanced by 2.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use bytes::Buf;
+    ///
+    /// let mut buf: &[u8] = match cfg!(target_endian = "big") {
+    ///     true => b"\x08\x09 hello",
+    ///     false => b"\x09\x08 hello",
+    /// };
+    /// assert_eq!(0x0809, buf.get_u16_ne());
+    /// ```
+    ///
+    /// # Panics
+    ///
+    /// This function panics if there is not enough remaining data in `self`.
+    fn get_u16_ne(&mut self) -> u16 {
+        buf_get_impl!(self, u16::from_ne_bytes);
+    }
+
     /// Gets a signed 16 bit integer from `self` in big-endian byte order.
     ///
     /// The current position is advanced by 2.
@@ -394,6 +434,29 @@ pub trait Buf {
         buf_get_impl!(self, i16::from_le_bytes);
     }
 
+    /// Gets a signed 16 bit integer from `self` in native-endian byte order.
+    ///
+    /// The current position is advanced by 2.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use bytes::Buf;
+    ///
+    /// let mut buf: &[u8] = match cfg!(target_endian = "big") {
+    ///     true => b"\x08\x09 hello",
+    ///     false => b"\x09\x08 hello",
+    /// };
+    /// assert_eq!(0x0809, buf.get_i16_ne());
+    /// ```
+    ///
+    /// # Panics
+    ///
+    /// This function panics if there is not enough remaining data in `self`.
+    fn get_i16_ne(&mut self) -> i16 {
+        buf_get_impl!(self, i16::from_ne_bytes);
+    }
+
     /// Gets an unsigned 32 bit integer from `self` in the big-endian byte order.
     ///
     /// The current position is advanced by 4.
@@ -434,6 +497,29 @@ pub trait Buf {
         buf_get_impl!(self, u32::from_le_bytes);
     }
 
+    /// Gets an unsigned 32 bit integer from `self` in native-endian byte order.
+    ///
+    /// The current position is advanced by 4.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use bytes::Buf;
+    ///
+    /// let mut buf: &[u8] = match cfg!(target_endian = "big") {
+    ///     true => b"\x08\x09\xA0\xA1 hello",
+    ///     false => b"\xA1\xA0\x09\x08 hello",
+    /// };
+    /// assert_eq!(0x0809A0A1, buf.get_u32_ne());
+    /// ```
+    ///
+    /// # Panics
+    ///
+    /// This function panics if there is not enough remaining data in `self`.
+    fn get_u32_ne(&mut self) -> u32 {
+        buf_get_impl!(self, u32::from_ne_bytes);
+    }
+
     /// Gets a signed 32 bit integer from `self` in big-endian byte order.
     ///
     /// The current position is advanced by 4.
@@ -474,6 +560,29 @@ pub trait Buf {
         buf_get_impl!(self, i32::from_le_bytes);
     }
 
+    /// Gets a signed 32 bit integer from `self` in native-endian byte order.
+    ///
+    /// The current position is advanced by 4.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use bytes::Buf;
+    ///
+    /// let mut buf: &[u8] = match cfg!(target_endian = "big") {
+    ///     true => b"\x08\x09\xA0\xA1 hello",
+    ///     false => b"\xA1\xA0\x09\x08 hello",
+    /// };
+    /// assert_eq!(0x0809A0A1, buf.get_i32_ne());
+    /// ```
+    ///
+    /// # Panics
+    ///
+    /// This function panics if there is not enough remaining data in `self`.
+    fn get_i32_ne(&mut self) -> i32 {
+        buf_get_impl!(self, i32::from_ne_bytes);
+    }
+
     /// Gets an unsigned 64 bit integer from `self` in big-endian byte order.
     ///
     /// The current position is advanced by 8.
@@ -514,6 +623,29 @@ pub trait Buf {
         buf_get_impl!(self, u64::from_le_bytes);
     }
 
+    /// Gets an unsigned 64 bit integer from `self` in native-endian byte order.
+    ///
+    /// The current position is advanced by 8.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use bytes::Buf;
+    ///
+    /// let mut buf: &[u8] = match cfg!(target_endian = "big") {
+    ///     true => b"\x01\x02\x03\x04\x05\x06\x07\x08 hello",
+    ///     false => b"\x08\x07\x06\x05\x04\x03\x02\x01 hello",
+    /// };
+    /// assert_eq!(0x0102030405060708, buf.get_u64_ne());
+    /// ```
+    ///
+    /// # Panics
+    ///
+    /// This function panics if there is not enough remaining data in `self`.
+    fn get_u64_ne(&mut self) -> u64 {
+        buf_get_impl!(self, u64::from_ne_bytes);
+    }
+
     /// Gets a signed 64 bit integer from `self` in big-endian byte order.
     ///
     /// The current position is advanced by 8.
@@ -554,6 +686,29 @@ pub trait Buf {
         buf_get_impl!(self, i64::from_le_bytes);
     }
 
+    /// Gets a signed 64 bit integer from `self` in native-endian byte order.
+    ///
+    /// The current position is advanced by 8.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use bytes::Buf;
+    ///
+    /// let mut buf: &[u8] = match cfg!(target_endian = "big") {
+    ///     true => b"\x01\x02\x03\x04\x05\x06\x07\x08 hello",
+    ///     false => b"\x08\x07\x06\x05\x04\x03\x02\x01 hello",
+    /// };
+    /// assert_eq!(0x0102030405060708, buf.get_i64_ne());
+    /// ```
+    ///
+    /// # Panics
+    ///
+    /// This function panics if there is not enough remaining data in `self`.
+    fn get_i64_ne(&mut self) -> i64 {
+        buf_get_impl!(self, i64::from_ne_bytes);
+    }
+
     /// Gets an unsigned 128 bit integer from `self` in big-endian byte order.
     ///
     /// The current position is advanced by 16.
@@ -594,6 +749,29 @@ pub trait Buf {
         buf_get_impl!(self, u128::from_le_bytes);
     }
 
+    /// Gets an unsigned 128 bit integer from `self` in native-endian byte order.
+    ///
+    /// The current position is advanced by 16.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use bytes::Buf;
+    ///
+    /// let mut buf: &[u8] = match cfg!(target_endian = "big") {
+    ///     true => b"\x01\x02\x03\x04\x05\x06\x07\x08\x09\x10\x11\x12\x13\x14\x15\x16 hello",
+    ///     false => b"\x16\x15\x14\x13\x12\x11\x10\x09\x08\x07\x06\x05\x04\x03\x02\x01 hello",
+    /// };
+    /// assert_eq!(0x01020304050607080910111213141516, buf.get_u128_ne());
+    /// ```
+    ///
+    /// # Panics
+    ///
+    /// This function panics if there is not enough remaining data in `self`.
+    fn get_u128_ne(&mut self) -> u128 {
+        buf_get_impl!(self, u128::from_ne_bytes);
+    }
+
     /// Gets a signed 128 bit integer from `self` in big-endian byte order.
     ///
     /// The current position is advanced by 16.
@@ -634,6 +812,29 @@ pub trait Buf {
         buf_get_impl!(self, i128::from_le_bytes);
     }
 
+    /// Gets a signed 128 bit integer from `self` in native-endian byte order.
+    ///
+    /// The current position is advanced by 16.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use bytes::Buf;
+    ///
+    /// let mut buf: &[u8] = match cfg!(target_endian = "big") {
+    ///     true => b"\x01\x02\x03\x04\x05\x06\x07\x08\x09\x10\x11\x12\x13\x14\x15\x16 hello",
+    ///     false => b"\x16\x15\x14\x13\x12\x11\x10\x09\x08\x07\x06\x05\x04\x03\x02\x01 hello",
+    /// };
+    /// assert_eq!(0x01020304050607080910111213141516, buf.get_i128_ne());
+    /// ```
+    ///
+    /// # Panics
+    ///
+    /// This function panics if there is not enough remaining data in `self`.
+    fn get_i128_ne(&mut self) -> i128 {
+        buf_get_impl!(self, i128::from_ne_bytes);
+    }
+
     /// Gets an unsigned n-byte integer from `self` in big-endian byte order.
     ///
     /// The current position is advanced by `nbytes`.
@@ -674,6 +875,34 @@ pub trait Buf {
         buf_get_impl!(le => self, u64, nbytes);
     }
 
+    /// Gets an unsigned n-byte integer from `self` in native-endian byte order.
+    ///
+    /// The current position is advanced by `nbytes`.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use bytes::Buf;
+    ///
+    /// let mut buf: &[u8] = match cfg!(target_endian = "big") {
+    ///     true => b"\x01\x02\x03 hello",
+    ///     false => b"\x03\x02\x01 hello",
+    /// };
+    /// assert_eq!(0x010203, buf.get_uint_ne(3));
+    /// ```
+    ///
+    /// # Panics
+    ///
+    /// This function panics if there is not enough remaining data in `self`, or
+    /// if `nbytes` is greater than 8.
+    fn get_uint_ne(&mut self, nbytes: usize) -> u64 {
+        if cfg!(target_endian = "big") {
+            self.get_uint(nbytes)
+        } else {
+            self.get_uint_le(nbytes)
+        }
+    }
+
     /// Gets a signed n-byte integer from `self` in big-endian byte order.
     ///
     /// The current position is advanced by `nbytes`.
@@ -689,7 +918,8 @@ pub trait Buf {
     ///
     /// # Panics
     ///
-    /// This function panics if there is not enough remaining data in `self`.
+    /// This function panics if there is not enough remaining data in `self`, or
+    /// if `nbytes` is greater than 8.
     fn get_int(&mut self, nbytes: usize) -> i64 {
         buf_get_impl!(be => self, i64, nbytes);
     }
@@ -709,11 +939,40 @@ pub trait Buf {
     ///
     /// # Panics
     ///
-    /// This function panics if there is not enough remaining data in `self`.
+    /// This function panics if there is not enough remaining data in `self`, or
+    /// if `nbytes` is greater than 8.
     fn get_int_le(&mut self, nbytes: usize) -> i64 {
         buf_get_impl!(le => self, i64, nbytes);
     }
 
+    /// Gets a signed n-byte integer from `self` in native-endian byte order.
+    ///
+    /// The current position is advanced by `nbytes`.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use bytes::Buf;
+    ///
+    /// let mut buf: &[u8] = match cfg!(target_endian = "big") {
+    ///     true => b"\x01\x02\x03 hello",
+    ///     false => b"\x03\x02\x01 hello",
+    /// };
+    /// assert_eq!(0x010203, buf.get_int_ne(3));
+    /// ```
+    ///
+    /// # Panics
+    ///
+    /// This function panics if there is not enough remaining data in `self`, or
+    /// if `nbytes` is greater than 8.
+    fn get_int_ne(&mut self, nbytes: usize) -> i64 {
+        if cfg!(target_endian = "big") {
+            self.get_int(nbytes)
+        } else {
+            self.get_int_le(nbytes)
+        }
+    }
+
     /// Gets an IEEE754 single-precision (4 bytes) floating point number from
     /// `self` in big-endian byte order.
     ///
@@ -732,7 +991,7 @@ pub trait Buf {
     ///
     /// This function panics if there is not enough remaining data in `self`.
     fn get_f32(&mut self) -> f32 {
-        f32::from_bits(Self::get_u32(self))
+        f32::from_bits(self.get_u32())
     }
 
     /// Gets an IEEE754 single-precision (4 bytes) floating point number from
@@ -753,7 +1012,31 @@ pub trait Buf {
     ///
     /// This function panics if there is not enough remaining data in `self`.
     fn get_f32_le(&mut self) -> f32 {
-        f32::from_bits(Self::get_u32_le(self))
+        f32::from_bits(self.get_u32_le())
+    }
+
+    /// Gets an IEEE754 single-precision (4 bytes) floating point number from
+    /// `self` in native-endian byte order.
+    ///
+    /// The current position is advanced by 4.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use bytes::Buf;
+    ///
+    /// let mut buf: &[u8] = match cfg!(target_endian = "big") {
+    ///     true => b"\x3F\x99\x99\x9A hello",
+    ///     false => b"\x9A\x99\x99\x3F hello",
+    /// };
+    /// assert_eq!(1.2f32, buf.get_f32_ne());
+    /// ```
+    ///
+    /// # Panics
+    ///
+    /// This function panics if there is not enough remaining data in `self`.
+    fn get_f32_ne(&mut self) -> f32 {
+        f32::from_bits(self.get_u32_ne())
     }
 
     /// Gets an IEEE754 double-precision (8 bytes) floating point number from
@@ -774,7 +1057,7 @@ pub trait Buf {
     ///
     /// This function panics if there is not enough remaining data in `self`.
     fn get_f64(&mut self) -> f64 {
-        f64::from_bits(Self::get_u64(self))
+        f64::from_bits(self.get_u64())
     }
 
     /// Gets an IEEE754 double-precision (8 bytes) floating point number from
@@ -795,7 +1078,31 @@ pub trait Buf {
     ///
     /// This function panics if there is not enough remaining data in `self`.
     fn get_f64_le(&mut self) -> f64 {
-        f64::from_bits(Self::get_u64_le(self))
+        f64::from_bits(self.get_u64_le())
+    }
+
+    /// Gets an IEEE754 double-precision (8 bytes) floating point number from
+    /// `self` in native-endian byte order.
+    ///
+    /// The current position is advanced by 8.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use bytes::Buf;
+    ///
+    /// let mut buf: &[u8] = match cfg!(target_endian = "big") {
+    ///     true => b"\x3F\xF3\x33\x33\x33\x33\x33\x33 hello",
+    ///     false => b"\x33\x33\x33\x33\x33\x33\xF3\x3F hello",
+    /// };
+    /// assert_eq!(1.2f64, buf.get_f64_ne());
+    /// ```
+    ///
+    /// # Panics
+    ///
+    /// This function panics if there is not enough remaining data in `self`.
+    fn get_f64_ne(&mut self) -> f64 {
+        f64::from_bits(self.get_u64_ne())
     }
 
     /// Consumes `len` bytes inside self and returns new instance of `Bytes`
@@ -813,10 +1120,16 @@ pub trait Buf {
     /// let bytes = (&b"hello world"[..]).copy_to_bytes(5);
     /// assert_eq!(&bytes[..], &b"hello"[..]);
     /// ```
+    ///
+    /// # Panics
+    ///
+    /// This function panics if `len > self.remaining()`.
     fn copy_to_bytes(&mut self, len: usize) -> crate::Bytes {
         use super::BufMut;
 
-        assert!(len <= self.remaining(), "`len` greater than remaining");
+        if self.remaining() < len {
+            panic_advance(len, self.remaining());
+        }
 
         let mut ret = crate::BytesMut::with_capacity(len);
         ret.put(self.take(len));
@@ -897,6 +1210,7 @@ pub trait Buf {
     /// assert_eq!(&dst[..11], &b"hello world"[..]);
     /// ```
     #[cfg(feature = "std")]
+    #[cfg_attr(docsrs, doc(cfg(feature = "std")))]
     fn reader(self) -> Reader<Self>
     where
         Self: Sized,
@@ -907,103 +1221,169 @@ pub trait Buf {
 
 macro_rules! deref_forward_buf {
     () => {
+        #[inline]
         fn remaining(&self) -> usize {
             (**self).remaining()
         }
 
+        #[inline]
         fn chunk(&self) -> &[u8] {
             (**self).chunk()
         }
 
         #[cfg(feature = "std")]
+        #[inline]
         fn chunks_vectored<'b>(&'b self, dst: &mut [IoSlice<'b>]) -> usize {
             (**self).chunks_vectored(dst)
         }
 
+        #[inline]
         fn advance(&mut self, cnt: usize) {
             (**self).advance(cnt)
         }
 
+        #[inline]
         fn has_remaining(&self) -> bool {
             (**self).has_remaining()
         }
 
+        #[inline]
         fn copy_to_slice(&mut self, dst: &mut [u8]) {
             (**self).copy_to_slice(dst)
         }
 
+        #[inline]
         fn get_u8(&mut self) -> u8 {
             (**self).get_u8()
         }
 
+        #[inline]
         fn get_i8(&mut self) -> i8 {
             (**self).get_i8()
         }
 
+        #[inline]
         fn get_u16(&mut self) -> u16 {
             (**self).get_u16()
         }
 
+        #[inline]
         fn get_u16_le(&mut self) -> u16 {
             (**self).get_u16_le()
         }
 
+        #[inline]
+        fn get_u16_ne(&mut self) -> u16 {
+            (**self).get_u16_ne()
+        }
+
+        #[inline]
         fn get_i16(&mut self) -> i16 {
             (**self).get_i16()
         }
 
+        #[inline]
         fn get_i16_le(&mut self) -> i16 {
             (**self).get_i16_le()
         }
 
+        #[inline]
+        fn get_i16_ne(&mut self) -> i16 {
+            (**self).get_i16_ne()
+        }
+
+        #[inline]
         fn get_u32(&mut self) -> u32 {
             (**self).get_u32()
         }
 
+        #[inline]
         fn get_u32_le(&mut self) -> u32 {
             (**self).get_u32_le()
         }
 
+        #[inline]
+        fn get_u32_ne(&mut self) -> u32 {
+            (**self).get_u32_ne()
+        }
+
+        #[inline]
         fn get_i32(&mut self) -> i32 {
             (**self).get_i32()
         }
 
+        #[inline]
         fn get_i32_le(&mut self) -> i32 {
             (**self).get_i32_le()
         }
 
+        #[inline]
+        fn get_i32_ne(&mut self) -> i32 {
+            (**self).get_i32_ne()
+        }
+
+        #[inline]
         fn get_u64(&mut self) -> u64 {
             (**self).get_u64()
         }
 
+        #[inline]
         fn get_u64_le(&mut self) -> u64 {
             (**self).get_u64_le()
         }
 
+        #[inline]
+        fn get_u64_ne(&mut self) -> u64 {
+            (**self).get_u64_ne()
+        }
+
+        #[inline]
         fn get_i64(&mut self) -> i64 {
             (**self).get_i64()
         }
 
+        #[inline]
         fn get_i64_le(&mut self) -> i64 {
             (**self).get_i64_le()
         }
 
+        #[inline]
+        fn get_i64_ne(&mut self) -> i64 {
+            (**self).get_i64_ne()
+        }
+
+        #[inline]
         fn get_uint(&mut self, nbytes: usize) -> u64 {
             (**self).get_uint(nbytes)
         }
 
+        #[inline]
         fn get_uint_le(&mut self, nbytes: usize) -> u64 {
             (**self).get_uint_le(nbytes)
         }
 
+        #[inline]
+        fn get_uint_ne(&mut self, nbytes: usize) -> u64 {
+            (**self).get_uint_ne(nbytes)
+        }
+
+        #[inline]
         fn get_int(&mut self, nbytes: usize) -> i64 {
             (**self).get_int(nbytes)
         }
 
+        #[inline]
         fn get_int_le(&mut self, nbytes: usize) -> i64 {
             (**self).get_int_le(nbytes)
         }
 
+        #[cfg_attr(bytes_unstable, cfg(target_has_atomic = "ptr"))]
+        #[inline]
+        fn get_int_ne(&mut self, nbytes: usize) -> i64 {
+            (**self).get_int_ne(nbytes)
+        }
+
+        #[inline]
         fn copy_to_bytes(&mut self, len: usize) -> crate::Bytes {
             (**self).copy_to_bytes(len)
         }
@@ -1031,41 +1411,52 @@ impl Buf for &[u8] {
 
     #[inline]
     fn advance(&mut self, cnt: usize) {
+        if self.len() < cnt {
+            panic_advance(cnt, self.len());
+        }
+
         *self = &self[cnt..];
     }
+
+    #[inline]
+    fn copy_to_slice(&mut self, dst: &mut [u8]) {
+        if self.len() < dst.len() {
+            panic_advance(dst.len(), self.len());
+        }
+
+        dst.copy_from_slice(&self[..dst.len()]);
+        self.advance(dst.len());
+    }
 }
 
 #[cfg(feature = "std")]
 impl<T: AsRef<[u8]>> Buf for std::io::Cursor<T> {
+    #[inline]
     fn remaining(&self) -> usize {
-        let len = self.get_ref().as_ref().len();
-        let pos = self.position();
-
-        if pos >= len as u64 {
-            return 0;
-        }
-
-        len - pos as usize
+        saturating_sub_usize_u64(self.get_ref().as_ref().len(), self.position())
     }
 
+    #[inline]
     fn chunk(&self) -> &[u8] {
+        let slice = self.get_ref().as_ref();
+        let pos = min_u64_usize(self.position(), slice.len());
+        &slice[pos..]
+    }
+
+    #[inline]
+    fn advance(&mut self, cnt: usize) {
         let len = self.get_ref().as_ref().len();
         let pos = self.position();
 
-        if pos >= len as u64 {
-            return &[];
+        // We intentionally allow `cnt == 0` here even if `pos > len`.
+        let max_cnt = saturating_sub_usize_u64(len, pos);
+        if cnt > max_cnt {
+            panic_advance(cnt, max_cnt);
         }
 
-        &self.get_ref().as_ref()[pos as usize..]
-    }
-
-    fn advance(&mut self, cnt: usize) {
-        let pos = (self.position() as usize)
-            .checked_add(cnt)
-            .expect("overflow");
-
-        assert!(pos <= self.get_ref().as_ref().len());
-        self.set_position(pos as u64);
+        // This will not overflow because either `cnt == 0` or the sum is not
+        // greater than `len`.
+        self.set_position(pos + cnt as u64);
     }
 }
 
diff --git a/src/buf/buf_mut.rs b/src/buf/buf_mut.rs
index bf33fe6..304e11b 100644
--- a/src/buf/buf_mut.rs
+++ b/src/buf/buf_mut.rs
@@ -1,8 +1,9 @@
 use crate::buf::{limit, Chain, Limit, UninitSlice};
 #[cfg(feature = "std")]
 use crate::buf::{writer, Writer};
+use crate::{panic_advance, panic_does_not_fit};
 
-use core::{cmp, mem, ptr, usize};
+use core::{mem, ptr, usize};
 
 use alloc::{boxed::Box, vec::Vec};
 
@@ -56,6 +57,10 @@ pub unsafe trait BufMut {
     /// Implementations of `remaining_mut` should ensure that the return value
     /// does not change unless a call is made to `advance_mut` or any other
     /// function that is documented to change the `BufMut`'s current position.
+    ///
+    /// # Note
+    ///
+    /// `remaining_mut` may return value smaller than actual available space.
     fn remaining_mut(&self) -> usize;
 
     /// Advance the internal cursor of the BufMut
@@ -63,8 +68,10 @@ pub unsafe trait BufMut {
     /// The next call to `chunk_mut` will return a slice starting `cnt` bytes
     /// further into the underlying buffer.
     ///
-    /// This function is unsafe because there is no guarantee that the bytes
-    /// being advanced past have been initialized.
+    /// # Safety
+    ///
+    /// The caller must ensure that the next `cnt` bytes of `chunk` are
+    /// initialized.
     ///
     /// # Examples
     ///
@@ -117,6 +124,7 @@ pub unsafe trait BufMut {
     ///
     /// assert!(!buf.has_remaining_mut());
     /// ```
+    #[inline]
     fn has_remaining_mut(&self) -> bool {
         self.remaining_mut() > 0
     }
@@ -190,27 +198,25 @@ pub unsafe trait BufMut {
     /// # Panics
     ///
     /// Panics if `self` does not have enough capacity to contain `src`.
+    #[inline]
     fn put<T: super::Buf>(&mut self, mut src: T)
     where
         Self: Sized,
     {
-        assert!(self.remaining_mut() >= src.remaining());
+        if self.remaining_mut() < src.remaining() {
+            panic_advance(src.remaining(), self.remaining_mut());
+        }
 
         while src.has_remaining() {
-            let l;
+            let s = src.chunk();
+            let d = self.chunk_mut();
+            let cnt = usize::min(s.len(), d.len());
 
-            unsafe {
-                let s = src.chunk();
-                let d = self.chunk_mut();
-                l = cmp::min(s.len(), d.len());
+            d[..cnt].copy_from_slice(&s[..cnt]);
 
-                ptr::copy_nonoverlapping(s.as_ptr(), d.as_mut_ptr() as *mut u8, l);
-            }
-
-            src.advance(l);
-            unsafe {
-                self.advance_mut(l);
-            }
+            // SAFETY: We just initialized `cnt` bytes in `self`.
+            unsafe { self.advance_mut(cnt) };
+            src.advance(cnt);
         }
     }
 
@@ -233,31 +239,63 @@ pub unsafe trait BufMut {
     ///
     /// assert_eq!(b"hello\0", &dst);
     /// ```
-    fn put_slice(&mut self, src: &[u8]) {
-        let mut off = 0;
-
-        assert!(
-            self.remaining_mut() >= src.len(),
-            "buffer overflow; remaining = {}; src = {}",
-            self.remaining_mut(),
-            src.len()
-        );
+    #[inline]
+    fn put_slice(&mut self, mut src: &[u8]) {
+        if self.remaining_mut() < src.len() {
+            panic_advance(src.len(), self.remaining_mut());
+        }
 
-        while off < src.len() {
-            let cnt;
+        while !src.is_empty() {
+            let dst = self.chunk_mut();
+            let cnt = usize::min(src.len(), dst.len());
 
-            unsafe {
-                let dst = self.chunk_mut();
-                cnt = cmp::min(dst.len(), src.len() - off);
+            dst[..cnt].copy_from_slice(&src[..cnt]);
+            src = &src[cnt..];
 
-                ptr::copy_nonoverlapping(src[off..].as_ptr(), dst.as_mut_ptr() as *mut u8, cnt);
+            // SAFETY: We just initialized `cnt` bytes in `self`.
+            unsafe { self.advance_mut(cnt) };
+        }
+    }
 
-                off += cnt;
-            }
+    /// Put `cnt` bytes `val` into `self`.
+    ///
+    /// Logically equivalent to calling `self.put_u8(val)` `cnt` times, but may work faster.
+    ///
+    /// `self` must have at least `cnt` remaining capacity.
+    ///
+    /// ```
+    /// use bytes::BufMut;
+    ///
+    /// let mut dst = [0; 6];
+    ///
+    /// {
+    ///     let mut buf = &mut dst[..];
+    ///     buf.put_bytes(b'a', 4);
+    ///
+    ///     assert_eq!(2, buf.remaining_mut());
+    /// }
+    ///
+    /// assert_eq!(b"aaaa\0\0", &dst);
+    /// ```
+    ///
+    /// # Panics
+    ///
+    /// This function panics if there is not enough remaining capacity in
+    /// `self`.
+    #[inline]
+    fn put_bytes(&mut self, val: u8, mut cnt: usize) {
+        if self.remaining_mut() < cnt {
+            panic_advance(cnt, self.remaining_mut());
+        }
 
-            unsafe {
-                self.advance_mut(cnt);
-            }
+        while cnt > 0 {
+            let dst = self.chunk_mut();
+            let dst_len = usize::min(dst.len(), cnt);
+            // SAFETY: The pointer is valid for `dst_len <= dst.len()` bytes.
+            unsafe { core::ptr::write_bytes(dst.as_mut_ptr(), val, dst_len) };
+            // SAFETY: We just initialized `dst_len` bytes in `self`.
+            unsafe { self.advance_mut(dst_len) };
+            cnt -= dst_len;
         }
     }
 
@@ -279,6 +317,7 @@ pub unsafe trait BufMut {
     ///
     /// This function panics if there is not enough remaining capacity in
     /// `self`.
+    #[inline]
     fn put_u8(&mut self, n: u8) {
         let src = [n];
         self.put_slice(&src);
@@ -302,6 +341,7 @@ pub unsafe trait BufMut {
     ///
     /// This function panics if there is not enough remaining capacity in
     /// `self`.
+    #[inline]
     fn put_i8(&mut self, n: i8) {
         let src = [n as u8];
         self.put_slice(&src)
@@ -325,6 +365,7 @@ pub unsafe trait BufMut {
     ///
     /// This function panics if there is not enough remaining capacity in
     /// `self`.
+    #[inline]
     fn put_u16(&mut self, n: u16) {
         self.put_slice(&n.to_be_bytes())
     }
@@ -347,10 +388,38 @@ pub unsafe trait BufMut {
     ///
     /// This function panics if there is not enough remaining capacity in
     /// `self`.
+    #[inline]
     fn put_u16_le(&mut self, n: u16) {
         self.put_slice(&n.to_le_bytes())
     }
 
+    /// Writes an unsigned 16 bit integer to `self` in native-endian byte order.
+    ///
+    /// The current position is advanced by 2.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use bytes::BufMut;
+    ///
+    /// let mut buf = vec![];
+    /// buf.put_u16_ne(0x0809);
+    /// if cfg!(target_endian = "big") {
+    ///     assert_eq!(buf, b"\x08\x09");
+    /// } else {
+    ///     assert_eq!(buf, b"\x09\x08");
+    /// }
+    /// ```
+    ///
+    /// # Panics
+    ///
+    /// This function panics if there is not enough remaining capacity in
+    /// `self`.
+    #[inline]
+    fn put_u16_ne(&mut self, n: u16) {
+        self.put_slice(&n.to_ne_bytes())
+    }
+
     /// Writes a signed 16 bit integer to `self` in big-endian byte order.
     ///
     /// The current position is advanced by 2.
@@ -369,6 +438,7 @@ pub unsafe trait BufMut {
     ///
     /// This function panics if there is not enough remaining capacity in
     /// `self`.
+    #[inline]
     fn put_i16(&mut self, n: i16) {
         self.put_slice(&n.to_be_bytes())
     }
@@ -391,10 +461,38 @@ pub unsafe trait BufMut {
     ///
     /// This function panics if there is not enough remaining capacity in
     /// `self`.
+    #[inline]
     fn put_i16_le(&mut self, n: i16) {
         self.put_slice(&n.to_le_bytes())
     }
 
+    /// Writes a signed 16 bit integer to `self` in native-endian byte order.
+    ///
+    /// The current position is advanced by 2.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use bytes::BufMut;
+    ///
+    /// let mut buf = vec![];
+    /// buf.put_i16_ne(0x0809);
+    /// if cfg!(target_endian = "big") {
+    ///     assert_eq!(buf, b"\x08\x09");
+    /// } else {
+    ///     assert_eq!(buf, b"\x09\x08");
+    /// }
+    /// ```
+    ///
+    /// # Panics
+    ///
+    /// This function panics if there is not enough remaining capacity in
+    /// `self`.
+    #[inline]
+    fn put_i16_ne(&mut self, n: i16) {
+        self.put_slice(&n.to_ne_bytes())
+    }
+
     /// Writes an unsigned 32 bit integer to `self` in big-endian byte order.
     ///
     /// The current position is advanced by 4.
@@ -413,6 +511,7 @@ pub unsafe trait BufMut {
     ///
     /// This function panics if there is not enough remaining capacity in
     /// `self`.
+    #[inline]
     fn put_u32(&mut self, n: u32) {
         self.put_slice(&n.to_be_bytes())
     }
@@ -435,10 +534,38 @@ pub unsafe trait BufMut {
     ///
     /// This function panics if there is not enough remaining capacity in
     /// `self`.
+    #[inline]
     fn put_u32_le(&mut self, n: u32) {
         self.put_slice(&n.to_le_bytes())
     }
 
+    /// Writes an unsigned 32 bit integer to `self` in native-endian byte order.
+    ///
+    /// The current position is advanced by 4.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use bytes::BufMut;
+    ///
+    /// let mut buf = vec![];
+    /// buf.put_u32_ne(0x0809A0A1);
+    /// if cfg!(target_endian = "big") {
+    ///     assert_eq!(buf, b"\x08\x09\xA0\xA1");
+    /// } else {
+    ///     assert_eq!(buf, b"\xA1\xA0\x09\x08");
+    /// }
+    /// ```
+    ///
+    /// # Panics
+    ///
+    /// This function panics if there is not enough remaining capacity in
+    /// `self`.
+    #[inline]
+    fn put_u32_ne(&mut self, n: u32) {
+        self.put_slice(&n.to_ne_bytes())
+    }
+
     /// Writes a signed 32 bit integer to `self` in big-endian byte order.
     ///
     /// The current position is advanced by 4.
@@ -457,6 +584,7 @@ pub unsafe trait BufMut {
     ///
     /// This function panics if there is not enough remaining capacity in
     /// `self`.
+    #[inline]
     fn put_i32(&mut self, n: i32) {
         self.put_slice(&n.to_be_bytes())
     }
@@ -479,10 +607,38 @@ pub unsafe trait BufMut {
     ///
     /// This function panics if there is not enough remaining capacity in
     /// `self`.
+    #[inline]
     fn put_i32_le(&mut self, n: i32) {
         self.put_slice(&n.to_le_bytes())
     }
 
+    /// Writes a signed 32 bit integer to `self` in native-endian byte order.
+    ///
+    /// The current position is advanced by 4.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use bytes::BufMut;
+    ///
+    /// let mut buf = vec![];
+    /// buf.put_i32_ne(0x0809A0A1);
+    /// if cfg!(target_endian = "big") {
+    ///     assert_eq!(buf, b"\x08\x09\xA0\xA1");
+    /// } else {
+    ///     assert_eq!(buf, b"\xA1\xA0\x09\x08");
+    /// }
+    /// ```
+    ///
+    /// # Panics
+    ///
+    /// This function panics if there is not enough remaining capacity in
+    /// `self`.
+    #[inline]
+    fn put_i32_ne(&mut self, n: i32) {
+        self.put_slice(&n.to_ne_bytes())
+    }
+
     /// Writes an unsigned 64 bit integer to `self` in the big-endian byte order.
     ///
     /// The current position is advanced by 8.
@@ -501,6 +657,7 @@ pub unsafe trait BufMut {
     ///
     /// This function panics if there is not enough remaining capacity in
     /// `self`.
+    #[inline]
     fn put_u64(&mut self, n: u64) {
         self.put_slice(&n.to_be_bytes())
     }
@@ -523,10 +680,38 @@ pub unsafe trait BufMut {
     ///
     /// This function panics if there is not enough remaining capacity in
     /// `self`.
+    #[inline]
     fn put_u64_le(&mut self, n: u64) {
         self.put_slice(&n.to_le_bytes())
     }
 
+    /// Writes an unsigned 64 bit integer to `self` in native-endian byte order.
+    ///
+    /// The current position is advanced by 8.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use bytes::BufMut;
+    ///
+    /// let mut buf = vec![];
+    /// buf.put_u64_ne(0x0102030405060708);
+    /// if cfg!(target_endian = "big") {
+    ///     assert_eq!(buf, b"\x01\x02\x03\x04\x05\x06\x07\x08");
+    /// } else {
+    ///     assert_eq!(buf, b"\x08\x07\x06\x05\x04\x03\x02\x01");
+    /// }
+    /// ```
+    ///
+    /// # Panics
+    ///
+    /// This function panics if there is not enough remaining capacity in
+    /// `self`.
+    #[inline]
+    fn put_u64_ne(&mut self, n: u64) {
+        self.put_slice(&n.to_ne_bytes())
+    }
+
     /// Writes a signed 64 bit integer to `self` in the big-endian byte order.
     ///
     /// The current position is advanced by 8.
@@ -545,6 +730,7 @@ pub unsafe trait BufMut {
     ///
     /// This function panics if there is not enough remaining capacity in
     /// `self`.
+    #[inline]
     fn put_i64(&mut self, n: i64) {
         self.put_slice(&n.to_be_bytes())
     }
@@ -567,10 +753,38 @@ pub unsafe trait BufMut {
     ///
     /// This function panics if there is not enough remaining capacity in
     /// `self`.
+    #[inline]
     fn put_i64_le(&mut self, n: i64) {
         self.put_slice(&n.to_le_bytes())
     }
 
+    /// Writes a signed 64 bit integer to `self` in native-endian byte order.
+    ///
+    /// The current position is advanced by 8.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use bytes::BufMut;
+    ///
+    /// let mut buf = vec![];
+    /// buf.put_i64_ne(0x0102030405060708);
+    /// if cfg!(target_endian = "big") {
+    ///     assert_eq!(buf, b"\x01\x02\x03\x04\x05\x06\x07\x08");
+    /// } else {
+    ///     assert_eq!(buf, b"\x08\x07\x06\x05\x04\x03\x02\x01");
+    /// }
+    /// ```
+    ///
+    /// # Panics
+    ///
+    /// This function panics if there is not enough remaining capacity in
+    /// `self`.
+    #[inline]
+    fn put_i64_ne(&mut self, n: i64) {
+        self.put_slice(&n.to_ne_bytes())
+    }
+
     /// Writes an unsigned 128 bit integer to `self` in the big-endian byte order.
     ///
     /// The current position is advanced by 16.
@@ -589,6 +803,7 @@ pub unsafe trait BufMut {
     ///
     /// This function panics if there is not enough remaining capacity in
     /// `self`.
+    #[inline]
     fn put_u128(&mut self, n: u128) {
         self.put_slice(&n.to_be_bytes())
     }
@@ -611,10 +826,38 @@ pub unsafe trait BufMut {
     ///
     /// This function panics if there is not enough remaining capacity in
     /// `self`.
+    #[inline]
     fn put_u128_le(&mut self, n: u128) {
         self.put_slice(&n.to_le_bytes())
     }
 
+    /// Writes an unsigned 128 bit integer to `self` in native-endian byte order.
+    ///
+    /// The current position is advanced by 16.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use bytes::BufMut;
+    ///
+    /// let mut buf = vec![];
+    /// buf.put_u128_ne(0x01020304050607080910111213141516);
+    /// if cfg!(target_endian = "big") {
+    ///     assert_eq!(buf, b"\x01\x02\x03\x04\x05\x06\x07\x08\x09\x10\x11\x12\x13\x14\x15\x16");
+    /// } else {
+    ///     assert_eq!(buf, b"\x16\x15\x14\x13\x12\x11\x10\x09\x08\x07\x06\x05\x04\x03\x02\x01");
+    /// }
+    /// ```
+    ///
+    /// # Panics
+    ///
+    /// This function panics if there is not enough remaining capacity in
+    /// `self`.
+    #[inline]
+    fn put_u128_ne(&mut self, n: u128) {
+        self.put_slice(&n.to_ne_bytes())
+    }
+
     /// Writes a signed 128 bit integer to `self` in the big-endian byte order.
     ///
     /// The current position is advanced by 16.
@@ -633,6 +876,7 @@ pub unsafe trait BufMut {
     ///
     /// This function panics if there is not enough remaining capacity in
     /// `self`.
+    #[inline]
     fn put_i128(&mut self, n: i128) {
         self.put_slice(&n.to_be_bytes())
     }
@@ -655,10 +899,38 @@ pub unsafe trait BufMut {
     ///
     /// This function panics if there is not enough remaining capacity in
     /// `self`.
+    #[inline]
     fn put_i128_le(&mut self, n: i128) {
         self.put_slice(&n.to_le_bytes())
     }
 
+    /// Writes a signed 128 bit integer to `self` in native-endian byte order.
+    ///
+    /// The current position is advanced by 16.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use bytes::BufMut;
+    ///
+    /// let mut buf = vec![];
+    /// buf.put_i128_ne(0x01020304050607080910111213141516);
+    /// if cfg!(target_endian = "big") {
+    ///     assert_eq!(buf, b"\x01\x02\x03\x04\x05\x06\x07\x08\x09\x10\x11\x12\x13\x14\x15\x16");
+    /// } else {
+    ///     assert_eq!(buf, b"\x16\x15\x14\x13\x12\x11\x10\x09\x08\x07\x06\x05\x04\x03\x02\x01");
+    /// }
+    /// ```
+    ///
+    /// # Panics
+    ///
+    /// This function panics if there is not enough remaining capacity in
+    /// `self`.
+    #[inline]
+    fn put_i128_ne(&mut self, n: i128) {
+        self.put_slice(&n.to_ne_bytes())
+    }
+
     /// Writes an unsigned n-byte integer to `self` in big-endian byte order.
     ///
     /// The current position is advanced by `nbytes`.
@@ -676,9 +948,15 @@ pub unsafe trait BufMut {
     /// # Panics
     ///
     /// This function panics if there is not enough remaining capacity in
-    /// `self`.
+    /// `self` or if `nbytes` is greater than 8.
+    #[inline]
     fn put_uint(&mut self, n: u64, nbytes: usize) {
-        self.put_slice(&n.to_be_bytes()[mem::size_of_val(&n) - nbytes..]);
+        let start = match mem::size_of_val(&n).checked_sub(nbytes) {
+            Some(start) => start,
+            None => panic_does_not_fit(nbytes, mem::size_of_val(&n)),
+        };
+
+        self.put_slice(&n.to_be_bytes()[start..]);
     }
 
     /// Writes an unsigned n-byte integer to `self` in the little-endian byte order.
@@ -698,12 +976,19 @@ pub unsafe trait BufMut {
     /// # Panics
     ///
     /// This function panics if there is not enough remaining capacity in
-    /// `self`.
+    /// `self` or if `nbytes` is greater than 8.
+    #[inline]
     fn put_uint_le(&mut self, n: u64, nbytes: usize) {
-        self.put_slice(&n.to_le_bytes()[0..nbytes]);
+        let slice = n.to_le_bytes();
+        let slice = match slice.get(..nbytes) {
+            Some(slice) => slice,
+            None => panic_does_not_fit(nbytes, slice.len()),
+        };
+
+        self.put_slice(slice);
     }
 
-    /// Writes a signed n-byte integer to `self` in big-endian byte order.
+    /// Writes an unsigned n-byte integer to `self` in the native-endian byte order.
     ///
     /// The current position is advanced by `nbytes`.
     ///
@@ -713,19 +998,56 @@ pub unsafe trait BufMut {
     /// use bytes::BufMut;
     ///
     /// let mut buf = vec![];
-    /// buf.put_int(0x010203, 3);
+    /// buf.put_uint_ne(0x010203, 3);
+    /// if cfg!(target_endian = "big") {
+    ///     assert_eq!(buf, b"\x01\x02\x03");
+    /// } else {
+    ///     assert_eq!(buf, b"\x03\x02\x01");
+    /// }
+    /// ```
+    ///
+    /// # Panics
+    ///
+    /// This function panics if there is not enough remaining capacity in
+    /// `self` or if `nbytes` is greater than 8.
+    #[inline]
+    fn put_uint_ne(&mut self, n: u64, nbytes: usize) {
+        if cfg!(target_endian = "big") {
+            self.put_uint(n, nbytes)
+        } else {
+            self.put_uint_le(n, nbytes)
+        }
+    }
+
+    /// Writes low `nbytes` of a signed integer to `self` in big-endian byte order.
+    ///
+    /// The current position is advanced by `nbytes`.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use bytes::BufMut;
+    ///
+    /// let mut buf = vec![];
+    /// buf.put_int(0x0504010203, 3);
     /// assert_eq!(buf, b"\x01\x02\x03");
     /// ```
     ///
     /// # Panics
     ///
     /// This function panics if there is not enough remaining capacity in
-    /// `self`.
+    /// `self` or if `nbytes` is greater than 8.
+    #[inline]
     fn put_int(&mut self, n: i64, nbytes: usize) {
-        self.put_slice(&n.to_be_bytes()[mem::size_of_val(&n) - nbytes..]);
+        let start = match mem::size_of_val(&n).checked_sub(nbytes) {
+            Some(start) => start,
+            None => panic_does_not_fit(nbytes, mem::size_of_val(&n)),
+        };
+
+        self.put_slice(&n.to_be_bytes()[start..]);
     }
 
-    /// Writes a signed n-byte integer to `self` in little-endian byte order.
+    /// Writes low `nbytes` of a signed integer to `self` in little-endian byte order.
     ///
     /// The current position is advanced by `nbytes`.
     ///
@@ -735,16 +1057,54 @@ pub unsafe trait BufMut {
     /// use bytes::BufMut;
     ///
     /// let mut buf = vec![];
-    /// buf.put_int_le(0x010203, 3);
+    /// buf.put_int_le(0x0504010203, 3);
     /// assert_eq!(buf, b"\x03\x02\x01");
     /// ```
     ///
     /// # Panics
     ///
     /// This function panics if there is not enough remaining capacity in
-    /// `self`.
+    /// `self` or if `nbytes` is greater than 8.
+    #[inline]
     fn put_int_le(&mut self, n: i64, nbytes: usize) {
-        self.put_slice(&n.to_le_bytes()[0..nbytes]);
+        let slice = n.to_le_bytes();
+        let slice = match slice.get(..nbytes) {
+            Some(slice) => slice,
+            None => panic_does_not_fit(nbytes, slice.len()),
+        };
+
+        self.put_slice(slice);
+    }
+
+    /// Writes low `nbytes` of a signed integer to `self` in native-endian byte order.
+    ///
+    /// The current position is advanced by `nbytes`.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use bytes::BufMut;
+    ///
+    /// let mut buf = vec![];
+    /// buf.put_int_ne(0x010203, 3);
+    /// if cfg!(target_endian = "big") {
+    ///     assert_eq!(buf, b"\x01\x02\x03");
+    /// } else {
+    ///     assert_eq!(buf, b"\x03\x02\x01");
+    /// }
+    /// ```
+    ///
+    /// # Panics
+    ///
+    /// This function panics if there is not enough remaining capacity in
+    /// `self` or if `nbytes` is greater than 8.
+    #[inline]
+    fn put_int_ne(&mut self, n: i64, nbytes: usize) {
+        if cfg!(target_endian = "big") {
+            self.put_int(n, nbytes)
+        } else {
+            self.put_int_le(n, nbytes)
+        }
     }
 
     /// Writes  an IEEE754 single-precision (4 bytes) floating point number to
@@ -766,6 +1126,7 @@ pub unsafe trait BufMut {
     ///
     /// This function panics if there is not enough remaining capacity in
     /// `self`.
+    #[inline]
     fn put_f32(&mut self, n: f32) {
         self.put_u32(n.to_bits());
     }
@@ -789,10 +1150,39 @@ pub unsafe trait BufMut {
     ///
     /// This function panics if there is not enough remaining capacity in
     /// `self`.
+    #[inline]
     fn put_f32_le(&mut self, n: f32) {
         self.put_u32_le(n.to_bits());
     }
 
+    /// Writes an IEEE754 single-precision (4 bytes) floating point number to
+    /// `self` in native-endian byte order.
+    ///
+    /// The current position is advanced by 4.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use bytes::BufMut;
+    ///
+    /// let mut buf = vec![];
+    /// buf.put_f32_ne(1.2f32);
+    /// if cfg!(target_endian = "big") {
+    ///     assert_eq!(buf, b"\x3F\x99\x99\x9A");
+    /// } else {
+    ///     assert_eq!(buf, b"\x9A\x99\x99\x3F");
+    /// }
+    /// ```
+    ///
+    /// # Panics
+    ///
+    /// This function panics if there is not enough remaining capacity in
+    /// `self`.
+    #[inline]
+    fn put_f32_ne(&mut self, n: f32) {
+        self.put_u32_ne(n.to_bits());
+    }
+
     /// Writes  an IEEE754 double-precision (8 bytes) floating point number to
     /// `self` in big-endian byte order.
     ///
@@ -812,6 +1202,7 @@ pub unsafe trait BufMut {
     ///
     /// This function panics if there is not enough remaining capacity in
     /// `self`.
+    #[inline]
     fn put_f64(&mut self, n: f64) {
         self.put_u64(n.to_bits());
     }
@@ -835,10 +1226,39 @@ pub unsafe trait BufMut {
     ///
     /// This function panics if there is not enough remaining capacity in
     /// `self`.
+    #[inline]
     fn put_f64_le(&mut self, n: f64) {
         self.put_u64_le(n.to_bits());
     }
 
+    /// Writes  an IEEE754 double-precision (8 bytes) floating point number to
+    /// `self` in native-endian byte order.
+    ///
+    /// The current position is advanced by 8.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use bytes::BufMut;
+    ///
+    /// let mut buf = vec![];
+    /// buf.put_f64_ne(1.2f64);
+    /// if cfg!(target_endian = "big") {
+    ///     assert_eq!(buf, b"\x3F\xF3\x33\x33\x33\x33\x33\x33");
+    /// } else {
+    ///     assert_eq!(buf, b"\x33\x33\x33\x33\x33\x33\xF3\x3F");
+    /// }
+    /// ```
+    ///
+    /// # Panics
+    ///
+    /// This function panics if there is not enough remaining capacity in
+    /// `self`.
+    #[inline]
+    fn put_f64_ne(&mut self, n: f64) {
+        self.put_u64_ne(n.to_bits());
+    }
+
     /// Creates an adaptor which can write at most `limit` bytes to `self`.
     ///
     /// # Examples
@@ -852,6 +1272,7 @@ pub unsafe trait BufMut {
     /// let dst = arr.limit(10);
     /// assert_eq!(dst.remaining_mut(), 10);
     /// ```
+    #[inline]
     fn limit(self, limit: usize) -> Limit<Self>
     where
         Self: Sized,
@@ -882,6 +1303,8 @@ pub unsafe trait BufMut {
     /// assert_eq!(*buf, b"hello world"[..]);
     /// ```
     #[cfg(feature = "std")]
+    #[cfg_attr(docsrs, doc(cfg(feature = "std")))]
+    #[inline]
     fn writer(self) -> Writer<Self>
     where
         Self: Sized,
@@ -909,6 +1332,7 @@ pub unsafe trait BufMut {
     /// assert_eq!(&a[..], b"hello");
     /// assert_eq!(&b[..], b" world");
     /// ```
+    #[inline]
     fn chain_mut<U: BufMut>(self, next: U) -> Chain<Self, U>
     where
         Self: Sized,
@@ -919,77 +1343,125 @@ pub unsafe trait BufMut {
 
 macro_rules! deref_forward_bufmut {
     () => {
+        #[inline]
         fn remaining_mut(&self) -> usize {
             (**self).remaining_mut()
         }
 
+        #[inline]
         fn chunk_mut(&mut self) -> &mut UninitSlice {
             (**self).chunk_mut()
         }
 
+        #[inline]
         unsafe fn advance_mut(&mut self, cnt: usize) {
             (**self).advance_mut(cnt)
         }
 
+        #[inline]
         fn put_slice(&mut self, src: &[u8]) {
             (**self).put_slice(src)
         }
 
+        #[inline]
         fn put_u8(&mut self, n: u8) {
             (**self).put_u8(n)
         }
 
+        #[inline]
         fn put_i8(&mut self, n: i8) {
             (**self).put_i8(n)
         }
 
+        #[inline]
         fn put_u16(&mut self, n: u16) {
             (**self).put_u16(n)
         }
 
+        #[inline]
         fn put_u16_le(&mut self, n: u16) {
             (**self).put_u16_le(n)
         }
 
+        #[inline]
+        fn put_u16_ne(&mut self, n: u16) {
+            (**self).put_u16_ne(n)
+        }
+
+        #[inline]
         fn put_i16(&mut self, n: i16) {
             (**self).put_i16(n)
         }
 
+        #[inline]
         fn put_i16_le(&mut self, n: i16) {
             (**self).put_i16_le(n)
         }
 
+        #[inline]
+        fn put_i16_ne(&mut self, n: i16) {
+            (**self).put_i16_ne(n)
+        }
+
+        #[inline]
         fn put_u32(&mut self, n: u32) {
             (**self).put_u32(n)
         }
 
+        #[inline]
         fn put_u32_le(&mut self, n: u32) {
             (**self).put_u32_le(n)
         }
 
+        #[inline]
+        fn put_u32_ne(&mut self, n: u32) {
+            (**self).put_u32_ne(n)
+        }
+
+        #[inline]
         fn put_i32(&mut self, n: i32) {
             (**self).put_i32(n)
         }
 
+        #[inline]
         fn put_i32_le(&mut self, n: i32) {
             (**self).put_i32_le(n)
         }
 
+        #[inline]
+        fn put_i32_ne(&mut self, n: i32) {
+            (**self).put_i32_ne(n)
+        }
+
+        #[inline]
         fn put_u64(&mut self, n: u64) {
             (**self).put_u64(n)
         }
 
+        #[inline]
         fn put_u64_le(&mut self, n: u64) {
             (**self).put_u64_le(n)
         }
 
+        #[inline]
+        fn put_u64_ne(&mut self, n: u64) {
+            (**self).put_u64_ne(n)
+        }
+
+        #[inline]
         fn put_i64(&mut self, n: i64) {
             (**self).put_i64(n)
         }
 
+        #[inline]
         fn put_i64_le(&mut self, n: i64) {
             (**self).put_i64_le(n)
         }
+
+        #[inline]
+        fn put_i64_ne(&mut self, n: i64) {
+            (**self).put_i64_ne(n)
+        }
     };
 }
 
@@ -1009,12 +1481,15 @@ unsafe impl BufMut for &mut [u8] {
 
     #[inline]
     fn chunk_mut(&mut self) -> &mut UninitSlice {
-        // UninitSlice is repr(transparent), so safe to transmute
-        unsafe { &mut *(*self as *mut [u8] as *mut _) }
+        UninitSlice::new(self)
     }
 
     #[inline]
     unsafe fn advance_mut(&mut self, cnt: usize) {
+        if self.len() < cnt {
+            panic_advance(cnt, self.len());
+        }
+
         // Lifetime dance taken from `impl Write for &mut [u8]`.
         let (_, b) = core::mem::replace(self, &mut []).split_at_mut(cnt);
         *self = b;
@@ -1022,11 +1497,76 @@ unsafe impl BufMut for &mut [u8] {
 
     #[inline]
     fn put_slice(&mut self, src: &[u8]) {
+        if self.len() < src.len() {
+            panic_advance(src.len(), self.len());
+        }
+
         self[..src.len()].copy_from_slice(src);
+        // SAFETY: We just initialized `src.len()` bytes.
+        unsafe { self.advance_mut(src.len()) };
+    }
+
+    #[inline]
+    fn put_bytes(&mut self, val: u8, cnt: usize) {
+        if self.len() < cnt {
+            panic_advance(cnt, self.len());
+        }
+
+        // SAFETY: We just checked that the pointer is valid for `cnt` bytes.
         unsafe {
+            ptr::write_bytes(self.as_mut_ptr(), val, cnt);
+            self.advance_mut(cnt);
+        }
+    }
+}
+
+unsafe impl BufMut for &mut [core::mem::MaybeUninit<u8>] {
+    #[inline]
+    fn remaining_mut(&self) -> usize {
+        self.len()
+    }
+
+    #[inline]
+    fn chunk_mut(&mut self) -> &mut UninitSlice {
+        UninitSlice::uninit(self)
+    }
+
+    #[inline]
+    unsafe fn advance_mut(&mut self, cnt: usize) {
+        if self.len() < cnt {
+            panic_advance(cnt, self.len());
+        }
+
+        // Lifetime dance taken from `impl Write for &mut [u8]`.
+        let (_, b) = core::mem::replace(self, &mut []).split_at_mut(cnt);
+        *self = b;
+    }
+
+    #[inline]
+    fn put_slice(&mut self, src: &[u8]) {
+        if self.len() < src.len() {
+            panic_advance(src.len(), self.len());
+        }
+
+        // SAFETY: We just checked that the pointer is valid for `src.len()` bytes.
+        unsafe {
+            ptr::copy_nonoverlapping(src.as_ptr(), self.as_mut_ptr().cast(), src.len());
             self.advance_mut(src.len());
         }
     }
+
+    #[inline]
+    fn put_bytes(&mut self, val: u8, cnt: usize) {
+        if self.len() < cnt {
+            panic_advance(cnt, self.len());
+        }
+
+        // SAFETY: We just checked that the pointer is valid for `cnt` bytes.
+        unsafe {
+            ptr::write_bytes(self.as_mut_ptr() as *mut u8, val, cnt);
+            self.advance_mut(cnt);
+        }
+    }
 }
 
 unsafe impl BufMut for Vec<u8> {
@@ -1041,13 +1581,11 @@ unsafe impl BufMut for Vec<u8> {
         let len = self.len();
         let remaining = self.capacity() - len;
 
-        assert!(
-            cnt <= remaining,
-            "cannot advance past `remaining_mut`: {:?} <= {:?}",
-            cnt,
-            remaining
-        );
+        if remaining < cnt {
+            panic_advance(cnt, remaining);
+        }
 
+        // Addition will not overflow since the sum is at most the capacity.
         self.set_len(len + cnt);
     }
 
@@ -1061,28 +1599,26 @@ unsafe impl BufMut for Vec<u8> {
         let len = self.len();
 
         let ptr = self.as_mut_ptr();
-        unsafe { &mut UninitSlice::from_raw_parts_mut(ptr, cap)[len..] }
+        // SAFETY: Since `ptr` is valid for `cap` bytes, `ptr.add(len)` must be
+        // valid for `cap - len` bytes. The subtraction will not underflow since
+        // `len <= cap`.
+        unsafe { UninitSlice::from_raw_parts_mut(ptr.add(len), cap - len) }
     }
 
     // Specialize these methods so they can skip checking `remaining_mut`
     // and `advance_mut`.
+    #[inline]
     fn put<T: super::Buf>(&mut self, mut src: T)
     where
         Self: Sized,
     {
-        // In case the src isn't contiguous, reserve upfront
+        // In case the src isn't contiguous, reserve upfront.
         self.reserve(src.remaining());
 
         while src.has_remaining() {
-            let l;
-
-            // a block to contain the src.bytes() borrow
-            {
-                let s = src.chunk();
-                l = s.len();
-                self.extend_from_slice(s);
-            }
-
+            let s = src.chunk();
+            let l = s.len();
+            self.extend_from_slice(s);
             src.advance(l);
         }
     }
@@ -1091,6 +1627,13 @@ unsafe impl BufMut for Vec<u8> {
     fn put_slice(&mut self, src: &[u8]) {
         self.extend_from_slice(src);
     }
+
+    #[inline]
+    fn put_bytes(&mut self, val: u8, cnt: usize) {
+        // If the addition overflows, then the `resize` will fail.
+        let new_len = self.len().saturating_add(cnt);
+        self.resize(new_len, val);
+    }
 }
 
 // The existence of this function makes the compiler catch if the BufMut
diff --git a/src/buf/chain.rs b/src/buf/chain.rs
index 9ce5f23..c8bc36d 100644
--- a/src/buf/chain.rs
+++ b/src/buf/chain.rs
@@ -1,5 +1,5 @@
 use crate::buf::{IntoIter, UninitSlice};
-use crate::{Buf, BufMut, Bytes};
+use crate::{Buf, BufMut};
 
 #[cfg(feature = "std")]
 use std::io::IoSlice;
@@ -25,9 +25,7 @@ use std::io::IoSlice;
 /// assert_eq!(full[..], b"hello world"[..]);
 /// ```
 ///
-/// [`Buf::chain`]: trait.Buf.html#method.chain
-/// [`Buf`]: trait.Buf.html
-/// [`BufMut`]: trait.BufMut.html
+/// [`Buf::chain`]: Buf::chain
 #[derive(Debug)]
 pub struct Chain<T, U> {
     a: T,
@@ -135,7 +133,7 @@ where
     U: Buf,
 {
     fn remaining(&self) -> usize {
-        self.a.remaining().checked_add(self.b.remaining()).unwrap()
+        self.a.remaining().saturating_add(self.b.remaining())
     }
 
     fn chunk(&self) -> &[u8] {
@@ -171,7 +169,7 @@ where
         n
     }
 
-    fn copy_to_bytes(&mut self, len: usize) -> Bytes {
+    fn copy_to_bytes(&mut self, len: usize) -> crate::Bytes {
         let a_rem = self.a.remaining();
         if a_rem >= len {
             self.a.copy_to_bytes(len)
@@ -198,8 +196,7 @@ where
     fn remaining_mut(&self) -> usize {
         self.a
             .remaining_mut()
-            .checked_add(self.b.remaining_mut())
-            .unwrap()
+            .saturating_add(self.b.remaining_mut())
     }
 
     fn chunk_mut(&mut self) -> &mut UninitSlice {
diff --git a/src/buf/iter.rs b/src/buf/iter.rs
index 8914a40..74f9b99 100644
--- a/src/buf/iter.rs
+++ b/src/buf/iter.rs
@@ -2,8 +2,6 @@ use crate::Buf;
 
 /// Iterator over the bytes contained by the buffer.
 ///
-/// This struct is created by the [`iter`] method on [`Buf`].
-///
 /// # Examples
 ///
 /// Basic usage:
@@ -19,9 +17,6 @@ use crate::Buf;
 /// assert_eq!(iter.next(), Some(b'c'));
 /// assert_eq!(iter.next(), None);
 /// ```
-///
-/// [`iter`]: trait.Buf.html#method.iter
-/// [`Buf`]: trait.Buf.html
 #[derive(Debug)]
 pub struct IntoIter<T> {
     inner: T,
@@ -43,7 +38,7 @@ impl<T> IntoIter<T> {
     /// assert_eq!(iter.next(), Some(b'c'));
     /// assert_eq!(iter.next(), None);
     /// ```
-    pub(crate) fn new(inner: T) -> IntoIter<T> {
+    pub fn new(inner: T) -> IntoIter<T> {
         IntoIter { inner }
     }
 
diff --git a/src/buf/mod.rs b/src/buf/mod.rs
index c4c0a57..1bf0a47 100644
--- a/src/buf/mod.rs
+++ b/src/buf/mod.rs
@@ -13,8 +13,6 @@
 //! See [`Buf`] and [`BufMut`] for more details.
 //!
 //! [rope]: https://en.wikipedia.org/wiki/Rope_(data_structure)
-//! [`Buf`]: trait.Buf.html
-//! [`BufMut`]: trait.BufMut.html
 
 mod buf_impl;
 mod buf_mut;
diff --git a/src/buf/reader.rs b/src/buf/reader.rs
index f2b4d98..5214949 100644
--- a/src/buf/reader.rs
+++ b/src/buf/reader.rs
@@ -5,7 +5,7 @@ use std::{cmp, io};
 /// A `Buf` adapter which implements `io::Read` for the inner value.
 ///
 /// This struct is generally created by calling `reader()` on `Buf`. See
-/// documentation of [`reader()`](trait.Buf.html#method.reader) for more
+/// documentation of [`reader()`](Buf::reader) for more
 /// details.
 #[derive(Debug)]
 pub struct Reader<B> {
diff --git a/src/buf/take.rs b/src/buf/take.rs
index d3cb10a..e74b064 100644
--- a/src/buf/take.rs
+++ b/src/buf/take.rs
@@ -1,11 +1,11 @@
-use crate::{Buf, Bytes};
+use crate::Buf;
 
 use core::cmp;
 
 /// A `Buf` adapter which limits the bytes read from an underlying buffer.
 ///
 /// This struct is generally created by calling `take()` on `Buf`. See
-/// documentation of [`take()`](trait.Buf.html#method.take) for more details.
+/// documentation of [`take()`](Buf::take) for more details.
 #[derive(Debug)]
 pub struct Take<T> {
     inner: T,
@@ -145,7 +145,7 @@ impl<T: Buf> Buf for Take<T> {
         self.limit -= cnt;
     }
 
-    fn copy_to_bytes(&mut self, len: usize) -> Bytes {
+    fn copy_to_bytes(&mut self, len: usize) -> crate::Bytes {
         assert!(len <= self.remaining(), "`len` greater than remaining");
 
         let r = self.inner.copy_to_bytes(len);
diff --git a/src/buf/uninit_slice.rs b/src/buf/uninit_slice.rs
index fb67c0a..82ebdbb 100644
--- a/src/buf/uninit_slice.rs
+++ b/src/buf/uninit_slice.rs
@@ -22,6 +22,44 @@ use core::ops::{
 pub struct UninitSlice([MaybeUninit<u8>]);
 
 impl UninitSlice {
+    /// Creates a `&mut UninitSlice` wrapping a slice of initialised memory.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use bytes::buf::UninitSlice;
+    ///
+    /// let mut buffer = [0u8; 64];
+    /// let slice = UninitSlice::new(&mut buffer[..]);
+    /// ```
+    #[inline]
+    pub fn new(slice: &mut [u8]) -> &mut UninitSlice {
+        unsafe { &mut *(slice as *mut [u8] as *mut [MaybeUninit<u8>] as *mut UninitSlice) }
+    }
+
+    /// Creates a `&mut UninitSlice` wrapping a slice of uninitialised memory.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use bytes::buf::UninitSlice;
+    /// use core::mem::MaybeUninit;
+    ///
+    /// let mut buffer = [MaybeUninit::uninit(); 64];
+    /// let slice = UninitSlice::uninit(&mut buffer[..]);
+    ///
+    /// let mut vec = Vec::with_capacity(1024);
+    /// let spare: &mut UninitSlice = vec.spare_capacity_mut().into();
+    /// ```
+    #[inline]
+    pub fn uninit(slice: &mut [MaybeUninit<u8>]) -> &mut UninitSlice {
+        unsafe { &mut *(slice as *mut [MaybeUninit<u8>] as *mut UninitSlice) }
+    }
+
+    fn uninit_ref(slice: &[MaybeUninit<u8>]) -> &UninitSlice {
+        unsafe { &*(slice as *const [MaybeUninit<u8>] as *const UninitSlice) }
+    }
+
     /// Create a `&mut UninitSlice` from a pointer and a length.
     ///
     /// # Safety
@@ -44,7 +82,7 @@ impl UninitSlice {
     pub unsafe fn from_raw_parts_mut<'a>(ptr: *mut u8, len: usize) -> &'a mut UninitSlice {
         let maybe_init: &mut [MaybeUninit<u8>] =
             core::slice::from_raw_parts_mut(ptr as *mut _, len);
-        &mut *(maybe_init as *mut [MaybeUninit<u8>] as *mut UninitSlice)
+        Self::uninit(maybe_init)
     }
 
     /// Write a single byte at the specified offset.
@@ -124,6 +162,32 @@ impl UninitSlice {
         self.0.as_mut_ptr() as *mut _
     }
 
+    /// Return a `&mut [MaybeUninit<u8>]` to this slice's buffer.
+    ///
+    /// # Safety
+    ///
+    /// The caller **must not** read from the referenced memory and **must not** write
+    /// **uninitialized** bytes to the slice either. This is because `BufMut` implementation
+    /// that created the `UninitSlice` knows which parts are initialized. Writing uninitialized
+    /// bytes to the slice may cause the `BufMut` to read those bytes and trigger undefined
+    /// behavior.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use bytes::BufMut;
+    ///
+    /// let mut data = [0, 1, 2];
+    /// let mut slice = &mut data[..];
+    /// unsafe {
+    ///     let uninit_slice = BufMut::chunk_mut(&mut slice).as_uninit_slice_mut();
+    /// };
+    /// ```
+    #[inline]
+    pub unsafe fn as_uninit_slice_mut(&mut self) -> &mut [MaybeUninit<u8>] {
+        &mut self.0
+    }
+
     /// Returns the number of bytes in the slice.
     ///
     /// # Examples
@@ -149,6 +213,18 @@ impl fmt::Debug for UninitSlice {
     }
 }
 
+impl<'a> From<&'a mut [u8]> for &'a mut UninitSlice {
+    fn from(slice: &'a mut [u8]) -> Self {
+        UninitSlice::new(slice)
+    }
+}
+
+impl<'a> From<&'a mut [MaybeUninit<u8>]> for &'a mut UninitSlice {
+    fn from(slice: &'a mut [MaybeUninit<u8>]) -> Self {
+        UninitSlice::uninit(slice)
+    }
+}
+
 macro_rules! impl_index {
     ($($t:ty),*) => {
         $(
@@ -157,16 +233,14 @@ macro_rules! impl_index {
 
                 #[inline]
                 fn index(&self, index: $t) -> &UninitSlice {
-                    let maybe_uninit: &[MaybeUninit<u8>] = &self.0[index];
-                    unsafe { &*(maybe_uninit as *const [MaybeUninit<u8>] as *const UninitSlice) }
+                    UninitSlice::uninit_ref(&self.0[index])
                 }
             }
 
             impl IndexMut<$t> for UninitSlice {
                 #[inline]
                 fn index_mut(&mut self, index: $t) -> &mut UninitSlice {
-                    let maybe_uninit: &mut [MaybeUninit<u8>] = &mut self.0[index];
-                    unsafe { &mut *(maybe_uninit as *mut [MaybeUninit<u8>] as *mut UninitSlice) }
+                    UninitSlice::uninit(&mut self.0[index])
                 }
             }
         )*
diff --git a/src/buf/writer.rs b/src/buf/writer.rs
index 261d7cd..08f15d2 100644
--- a/src/buf/writer.rs
+++ b/src/buf/writer.rs
@@ -5,7 +5,7 @@ use std::{cmp, io};
 /// A `BufMut` adapter which implements `io::Write` for the inner value.
 ///
 /// This struct is generally created by calling `writer()` on `BufMut`. See
-/// documentation of [`writer()`](trait.BufMut.html#method.writer) for more
+/// documentation of [`writer()`](BufMut::writer) for more
 /// details.
 #[derive(Debug)]
 pub struct Writer<B> {
diff --git a/src/bytes.rs b/src/bytes.rs
index b1b35ea..e0c33b3 100644
--- a/src/bytes.rs
+++ b/src/bytes.rs
@@ -1,14 +1,21 @@
 use core::iter::FromIterator;
+use core::mem::{self, ManuallyDrop};
 use core::ops::{Deref, RangeBounds};
-use core::{cmp, fmt, hash, mem, ptr, slice, usize};
-
-use alloc::{borrow::Borrow, boxed::Box, string::String, vec::Vec};
+use core::{cmp, fmt, hash, ptr, slice, usize};
+
+use alloc::{
+    alloc::{dealloc, Layout},
+    borrow::Borrow,
+    boxed::Box,
+    string::String,
+    vec::Vec,
+};
 
 use crate::buf::IntoIter;
 #[allow(unused)]
 use crate::loom::sync::atomic::AtomicMut;
-use crate::loom::sync::atomic::{self, AtomicPtr, AtomicUsize, Ordering};
-use crate::Buf;
+use crate::loom::sync::atomic::{AtomicPtr, AtomicUsize, Ordering};
+use crate::{offset_from, Buf, BytesMut};
 
 /// A cheaply cloneable and sliceable chunk of contiguous memory.
 ///
@@ -26,7 +33,7 @@ use crate::Buf;
 /// All `Bytes` implementations must fulfill the following requirements:
 /// - They are cheaply cloneable and thereby shareable between an unlimited amount
 ///   of components, for example by modifying a reference count.
-/// - Instances can be sliced to refer to a subset of the the original buffer.
+/// - Instances can be sliced to refer to a subset of the original buffer.
 ///
 /// ```
 /// use bytes::Bytes;
@@ -55,17 +62,17 @@ use crate::Buf;
 /// # Sharing
 ///
 /// `Bytes` contains a vtable, which allows implementations of `Bytes` to define
-/// how sharing/cloneing is implemented in detail.
+/// how sharing/cloning is implemented in detail.
 /// When `Bytes::clone()` is called, `Bytes` will call the vtable function for
-/// cloning the backing storage in order to share it behind between multiple
-/// `Bytes` instances.
+/// cloning the backing storage in order to share it behind multiple `Bytes`
+/// instances.
 ///
 /// For `Bytes` implementations which refer to constant memory (e.g. created
 /// via `Bytes::from_static()`) the cloning implementation will be a no-op.
 ///
 /// For `Bytes` implementations which point to a reference counted shared storage
 /// (e.g. an `Arc<[u8]>`), sharing will be implemented by increasing the
-/// the reference count.
+/// reference count.
 ///
 /// Due to this mechanism, multiple `Bytes` instances may point to the same
 /// shared memory region.
@@ -78,18 +85,18 @@ use crate::Buf;
 ///
 /// ```text
 ///
-///    Arc ptrs                   +---------+
-///    ________________________ / | Bytes 2 |
-///   /                           +---------+
-///  /          +-----------+     |         |
-/// |_________/ |  Bytes 1  |     |         |
-/// |           +-----------+     |         |
+///    Arc ptrs                   ┌─────────┐
+///    ________________________ / │ Bytes 2 │
+///   /                           └─────────┘
+///  /          ┌───────────┐     |         |
+/// |_________/ │  Bytes 1  │     |         |
+/// |           └───────────┘     |         |
 /// |           |           | ___/ data     | tail
 /// |      data |      tail |/              |
 /// v           v           v               v
-/// +-----+---------------------------------+-----+
-/// | Arc |     |           |               |     |
-/// +-----+---------------------------------+-----+
+/// ┌─────┬─────┬───────────┬───────────────┬─────┐
+/// │ Arc │     │           │               │     │
+/// └─────┴─────┴───────────┴───────────────┴─────┘
 /// ```
 pub struct Bytes {
     ptr: *const u8,
@@ -103,6 +110,13 @@ pub(crate) struct Vtable {
     /// fn(data, ptr, len)
     pub clone: unsafe fn(&AtomicPtr<()>, *const u8, usize) -> Bytes,
     /// fn(data, ptr, len)
+    ///
+    /// takes `Bytes` to value
+    pub to_vec: unsafe fn(&AtomicPtr<()>, *const u8, usize) -> Vec<u8>,
+    pub to_mut: unsafe fn(&AtomicPtr<()>, *const u8, usize) -> BytesMut,
+    /// fn(data)
+    pub is_unique: unsafe fn(&AtomicPtr<()>) -> bool,
+    /// fn(data, ptr, len)
     pub drop: unsafe fn(&mut AtomicPtr<()>, *const u8, usize),
 }
 
@@ -121,7 +135,7 @@ impl Bytes {
     /// ```
     #[inline]
     #[cfg(not(all(loom, test)))]
-    pub const fn new() -> Bytes {
+    pub const fn new() -> Self {
         // Make it a named const to work around
         // "unsizing casts are not allowed in const fn"
         const EMPTY: &[u8] = &[];
@@ -129,7 +143,7 @@ impl Bytes {
     }
 
     #[cfg(all(loom, test))]
-    pub fn new() -> Bytes {
+    pub fn new() -> Self {
         const EMPTY: &[u8] = &[];
         Bytes::from_static(EMPTY)
     }
@@ -149,7 +163,7 @@ impl Bytes {
     /// ```
     #[inline]
     #[cfg(not(all(loom, test)))]
-    pub const fn from_static(bytes: &'static [u8]) -> Bytes {
+    pub const fn from_static(bytes: &'static [u8]) -> Self {
         Bytes {
             ptr: bytes.as_ptr(),
             len: bytes.len(),
@@ -159,7 +173,7 @@ impl Bytes {
     }
 
     #[cfg(all(loom, test))]
-    pub fn from_static(bytes: &'static [u8]) -> Bytes {
+    pub fn from_static(bytes: &'static [u8]) -> Self {
         Bytes {
             ptr: bytes.as_ptr(),
             len: bytes.len(),
@@ -179,7 +193,7 @@ impl Bytes {
     /// assert_eq!(b.len(), 5);
     /// ```
     #[inline]
-    pub fn len(&self) -> usize {
+    pub const fn len(&self) -> usize {
         self.len
     }
 
@@ -194,10 +208,32 @@ impl Bytes {
     /// assert!(b.is_empty());
     /// ```
     #[inline]
-    pub fn is_empty(&self) -> bool {
+    pub const fn is_empty(&self) -> bool {
         self.len == 0
     }
 
+    /// Returns true if this is the only reference to the data.
+    ///
+    /// Always returns false if the data is backed by a static slice.
+    ///
+    /// The result of this method may be invalidated immediately if another
+    /// thread clones this value while this is being called. Ensure you have
+    /// unique access to this value (`&mut Bytes`) first if you need to be
+    /// certain the result is valid (i.e. for safety reasons)
+    /// # Examples
+    ///
+    /// ```
+    /// use bytes::Bytes;
+    ///
+    /// let a = Bytes::from(vec![1, 2, 3]);
+    /// assert!(a.is_unique());
+    /// let b = a.clone();
+    /// assert!(!a.is_unique());
+    /// ```
+    pub fn is_unique(&self) -> bool {
+        unsafe { (self.vtable.is_unique)(&self.data) }
+    }
+
     /// Creates `Bytes` instance from slice, by copying it.
     pub fn copy_from_slice(data: &[u8]) -> Self {
         data.to_vec().into()
@@ -225,14 +261,14 @@ impl Bytes {
     ///
     /// Requires that `begin <= end` and `end <= self.len()`, otherwise slicing
     /// will panic.
-    pub fn slice(&self, range: impl RangeBounds<usize>) -> Bytes {
+    pub fn slice(&self, range: impl RangeBounds<usize>) -> Self {
         use core::ops::Bound;
 
         let len = self.len();
 
         let begin = match range.start_bound() {
             Bound::Included(&n) => n,
-            Bound::Excluded(&n) => n + 1,
+            Bound::Excluded(&n) => n.checked_add(1).expect("out of range"),
             Bound::Unbounded => 0,
         };
 
@@ -262,7 +298,7 @@ impl Bytes {
         let mut ret = self.clone();
 
         ret.len = end - begin;
-        ret.ptr = unsafe { ret.ptr.offset(begin as isize) };
+        ret.ptr = unsafe { ret.ptr.add(begin) };
 
         ret
     }
@@ -292,7 +328,7 @@ impl Bytes {
     ///
     /// Requires that the given `sub` slice is in fact contained within the
     /// `Bytes` buffer; otherwise this function will panic.
-    pub fn slice_ref(&self, subset: &[u8]) -> Bytes {
+    pub fn slice_ref(&self, subset: &[u8]) -> Self {
         // Empty slice and empty Bytes may have their pointers reset
         // so explicitly allow empty slice to be a subslice of any slice.
         if subset.is_empty() {
@@ -308,15 +344,15 @@ impl Bytes {
         assert!(
             sub_p >= bytes_p,
             "subset pointer ({:p}) is smaller than self pointer ({:p})",
-            sub_p as *const u8,
-            bytes_p as *const u8,
+            subset.as_ptr(),
+            self.as_ptr(),
         );
         assert!(
             sub_p + sub_len <= bytes_p + bytes_len,
             "subset is out of bounds: self = ({:p}, {}), subset = ({:p}, {})",
-            bytes_p as *const u8,
+            self.as_ptr(),
             bytes_len,
-            sub_p as *const u8,
+            subset.as_ptr(),
             sub_len,
         );
 
@@ -349,14 +385,7 @@ impl Bytes {
     ///
     /// Panics if `at > len`.
     #[must_use = "consider Bytes::truncate if you don't need the other half"]
-    pub fn split_off(&mut self, at: usize) -> Bytes {
-        assert!(
-            at <= self.len(),
-            "split_off out of bounds: {:?} <= {:?}",
-            at,
-            self.len(),
-        );
-
+    pub fn split_off(&mut self, at: usize) -> Self {
         if at == self.len() {
             return Bytes::new();
         }
@@ -365,6 +394,13 @@ impl Bytes {
             return mem::replace(self, Bytes::new());
         }
 
+        assert!(
+            at <= self.len(),
+            "split_off out of bounds: {:?} <= {:?}",
+            at,
+            self.len(),
+        );
+
         let mut ret = self.clone();
 
         self.len = at;
@@ -398,14 +434,7 @@ impl Bytes {
     ///
     /// Panics if `at > len`.
     #[must_use = "consider Bytes::advance if you don't need the other half"]
-    pub fn split_to(&mut self, at: usize) -> Bytes {
-        assert!(
-            at <= self.len(),
-            "split_to out of bounds: {:?} <= {:?}",
-            at,
-            self.len(),
-        );
-
+    pub fn split_to(&mut self, at: usize) -> Self {
         if at == self.len() {
             return mem::replace(self, Bytes::new());
         }
@@ -414,6 +443,13 @@ impl Bytes {
             return Bytes::new();
         }
 
+        assert!(
+            at <= self.len(),
+            "split_to out of bounds: {:?} <= {:?}",
+            at,
+            self.len(),
+        );
+
         let mut ret = self.clone();
 
         unsafe { self.inc_start(at) };
@@ -428,7 +464,7 @@ impl Bytes {
     /// If `len` is greater than the buffer's current length, this has no
     /// effect.
     ///
-    /// The [`split_off`] method can emulate `truncate`, but this causes the
+    /// The [split_off](`Self::split_off()`) method can emulate `truncate`, but this causes the
     /// excess bytes to be returned instead of dropped.
     ///
     /// # Examples
@@ -440,8 +476,6 @@ impl Bytes {
     /// buf.truncate(5);
     /// assert_eq!(buf, b"hello"[..]);
     /// ```
-    ///
-    /// [`split_off`]: #method.split_off
     #[inline]
     pub fn truncate(&mut self, len: usize) {
         if len < self.len {
@@ -474,6 +508,29 @@ impl Bytes {
         self.truncate(0);
     }
 
+    /// Try to convert self into `BytesMut`.
+    ///
+    /// If `self` is unique for the entire original buffer, this will succeed
+    /// and return a `BytesMut` with the contents of `self` without copying.
+    /// If `self` is not unique for the entire original buffer, this will fail
+    /// and return self.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use bytes::{Bytes, BytesMut};
+    ///
+    /// let bytes = Bytes::from(b"hello".to_vec());
+    /// assert_eq!(bytes.try_into_mut(), Ok(BytesMut::from(&b"hello"[..])));
+    /// ```
+    pub fn try_into_mut(self) -> Result<BytesMut, Bytes> {
+        if self.is_unique() {
+            Ok(self.into())
+        } else {
+            Err(self)
+        }
+    }
+
     #[inline]
     pub(crate) unsafe fn with_vtable(
         ptr: *const u8,
@@ -501,7 +558,7 @@ impl Bytes {
         // should already be asserted, but debug assert for tests
         debug_assert!(self.len >= by, "internal: inc_start out of bounds");
         self.len -= by;
-        self.ptr = self.ptr.offset(by as isize);
+        self.ptr = self.ptr.add(by);
     }
 }
 
@@ -548,14 +605,8 @@ impl Buf for Bytes {
         }
     }
 
-    fn copy_to_bytes(&mut self, len: usize) -> crate::Bytes {
-        if len == self.remaining() {
-            core::mem::replace(self, Bytes::new())
-        } else {
-            let ret = self.slice(..len);
-            self.advance(len);
-            ret
-        }
+    fn copy_to_bytes(&mut self, len: usize) -> Self {
+        self.split_to(len)
     }
 }
 
@@ -604,7 +655,7 @@ impl<'a> IntoIterator for &'a Bytes {
     type IntoIter = core::slice::Iter<'a, u8>;
 
     fn into_iter(self) -> Self::IntoIter {
-        self.as_slice().into_iter()
+        self.as_slice().iter()
     }
 }
 
@@ -686,7 +737,7 @@ impl PartialOrd<Bytes> for str {
 
 impl PartialEq<Vec<u8>> for Bytes {
     fn eq(&self, other: &Vec<u8>) -> bool {
-        *self == &other[..]
+        *self == other[..]
     }
 }
 
@@ -710,7 +761,7 @@ impl PartialOrd<Bytes> for Vec<u8> {
 
 impl PartialEq<String> for Bytes {
     fn eq(&self, other: &String) -> bool {
-        *self == &other[..]
+        *self == other[..]
     }
 }
 
@@ -797,42 +848,105 @@ impl From<&'static str> for Bytes {
 
 impl From<Vec<u8>> for Bytes {
     fn from(vec: Vec<u8>) -> Bytes {
-        // into_boxed_slice doesn't return a heap allocation for empty vectors,
+        let mut vec = ManuallyDrop::new(vec);
+        let ptr = vec.as_mut_ptr();
+        let len = vec.len();
+        let cap = vec.capacity();
+
+        // Avoid an extra allocation if possible.
+        if len == cap {
+            let vec = ManuallyDrop::into_inner(vec);
+            return Bytes::from(vec.into_boxed_slice());
+        }
+
+        let shared = Box::new(Shared {
+            buf: ptr,
+            cap,
+            ref_cnt: AtomicUsize::new(1),
+        });
+
+        let shared = Box::into_raw(shared);
+        // The pointer should be aligned, so this assert should
+        // always succeed.
+        debug_assert!(
+            0 == (shared as usize & KIND_MASK),
+            "internal: Box<Shared> should have an aligned pointer",
+        );
+        Bytes {
+            ptr,
+            len,
+            data: AtomicPtr::new(shared as _),
+            vtable: &SHARED_VTABLE,
+        }
+    }
+}
+
+impl From<Box<[u8]>> for Bytes {
+    fn from(slice: Box<[u8]>) -> Bytes {
+        // Box<[u8]> doesn't contain a heap allocation for empty slices,
         // so the pointer isn't aligned enough for the KIND_VEC stashing to
         // work.
-        if vec.is_empty() {
+        if slice.is_empty() {
             return Bytes::new();
         }
 
-        let slice = vec.into_boxed_slice();
         let len = slice.len();
         let ptr = Box::into_raw(slice) as *mut u8;
 
         if ptr as usize & 0x1 == 0 {
-            let data = ptr as usize | KIND_VEC;
+            let data = ptr_map(ptr, |addr| addr | KIND_VEC);
             Bytes {
                 ptr,
                 len,
-                data: AtomicPtr::new(data as *mut _),
+                data: AtomicPtr::new(data.cast()),
                 vtable: &PROMOTABLE_EVEN_VTABLE,
             }
         } else {
             Bytes {
                 ptr,
                 len,
-                data: AtomicPtr::new(ptr as *mut _),
+                data: AtomicPtr::new(ptr.cast()),
                 vtable: &PROMOTABLE_ODD_VTABLE,
             }
         }
     }
 }
 
+impl From<Bytes> for BytesMut {
+    /// Convert self into `BytesMut`.
+    ///
+    /// If `bytes` is unique for the entire original buffer, this will return a
+    /// `BytesMut` with the contents of `bytes` without copying.
+    /// If `bytes` is not unique for the entire original buffer, this will make
+    /// a copy of `bytes` subset of the original buffer in a new `BytesMut`.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use bytes::{Bytes, BytesMut};
+    ///
+    /// let bytes = Bytes::from(b"hello".to_vec());
+    /// assert_eq!(BytesMut::from(bytes), BytesMut::from(&b"hello"[..]));
+    /// ```
+    fn from(bytes: Bytes) -> Self {
+        let bytes = ManuallyDrop::new(bytes);
+        unsafe { (bytes.vtable.to_mut)(&bytes.data, bytes.ptr, bytes.len) }
+    }
+}
+
 impl From<String> for Bytes {
     fn from(s: String) -> Bytes {
         Bytes::from(s.into_bytes())
     }
 }
 
+impl From<Bytes> for Vec<u8> {
+    fn from(bytes: Bytes) -> Vec<u8> {
+        let bytes = ManuallyDrop::new(bytes);
+        unsafe { (bytes.vtable.to_vec)(&bytes.data, bytes.ptr, bytes.len) }
+    }
+}
+
 // ===== impl Vtable =====
 
 impl fmt::Debug for Vtable {
@@ -848,6 +962,9 @@ impl fmt::Debug for Vtable {
 
 const STATIC_VTABLE: Vtable = Vtable {
     clone: static_clone,
+    to_vec: static_to_vec,
+    to_mut: static_to_mut,
+    is_unique: static_is_unique,
     drop: static_drop,
 };
 
@@ -856,6 +973,20 @@ unsafe fn static_clone(_: &AtomicPtr<()>, ptr: *const u8, len: usize) -> Bytes {
     Bytes::from_static(slice)
 }
 
+unsafe fn static_to_vec(_: &AtomicPtr<()>, ptr: *const u8, len: usize) -> Vec<u8> {
+    let slice = slice::from_raw_parts(ptr, len);
+    slice.to_vec()
+}
+
+unsafe fn static_to_mut(_: &AtomicPtr<()>, ptr: *const u8, len: usize) -> BytesMut {
+    let slice = slice::from_raw_parts(ptr, len);
+    BytesMut::from(slice)
+}
+
+fn static_is_unique(_: &AtomicPtr<()>) -> bool {
+    false
+}
+
 unsafe fn static_drop(_: &mut AtomicPtr<()>, _: *const u8, _: usize) {
     // nothing to drop for &'static [u8]
 }
@@ -864,11 +995,17 @@ unsafe fn static_drop(_: &mut AtomicPtr<()>, _: *const u8, _: usize) {
 
 static PROMOTABLE_EVEN_VTABLE: Vtable = Vtable {
     clone: promotable_even_clone,
+    to_vec: promotable_even_to_vec,
+    to_mut: promotable_even_to_mut,
+    is_unique: promotable_is_unique,
     drop: promotable_even_drop,
 };
 
 static PROMOTABLE_ODD_VTABLE: Vtable = Vtable {
     clone: promotable_odd_clone,
+    to_vec: promotable_odd_to_vec,
+    to_mut: promotable_odd_to_mut,
+    is_unique: promotable_is_unique,
     drop: promotable_odd_drop,
 };
 
@@ -877,25 +1014,92 @@ unsafe fn promotable_even_clone(data: &AtomicPtr<()>, ptr: *const u8, len: usize
     let kind = shared as usize & KIND_MASK;
 
     if kind == KIND_ARC {
-        shallow_clone_arc(shared as _, ptr, len)
+        shallow_clone_arc(shared.cast(), ptr, len)
     } else {
         debug_assert_eq!(kind, KIND_VEC);
-        let buf = (shared as usize & !KIND_MASK) as *mut u8;
+        let buf = ptr_map(shared.cast(), |addr| addr & !KIND_MASK);
         shallow_clone_vec(data, shared, buf, ptr, len)
     }
 }
 
+unsafe fn promotable_to_vec(
+    data: &AtomicPtr<()>,
+    ptr: *const u8,
+    len: usize,
+    f: fn(*mut ()) -> *mut u8,
+) -> Vec<u8> {
+    let shared = data.load(Ordering::Acquire);
+    let kind = shared as usize & KIND_MASK;
+
+    if kind == KIND_ARC {
+        shared_to_vec_impl(shared.cast(), ptr, len)
+    } else {
+        // If Bytes holds a Vec, then the offset must be 0.
+        debug_assert_eq!(kind, KIND_VEC);
+
+        let buf = f(shared);
+
+        let cap = offset_from(ptr, buf) + len;
+
+        // Copy back buffer
+        ptr::copy(ptr, buf, len);
+
+        Vec::from_raw_parts(buf, len, cap)
+    }
+}
+
+unsafe fn promotable_to_mut(
+    data: &AtomicPtr<()>,
+    ptr: *const u8,
+    len: usize,
+    f: fn(*mut ()) -> *mut u8,
+) -> BytesMut {
+    let shared = data.load(Ordering::Acquire);
+    let kind = shared as usize & KIND_MASK;
+
+    if kind == KIND_ARC {
+        shared_to_mut_impl(shared.cast(), ptr, len)
+    } else {
+        // KIND_VEC is a view of an underlying buffer at a certain offset.
+        // The ptr + len always represents the end of that buffer.
+        // Before truncating it, it is first promoted to KIND_ARC.
+        // Thus, we can safely reconstruct a Vec from it without leaking memory.
+        debug_assert_eq!(kind, KIND_VEC);
+
+        let buf = f(shared);
+        let off = offset_from(ptr, buf);
+        let cap = off + len;
+        let v = Vec::from_raw_parts(buf, cap, cap);
+
+        let mut b = BytesMut::from_vec(v);
+        b.advance_unchecked(off);
+        b
+    }
+}
+
+unsafe fn promotable_even_to_vec(data: &AtomicPtr<()>, ptr: *const u8, len: usize) -> Vec<u8> {
+    promotable_to_vec(data, ptr, len, |shared| {
+        ptr_map(shared.cast(), |addr| addr & !KIND_MASK)
+    })
+}
+
+unsafe fn promotable_even_to_mut(data: &AtomicPtr<()>, ptr: *const u8, len: usize) -> BytesMut {
+    promotable_to_mut(data, ptr, len, |shared| {
+        ptr_map(shared.cast(), |addr| addr & !KIND_MASK)
+    })
+}
+
 unsafe fn promotable_even_drop(data: &mut AtomicPtr<()>, ptr: *const u8, len: usize) {
     data.with_mut(|shared| {
         let shared = *shared;
         let kind = shared as usize & KIND_MASK;
 
         if kind == KIND_ARC {
-            release_shared(shared as *mut Shared);
+            release_shared(shared.cast());
         } else {
             debug_assert_eq!(kind, KIND_VEC);
-            let buf = (shared as usize & !KIND_MASK) as *mut u8;
-            drop(rebuild_boxed_slice(buf, ptr, len));
+            let buf = ptr_map(shared.cast(), |addr| addr & !KIND_MASK);
+            free_boxed_slice(buf, ptr, len);
         }
     });
 }
@@ -908,38 +1112,65 @@ unsafe fn promotable_odd_clone(data: &AtomicPtr<()>, ptr: *const u8, len: usize)
         shallow_clone_arc(shared as _, ptr, len)
     } else {
         debug_assert_eq!(kind, KIND_VEC);
-        shallow_clone_vec(data, shared, shared as *mut u8, ptr, len)
+        shallow_clone_vec(data, shared, shared.cast(), ptr, len)
     }
 }
 
+unsafe fn promotable_odd_to_vec(data: &AtomicPtr<()>, ptr: *const u8, len: usize) -> Vec<u8> {
+    promotable_to_vec(data, ptr, len, |shared| shared.cast())
+}
+
+unsafe fn promotable_odd_to_mut(data: &AtomicPtr<()>, ptr: *const u8, len: usize) -> BytesMut {
+    promotable_to_mut(data, ptr, len, |shared| shared.cast())
+}
+
 unsafe fn promotable_odd_drop(data: &mut AtomicPtr<()>, ptr: *const u8, len: usize) {
     data.with_mut(|shared| {
         let shared = *shared;
         let kind = shared as usize & KIND_MASK;
 
         if kind == KIND_ARC {
-            release_shared(shared as *mut Shared);
+            release_shared(shared.cast());
         } else {
             debug_assert_eq!(kind, KIND_VEC);
 
-            drop(rebuild_boxed_slice(shared as *mut u8, ptr, len));
+            free_boxed_slice(shared.cast(), ptr, len);
         }
     });
 }
 
-unsafe fn rebuild_boxed_slice(buf: *mut u8, offset: *const u8, len: usize) -> Box<[u8]> {
-    let cap = (offset as usize - buf as usize) + len;
-    Box::from_raw(slice::from_raw_parts_mut(buf, cap))
+unsafe fn promotable_is_unique(data: &AtomicPtr<()>) -> bool {
+    let shared = data.load(Ordering::Acquire);
+    let kind = shared as usize & KIND_MASK;
+
+    if kind == KIND_ARC {
+        let ref_cnt = (*shared.cast::<Shared>()).ref_cnt.load(Ordering::Relaxed);
+        ref_cnt == 1
+    } else {
+        true
+    }
+}
+
+unsafe fn free_boxed_slice(buf: *mut u8, offset: *const u8, len: usize) {
+    let cap = offset_from(offset, buf) + len;
+    dealloc(buf, Layout::from_size_align(cap, 1).unwrap())
 }
 
 // ===== impl SharedVtable =====
 
 struct Shared {
-    // holds vec for drop, but otherwise doesnt access it
-    _vec: Vec<u8>,
+    // Holds arguments to dealloc upon Drop, but otherwise doesn't use them
+    buf: *mut u8,
+    cap: usize,
     ref_cnt: AtomicUsize,
 }
 
+impl Drop for Shared {
+    fn drop(&mut self) {
+        unsafe { dealloc(self.buf, Layout::from_size_align(self.cap, 1).unwrap()) }
+    }
+}
+
 // Assert that the alignment of `Shared` is divisible by 2.
 // This is a necessary invariant since we depend on allocating `Shared` a
 // shared object to implicitly carry the `KIND_ARC` flag in its pointer.
@@ -948,6 +1179,9 @@ const _: [(); 0 - mem::align_of::<Shared>() % 2] = []; // Assert that the alignm
 
 static SHARED_VTABLE: Vtable = Vtable {
     clone: shared_clone,
+    to_vec: shared_to_vec,
+    to_mut: shared_to_mut,
+    is_unique: shared_is_unique,
     drop: shared_drop,
 };
 
@@ -960,9 +1194,87 @@ unsafe fn shared_clone(data: &AtomicPtr<()>, ptr: *const u8, len: usize) -> Byte
     shallow_clone_arc(shared as _, ptr, len)
 }
 
+unsafe fn shared_to_vec_impl(shared: *mut Shared, ptr: *const u8, len: usize) -> Vec<u8> {
+    // Check that the ref_cnt is 1 (unique).
+    //
+    // If it is unique, then it is set to 0 with AcqRel fence for the same
+    // reason in release_shared.
+    //
+    // Otherwise, we take the other branch and call release_shared.
+    if (*shared)
+        .ref_cnt
+        .compare_exchange(1, 0, Ordering::AcqRel, Ordering::Relaxed)
+        .is_ok()
+    {
+        // Deallocate the `Shared` instance without running its destructor.
+        let shared = *Box::from_raw(shared);
+        let shared = ManuallyDrop::new(shared);
+        let buf = shared.buf;
+        let cap = shared.cap;
+
+        // Copy back buffer
+        ptr::copy(ptr, buf, len);
+
+        Vec::from_raw_parts(buf, len, cap)
+    } else {
+        let v = slice::from_raw_parts(ptr, len).to_vec();
+        release_shared(shared);
+        v
+    }
+}
+
+unsafe fn shared_to_vec(data: &AtomicPtr<()>, ptr: *const u8, len: usize) -> Vec<u8> {
+    shared_to_vec_impl(data.load(Ordering::Relaxed).cast(), ptr, len)
+}
+
+unsafe fn shared_to_mut_impl(shared: *mut Shared, ptr: *const u8, len: usize) -> BytesMut {
+    // The goal is to check if the current handle is the only handle
+    // that currently has access to the buffer. This is done by
+    // checking if the `ref_cnt` is currently 1.
+    //
+    // The `Acquire` ordering synchronizes with the `Release` as
+    // part of the `fetch_sub` in `release_shared`. The `fetch_sub`
+    // operation guarantees that any mutations done in other threads
+    // are ordered before the `ref_cnt` is decremented. As such,
+    // this `Acquire` will guarantee that those mutations are
+    // visible to the current thread.
+    //
+    // Otherwise, we take the other branch, copy the data and call `release_shared`.
+    if (*shared).ref_cnt.load(Ordering::Acquire) == 1 {
+        // Deallocate the `Shared` instance without running its destructor.
+        let shared = *Box::from_raw(shared);
+        let shared = ManuallyDrop::new(shared);
+        let buf = shared.buf;
+        let cap = shared.cap;
+
+        // Rebuild Vec
+        let off = offset_from(ptr, buf);
+        let v = Vec::from_raw_parts(buf, len + off, cap);
+
+        let mut b = BytesMut::from_vec(v);
+        b.advance_unchecked(off);
+        b
+    } else {
+        // Copy the data from Shared in a new Vec, then release it
+        let v = slice::from_raw_parts(ptr, len).to_vec();
+        release_shared(shared);
+        BytesMut::from_vec(v)
+    }
+}
+
+unsafe fn shared_to_mut(data: &AtomicPtr<()>, ptr: *const u8, len: usize) -> BytesMut {
+    shared_to_mut_impl(data.load(Ordering::Relaxed).cast(), ptr, len)
+}
+
+pub(crate) unsafe fn shared_is_unique(data: &AtomicPtr<()>) -> bool {
+    let shared = data.load(Ordering::Acquire);
+    let ref_cnt = (*shared.cast::<Shared>()).ref_cnt.load(Ordering::Relaxed);
+    ref_cnt == 1
+}
+
 unsafe fn shared_drop(data: &mut AtomicPtr<()>, _ptr: *const u8, _len: usize) {
     data.with_mut(|shared| {
-        release_shared(*shared as *mut Shared);
+        release_shared(shared.cast());
     });
 }
 
@@ -1000,9 +1312,9 @@ unsafe fn shallow_clone_vec(
     // updated and since the buffer hasn't been promoted to an
     // `Arc`, those three fields still are the components of the
     // vector.
-    let vec = rebuild_boxed_slice(buf, offset, len).into_vec();
     let shared = Box::new(Shared {
-        _vec: vec,
+        buf,
+        cap: offset_from(offset, buf) + len,
         // Initialize refcount to 2. One for this reference, and one
         // for the new clone that will be returned from
         // `shallow_clone`.
@@ -1076,10 +1388,40 @@ unsafe fn release_shared(ptr: *mut Shared) {
     // > "acquire" operation before deleting the object.
     //
     // [1]: (www.boost.org/doc/libs/1_55_0/doc/html/atomic/usage_examples.html)
-    atomic::fence(Ordering::Acquire);
+    //
+    // Thread sanitizer does not support atomic fences. Use an atomic load
+    // instead.
+    (*ptr).ref_cnt.load(Ordering::Acquire);
 
     // Drop the data
-    Box::from_raw(ptr);
+    drop(Box::from_raw(ptr));
+}
+
+// Ideally we would always use this version of `ptr_map` since it is strict
+// provenance compatible, but it results in worse codegen. We will however still
+// use it on miri because it gives better diagnostics for people who test bytes
+// code with miri.
+//
+// See https://github.com/tokio-rs/bytes/pull/545 for more info.
+#[cfg(miri)]
+fn ptr_map<F>(ptr: *mut u8, f: F) -> *mut u8
+where
+    F: FnOnce(usize) -> usize,
+{
+    let old_addr = ptr as usize;
+    let new_addr = f(old_addr);
+    let diff = new_addr.wrapping_sub(old_addr);
+    ptr.wrapping_add(diff)
+}
+
+#[cfg(not(miri))]
+fn ptr_map<F>(ptr: *mut u8, f: F) -> *mut u8
+where
+    F: FnOnce(usize) -> usize,
+{
+    let old_addr = ptr as usize;
+    let new_addr = f(old_addr);
+    new_addr as *mut u8
 }
 
 // compile-fails
diff --git a/src/bytes/promotable.rs b/src/bytes/promotable.rs
new file mode 100644
index 0000000..e69de29
diff --git a/src/bytes_mut.rs b/src/bytes_mut.rs
index 8e42079..537f01a 100644
--- a/src/bytes_mut.rs
+++ b/src/bytes_mut.rs
@@ -1,5 +1,5 @@
-use core::iter::{FromIterator, Iterator};
-use core::mem::{self, ManuallyDrop};
+use core::iter::FromIterator;
+use core::mem::{self, ManuallyDrop, MaybeUninit};
 use core::ops::{Deref, DerefMut};
 use core::ptr::{self, NonNull};
 use core::{cmp, fmt, hash, isize, slice, usize};
@@ -8,6 +8,7 @@ use alloc::{
     borrow::{Borrow, BorrowMut},
     boxed::Box,
     string::String,
+    vec,
     vec::Vec,
 };
 
@@ -15,8 +16,8 @@ use crate::buf::{IntoIter, UninitSlice};
 use crate::bytes::Vtable;
 #[allow(unused)]
 use crate::loom::sync::atomic::AtomicMut;
-use crate::loom::sync::atomic::{self, AtomicPtr, AtomicUsize, Ordering};
-use crate::{Buf, BufMut, Bytes};
+use crate::loom::sync::atomic::{AtomicPtr, AtomicUsize, Ordering};
+use crate::{offset_from, Buf, BufMut, Bytes};
 
 /// A unique reference to a contiguous slice of memory.
 ///
@@ -79,6 +80,12 @@ struct Shared {
     ref_count: AtomicUsize,
 }
 
+// Assert that the alignment of `Shared` is divisible by 2.
+// This is a necessary invariant since we depend on allocating `Shared` a
+// shared object to implicitly carry the `KIND_ARC` flag in its pointer.
+// This flag is set when the LSB is 0.
+const _: [(); 0 - mem::align_of::<Shared>() % 2] = []; // Assert that the alignment of `Shared` is divisible by 2.
+
 // Buffer storage strategy flags.
 const KIND_ARC: usize = 0b0;
 const KIND_VEC: usize = 0b1;
@@ -95,11 +102,11 @@ const MIN_ORIGINAL_CAPACITY_WIDTH: usize = 10;
 const ORIGINAL_CAPACITY_MASK: usize = 0b11100;
 const ORIGINAL_CAPACITY_OFFSET: usize = 2;
 
+const VEC_POS_OFFSET: usize = 5;
 // When the storage is in the `Vec` representation, the pointer can be advanced
 // at most this value. This is due to the amount of storage available to track
 // the offset is usize - number of KIND bits and number of ORIGINAL_CAPACITY
 // bits.
-const VEC_POS_OFFSET: usize = 5;
 const MAX_VEC_POS: usize = usize::MAX >> VEC_POS_OFFSET;
 const NOT_VEC_POS_MASK: usize = 0b11111;
 
@@ -236,28 +243,43 @@ impl BytesMut {
     /// th.join().unwrap();
     /// ```
     #[inline]
-    pub fn freeze(mut self) -> Bytes {
-        if self.kind() == KIND_VEC {
+    pub fn freeze(self) -> Bytes {
+        let bytes = ManuallyDrop::new(self);
+        if bytes.kind() == KIND_VEC {
             // Just re-use `Bytes` internal Vec vtable
             unsafe {
-                let (off, _) = self.get_vec_pos();
-                let vec = rebuild_vec(self.ptr.as_ptr(), self.len, self.cap, off);
-                mem::forget(self);
+                let off = bytes.get_vec_pos();
+                let vec = rebuild_vec(bytes.ptr.as_ptr(), bytes.len, bytes.cap, off);
                 let mut b: Bytes = vec.into();
                 b.advance(off);
                 b
             }
         } else {
-            debug_assert_eq!(self.kind(), KIND_ARC);
+            debug_assert_eq!(bytes.kind(), KIND_ARC);
 
-            let ptr = self.ptr.as_ptr();
-            let len = self.len;
-            let data = AtomicPtr::new(self.data as _);
-            mem::forget(self);
+            let ptr = bytes.ptr.as_ptr();
+            let len = bytes.len;
+            let data = AtomicPtr::new(bytes.data.cast());
             unsafe { Bytes::with_vtable(ptr, len, data, &SHARED_VTABLE) }
         }
     }
 
+    /// Creates a new `BytesMut`, which is initialized with zero.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use bytes::BytesMut;
+    ///
+    /// let zeros = BytesMut::zeroed(42);
+    ///
+    /// assert_eq!(zeros.len(), 42);
+    /// zeros.into_iter().for_each(|x| assert_eq!(x, 0));
+    /// ```
+    pub fn zeroed(len: usize) -> BytesMut {
+        BytesMut::from_vec(vec![0; len])
+    }
+
     /// Splits the bytes into two at the given index.
     ///
     /// Afterwards `self` contains elements `[0, at)`, and the returned
@@ -294,8 +316,10 @@ impl BytesMut {
         );
         unsafe {
             let mut other = self.shallow_clone();
-            other.set_start(at);
-            self.set_end(at);
+            // SAFETY: We've checked that `at` <= `self.capacity()` above.
+            other.advance_unchecked(at);
+            self.cap = at;
+            self.len = cmp::min(self.len, at);
             other
         }
     }
@@ -325,7 +349,7 @@ impl BytesMut {
     ///
     /// assert_eq!(other, b"hello world"[..]);
     /// ```
-    #[must_use = "consider BytesMut::advance(len()) if you don't need the other half"]
+    #[must_use = "consider BytesMut::clear if you don't need the other half"]
     pub fn split(&mut self) -> BytesMut {
         let len = self.len();
         self.split_to(len)
@@ -368,8 +392,11 @@ impl BytesMut {
 
         unsafe {
             let mut other = self.shallow_clone();
-            other.set_end(at);
-            self.set_start(at);
+            // SAFETY: We've checked that `at` <= `self.len()` and we know that `self.len()` <=
+            // `self.capacity()`.
+            self.advance_unchecked(at);
+            other.cap = at;
+            other.len = at;
             other
         }
     }
@@ -380,7 +407,9 @@ impl BytesMut {
     /// If `len` is greater than the buffer's current length, this has no
     /// effect.
     ///
-    /// The [`split_off`] method can emulate `truncate`, but this causes the
+    /// Existing underlying capacity is preserved.
+    ///
+    /// The [split_off](`Self::split_off()`) method can emulate `truncate`, but this causes the
     /// excess bytes to be returned instead of dropped.
     ///
     /// # Examples
@@ -392,17 +421,14 @@ impl BytesMut {
     /// buf.truncate(5);
     /// assert_eq!(buf, b"hello"[..]);
     /// ```
-    ///
-    /// [`split_off`]: #method.split_off
     pub fn truncate(&mut self, len: usize) {
         if len <= self.len() {
-            unsafe {
-                self.set_len(len);
-            }
+            // SAFETY: Shrinking the buffer cannot expose uninitialized bytes.
+            unsafe { self.set_len(len) };
         }
     }
 
-    /// Clears the buffer, removing all data.
+    /// Clears the buffer, removing all data. Existing capacity is preserved.
     ///
     /// # Examples
     ///
@@ -414,7 +440,8 @@ impl BytesMut {
     /// assert!(buf.is_empty());
     /// ```
     pub fn clear(&mut self) {
-        self.truncate(0);
+        // SAFETY: Setting the length to zero cannot expose uninitialized bytes.
+        unsafe { self.set_len(0) };
     }
 
     /// Resizes the buffer so that `len` is equal to `new_len`.
@@ -440,18 +467,26 @@ impl BytesMut {
     /// assert_eq!(&buf[..], &[0x1, 0x1, 0x3, 0x3]);
     /// ```
     pub fn resize(&mut self, new_len: usize, value: u8) {
-        let len = self.len();
-        if new_len > len {
-            let additional = new_len - len;
-            self.reserve(additional);
-            unsafe {
-                let dst = self.chunk_mut().as_mut_ptr();
-                ptr::write_bytes(dst, value, additional);
-                self.set_len(new_len);
-            }
+        let additional = if let Some(additional) = new_len.checked_sub(self.len()) {
+            additional
         } else {
             self.truncate(new_len);
+            return;
+        };
+
+        if additional == 0 {
+            return;
         }
+
+        self.reserve(additional);
+        let dst = self.spare_capacity_mut().as_mut_ptr();
+        // SAFETY: `spare_capacity_mut` returns a valid, properly aligned pointer and we've
+        // reserved enough space to write `additional` bytes.
+        unsafe { ptr::write_bytes(dst, value, additional) };
+
+        // SAFETY: There are at least `new_len` initialized bytes in the buffer so no
+        // uninitialized bytes are being exposed.
+        unsafe { self.set_len(new_len) };
     }
 
     /// Sets the length of the buffer.
@@ -492,11 +527,20 @@ impl BytesMut {
     /// reallocations. A call to `reserve` may result in an allocation.
     ///
     /// Before allocating new buffer space, the function will attempt to reclaim
-    /// space in the existing buffer. If the current handle references a small
-    /// view in the original buffer and all other handles have been dropped,
-    /// and the requested capacity is less than or equal to the existing
-    /// buffer's capacity, then the current view will be copied to the front of
-    /// the buffer and the handle will take ownership of the full buffer.
+    /// space in the existing buffer. If the current handle references a view
+    /// into a larger original buffer, and all other handles referencing part
+    /// of the same original buffer have been dropped, then the current view
+    /// can be copied/shifted to the front of the buffer and the handle can take
+    /// ownership of the full buffer, provided that the full buffer is large
+    /// enough to fit the requested additional capacity.
+    ///
+    /// This optimization will only happen if shifting the data from the current
+    /// view to the front of the buffer is not too expensive in terms of the
+    /// (amortized) time required. The precise condition is subject to change;
+    /// as of now, the length of the data being shifted needs to be at least as
+    /// large as the distance that it's shifted by. If the current view is empty
+    /// and the original buffer is large enough to fit the requested additional
+    /// capacity, then reallocations will never happen.
     ///
     /// # Examples
     ///
@@ -560,33 +604,51 @@ impl BytesMut {
             // space.
             //
             // Otherwise, since backed by a vector, use `Vec::reserve`
+            //
+            // We need to make sure that this optimization does not kill the
+            // amortized runtimes of BytesMut's operations.
             unsafe {
-                let (off, prev) = self.get_vec_pos();
+                let off = self.get_vec_pos();
 
                 // Only reuse space if we can satisfy the requested additional space.
-                if self.capacity() - self.len() + off >= additional {
-                    // There's space - reuse it
+                //
+                // Also check if the value of `off` suggests that enough bytes
+                // have been read to account for the overhead of shifting all
+                // the data (in an amortized analysis).
+                // Hence the condition `off >= self.len()`.
+                //
+                // This condition also already implies that the buffer is going
+                // to be (at least) half-empty in the end; so we do not break
+                // the (amortized) runtime with future resizes of the underlying
+                // `Vec`.
+                //
+                // [For more details check issue #524, and PR #525.]
+                if self.capacity() - self.len() + off >= additional && off >= self.len() {
+                    // There's enough space, and it's not too much overhead:
+                    // reuse the space!
                     //
                     // Just move the pointer back to the start after copying
                     // data back.
-                    let base_ptr = self.ptr.as_ptr().offset(-(off as isize));
-                    ptr::copy(self.ptr.as_ptr(), base_ptr, self.len);
+                    let base_ptr = self.ptr.as_ptr().sub(off);
+                    // Since `off >= self.len()`, the two regions don't overlap.
+                    ptr::copy_nonoverlapping(self.ptr.as_ptr(), base_ptr, self.len);
                     self.ptr = vptr(base_ptr);
-                    self.set_vec_pos(0, prev);
+                    self.set_vec_pos(0);
 
                     // Length stays constant, but since we moved backwards we
                     // can gain capacity back.
                     self.cap += off;
                 } else {
-                    // No space - allocate more
+                    // Not enough space, or reusing might be too much overhead:
+                    // allocate more space!
                     let mut v =
                         ManuallyDrop::new(rebuild_vec(self.ptr.as_ptr(), self.len, self.cap, off));
                     v.reserve(additional);
 
                     // Update the info
-                    self.ptr = vptr(v.as_mut_ptr().offset(off as isize));
-                    self.len = v.len() - off;
+                    self.ptr = vptr(v.as_mut_ptr().add(off));
                     self.cap = v.capacity() - off;
+                    debug_assert_eq!(self.len, v.len() - off);
                 }
 
                 return;
@@ -594,7 +656,7 @@ impl BytesMut {
         }
 
         debug_assert_eq!(kind, KIND_ARC);
-        let shared: *mut Shared = self.data as _;
+        let shared: *mut Shared = self.data;
 
         // Reserving involves abandoning the currently shared buffer and
         // allocating a new vector with the requested capacity.
@@ -602,13 +664,7 @@ impl BytesMut {
         // Compute the new capacity
         let mut new_cap = len.checked_add(additional).expect("overflow");
 
-        let original_capacity;
-        let original_capacity_repr;
-
         unsafe {
-            original_capacity_repr = (*shared).original_capacity_repr;
-            original_capacity = original_capacity_from_repr(original_capacity_repr);
-
             // First, try to reclaim the buffer. This is possible if the current
             // handle is the only outstanding handle pointing to the buffer.
             if (*shared).is_unique() {
@@ -617,34 +673,73 @@ impl BytesMut {
                 // sure that the vector has enough capacity.
                 let v = &mut (*shared).vec;
 
-                if v.capacity() >= new_cap {
-                    // The capacity is sufficient, reclaim the buffer
-                    let ptr = v.as_mut_ptr();
+                let v_capacity = v.capacity();
+                let ptr = v.as_mut_ptr();
+
+                let offset = offset_from(self.ptr.as_ptr(), ptr);
+
+                // Compare the condition in the `kind == KIND_VEC` case above
+                // for more details.
+                if v_capacity >= new_cap + offset {
+                    self.cap = new_cap;
+                    // no copy is necessary
+                } else if v_capacity >= new_cap && offset >= len {
+                    // The capacity is sufficient, and copying is not too much
+                    // overhead: reclaim the buffer!
 
-                    ptr::copy(self.ptr.as_ptr(), ptr, len);
+                    // `offset >= len` means: no overlap
+                    ptr::copy_nonoverlapping(self.ptr.as_ptr(), ptr, len);
 
                     self.ptr = vptr(ptr);
                     self.cap = v.capacity();
+                } else {
+                    // calculate offset
+                    let off = (self.ptr.as_ptr() as usize) - (v.as_ptr() as usize);
 
-                    return;
-                }
+                    // new_cap is calculated in terms of `BytesMut`, not the underlying
+                    // `Vec`, so it does not take the offset into account.
+                    //
+                    // Thus we have to manually add it here.
+                    new_cap = new_cap.checked_add(off).expect("overflow");
 
-                // The vector capacity is not sufficient. The reserve request is
-                // asking for more than the initial buffer capacity. Allocate more
-                // than requested if `new_cap` is not much bigger than the current
-                // capacity.
-                //
-                // There are some situations, using `reserve_exact` that the
-                // buffer capacity could be below `original_capacity`, so do a
-                // check.
-                let double = v.capacity().checked_shl(1).unwrap_or(new_cap);
+                    // The vector capacity is not sufficient. The reserve request is
+                    // asking for more than the initial buffer capacity. Allocate more
+                    // than requested if `new_cap` is not much bigger than the current
+                    // capacity.
+                    //
+                    // There are some situations, using `reserve_exact` that the
+                    // buffer capacity could be below `original_capacity`, so do a
+                    // check.
+                    let double = v.capacity().checked_shl(1).unwrap_or(new_cap);
 
-                new_cap = cmp::max(cmp::max(double, new_cap), original_capacity);
-            } else {
-                new_cap = cmp::max(new_cap, original_capacity);
+                    new_cap = cmp::max(double, new_cap);
+
+                    // No space - allocate more
+                    //
+                    // The length field of `Shared::vec` is not used by the `BytesMut`;
+                    // instead we use the `len` field in the `BytesMut` itself. However,
+                    // when calling `reserve`, it doesn't guarantee that data stored in
+                    // the unused capacity of the vector is copied over to the new
+                    // allocation, so we need to ensure that we don't have any data we
+                    // care about in the unused capacity before calling `reserve`.
+                    debug_assert!(off + len <= v.capacity());
+                    v.set_len(off + len);
+                    v.reserve(new_cap - v.len());
+
+                    // Update the info
+                    self.ptr = vptr(v.as_mut_ptr().add(off));
+                    self.cap = v.capacity() - off;
+                }
+
+                return;
             }
         }
 
+        let original_capacity_repr = unsafe { (*shared).original_capacity_repr };
+        let original_capacity = original_capacity_from_repr(original_capacity_repr);
+
+        new_cap = cmp::max(new_cap, original_capacity);
+
         // Create a new vector to store the data
         let mut v = ManuallyDrop::new(Vec::with_capacity(new_cap));
 
@@ -657,10 +752,10 @@ impl BytesMut {
 
         // Update self
         let data = (original_capacity_repr << ORIGINAL_CAPACITY_OFFSET) | KIND_VEC;
-        self.data = data as _;
+        self.data = invalid_ptr(data);
         self.ptr = vptr(v.as_mut_ptr());
-        self.len = v.len();
         self.cap = v.capacity();
+        debug_assert_eq!(self.len, v.len());
     }
 
     /// Appends given bytes to this `BytesMut`.
@@ -679,16 +774,17 @@ impl BytesMut {
     ///
     /// assert_eq!(b"aaabbbcccddd", &buf[..]);
     /// ```
+    #[inline]
     pub fn extend_from_slice(&mut self, extend: &[u8]) {
         let cnt = extend.len();
         self.reserve(cnt);
 
         unsafe {
-            let dst = self.uninit_slice();
+            let dst = self.spare_capacity_mut();
             // Reserved above
             debug_assert!(dst.len() >= cnt);
 
-            ptr::copy_nonoverlapping(extend.as_ptr(), dst.as_mut_ptr() as *mut u8, cnt);
+            ptr::copy_nonoverlapping(extend.as_ptr(), dst.as_mut_ptr().cast(), cnt);
         }
 
         unsafe {
@@ -698,10 +794,11 @@ impl BytesMut {
 
     /// Absorbs a `BytesMut` that was previously split off.
     ///
-    /// If the two `BytesMut` objects were previously contiguous, i.e., if
-    /// `other` was created by calling `split_off` on this `BytesMut`, then
-    /// this is an `O(1)` operation that just decreases a reference
-    /// count and sets a few indices. Otherwise this method degenerates to
+    /// If the two `BytesMut` objects were previously contiguous and not mutated
+    /// in a way that causes re-allocation i.e., if `other` was created by
+    /// calling `split_off` on this `BytesMut`, then this is an `O(1)` operation
+    /// that just decreases a reference count and sets a few indices.
+    /// Otherwise this method degenerates to
     /// `self.extend_from_slice(other.as_ref())`.
     ///
     /// # Examples
@@ -739,11 +836,11 @@ impl BytesMut {
     // internal change could make a simple pattern (`BytesMut::from(vec)`)
     // suddenly a lot more expensive.
     #[inline]
-    pub(crate) fn from_vec(mut vec: Vec<u8>) -> BytesMut {
+    pub(crate) fn from_vec(vec: Vec<u8>) -> BytesMut {
+        let mut vec = ManuallyDrop::new(vec);
         let ptr = vptr(vec.as_mut_ptr());
         let len = vec.len();
         let cap = vec.capacity();
-        mem::forget(vec);
 
         let original_capacity_repr = original_capacity_to_repr(cap);
         let data = (original_capacity_repr << ORIGINAL_CAPACITY_OFFSET) | KIND_VEC;
@@ -752,7 +849,7 @@ impl BytesMut {
             ptr,
             len,
             cap,
-            data: data as *mut _,
+            data: invalid_ptr(data),
         }
     }
 
@@ -766,14 +863,19 @@ impl BytesMut {
         unsafe { slice::from_raw_parts_mut(self.ptr.as_ptr(), self.len) }
     }
 
-    unsafe fn set_start(&mut self, start: usize) {
+    /// Advance the buffer without bounds checking.
+    ///
+    /// # SAFETY
+    ///
+    /// The caller must ensure that `count` <= `self.cap`.
+    pub(crate) unsafe fn advance_unchecked(&mut self, count: usize) {
         // Setting the start to 0 is a no-op, so return early if this is the
         // case.
-        if start == 0 {
+        if count == 0 {
             return;
         }
 
-        debug_assert!(start <= self.cap, "internal: set_start out of bounds");
+        debug_assert!(count <= self.cap, "internal: set_start out of bounds");
 
         let kind = self.kind();
 
@@ -782,11 +884,10 @@ impl BytesMut {
             // complicated. First, we have to track how far ahead the
             // "start" of the byte buffer from the beginning of the vec. We
             // also have to ensure that we don't exceed the maximum shift.
-            let (mut pos, prev) = self.get_vec_pos();
-            pos += start;
+            let pos = self.get_vec_pos() + count;
 
             if pos <= MAX_VEC_POS {
-                self.set_vec_pos(pos, prev);
+                self.set_vec_pos(pos);
             } else {
                 // The repr must be upgraded to ARC. This will never happen
                 // on 64 bit systems and will only happen on 32 bit systems
@@ -799,23 +900,9 @@ impl BytesMut {
         // Updating the start of the view is setting `ptr` to point to the
         // new start and updating the `len` field to reflect the new length
         // of the view.
-        self.ptr = vptr(self.ptr.as_ptr().offset(start as isize));
-
-        if self.len >= start {
-            self.len -= start;
-        } else {
-            self.len = 0;
-        }
-
-        self.cap -= start;
-    }
-
-    unsafe fn set_end(&mut self, end: usize) {
-        debug_assert_eq!(self.kind(), KIND_ARC);
-        assert!(end <= self.cap, "set_end out of bounds");
-
-        self.cap = end;
-        self.len = cmp::min(self.len, end);
+        self.ptr = vptr(self.ptr.as_ptr().add(count));
+        self.len = self.len.checked_sub(count).unwrap_or(0);
+        self.cap -= count;
     }
 
     fn try_unsplit(&mut self, other: BytesMut) -> Result<(), BytesMut> {
@@ -823,7 +910,7 @@ impl BytesMut {
             return Ok(());
         }
 
-        let ptr = unsafe { self.ptr.as_ptr().offset(self.len as isize) };
+        let ptr = unsafe { self.ptr.as_ptr().add(self.len) };
         if ptr == other.ptr.as_ptr()
             && self.kind() == KIND_ARC
             && other.kind() == KIND_ARC
@@ -873,7 +960,7 @@ impl BytesMut {
         // always succeed.
         debug_assert_eq!(shared as usize & KIND_MASK, KIND_ARC);
 
-        self.data = shared as _;
+        self.data = shared;
     }
 
     /// Makes an exact shallow clone of `self`.
@@ -894,28 +981,56 @@ impl BytesMut {
     }
 
     #[inline]
-    unsafe fn get_vec_pos(&mut self) -> (usize, usize) {
+    unsafe fn get_vec_pos(&self) -> usize {
         debug_assert_eq!(self.kind(), KIND_VEC);
 
-        let prev = self.data as usize;
-        (prev >> VEC_POS_OFFSET, prev)
+        self.data as usize >> VEC_POS_OFFSET
     }
 
     #[inline]
-    unsafe fn set_vec_pos(&mut self, pos: usize, prev: usize) {
+    unsafe fn set_vec_pos(&mut self, pos: usize) {
         debug_assert_eq!(self.kind(), KIND_VEC);
         debug_assert!(pos <= MAX_VEC_POS);
 
-        self.data = ((pos << VEC_POS_OFFSET) | (prev & NOT_VEC_POS_MASK)) as *mut _;
+        self.data = invalid_ptr((pos << VEC_POS_OFFSET) | (self.data as usize & NOT_VEC_POS_MASK));
     }
 
+    /// Returns the remaining spare capacity of the buffer as a slice of `MaybeUninit<u8>`.
+    ///
+    /// The returned slice can be used to fill the buffer with data (e.g. by
+    /// reading from a file) before marking the data as initialized using the
+    /// [`set_len`] method.
+    ///
+    /// [`set_len`]: BytesMut::set_len
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use bytes::BytesMut;
+    ///
+    /// // Allocate buffer big enough for 10 bytes.
+    /// let mut buf = BytesMut::with_capacity(10);
+    ///
+    /// // Fill in the first 3 elements.
+    /// let uninit = buf.spare_capacity_mut();
+    /// uninit[0].write(0);
+    /// uninit[1].write(1);
+    /// uninit[2].write(2);
+    ///
+    /// // Mark the first 3 bytes of the buffer as being initialized.
+    /// unsafe {
+    ///     buf.set_len(3);
+    /// }
+    ///
+    /// assert_eq!(&buf[..], &[0, 1, 2]);
+    /// ```
     #[inline]
-    fn uninit_slice(&mut self) -> &mut UninitSlice {
+    pub fn spare_capacity_mut(&mut self) -> &mut [MaybeUninit<u8>] {
         unsafe {
-            let ptr = self.ptr.as_ptr().offset(self.len as isize);
+            let ptr = self.ptr.as_ptr().add(self.len);
             let len = self.cap - self.len;
 
-            UninitSlice::from_raw_parts_mut(ptr, len)
+            slice::from_raw_parts_mut(ptr.cast(), len)
         }
     }
 }
@@ -926,13 +1041,13 @@ impl Drop for BytesMut {
 
         if kind == KIND_VEC {
             unsafe {
-                let (off, _) = self.get_vec_pos();
+                let off = self.get_vec_pos();
 
                 // Vector storage, free the vector
                 let _ = rebuild_vec(self.ptr.as_ptr(), self.len, self.cap, off);
             }
         } else if kind == KIND_ARC {
-            unsafe { release_shared(self.data as _) };
+            unsafe { release_shared(self.data) };
         }
     }
 }
@@ -950,6 +1065,13 @@ impl Buf for BytesMut {
 
     #[inline]
     fn advance(&mut self, cnt: usize) {
+        // Advancing by the length is the same as resetting the length to 0,
+        // except this way we get to reuse the full capacity.
+        if cnt == self.remaining() {
+            self.clear();
+            return;
+        }
+
         assert!(
             cnt <= self.remaining(),
             "cannot advance past `remaining`: {:?} <= {:?}",
@@ -957,11 +1079,13 @@ impl Buf for BytesMut {
             self.remaining(),
         );
         unsafe {
-            self.set_start(cnt);
+            // SAFETY: We've checked that `cnt` <= `self.remaining()` and we know that
+            // `self.remaining()` <= `self.cap`.
+            self.advance_unchecked(cnt);
         }
     }
 
-    fn copy_to_bytes(&mut self, len: usize) -> crate::Bytes {
+    fn copy_to_bytes(&mut self, len: usize) -> Bytes {
         self.split_to(len).freeze()
     }
 }
@@ -974,14 +1098,12 @@ unsafe impl BufMut for BytesMut {
 
     #[inline]
     unsafe fn advance_mut(&mut self, cnt: usize) {
-        let new_len = self.len() + cnt;
-        assert!(
-            new_len <= self.cap,
-            "new_len = {}; capacity = {}",
-            new_len,
-            self.cap
-        );
-        self.len = new_len;
+        let remaining = self.cap - self.len();
+        if cnt > remaining {
+            super::panic_advance(cnt, remaining);
+        }
+        // Addition won't overflow since it is at most `self.cap`.
+        self.len = self.len() + cnt;
     }
 
     #[inline]
@@ -989,13 +1111,13 @@ unsafe impl BufMut for BytesMut {
         if self.capacity() == self.len() {
             self.reserve(64);
         }
-        self.uninit_slice()
+        self.spare_capacity_mut().into()
     }
 
     // Specialize these methods so they can skip checking `remaining_mut`
     // and `advance_mut`.
 
-    fn put<T: crate::Buf>(&mut self, mut src: T)
+    fn put<T: Buf>(&mut self, mut src: T)
     where
         Self: Sized,
     {
@@ -1010,6 +1132,19 @@ unsafe impl BufMut for BytesMut {
     fn put_slice(&mut self, src: &[u8]) {
         self.extend_from_slice(src);
     }
+
+    fn put_bytes(&mut self, val: u8, cnt: usize) {
+        self.reserve(cnt);
+        unsafe {
+            let dst = self.spare_capacity_mut();
+            // Reserved above
+            debug_assert!(dst.len() >= cnt);
+
+            ptr::write_bytes(dst.as_mut_ptr(), val, cnt);
+
+            self.advance_mut(cnt);
+        }
+    }
 }
 
 impl AsRef<[u8]> for BytesMut {
@@ -1146,7 +1281,7 @@ impl<'a> IntoIterator for &'a BytesMut {
     type IntoIter = core::slice::Iter<'a, u8>;
 
     fn into_iter(self) -> Self::IntoIter {
-        self.as_ref().into_iter()
+        self.as_ref().iter()
     }
 }
 
@@ -1162,9 +1297,7 @@ impl Extend<u8> for BytesMut {
 
         // TODO: optimize
         // 1. If self.kind() == KIND_VEC, use Vec::extend
-        // 2. Make `reserve` inline-able
         for b in iter {
-            self.reserve(1);
             self.put_u8(b);
         }
     }
@@ -1175,7 +1308,18 @@ impl<'a> Extend<&'a u8> for BytesMut {
     where
         T: IntoIterator<Item = &'a u8>,
     {
-        self.extend(iter.into_iter().map(|b| *b))
+        self.extend(iter.into_iter().copied())
+    }
+}
+
+impl Extend<Bytes> for BytesMut {
+    fn extend<T>(&mut self, iter: T)
+    where
+        T: IntoIterator<Item = Bytes>,
+    {
+        for bytes in iter {
+            self.extend_from_slice(&bytes)
+        }
     }
 }
 
@@ -1187,7 +1331,7 @@ impl FromIterator<u8> for BytesMut {
 
 impl<'a> FromIterator<&'a u8> for BytesMut {
     fn from_iter<T: IntoIterator<Item = &'a u8>>(into_iter: T) -> Self {
-        BytesMut::from_iter(into_iter.into_iter().map(|b| *b))
+        BytesMut::from_iter(into_iter.into_iter().copied())
     }
 }
 
@@ -1228,10 +1372,13 @@ unsafe fn release_shared(ptr: *mut Shared) {
     // > "acquire" operation before deleting the object.
     //
     // [1]: (www.boost.org/doc/libs/1_55_0/doc/html/atomic/usage_examples.html)
-    atomic::fence(Ordering::Acquire);
+    //
+    // Thread sanitizer does not support atomic fences. Use an atomic load
+    // instead.
+    (*ptr).ref_count.load(Ordering::Acquire);
 
     // Drop the data
-    Box::from_raw(ptr);
+    drop(Box::from_raw(ptr));
 }
 
 impl Shared {
@@ -1267,56 +1414,59 @@ fn original_capacity_from_repr(repr: usize) -> usize {
     1 << (repr + (MIN_ORIGINAL_CAPACITY_WIDTH - 1))
 }
 
-/*
-#[test]
-fn test_original_capacity_to_repr() {
-    assert_eq!(original_capacity_to_repr(0), 0);
+#[cfg(test)]
+mod tests {
+    use super::*;
 
-    let max_width = 32;
+    #[test]
+    fn test_original_capacity_to_repr() {
+        assert_eq!(original_capacity_to_repr(0), 0);
 
-    for width in 1..(max_width + 1) {
-        let cap = 1 << width - 1;
+        let max_width = 32;
 
-        let expected = if width < MIN_ORIGINAL_CAPACITY_WIDTH {
-            0
-        } else if width < MAX_ORIGINAL_CAPACITY_WIDTH {
-            width - MIN_ORIGINAL_CAPACITY_WIDTH
-        } else {
-            MAX_ORIGINAL_CAPACITY_WIDTH - MIN_ORIGINAL_CAPACITY_WIDTH
-        };
+        for width in 1..(max_width + 1) {
+            let cap = 1 << width - 1;
 
-        assert_eq!(original_capacity_to_repr(cap), expected);
+            let expected = if width < MIN_ORIGINAL_CAPACITY_WIDTH {
+                0
+            } else if width < MAX_ORIGINAL_CAPACITY_WIDTH {
+                width - MIN_ORIGINAL_CAPACITY_WIDTH
+            } else {
+                MAX_ORIGINAL_CAPACITY_WIDTH - MIN_ORIGINAL_CAPACITY_WIDTH
+            };
 
-        if width > 1 {
-            assert_eq!(original_capacity_to_repr(cap + 1), expected);
-        }
+            assert_eq!(original_capacity_to_repr(cap), expected);
+
+            if width > 1 {
+                assert_eq!(original_capacity_to_repr(cap + 1), expected);
+            }
 
-        //  MIN_ORIGINAL_CAPACITY_WIDTH must be bigger than 7 to pass tests below
-        if width == MIN_ORIGINAL_CAPACITY_WIDTH + 1 {
-            assert_eq!(original_capacity_to_repr(cap - 24), expected - 1);
-            assert_eq!(original_capacity_to_repr(cap + 76), expected);
-        } else if width == MIN_ORIGINAL_CAPACITY_WIDTH + 2 {
-            assert_eq!(original_capacity_to_repr(cap - 1), expected - 1);
-            assert_eq!(original_capacity_to_repr(cap - 48), expected - 1);
+            //  MIN_ORIGINAL_CAPACITY_WIDTH must be bigger than 7 to pass tests below
+            if width == MIN_ORIGINAL_CAPACITY_WIDTH + 1 {
+                assert_eq!(original_capacity_to_repr(cap - 24), expected - 1);
+                assert_eq!(original_capacity_to_repr(cap + 76), expected);
+            } else if width == MIN_ORIGINAL_CAPACITY_WIDTH + 2 {
+                assert_eq!(original_capacity_to_repr(cap - 1), expected - 1);
+                assert_eq!(original_capacity_to_repr(cap - 48), expected - 1);
+            }
         }
     }
-}
 
-#[test]
-fn test_original_capacity_from_repr() {
-    assert_eq!(0, original_capacity_from_repr(0));
+    #[test]
+    fn test_original_capacity_from_repr() {
+        assert_eq!(0, original_capacity_from_repr(0));
 
-    let min_cap = 1 << MIN_ORIGINAL_CAPACITY_WIDTH;
+        let min_cap = 1 << MIN_ORIGINAL_CAPACITY_WIDTH;
 
-    assert_eq!(min_cap, original_capacity_from_repr(1));
-    assert_eq!(min_cap * 2, original_capacity_from_repr(2));
-    assert_eq!(min_cap * 4, original_capacity_from_repr(3));
-    assert_eq!(min_cap * 8, original_capacity_from_repr(4));
-    assert_eq!(min_cap * 16, original_capacity_from_repr(5));
-    assert_eq!(min_cap * 32, original_capacity_from_repr(6));
-    assert_eq!(min_cap * 64, original_capacity_from_repr(7));
+        assert_eq!(min_cap, original_capacity_from_repr(1));
+        assert_eq!(min_cap * 2, original_capacity_from_repr(2));
+        assert_eq!(min_cap * 4, original_capacity_from_repr(3));
+        assert_eq!(min_cap * 8, original_capacity_from_repr(4));
+        assert_eq!(min_cap * 16, original_capacity_from_repr(5));
+        assert_eq!(min_cap * 32, original_capacity_from_repr(6));
+        assert_eq!(min_cap * 64, original_capacity_from_repr(7));
+    }
 }
-*/
 
 unsafe impl Send for BytesMut {}
 unsafe impl Sync for BytesMut {}
@@ -1377,7 +1527,7 @@ impl PartialOrd<BytesMut> for str {
 
 impl PartialEq<Vec<u8>> for BytesMut {
     fn eq(&self, other: &Vec<u8>) -> bool {
-        *self == &other[..]
+        *self == other[..]
     }
 }
 
@@ -1401,7 +1551,7 @@ impl PartialOrd<BytesMut> for Vec<u8> {
 
 impl PartialEq<String> for BytesMut {
     fn eq(&self, other: &String) -> bool {
-        *self == &other[..]
+        *self == other[..]
     }
 }
 
@@ -1467,13 +1617,48 @@ impl PartialOrd<BytesMut> for &str {
 
 impl PartialEq<BytesMut> for Bytes {
     fn eq(&self, other: &BytesMut) -> bool {
-        &other[..] == &self[..]
+        other[..] == self[..]
     }
 }
 
 impl PartialEq<Bytes> for BytesMut {
     fn eq(&self, other: &Bytes) -> bool {
-        &other[..] == &self[..]
+        other[..] == self[..]
+    }
+}
+
+impl From<BytesMut> for Vec<u8> {
+    fn from(bytes: BytesMut) -> Self {
+        let kind = bytes.kind();
+        let bytes = ManuallyDrop::new(bytes);
+
+        let mut vec = if kind == KIND_VEC {
+            unsafe {
+                let off = bytes.get_vec_pos();
+                rebuild_vec(bytes.ptr.as_ptr(), bytes.len, bytes.cap, off)
+            }
+        } else {
+            let shared = bytes.data as *mut Shared;
+
+            if unsafe { (*shared).is_unique() } {
+                let vec = mem::replace(unsafe { &mut (*shared).vec }, Vec::new());
+
+                unsafe { release_shared(shared) };
+
+                vec
+            } else {
+                return ManuallyDrop::into_inner(bytes).deref().to_vec();
+            }
+        };
+
+        let len = bytes.len;
+
+        unsafe {
+            ptr::copy(bytes.ptr.as_ptr(), vec.as_mut_ptr(), len);
+            vec.set_len(len);
+        }
+
+        vec
     }
 }
 
@@ -1486,8 +1671,20 @@ fn vptr(ptr: *mut u8) -> NonNull<u8> {
     }
 }
 
+/// Returns a dangling pointer with the given address. This is used to store
+/// integer data in pointer fields.
+///
+/// It is equivalent to `addr as *mut T`, but this fails on miri when strict
+/// provenance checking is enabled.
+#[inline]
+fn invalid_ptr<T>(addr: usize) -> *mut T {
+    let ptr = core::ptr::null_mut::<u8>().wrapping_add(addr);
+    debug_assert_eq!(ptr as usize, addr);
+    ptr.cast::<T>()
+}
+
 unsafe fn rebuild_vec(ptr: *mut u8, mut len: usize, mut cap: usize, off: usize) -> Vec<u8> {
-    let ptr = ptr.offset(-(off as isize));
+    let ptr = ptr.sub(off);
     len += off;
     cap += off;
 
@@ -1498,6 +1695,9 @@ unsafe fn rebuild_vec(ptr: *mut u8, mut len: usize, mut cap: usize, off: usize)
 
 static SHARED_VTABLE: Vtable = Vtable {
     clone: shared_v_clone,
+    to_vec: shared_v_to_vec,
+    to_mut: shared_v_to_mut,
+    is_unique: crate::bytes::shared_is_unique,
     drop: shared_v_drop,
 };
 
@@ -1505,10 +1705,61 @@ unsafe fn shared_v_clone(data: &AtomicPtr<()>, ptr: *const u8, len: usize) -> By
     let shared = data.load(Ordering::Relaxed) as *mut Shared;
     increment_shared(shared);
 
-    let data = AtomicPtr::new(shared as _);
+    let data = AtomicPtr::new(shared as *mut ());
     Bytes::with_vtable(ptr, len, data, &SHARED_VTABLE)
 }
 
+unsafe fn shared_v_to_vec(data: &AtomicPtr<()>, ptr: *const u8, len: usize) -> Vec<u8> {
+    let shared: *mut Shared = data.load(Ordering::Relaxed).cast();
+
+    if (*shared).is_unique() {
+        let shared = &mut *shared;
+
+        // Drop shared
+        let mut vec = mem::replace(&mut shared.vec, Vec::new());
+        release_shared(shared);
+
+        // Copy back buffer
+        ptr::copy(ptr, vec.as_mut_ptr(), len);
+        vec.set_len(len);
+
+        vec
+    } else {
+        let v = slice::from_raw_parts(ptr, len).to_vec();
+        release_shared(shared);
+        v
+    }
+}
+
+unsafe fn shared_v_to_mut(data: &AtomicPtr<()>, ptr: *const u8, len: usize) -> BytesMut {
+    let shared: *mut Shared = data.load(Ordering::Relaxed).cast();
+
+    if (*shared).is_unique() {
+        let shared = &mut *shared;
+
+        // The capacity is always the original capacity of the buffer
+        // minus the offset from the start of the buffer
+        let v = &mut shared.vec;
+        let v_capacity = v.capacity();
+        let v_ptr = v.as_mut_ptr();
+        let offset = offset_from(ptr as *mut u8, v_ptr);
+        let cap = v_capacity - offset;
+
+        let ptr = vptr(ptr as *mut u8);
+
+        BytesMut {
+            ptr,
+            len,
+            cap,
+            data: shared,
+        }
+    } else {
+        let v = slice::from_raw_parts(ptr, len).to_vec();
+        release_shared(shared);
+        BytesMut::from_vec(v)
+    }
+}
+
 unsafe fn shared_v_drop(data: &mut AtomicPtr<()>, _ptr: *const u8, _len: usize) {
     data.with_mut(|shared| {
         release_shared(*shared as *mut Shared);
diff --git a/src/fmt/debug.rs b/src/fmt/debug.rs
index a854551..83de695 100644
--- a/src/fmt/debug.rs
+++ b/src/fmt/debug.rs
@@ -25,7 +25,7 @@ impl Debug for BytesRef<'_> {
             } else if b == b'\0' {
                 write!(f, "\\0")?;
             // ASCII printable
-            } else if b >= 0x20 && b < 0x7f {
+            } else if (0x20..0x7f).contains(&b) {
                 write!(f, "{}", b as char)?;
             } else {
                 write!(f, "\\x{:02x}", b)?;
@@ -38,12 +38,12 @@ impl Debug for BytesRef<'_> {
 
 impl Debug for Bytes {
     fn fmt(&self, f: &mut Formatter<'_>) -> Result {
-        Debug::fmt(&BytesRef(&self.as_ref()), f)
+        Debug::fmt(&BytesRef(self.as_ref()), f)
     }
 }
 
 impl Debug for BytesMut {
     fn fmt(&self, f: &mut Formatter<'_>) -> Result {
-        Debug::fmt(&BytesRef(&self.as_ref()), f)
+        Debug::fmt(&BytesRef(self.as_ref()), f)
     }
 }
diff --git a/src/lib.rs b/src/lib.rs
index 706735e..1296a13 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,19 +1,19 @@
+#![allow(unknown_lints, unexpected_cfgs)]
 #![warn(missing_docs, missing_debug_implementations, rust_2018_idioms)]
 #![doc(test(
     no_crate_inject,
     attr(deny(warnings, rust_2018_idioms), allow(dead_code, unused_variables))
 ))]
 #![no_std]
+#![cfg_attr(bytes_unstable, feature(cfg_target_has_atomic))]
+#![cfg_attr(docsrs, feature(doc_cfg))]
 
 //! Provides abstractions for working with bytes.
 //!
 //! The `bytes` crate provides an efficient byte buffer structure
-//! ([`Bytes`](struct.Bytes.html)) and traits for working with buffer
+//! ([`Bytes`]) and traits for working with buffer
 //! implementations ([`Buf`], [`BufMut`]).
 //!
-//! [`Buf`]: trait.Buf.html
-//! [`BufMut`]: trait.BufMut.html
-//!
 //! # `Bytes`
 //!
 //! `Bytes` is an efficient container for storing and operating on contiguous
@@ -51,9 +51,7 @@
 //! `a` and `b` will share the underlying buffer and maintain indices tracking
 //! the view into the buffer represented by the handle.
 //!
-//! See the [struct docs] for more details.
-//!
-//! [struct docs]: struct.Bytes.html
+//! See the [struct docs](`Bytes`) for more details.
 //!
 //! # `Buf`, `BufMut`
 //!
@@ -69,7 +67,7 @@
 //! ## Relation with `Read` and `Write`
 //!
 //! At first glance, it may seem that `Buf` and `BufMut` overlap in
-//! functionality with `std::io::Read` and `std::io::Write`. However, they
+//! functionality with [`std::io::Read`] and [`std::io::Write`]. However, they
 //! serve different purposes. A buffer is the value that is provided as an
 //! argument to `Read::read` and `Write::write`. `Read` and `Write` may then
 //! perform a syscall, which has the potential of failing. Operations on `Buf`
@@ -91,6 +89,7 @@ pub use crate::bytes::Bytes;
 pub use crate::bytes_mut::BytesMut;
 
 // Optional Serde support
+#[cfg(not(bytes_no_atomic_cas))]
 #[cfg(feature = "serde")]
 mod serde;
 
@@ -114,3 +113,55 @@ fn abort() -> ! {
         panic!("abort");
     }
 }
+
+#[inline(always)]
+#[cfg(feature = "std")]
+fn saturating_sub_usize_u64(a: usize, b: u64) -> usize {
+    use core::convert::TryFrom;
+    match usize::try_from(b) {
+        Ok(b) => a.saturating_sub(b),
+        Err(_) => 0,
+    }
+}
+
+#[inline(always)]
+#[cfg(feature = "std")]
+fn min_u64_usize(a: u64, b: usize) -> usize {
+    use core::convert::TryFrom;
+    match usize::try_from(a) {
+        Ok(a) => usize::min(a, b),
+        Err(_) => b,
+    }
+}
+
+/// Panic with a nice error message.
+#[cold]
+fn panic_advance(idx: usize, len: usize) -> ! {
+    panic!(
+        "advance out of bounds: the len is {} but advancing by {}",
+        len, idx
+    );
+}
+
+#[cold]
+fn panic_does_not_fit(size: usize, nbytes: usize) -> ! {
+    panic!(
+        "size too large: the integer type can fit {} bytes, but nbytes is {}",
+        size, nbytes
+    );
+}
+
+/// Precondition: dst >= original
+///
+/// The following line is equivalent to:
+///
+/// ```rust,ignore
+/// self.ptr.as_ptr().offset_from(ptr) as usize;
+/// ```
+///
+/// But due to min rust is 1.39 and it is only stabilized
+/// in 1.47, we cannot use it.
+#[inline]
+fn offset_from(dst: *const u8, original: *const u8) -> usize {
+    dst as usize - original as usize
+}
diff --git a/src/loom.rs b/src/loom.rs
index 1cae881..6fd2e76 100644
--- a/src/loom.rs
+++ b/src/loom.rs
@@ -1,7 +1,15 @@
+#![cfg_attr(not(feature = "std"), allow(unused_imports))]
+
 #[cfg(not(all(test, loom)))]
 pub(crate) mod sync {
     pub(crate) mod atomic {
-        pub(crate) use core::sync::atomic::{fence, AtomicPtr, AtomicUsize, Ordering};
+        #[cfg(not(bytes_no_atomic_cas))]
+        pub(crate) use core::sync::atomic::{AtomicPtr, AtomicUsize, Ordering};
+
+        #[cfg(bytes_no_atomic_cas)]
+        pub(crate) use core::sync::atomic::{fence, Ordering};
+        #[cfg(bytes_no_atomic_cas)]
+        pub(crate) use portable_atomic::{AtomicUsize, AtomicPtr};
 
         pub(crate) trait AtomicMut<T> {
             fn with_mut<F, R>(&mut self, f: F) -> R
@@ -23,7 +31,7 @@ pub(crate) mod sync {
 #[cfg(all(test, loom))]
 pub(crate) mod sync {
     pub(crate) mod atomic {
-        pub(crate) use loom::sync::atomic::{fence, AtomicPtr, AtomicUsize, Ordering};
+        pub(crate) use loom::sync::atomic::{AtomicPtr, AtomicUsize, Ordering};
 
         pub(crate) trait AtomicMut<T> {}
     }
diff --git a/tests/test_buf.rs b/tests/test_buf.rs
index fbad003..3940f92 100644
--- a/tests/test_buf.rs
+++ b/tests/test_buf.rs
@@ -72,6 +72,7 @@ fn test_vec_deque() {
     assert_eq!(b"world piece", &out[..]);
 }
 
+#[allow(unused_allocation)] // This is intentional.
 #[test]
 fn test_deref_buf_forwards() {
     struct Special;
diff --git a/tests/test_buf_mut.rs b/tests/test_buf_mut.rs
index f631982..0abeb9f 100644
--- a/tests/test_buf_mut.rs
+++ b/tests/test_buf_mut.rs
@@ -3,6 +3,7 @@
 use bytes::buf::UninitSlice;
 use bytes::{BufMut, BytesMut};
 use core::fmt::Write;
+use core::mem::MaybeUninit;
 use core::usize;
 
 #[test]
@@ -27,6 +28,14 @@ fn test_vec_as_mut_buf() {
     assert_eq!(buf.len(), 68);
 }
 
+#[test]
+fn test_vec_put_bytes() {
+    let mut buf = Vec::new();
+    buf.push(17);
+    buf.put_bytes(19, 2);
+    assert_eq!([17, 19, 19], &buf[..]);
+}
+
 #[test]
 fn test_put_u8() {
     let mut buf = Vec::with_capacity(8);
@@ -46,7 +55,35 @@ fn test_put_u16() {
 }
 
 #[test]
-#[should_panic(expected = "cannot advance")]
+fn test_put_int() {
+    let mut buf = Vec::with_capacity(8);
+    buf.put_int(0x1020304050607080, 3);
+    assert_eq!(b"\x60\x70\x80", &buf[..]);
+}
+
+#[test]
+#[should_panic]
+fn test_put_int_nbytes_overflow() {
+    let mut buf = Vec::with_capacity(8);
+    buf.put_int(0x1020304050607080, 9);
+}
+
+#[test]
+fn test_put_int_le() {
+    let mut buf = Vec::with_capacity(8);
+    buf.put_int_le(0x1020304050607080, 3);
+    assert_eq!(b"\x80\x70\x60", &buf[..]);
+}
+
+#[test]
+#[should_panic]
+fn test_put_int_le_nbytes_overflow() {
+    let mut buf = Vec::with_capacity(8);
+    buf.put_int_le(0x1020304050607080, 9);
+}
+
+#[test]
+#[should_panic(expected = "advance out of bounds: the len is 8 but advancing by 12")]
 fn test_vec_advance_mut() {
     // Verify fix for #354
     let mut buf = Vec::with_capacity(8);
@@ -65,16 +102,123 @@ fn test_clone() {
     assert!(buf != buf2);
 }
 
+fn do_test_slice_small<T: ?Sized>(make: impl Fn(&mut [u8]) -> &mut T)
+where
+    for<'r> &'r mut T: BufMut,
+{
+    let mut buf = [b'X'; 8];
+
+    let mut slice = make(&mut buf[..]);
+    slice.put_bytes(b'A', 2);
+    slice.put_u8(b'B');
+    slice.put_slice(b"BCC");
+    assert_eq!(2, slice.remaining_mut());
+    assert_eq!(b"AABBCCXX", &buf[..]);
+
+    let mut slice = make(&mut buf[..]);
+    slice.put_u32(0x61626364);
+    assert_eq!(4, slice.remaining_mut());
+    assert_eq!(b"abcdCCXX", &buf[..]);
+
+    let mut slice = make(&mut buf[..]);
+    slice.put_u32_le(0x30313233);
+    assert_eq!(4, slice.remaining_mut());
+    assert_eq!(b"3210CCXX", &buf[..]);
+}
+
+fn do_test_slice_large<T: ?Sized>(make: impl Fn(&mut [u8]) -> &mut T)
+where
+    for<'r> &'r mut T: BufMut,
+{
+    const LEN: usize = 100;
+    const FILL: [u8; LEN] = [b'Y'; LEN];
+
+    let test = |fill: &dyn Fn(&mut &mut T, usize)| {
+        for buf_len in 0..LEN {
+            let mut buf = [b'X'; LEN];
+            for fill_len in 0..=buf_len {
+                let mut slice = make(&mut buf[..buf_len]);
+                fill(&mut slice, fill_len);
+                assert_eq!(buf_len - fill_len, slice.remaining_mut());
+                let (head, tail) = buf.split_at(fill_len);
+                assert_eq!(&FILL[..fill_len], head);
+                assert!(tail.iter().all(|b| *b == b'X'));
+            }
+        }
+    };
+
+    test(&|slice, fill_len| slice.put_slice(&FILL[..fill_len]));
+    test(&|slice, fill_len| slice.put_bytes(FILL[0], fill_len));
+}
+
+fn do_test_slice_put_slice_panics<T: ?Sized>(make: impl Fn(&mut [u8]) -> &mut T)
+where
+    for<'r> &'r mut T: BufMut,
+{
+    let mut buf = [b'X'; 4];
+    let mut slice = make(&mut buf[..]);
+    slice.put_slice(b"12345");
+}
+
+fn do_test_slice_put_bytes_panics<T: ?Sized>(make: impl Fn(&mut [u8]) -> &mut T)
+where
+    for<'r> &'r mut T: BufMut,
+{
+    let mut buf = [b'X'; 4];
+    let mut slice = make(&mut buf[..]);
+    slice.put_bytes(b'1', 5);
+}
+
+#[test]
+fn test_slice_buf_mut_small() {
+    do_test_slice_small(|x| x);
+}
+
+#[test]
+fn test_slice_buf_mut_large() {
+    do_test_slice_large(|x| x);
+}
+
+#[test]
+#[should_panic]
+fn test_slice_buf_mut_put_slice_overflow() {
+    do_test_slice_put_slice_panics(|x| x);
+}
+
+#[test]
+#[should_panic]
+fn test_slice_buf_mut_put_bytes_overflow() {
+    do_test_slice_put_bytes_panics(|x| x);
+}
+
+fn make_maybe_uninit_slice(slice: &mut [u8]) -> &mut [MaybeUninit<u8>] {
+    // SAFETY: [u8] has the same layout as [MaybeUninit<u8>].
+    unsafe { core::mem::transmute(slice) }
+}
+
 #[test]
-fn test_mut_slice() {
-    let mut v = vec![0, 0, 0, 0];
-    let mut s = &mut v[..];
-    s.put_u32(42);
+fn test_maybe_uninit_buf_mut_small() {
+    do_test_slice_small(make_maybe_uninit_slice);
+}
 
-    assert_eq!(s.len(), 0);
-    assert_eq!(&v, &[0, 0, 0, 42]);
+#[test]
+fn test_maybe_uninit_buf_mut_large() {
+    do_test_slice_large(make_maybe_uninit_slice);
+}
+
+#[test]
+#[should_panic]
+fn test_maybe_uninit_buf_mut_put_slice_overflow() {
+    do_test_slice_put_slice_panics(make_maybe_uninit_slice);
+}
+
+#[test]
+#[should_panic]
+fn test_maybe_uninit_buf_mut_put_bytes_overflow() {
+    do_test_slice_put_bytes_panics(make_maybe_uninit_slice);
 }
 
+#[allow(unused_allocation)] // This is intentional.
 #[test]
 fn test_deref_bufmut_forwards() {
     struct Special;
diff --git a/tests/test_bytes.rs b/tests/test_bytes.rs
index 2d22fdd..3ac4298 100644
--- a/tests/test_bytes.rs
+++ b/tests/test_bytes.rs
@@ -4,8 +4,8 @@ use bytes::{Buf, BufMut, Bytes, BytesMut};
 
 use std::usize;
 
-const LONG: &'static [u8] = b"mary had a little lamb, little lamb, little lamb";
-const SHORT: &'static [u8] = b"hello world";
+const LONG: &[u8] = b"mary had a little lamb, little lamb, little lamb";
+const SHORT: &[u8] = b"hello world";
 
 fn is_sync<T: Sync>() {}
 fn is_send<T: Send>() {}
@@ -411,8 +411,8 @@ fn freeze_after_split_off() {
 fn fns_defined_for_bytes_mut() {
     let mut bytes = BytesMut::from(&b"hello world"[..]);
 
-    bytes.as_ptr();
-    bytes.as_mut_ptr();
+    let _ = bytes.as_ptr();
+    let _ = bytes.as_mut_ptr();
 
     // Iterator
     let v: Vec<u8> = bytes.as_ref().iter().cloned().collect();
@@ -443,7 +443,7 @@ fn reserve_growth() {
     let _ = bytes.split();
 
     bytes.reserve(65);
-    assert_eq!(bytes.capacity(), 128);
+    assert_eq!(bytes.capacity(), 117);
 }
 
 #[test]
@@ -515,6 +515,34 @@ fn reserve_in_arc_unique_doubles() {
     assert_eq!(2000, bytes.capacity());
 }
 
+#[test]
+fn reserve_in_arc_unique_does_not_overallocate_after_split() {
+    let mut bytes = BytesMut::from(LONG);
+    let orig_capacity = bytes.capacity();
+    drop(bytes.split_off(LONG.len() / 2));
+
+    // now bytes is Arc and refcount == 1
+
+    let new_capacity = bytes.capacity();
+    bytes.reserve(orig_capacity - new_capacity);
+    assert_eq!(bytes.capacity(), orig_capacity);
+}
+
+#[test]
+fn reserve_in_arc_unique_does_not_overallocate_after_multiple_splits() {
+    let mut bytes = BytesMut::from(LONG);
+    let orig_capacity = bytes.capacity();
+    for _ in 0..10 {
+        drop(bytes.split_off(LONG.len() / 2));
+
+        // now bytes is Arc and refcount == 1
+
+        let new_capacity = bytes.capacity();
+        bytes.reserve(orig_capacity - new_capacity);
+    }
+    assert_eq!(bytes.capacity(), orig_capacity);
+}
+
 #[test]
 fn reserve_in_arc_nonunique_does_not_overallocate() {
     let mut bytes = BytesMut::with_capacity(1000);
@@ -527,6 +555,25 @@ fn reserve_in_arc_nonunique_does_not_overallocate() {
     assert_eq!(2001, bytes.capacity());
 }
 
+/// This function tests `BytesMut::reserve_inner`, where `BytesMut` holds
+/// a unique reference to the shared vector and decide to reuse it
+/// by reallocating the `Vec`.
+#[test]
+fn reserve_shared_reuse() {
+    let mut bytes = BytesMut::with_capacity(1000);
+    bytes.put_slice(b"Hello, World!");
+    drop(bytes.split());
+
+    bytes.put_slice(b"!123ex123,sadchELLO,_wORLD!");
+    // Use split_off so that v.capacity() - self.cap != off
+    drop(bytes.split_off(9));
+    assert_eq!(&*bytes, b"!123ex123");
+
+    bytes.reserve(2000);
+    assert_eq!(&*bytes, b"!123ex123");
+    assert_eq!(bytes.capacity(), 2009);
+}
+
 #[test]
 fn extend_mut() {
     let mut bytes = BytesMut::with_capacity(0);
@@ -544,6 +591,35 @@ fn extend_from_slice_mut() {
     }
 }
 
+#[test]
+fn extend_mut_from_bytes() {
+    let mut bytes = BytesMut::with_capacity(0);
+    bytes.extend([Bytes::from(LONG)]);
+    assert_eq!(*bytes, LONG[..]);
+}
+
+#[test]
+fn extend_past_lower_limit_of_size_hint() {
+    // See https://github.com/tokio-rs/bytes/pull/674#pullrequestreview-1913035700
+    struct Iter<I>(I);
+
+    impl<I: Iterator<Item = u8>> Iterator for Iter<I> {
+        type Item = u8;
+
+        fn next(&mut self) -> Option<Self::Item> {
+            self.0.next()
+        }
+
+        fn size_hint(&self) -> (usize, Option<usize>) {
+            (5, None)
+        }
+    }
+
+    let mut bytes = BytesMut::with_capacity(5);
+    bytes.extend(Iter(std::iter::repeat(0).take(10)));
+    assert_eq!(bytes.len(), 10);
+}
+
 #[test]
 fn extend_mut_without_size_hint() {
     let mut bytes = BytesMut::with_capacity(0);
@@ -656,97 +732,6 @@ fn partial_eq_bytesmut() {
     assert!(bytesmut != bytes2);
 }
 
-/*
-#[test]
-fn bytes_unsplit_basic() {
-    let buf = Bytes::from(&b"aaabbbcccddd"[..]);
-
-    let splitted = buf.split_off(6);
-    assert_eq!(b"aaabbb", &buf[..]);
-    assert_eq!(b"cccddd", &splitted[..]);
-
-    buf.unsplit(splitted);
-    assert_eq!(b"aaabbbcccddd", &buf[..]);
-}
-
-#[test]
-fn bytes_unsplit_empty_other() {
-    let buf = Bytes::from(&b"aaabbbcccddd"[..]);
-
-    // empty other
-    let other = Bytes::new();
-
-    buf.unsplit(other);
-    assert_eq!(b"aaabbbcccddd", &buf[..]);
-}
-
-#[test]
-fn bytes_unsplit_empty_self() {
-    // empty self
-    let mut buf = Bytes::new();
-
-    let mut other = Bytes::with_capacity(64);
-    other.extend_from_slice(b"aaabbbcccddd");
-
-    buf.unsplit(other);
-    assert_eq!(b"aaabbbcccddd", &buf[..]);
-}
-
-#[test]
-fn bytes_unsplit_arc_different() {
-    let mut buf = Bytes::with_capacity(64);
-    buf.extend_from_slice(b"aaaabbbbeeee");
-
-    buf.split_off(8); //arc
-
-    let mut buf2 = Bytes::with_capacity(64);
-    buf2.extend_from_slice(b"ccccddddeeee");
-
-    buf2.split_off(8); //arc
-
-    buf.unsplit(buf2);
-    assert_eq!(b"aaaabbbbccccdddd", &buf[..]);
-}
-
-#[test]
-fn bytes_unsplit_arc_non_contiguous() {
-    let mut buf = Bytes::with_capacity(64);
-    buf.extend_from_slice(b"aaaabbbbeeeeccccdddd");
-
-    let mut buf2 = buf.split_off(8); //arc
-
-    let buf3 = buf2.split_off(4); //arc
-
-    buf.unsplit(buf3);
-    assert_eq!(b"aaaabbbbccccdddd", &buf[..]);
-}
-
-#[test]
-fn bytes_unsplit_two_split_offs() {
-    let mut buf = Bytes::with_capacity(64);
-    buf.extend_from_slice(b"aaaabbbbccccdddd");
-
-    let mut buf2 = buf.split_off(8); //arc
-    let buf3 = buf2.split_off(4); //arc
-
-    buf2.unsplit(buf3);
-    buf.unsplit(buf2);
-    assert_eq!(b"aaaabbbbccccdddd", &buf[..]);
-}
-
-#[test]
-fn bytes_unsplit_overlapping_references() {
-    let mut buf = Bytes::with_capacity(64);
-    buf.extend_from_slice(b"abcdefghijklmnopqrstuvwxyz");
-    let mut buf0010 = buf.slice(0..10);
-    let buf1020 = buf.slice(10..20);
-    let buf0515 = buf.slice(5..15);
-    buf0010.unsplit(buf1020);
-    assert_eq!(b"abcdefghijklmnopqrst", &buf0010[..]);
-    assert_eq!(b"fghijklmno", &buf0515[..]);
-}
-*/
-
 #[test]
 fn bytes_mut_unsplit_basic() {
     let mut buf = BytesMut::with_capacity(64);
@@ -874,7 +859,7 @@ fn from_iter_no_size_hint() {
 
 fn test_slice_ref(bytes: &Bytes, start: usize, end: usize, expected: &[u8]) {
     let slice = &(bytes.as_ref()[start..end]);
-    let sub = bytes.slice_ref(&slice);
+    let sub = bytes.slice_ref(slice);
     assert_eq!(&sub[..], expected);
 }
 
@@ -894,7 +879,7 @@ fn slice_ref_empty() {
     let bytes = Bytes::from(&b""[..]);
     let slice = &(bytes.as_ref()[0..0]);
 
-    let sub = bytes.slice_ref(&slice);
+    let sub = bytes.slice_ref(slice);
     assert_eq!(&sub[..], b"");
 }
 
@@ -986,3 +971,315 @@ fn bytes_with_capacity_but_empty() {
     let vec = Vec::with_capacity(1);
     let _ = Bytes::from(vec);
 }
+
+#[test]
+fn bytes_put_bytes() {
+    let mut bytes = BytesMut::new();
+    bytes.put_u8(17);
+    bytes.put_bytes(19, 2);
+    assert_eq!([17, 19, 19], bytes.as_ref());
+}
+
+#[test]
+fn box_slice_empty() {
+    // See https://github.com/tokio-rs/bytes/issues/340
+    let empty: Box<[u8]> = Default::default();
+    let b = Bytes::from(empty);
+    assert!(b.is_empty());
+}
+
+#[test]
+fn bytes_into_vec() {
+    // Test kind == KIND_VEC
+    let content = b"helloworld";
+
+    let mut bytes = BytesMut::new();
+    bytes.put_slice(content);
+
+    let vec: Vec<u8> = bytes.into();
+    assert_eq!(&vec, content);
+
+    // Test kind == KIND_ARC, shared.is_unique() == True
+    let mut bytes = BytesMut::new();
+    bytes.put_slice(b"abcdewe23");
+    bytes.put_slice(content);
+
+    // Overwrite the bytes to make sure only one reference to the underlying
+    // Vec exists.
+    bytes = bytes.split_off(9);
+
+    let vec: Vec<u8> = bytes.into();
+    assert_eq!(&vec, content);
+
+    // Test kind == KIND_ARC, shared.is_unique() == False
+    let prefix = b"abcdewe23";
+
+    let mut bytes = BytesMut::new();
+    bytes.put_slice(prefix);
+    bytes.put_slice(content);
+
+    let vec: Vec<u8> = bytes.split_off(prefix.len()).into();
+    assert_eq!(&vec, content);
+
+    let vec: Vec<u8> = bytes.into();
+    assert_eq!(&vec, prefix);
+}
+
+#[test]
+fn test_bytes_into_vec() {
+    // Test STATIC_VTABLE.to_vec
+    let bs = b"1b23exfcz3r";
+    let vec: Vec<u8> = Bytes::from_static(bs).into();
+    assert_eq!(&*vec, bs);
+
+    // Test bytes_mut.SHARED_VTABLE.to_vec impl
+    eprintln!("1");
+    let mut bytes_mut: BytesMut = bs[..].into();
+
+    // Set kind to KIND_ARC so that after freeze, Bytes will use bytes_mut.SHARED_VTABLE
+    eprintln!("2");
+    drop(bytes_mut.split_off(bs.len()));
+
+    eprintln!("3");
+    let b1 = bytes_mut.freeze();
+    eprintln!("4");
+    let b2 = b1.clone();
+
+    eprintln!("{:#?}", (&*b1).as_ptr());
+
+    // shared.is_unique() = False
+    eprintln!("5");
+    assert_eq!(&*Vec::from(b2), bs);
+
+    // shared.is_unique() = True
+    eprintln!("6");
+    assert_eq!(&*Vec::from(b1), bs);
+
+    // Test bytes_mut.SHARED_VTABLE.to_vec impl where offset != 0
+    let mut bytes_mut1: BytesMut = bs[..].into();
+    let bytes_mut2 = bytes_mut1.split_off(9);
+
+    let b1 = bytes_mut1.freeze();
+    let b2 = bytes_mut2.freeze();
+
+    assert_eq!(Vec::from(b2), bs[9..]);
+    assert_eq!(Vec::from(b1), bs[..9]);
+}
+
+#[test]
+fn test_bytes_into_vec_promotable_even() {
+    let vec = vec![33u8; 1024];
+
+    // Test cases where kind == KIND_VEC
+    let b1 = Bytes::from(vec.clone());
+    assert_eq!(Vec::from(b1), vec);
+
+    // Test cases where kind == KIND_ARC, ref_cnt == 1
+    let b1 = Bytes::from(vec.clone());
+    drop(b1.clone());
+    assert_eq!(Vec::from(b1), vec);
+
+    // Test cases where kind == KIND_ARC, ref_cnt == 2
+    let b1 = Bytes::from(vec.clone());
+    let b2 = b1.clone();
+    assert_eq!(Vec::from(b1), vec);
+
+    // Test cases where vtable = SHARED_VTABLE, kind == KIND_ARC, ref_cnt == 1
+    assert_eq!(Vec::from(b2), vec);
+
+    // Test cases where offset != 0
+    let mut b1 = Bytes::from(vec.clone());
+    let b2 = b1.split_off(20);
+
+    assert_eq!(Vec::from(b2), vec[20..]);
+    assert_eq!(Vec::from(b1), vec[..20]);
+}
+
+#[test]
+fn test_bytes_vec_conversion() {
+    let mut vec = Vec::with_capacity(10);
+    vec.extend(b"abcdefg");
+    let b = Bytes::from(vec);
+    let v = Vec::from(b);
+    assert_eq!(v.len(), 7);
+    assert_eq!(v.capacity(), 10);
+
+    let mut b = Bytes::from(v);
+    b.advance(1);
+    let v = Vec::from(b);
+    assert_eq!(v.len(), 6);
+    assert_eq!(v.capacity(), 10);
+    assert_eq!(v.as_slice(), b"bcdefg");
+}
+
+#[test]
+fn test_bytes_mut_conversion() {
+    let mut b1 = BytesMut::with_capacity(10);
+    b1.extend(b"abcdefg");
+    let b2 = Bytes::from(b1);
+    let v = Vec::from(b2);
+    assert_eq!(v.len(), 7);
+    assert_eq!(v.capacity(), 10);
+
+    let mut b = Bytes::from(v);
+    b.advance(1);
+    let v = Vec::from(b);
+    assert_eq!(v.len(), 6);
+    assert_eq!(v.capacity(), 10);
+    assert_eq!(v.as_slice(), b"bcdefg");
+}
+
+#[test]
+fn test_bytes_capacity_len() {
+    for cap in 0..100 {
+        for len in 0..=cap {
+            let mut v = Vec::with_capacity(cap);
+            v.resize(len, 0);
+            let _ = Bytes::from(v);
+        }
+    }
+}
+
+#[test]
+fn static_is_unique() {
+    let b = Bytes::from_static(LONG);
+    assert!(!b.is_unique());
+}
+
+#[test]
+fn vec_is_unique() {
+    let v: Vec<u8> = LONG.to_vec();
+    let b = Bytes::from(v);
+    assert!(b.is_unique());
+}
+
+#[test]
+fn arc_is_unique() {
+    let v: Vec<u8> = LONG.to_vec();
+    let b = Bytes::from(v);
+    let c = b.clone();
+    assert!(!b.is_unique());
+    drop(c);
+    assert!(b.is_unique());
+}
+
+#[test]
+fn shared_is_unique() {
+    let v: Vec<u8> = LONG.to_vec();
+    let b = Bytes::from(v);
+    let c = b.clone();
+    assert!(!c.is_unique());
+    drop(b);
+    assert!(c.is_unique());
+}
+
+#[test]
+fn test_bytesmut_from_bytes_static() {
+    let bs = b"1b23exfcz3r";
+
+    // Test STATIC_VTABLE.to_mut
+    let bytes_mut = BytesMut::from(Bytes::from_static(bs));
+    assert_eq!(bytes_mut, bs[..]);
+}
+
+#[test]
+fn test_bytesmut_from_bytes_bytes_mut_vec() {
+    let bs = b"1b23exfcz3r";
+    let bs_long = b"1b23exfcz3r1b23exfcz3r";
+
+    // Test case where kind == KIND_VEC
+    let mut bytes_mut: BytesMut = bs[..].into();
+    bytes_mut = BytesMut::from(bytes_mut.freeze());
+    assert_eq!(bytes_mut, bs[..]);
+    bytes_mut.extend_from_slice(&bs[..]);
+    assert_eq!(bytes_mut, bs_long[..]);
+}
+
+#[test]
+fn test_bytesmut_from_bytes_bytes_mut_shared() {
+    let bs = b"1b23exfcz3r";
+
+    // Set kind to KIND_ARC so that after freeze, Bytes will use bytes_mut.SHARED_VTABLE
+    let mut bytes_mut: BytesMut = bs[..].into();
+    drop(bytes_mut.split_off(bs.len()));
+
+    let b1 = bytes_mut.freeze();
+    let b2 = b1.clone();
+
+    // shared.is_unique() = False
+    let mut b1m = BytesMut::from(b1);
+    assert_eq!(b1m, bs[..]);
+    b1m[0] = b'9';
+
+    // shared.is_unique() = True
+    let b2m = BytesMut::from(b2);
+    assert_eq!(b2m, bs[..]);
+}
+
+#[test]
+fn test_bytesmut_from_bytes_bytes_mut_offset() {
+    let bs = b"1b23exfcz3r";
+
+    // Test bytes_mut.SHARED_VTABLE.to_mut impl where offset != 0
+    let mut bytes_mut1: BytesMut = bs[..].into();
+    let bytes_mut2 = bytes_mut1.split_off(9);
+
+    let b1 = bytes_mut1.freeze();
+    let b2 = bytes_mut2.freeze();
+
+    let b1m = BytesMut::from(b1);
+    let b2m = BytesMut::from(b2);
+
+    assert_eq!(b2m, bs[9..]);
+    assert_eq!(b1m, bs[..9]);
+}
+
+#[test]
+fn test_bytesmut_from_bytes_promotable_even_vec() {
+    let vec = vec![33u8; 1024];
+
+    // Test case where kind == KIND_VEC
+    let b1 = Bytes::from(vec.clone());
+    let b1m = BytesMut::from(b1);
+    assert_eq!(b1m, vec);
+}
+
+#[test]
+fn test_bytesmut_from_bytes_promotable_even_arc_1() {
+    let vec = vec![33u8; 1024];
+
+    // Test case where kind == KIND_ARC, ref_cnt == 1
+    let b1 = Bytes::from(vec.clone());
+    drop(b1.clone());
+    let b1m = BytesMut::from(b1);
+    assert_eq!(b1m, vec);
+}
+
+#[test]
+fn test_bytesmut_from_bytes_promotable_even_arc_2() {
+    let vec = vec![33u8; 1024];
+
+    // Test case where kind == KIND_ARC, ref_cnt == 2
+    let b1 = Bytes::from(vec.clone());
+    let b2 = b1.clone();
+    let b1m = BytesMut::from(b1);
+    assert_eq!(b1m, vec);
+
+    // Test case where vtable = SHARED_VTABLE, kind == KIND_ARC, ref_cnt == 1
+    let b2m = BytesMut::from(b2);
+    assert_eq!(b2m, vec);
+}
+
+#[test]
+fn test_bytesmut_from_bytes_promotable_even_arc_offset() {
+    let vec = vec![33u8; 1024];
+
+    // Test case where offset != 0
+    let mut b1 = Bytes::from(vec.clone());
+    let b2 = b1.split_off(20);
+    let b1m = BytesMut::from(b1);
+    let b2m = BytesMut::from(b2);
+
+    assert_eq!(b2m, vec[20..]);
+    assert_eq!(b1m, vec[..20]);
+}
diff --git a/tests/test_bytes_odd_alloc.rs b/tests/test_bytes_odd_alloc.rs
index 04ba7c2..4758dc2 100644
--- a/tests/test_bytes_odd_alloc.rs
+++ b/tests/test_bytes_odd_alloc.rs
@@ -6,7 +6,7 @@
 use std::alloc::{GlobalAlloc, Layout, System};
 use std::ptr;
 
-use bytes::Bytes;
+use bytes::{Bytes, BytesMut};
 
 #[global_allocator]
 static ODD: Odd = Odd;
@@ -24,8 +24,7 @@ unsafe impl GlobalAlloc for Odd {
             };
             let ptr = System.alloc(new_layout);
             if !ptr.is_null() {
-                let ptr = ptr.offset(1);
-                ptr
+                ptr.offset(1)
             } else {
                 ptr
             }
@@ -67,3 +66,82 @@ fn test_bytes_clone_drop() {
     let b1 = Bytes::from(vec);
     let _b2 = b1.clone();
 }
+
+#[test]
+fn test_bytes_into_vec() {
+    let vec = vec![33u8; 1024];
+
+    // Test cases where kind == KIND_VEC
+    let b1 = Bytes::from(vec.clone());
+    assert_eq!(Vec::from(b1), vec);
+
+    // Test cases where kind == KIND_ARC, ref_cnt == 1
+    let b1 = Bytes::from(vec.clone());
+    drop(b1.clone());
+    assert_eq!(Vec::from(b1), vec);
+
+    // Test cases where kind == KIND_ARC, ref_cnt == 2
+    let b1 = Bytes::from(vec.clone());
+    let b2 = b1.clone();
+    assert_eq!(Vec::from(b1), vec);
+
+    // Test cases where vtable = SHARED_VTABLE, kind == KIND_ARC, ref_cnt == 1
+    assert_eq!(Vec::from(b2), vec);
+
+    // Test cases where offset != 0
+    let mut b1 = Bytes::from(vec.clone());
+    let b2 = b1.split_off(20);
+
+    assert_eq!(Vec::from(b2), vec[20..]);
+    assert_eq!(Vec::from(b1), vec[..20]);
+}
+
+#[test]
+fn test_bytesmut_from_bytes_vec() {
+    let vec = vec![33u8; 1024];
+
+    // Test case where kind == KIND_VEC
+    let b1 = Bytes::from(vec.clone());
+    let b1m = BytesMut::from(b1);
+    assert_eq!(b1m, vec);
+}
+
+#[test]
+fn test_bytesmut_from_bytes_arc_1() {
+    let vec = vec![33u8; 1024];
+
+    // Test case where kind == KIND_ARC, ref_cnt == 1
+    let b1 = Bytes::from(vec.clone());
+    drop(b1.clone());
+    let b1m = BytesMut::from(b1);
+    assert_eq!(b1m, vec);
+}
+
+#[test]
+fn test_bytesmut_from_bytes_arc_2() {
+    let vec = vec![33u8; 1024];
+
+    // Test case where kind == KIND_ARC, ref_cnt == 2
+    let b1 = Bytes::from(vec.clone());
+    let b2 = b1.clone();
+    let b1m = BytesMut::from(b1);
+    assert_eq!(b1m, vec);
+
+    // Test case where vtable = SHARED_VTABLE, kind == KIND_ARC, ref_cnt == 1
+    let b2m = BytesMut::from(b2);
+    assert_eq!(b2m, vec);
+}
+
+#[test]
+fn test_bytesmut_from_bytes_arc_offset() {
+    let vec = vec![33u8; 1024];
+
+    // Test case where offset != 0
+    let mut b1 = Bytes::from(vec.clone());
+    let b2 = b1.split_off(20);
+    let b1m = BytesMut::from(b1);
+    let b2m = BytesMut::from(b2);
+
+    assert_eq!(b2m, vec[20..]);
+    assert_eq!(b1m, vec[..20]);
+}
diff --git a/tests/test_bytes_vec_alloc.rs b/tests/test_bytes_vec_alloc.rs
index 418a9cd..107e56e 100644
--- a/tests/test_bytes_vec_alloc.rs
+++ b/tests/test_bytes_vec_alloc.rs
@@ -1,61 +1,87 @@
 use std::alloc::{GlobalAlloc, Layout, System};
-use std::{mem, ptr};
+use std::ptr::null_mut;
+use std::sync::atomic::{AtomicPtr, AtomicUsize, Ordering};
 
 use bytes::{Buf, Bytes};
 
 #[global_allocator]
-static LEDGER: Ledger = Ledger;
+static LEDGER: Ledger = Ledger::new();
 
-struct Ledger;
+const LEDGER_LENGTH: usize = 2048;
 
-const USIZE_SIZE: usize = mem::size_of::<usize>();
+struct Ledger {
+    alloc_table: [(AtomicPtr<u8>, AtomicUsize); LEDGER_LENGTH],
+}
 
-unsafe impl GlobalAlloc for Ledger {
-    unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
-        if layout.align() == 1 && layout.size() > 0 {
-            // Allocate extra space to stash a record of
-            // how much space there was.
-            let orig_size = layout.size();
-            let size = orig_size + USIZE_SIZE;
-            let new_layout = match Layout::from_size_align(size, 1) {
-                Ok(layout) => layout,
-                Err(_err) => return ptr::null_mut(),
-            };
-            let ptr = System.alloc(new_layout);
-            if !ptr.is_null() {
-                (ptr as *mut usize).write(orig_size);
-                let ptr = ptr.offset(USIZE_SIZE as isize);
-                ptr
-            } else {
-                ptr
+impl Ledger {
+    const fn new() -> Self {
+        const ELEM: (AtomicPtr<u8>, AtomicUsize) =
+            (AtomicPtr::new(null_mut()), AtomicUsize::new(0));
+        let alloc_table = [ELEM; LEDGER_LENGTH];
+
+        Self { alloc_table }
+    }
+
+    /// Iterate over our table until we find an open entry, then insert into said entry
+    fn insert(&self, ptr: *mut u8, size: usize) {
+        for (entry_ptr, entry_size) in self.alloc_table.iter() {
+            // SeqCst is good enough here, we don't care about perf, i just want to be correct!
+            if entry_ptr
+                .compare_exchange(null_mut(), ptr, Ordering::SeqCst, Ordering::SeqCst)
+                .is_ok()
+            {
+                entry_size.store(size, Ordering::SeqCst);
+                break;
             }
-        } else {
-            System.alloc(layout)
         }
     }
 
-    unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {
-        if layout.align() == 1 && layout.size() > 0 {
-            let off_ptr = (ptr as *mut usize).offset(-1);
-            let orig_size = off_ptr.read();
-            if orig_size != layout.size() {
-                panic!(
-                    "bad dealloc: alloc size was {}, dealloc size is {}",
-                    orig_size,
-                    layout.size()
-                );
+    fn remove(&self, ptr: *mut u8) -> usize {
+        for (entry_ptr, entry_size) in self.alloc_table.iter() {
+            // set the value to be something that will never try and be deallocated, so that we
+            // don't have any chance of a race condition
+            //
+            // dont worry, LEDGER_LENGTH is really long to compensate for us not reclaiming space
+            if entry_ptr
+                .compare_exchange(
+                    ptr,
+                    invalid_ptr(usize::MAX),
+                    Ordering::SeqCst,
+                    Ordering::SeqCst,
+                )
+                .is_ok()
+            {
+                return entry_size.load(Ordering::SeqCst);
             }
+        }
+
+        panic!("Couldn't find a matching entry for {:x?}", ptr);
+    }
+}
+
+unsafe impl GlobalAlloc for Ledger {
+    unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
+        let size = layout.size();
+        let ptr = System.alloc(layout);
+        self.insert(ptr, size);
+        ptr
+    }
+
+    unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {
+        let orig_size = self.remove(ptr);
 
-            let new_layout = match Layout::from_size_align(layout.size() + USIZE_SIZE, 1) {
-                Ok(layout) => layout,
-                Err(_err) => std::process::abort(),
-            };
-            System.dealloc(off_ptr as *mut u8, new_layout);
+        if orig_size != layout.size() {
+            panic!(
+                "bad dealloc: alloc size was {}, dealloc size is {}",
+                orig_size,
+                layout.size()
+            );
         } else {
             System.dealloc(ptr, layout);
         }
     }
 }
+
 #[test]
 fn test_bytes_advance() {
     let mut bytes = Bytes::from(vec![10, 20, 30]);
@@ -77,3 +103,41 @@ fn test_bytes_truncate_and_advance() {
     bytes.advance(1);
     drop(bytes);
 }
+
+/// Returns a dangling pointer with the given address. This is used to store
+/// integer data in pointer fields.
+#[inline]
+fn invalid_ptr<T>(addr: usize) -> *mut T {
+    let ptr = std::ptr::null_mut::<u8>().wrapping_add(addr);
+    debug_assert_eq!(ptr as usize, addr);
+    ptr.cast::<T>()
+}
+
+#[test]
+fn test_bytes_into_vec() {
+    let vec = vec![33u8; 1024];
+
+    // Test cases where kind == KIND_VEC
+    let b1 = Bytes::from(vec.clone());
+    assert_eq!(Vec::from(b1), vec);
+
+    // Test cases where kind == KIND_ARC, ref_cnt == 1
+    let b1 = Bytes::from(vec.clone());
+    drop(b1.clone());
+    assert_eq!(Vec::from(b1), vec);
+
+    // Test cases where kind == KIND_ARC, ref_cnt == 2
+    let b1 = Bytes::from(vec.clone());
+    let b2 = b1.clone();
+    assert_eq!(Vec::from(b1), vec);
+
+    // Test cases where vtable = SHARED_VTABLE, kind == KIND_ARC, ref_cnt == 1
+    assert_eq!(Vec::from(b2), vec);
+
+    // Test cases where offset != 0
+    let mut b1 = Bytes::from(vec.clone());
+    let b2 = b1.split_off(20);
+
+    assert_eq!(Vec::from(b2), vec[20..]);
+    assert_eq!(Vec::from(b1), vec[..20]);
+}
diff --git a/tests/test_chain.rs b/tests/test_chain.rs
index affaf7a..cfda6b8 100644
--- a/tests/test_chain.rs
+++ b/tests/test_chain.rs
@@ -133,6 +133,28 @@ fn vectored_read() {
     }
 }
 
+#[test]
+fn chain_growing_buffer() {
+    let mut buff = [' ' as u8; 10];
+    let mut vec = b"wassup".to_vec();
+
+    let mut chained = (&mut buff[..]).chain_mut(&mut vec).chain_mut(Vec::new()); // Required for potential overflow because remaining_mut for Vec is isize::MAX - vec.len(), but for chain_mut is usize::MAX
+
+    chained.put_slice(b"hey there123123");
+
+    assert_eq!(&buff, b"hey there1");
+    assert_eq!(&vec, b"wassup23123");
+}
+
+#[test]
+fn chain_overflow_remaining_mut() {
+    let mut chained = Vec::<u8>::new().chain_mut(Vec::new()).chain_mut(Vec::new());
+
+    assert_eq!(chained.remaining_mut(), usize::MAX);
+    chained.put_slice(&[0; 256]);
+    assert_eq!(chained.remaining_mut(), usize::MAX);
+}
+
 #[test]
 fn chain_get_bytes() {
     let mut ab = Bytes::copy_from_slice(b"ab");
diff --git a/tests/test_iter.rs b/tests/test_iter.rs
index a5bfddd..bad9018 100644
--- a/tests/test_iter.rs
+++ b/tests/test_iter.rs
@@ -1,11 +1,11 @@
 #![warn(rust_2018_idioms)]
 
-use bytes::Bytes;
+use bytes::{buf::IntoIter, Bytes};
 
 #[test]
 fn iter_len() {
     let buf = Bytes::from_static(b"hello world");
-    let iter = buf.iter();
+    let iter = IntoIter::new(buf);
 
     assert_eq!(iter.size_hint(), (11, Some(11)));
     assert_eq!(iter.len(), 11);
@@ -13,8 +13,8 @@ fn iter_len() {
 
 #[test]
 fn empty_iter_len() {
-    let buf = Bytes::from_static(b"");
-    let iter = buf.iter();
+    let buf = Bytes::new();
+    let iter = IntoIter::new(buf);
 
     assert_eq!(iter.size_hint(), (0, Some(0)));
     assert_eq!(iter.len(), 0);