diff --git a/build.rs b/build.rs
index 766dec05..2d0c01da 100644
--- a/build.rs
+++ b/build.rs
@@ -1,4 +1,4 @@
-use std::env;
+use std::{collections::HashMap, env, sync::atomic::Ordering};
 
 fn main() {
     println!("cargo:rerun-if-changed=build.rs");
@@ -90,6 +90,61 @@ fn main() {
     {
         println!("cargo:rustc-cfg=kernel_user_helpers")
     }
+
+    if llvm_target[0] == "aarch64" {
+        generate_aarch64_outlined_atomics();
+    }
+}
+
+fn aarch64_symbol(ordering: Ordering) -> &'static str {
+    match ordering {
+        Ordering::Relaxed => "relax",
+        Ordering::Acquire => "acq",
+        Ordering::Release => "rel",
+        Ordering::AcqRel => "acq_rel",
+        _ => panic!("unknown symbol for {:?}", ordering),
+    }
+}
+
+/// The `concat_idents` macro is extremely annoying and doesn't allow us to define new items.
+/// Define them from the build script instead.
+/// Note that the majority of the code is still defined in `aarch64.rs` through inline macros.
+fn generate_aarch64_outlined_atomics() {
+    use std::fmt::Write;
+    // #[macro_export] so that we can use this in tests
+    let gen_macro =
+        |name| format!("#[macro_export] macro_rules! foreach_{name} {{ ($macro:path) => {{\n");
+
+    // Generate different macros for add/clr/eor/set so that we can test them separately.
+    let sym_names = ["cas", "ldadd", "ldclr", "ldeor", "ldset", "swp"];
+    let mut macros = HashMap::new();
+    for sym in sym_names {
+        macros.insert(sym, gen_macro(sym));
+    }
+
+    for ordering in [
+        Ordering::Relaxed,
+        Ordering::Acquire,
+        Ordering::Release,
+        Ordering::AcqRel,
+    ] {
+        let sym_ordering = aarch64_symbol(ordering);
+        // TODO: support CAS 16
+        for size in [1, 2, 4, 8 /* , 16*/] {
+            for (sym, macro_) in &mut macros {
+                let name = format!("__aarch64_{sym}{size}_{sym_ordering}");
+                writeln!(macro_, "$macro!( {ordering:?}, {size}, {name} );").unwrap();
+            }
+        }
+    }
+
+    let mut buf = String::new();
+    for macro_def in macros.values() {
+        buf += &macro_def;
+        buf += "}; }";
+    }
+    let dst = std::env::var("OUT_DIR").unwrap() + "/outlined_atomics.rs";
+    std::fs::write(dst, buf).unwrap();
 }
 
 #[cfg(feature = "c")]
diff --git a/src/aarch64.rs b/src/aarch64.rs
new file mode 100644
index 00000000..01888065
--- /dev/null
+++ b/src/aarch64.rs
@@ -0,0 +1,221 @@
+//! Aarch64 targets have two possible implementations for atomics:
+//! 1. Load-Locked, Store-Conditional (LL/SC), older and slower.
+//! 2. Large System Extensions (LSE), newer and faster.
+//! To avoid breaking backwards compat, C toolchains introduced a concept of "outlined atomics",
+//! where atomic operations call into the compiler runtime to dispatch between two depending on
+//! which is supported on the current CPU.
+//! See https://community.arm.com/arm-community-blogs/b/tools-software-ides-blog/posts/making-the-most-of-the-arm-architecture-in-gcc-10#:~:text=out%20of%20line%20atomics for more discussion.
+//!
+//! Currently we only support LL/SC, because LSE requires `getauxval` from libc in order to do runtime detection.
+//! Use the `compiler-rt` intrinsics if you want LSE support.
+//!
+//! Ported from `aarch64/lse.S` in LLVM's compiler-rt.
+//!
+//! Generate functions for each of the following symbols:
+//!  __aarch64_swpN_ORDER
+//!  __aarch64_ldaddN_ORDER
+//!  __aarch64_ldclrN_ORDER
+//!  __aarch64_ldeorN_ORDER
+//!  __aarch64_ldsetN_ORDER
+//! for N = {1, 2, 4, 8}, M = {1, 2, 4, 8}, ORDER = { relax, acq, rel, acq_rel }
+//!
+//! TODO: M = 16
+//!
+//! The original `lse.S` has some truly horrifying code that expects to be compiled multiple times with different constants.
+//! We do something similar, but with macro arguments.
+
+/// We don't do runtime dispatch so we don't have to worry about the global ctor.
+/// Apparently MacOS uses a different number of underscores in the symbol name (???)
+// #[cfg(target_vendor = "apple")]
+// macro_rules! have_lse {
+//     () => { ___aarch64_have_lse_atomics }
+// }
+
+// #[cfg(not(target_vendor = "apple"))]
+// macro_rules! have_lse {
+//     () => { __aarch64_have_lse_atomics }
+// }
+
+/// Translate a byte size to a Rust type.
+macro_rules! int_ty {
+    (1) => { i8 };
+    (2) => { i16 };
+    (4) => { i32 };
+    (8) => { i64 };
+    (16) => { i128 };
+}
+
+/// Given a byte size and a register number, return a register of the appropriate size.
+///
+/// See <https://developer.arm.com/documentation/102374/0101/Registers-in-AArch64---general-purpose-registers>.
+macro_rules! reg {
+    (1, $num:literal) => { concat!("w", $num) };
+    (2, $num:literal) => { concat!("w", $num) };
+    (4, $num:literal) => { concat!("w", $num) };
+    (8, $num:literal) => { concat!("x", $num) };
+}
+
+/// Given an atomic ordering, translate it to the acquire suffix for the lxdr aarch64 ASM instruction.
+macro_rules! acquire {
+    (Relaxed) => { "" };
+    (Acquire) => { "a" };
+    (Release) => { "" };
+    (AcqRel) => { "a" };
+}
+
+/// Given an atomic ordering, translate it to the release suffix for the stxr aarch64 ASM instruction.
+macro_rules! release {
+    (Relaxed) => { "" };
+    (Acquire) => { "" };
+    (Release) => { "l" };
+    (AcqRel) => { "l" };
+}
+
+/// Given a size in bytes, translate it to the byte suffix for an aarch64 ASM instruction.
+macro_rules! size {
+    (1) => { "b" };
+    (2) => { "h" };
+    (4) => { "" };
+    (8) => { "" };
+    (16) => { "" };
+}
+
+/// Given a byte size, translate it to an Unsigned eXTend instruction
+/// with the correct semantics.
+///
+/// See <https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/UXTB--Unsigned-Extend-Byte--an-alias-of-UBFM->
+macro_rules! uxt {
+    (1) => { "uxtb" };
+    (2) => { "uxth" };
+    ($_:tt) => { "mov" };
+}
+
+/// Given an atomic ordering and byte size, translate it to a LoaD eXclusive Register instruction
+/// with the correct semantics.
+///
+/// See <https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/LDXR--Load-Exclusive-Register->.
+macro_rules! ldxr {
+    ($ordering:ident, $bytes:tt) => { concat!("ld", acquire!($ordering), "xr", size!($bytes)) }
+}
+
+/// Given an atomic ordering and byte size, translate it to a STore eXclusive Register instruction
+/// with the correct semantics.
+///
+/// See <https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/STXR--Store-Exclusive-Register->.
+macro_rules! stxr {
+    ($ordering:ident, $bytes:tt) => { concat!("st", release!($ordering), "xr", size!($bytes)) }
+}
+
+/// See <https://doc.rust-lang.org/stable/std/sync/atomic/struct.AtomicI8.html#method.compare_and_swap>.
+macro_rules! compare_and_swap {
+    ($ordering:ident, $bytes:tt, $name:ident) => {
+        intrinsics! {
+            #[maybe_use_optimized_c_shim]
+            #[naked]
+            pub extern "C" fn $name (
+                expected: int_ty!($bytes), desired: int_ty!($bytes), ptr: *mut int_ty!($bytes)
+            ) -> int_ty!($bytes) {
+                // We can't use `AtomicI8::compare_and_swap`; we *are* compare_and_swap.
+                unsafe { core::arch::asm! {
+                    // UXT s(tmp0), s(0)
+                    concat!(uxt!($bytes), " ", reg!($bytes, 16), ", ", reg!($bytes, 0)),
+                    "0:",
+                    // LDXR   s(0), [x2]
+                    concat!(ldxr!($ordering, $bytes), " ", reg!($bytes, 0), ", [x2]"),
+                    // cmp    s(0), s(tmp0)
+                    concat!("cmp ", reg!($bytes, 0), ", ", reg!($bytes, 16)),
+                    "bne    1f",
+                    // STXR   w(tmp1), s(1), [x2]
+                    concat!(stxr!($ordering, $bytes), " w17, ", reg!($bytes, 1), ", [x2]"),
+                    "cbnz   w17, 0b",
+                    "1:",
+                    "ret",
+                    options(noreturn)
+                } }
+            }
+        }
+    }
+}
+
+
+/// See <https://doc.rust-lang.org/stable/std/sync/atomic/struct.AtomicI8.html#method.swap>.
+macro_rules! swap {
+    ($ordering:ident, $bytes:tt, $name:ident) => {
+        intrinsics! {
+            #[maybe_use_optimized_c_shim]
+            #[naked]
+            pub extern "C" fn $name (
+                left: int_ty!($bytes), right_ptr: *mut int_ty!($bytes)
+            ) -> int_ty!($bytes) {
+                unsafe { core::arch::asm! {
+                    // mov    s(tmp0), s(0)
+                    concat!("mov ", reg!($bytes, 16), ", ", reg!($bytes, 0)),
+                    "0:",
+                    // LDXR   s(0), [x1]
+                    concat!(ldxr!($ordering, $bytes), " ", reg!($bytes, 0), ", [x1]"),
+                    // STXR   w(tmp1), s(tmp0), [x1]
+                    concat!(stxr!($ordering, $bytes), " w17, ", reg!($bytes, 16), ", [x1]"),
+                    "cbnz   w17, 0b",
+                    "ret",
+                    options(noreturn)
+                } }
+            }
+        }
+    }
+}
+
+/// See (e.g.) <https://doc.rust-lang.org/stable/std/sync/atomic/struct.AtomicI8.html#method.fetch_add>.
+macro_rules! fetch_op {
+    ($ordering:ident, $bytes:tt, $name:ident, $op:literal) => {
+        intrinsics! {
+            #[maybe_use_optimized_c_shim]
+            #[naked]
+            pub extern "C" fn $name (
+                val: int_ty!($bytes), ptr: *mut int_ty!($bytes)
+            ) -> int_ty!($bytes) {
+                unsafe { core::arch::asm! {
+                    // mov    s(tmp0), s(0)
+                    concat!("mov ", reg!($bytes, 16), ", ", reg!($bytes, 0)),
+                    "0:",
+                    // LDXR   s(0), [x1]
+                    concat!(ldxr!($ordering, $bytes), " ", reg!($bytes, 0), ", [x1]"),
+                    // OP     s(tmp1), s(0), s(tmp0)
+                    concat!($op, " ", reg!($bytes, 17), ", ", reg!($bytes, 0), ", ", reg!($bytes, 16)),
+                    // STXR   w(tmp2), s(tmp1), [x1]
+                    concat!(stxr!($ordering, $bytes), " w15, ", reg!($bytes, 17), ", [x1]"),
+                    "cbnz  w15, 0b",
+                    "ret",
+                    options(noreturn)
+                } }
+            }
+        }
+    }
+}
+
+// We need a single macro to pass to `foreach_ldadd`.
+macro_rules! add {
+    ($ordering:ident, $bytes:tt, $name:ident) => { fetch_op! { $ordering, $bytes, $name, "add" } }
+}
+
+macro_rules! and {
+    ($ordering:ident, $bytes:tt, $name:ident) => { fetch_op! { $ordering, $bytes, $name, "bic" } }
+}
+
+macro_rules! xor {
+    ($ordering:ident, $bytes:tt, $name:ident) => { fetch_op! { $ordering, $bytes, $name, "eor" } }
+}
+
+macro_rules! or {
+    ($ordering:ident, $bytes:tt, $name:ident) => { fetch_op! { $ordering, $bytes, $name, "orr" } }
+}
+
+// See `generate_aarch64_outlined_atomics` in build.rs.
+include!(concat!(env!("OUT_DIR"), "/outlined_atomics.rs"));
+foreach_cas!(compare_and_swap);
+foreach_swp!(swap);
+foreach_ldadd!(add);
+foreach_ldclr!(and);
+foreach_ldeor!(xor);
+foreach_ldset!(or);
+
+// TODO: CAS 16
diff --git a/src/lib.rs b/src/lib.rs
index 71f249c8..90b21f1f 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -57,6 +57,9 @@ pub mod mem;
 #[cfg(target_arch = "arm")]
 pub mod arm;
 
+#[cfg(target_arch = "aarch64")]
+pub mod aarch64;
+
 #[cfg(all(
     kernel_user_helpers,
     any(target_os = "linux", target_os = "android"),
diff --git a/src/macros.rs b/src/macros.rs
index b3becde7..e3a38192 100644
--- a/src/macros.rs
+++ b/src/macros.rs
@@ -419,7 +419,7 @@ macro_rules! intrinsics {
     (
         #[naked]
         $(#[$($attr:tt)*])*
-        pub unsafe extern $abi:tt fn $name:ident( $($argname:ident:  $ty:ty),* ) $(-> $ret:ty)? {
+        pub $(unsafe)? extern $abi:tt fn $name:ident( $($argname:ident:  $ty:ty),* ) $(-> $ret:ty)? {
             $($body:tt)*
         }
 
diff --git a/testcrate/tests/lse.rs b/testcrate/tests/lse.rs
new file mode 100644
index 00000000..4bade0e8
--- /dev/null
+++ b/testcrate/tests/lse.rs
@@ -0,0 +1,84 @@
+#![cfg(target_arch = "aarch64")]
+#![feature(decl_macro)] // so we can use pub(super)
+
+/// Translate a byte size to a Rust type.
+macro int_ty {
+    (1) => { i8 },
+    (2) => { i16 },
+    (4) => { i32 },
+    (8) => { i64 },
+    (16) => { i128 }
+}
+
+mod cas {
+    pub(super) macro test($_ordering:ident, $bytes:tt, $name:ident) {
+        #[test]
+        fn $name() {
+            testcrate::fuzz_2(10000, |expected: super::int_ty!($bytes), new| {
+                let mut target = expected.wrapping_add(10);
+                assert_eq!(
+                    unsafe { compiler_builtins::aarch64::$name::$name(expected, new, &mut target) },
+                    expected.wrapping_add(10),
+                    "return value should always be the previous value",
+                );
+                assert_eq!(target, expected.wrapping_add(10), "shouldn't have changed target");
+
+                target = expected;
+                assert_eq!(
+                    unsafe { compiler_builtins::aarch64::$name::$name(expected, new, &mut target) },
+                    expected
+                );
+                assert_eq!(target, new, "should have updated target");
+            });
+        }
+    }
+}
+
+mod swap {
+    pub(super) macro test($_ordering:ident, $bytes:tt, $name:ident) {
+        #[test]
+        fn $name() {
+            testcrate::fuzz_2(10000, |left: super::int_ty!($bytes), mut right| {
+                let orig_right = right;
+                assert_eq!(
+                    unsafe { compiler_builtins::aarch64::$name::$name(left, &mut right) },
+                    orig_right
+                );
+                assert_eq!(left, right);
+            });
+        }
+    }
+}
+
+macro_rules! test_op {
+    ($mod:ident, $( $op:tt )* ) => {
+        mod $mod {
+            pub(super) macro test {
+                ($_ordering:ident, $bytes:tt, $name:ident) => {
+                    #[test]
+                    fn $name() {
+                        testcrate::fuzz_2(10000, |old, val| {
+                            let mut target = old;
+                            let op: fn(super::int_ty!($bytes), super::int_ty!($bytes)) -> _ = $($op)*;
+                            let expected = op(old, val);
+                            assert_eq!(old, unsafe { compiler_builtins::aarch64::$name::$name(val, &mut target) }, "{} should return original value", stringify!($name));
+                            assert_eq!(expected, target, "{} should store to target", stringify!($name));
+                        });
+                    }
+                }
+            }
+        }
+    };
+}
+
+test_op!(add, |left, right| left.wrapping_add(right));
+test_op!(clr, |left, right| left & !right);
+test_op!(xor, std::ops::BitXor::bitxor);
+test_op!(or, std::ops::BitOr::bitor);
+
+compiler_builtins::foreach_cas!(cas::test);
+compiler_builtins::foreach_swp!(swap::test);
+compiler_builtins::foreach_ldadd!(add::test);
+compiler_builtins::foreach_ldclr!(clr::test);
+compiler_builtins::foreach_ldeor!(xor::test);
+compiler_builtins::foreach_ldset!(or::test);