diff --git a/.codespell.rc b/.codespell.rc index 914ca295128..7a811dee3ad 100644 --- a/.codespell.rc +++ b/.codespell.rc @@ -1,3 +1,3 @@ [codespell] ignore-words-list = crate -skip = ./.git/**,./.vscode/cspell.dictionaries/**,./target/**,./tests/fixtures/** +skip = ./.git/**,./.vscode/cspell.dictionaries/**,./target/**,./tests/fixtures/** \ No newline at end of file diff --git a/.vscode/cspell.dictionaries/jargon.wordlist.txt b/.vscode/cspell.dictionaries/jargon.wordlist.txt index d1f36618c87..975e8dbabef 100644 --- a/.vscode/cspell.dictionaries/jargon.wordlist.txt +++ b/.vscode/cspell.dictionaries/jargon.wordlist.txt @@ -130,6 +130,7 @@ symlink symlinks syscall syscalls +sysconf tokenize toolchain truthy diff --git a/Cargo.lock b/Cargo.lock index 0f83aa0a645..c94d000ae25 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3951,6 +3951,7 @@ dependencies = [ "fluent", "fnv", "itertools 0.14.0", + "libc", "memchr", "nix 0.30.1", "rand 0.9.2", diff --git a/src/uu/sort/Cargo.toml b/src/uu/sort/Cargo.toml index a44cc7570ae..aa8e0d650d1 100644 --- a/src/uu/sort/Cargo.toml +++ b/src/uu/sort/Cargo.toml @@ -36,6 +36,7 @@ thiserror = { workspace = true } unicode-width = { workspace = true } uucore = { workspace = true, features = ["fs", "parser", "version-cmp"] } fluent = { workspace = true } +libc = { workspace = true } [target.'cfg(target_os = "linux")'.dependencies] nix = { workspace = true } diff --git a/src/uu/sort/src/sort.rs b/src/uu/sort/src/sort.rs index 1d4948186fe..bd0f0f05a8a 100644 --- a/src/uu/sort/src/sort.rs +++ b/src/uu/sort/src/sort.rs @@ -24,8 +24,10 @@ use clap::{Arg, ArgAction, Command}; use custom_str_cmp::custom_str_cmp; use ext_sort::ext_sort; use fnv::FnvHasher; +#[cfg(target_family = "unix")] +use libc::{_SC_PAGESIZE, _SC_PHYS_PAGES}; #[cfg(target_os = "linux")] -use nix::libc::{RLIMIT_NOFILE, getrlimit, rlimit}; +use libc::{RLIMIT_NOFILE, rlimit}; use numeric_str_cmp::{NumInfo, NumInfoParseSettings, human_numeric_str_cmp, numeric_str_cmp}; use rand::{Rng, rng}; use rayon::prelude::*; @@ -115,10 +117,12 @@ const DECIMAL_PT: u8 = b'.'; const NEGATIVE: &u8 = &b'-'; const POSITIVE: &u8 = &b'+'; -// Choosing a higher buffer size does not result in performance improvements -// (at least not on my machine). TODO: In the future, we should also take the amount of -// available memory into consideration, instead of relying on this constant only. -const DEFAULT_BUF_SIZE: usize = 1_000_000_000; // 1 GB +// The automatic buffer heuristics clamp to this range to avoid +// over-committing memory on constrained systems while still keeping +// reasonably large chunks for typical workloads. +const MIN_AUTOMATIC_BUF_SIZE: usize = 512 * 1024; // 512 KiB +const FALLBACK_AUTOMATIC_BUF_SIZE: usize = 4 * 1024 * 1024; // 4 MiB +const MAX_AUTOMATIC_BUF_SIZE: usize = 1024 * 1024 * 1024; // 1 GiB #[derive(Debug, Error)] pub enum SortError { @@ -359,7 +363,7 @@ impl Default for GlobalSettings { separator: None, threads: String::new(), line_ending: LineEnding::Newline, - buffer_size: DEFAULT_BUF_SIZE, + buffer_size: FALLBACK_AUTOMATIC_BUF_SIZE, compress_prog: None, merge_batch_size: 32, precomputed: Precomputed::default(), @@ -1029,7 +1033,7 @@ fn get_rlimit() -> UResult { rlim_cur: 0, rlim_max: 0, }; - match unsafe { getrlimit(RLIMIT_NOFILE, &raw mut limit) } { + match unsafe { libc::getrlimit(RLIMIT_NOFILE, &mut limit) } { 0 => Ok(limit.rlim_cur as usize), _ => Err(UUsageError::new(2, translate!("sort-failed-fetch-rlimit"))), } @@ -1037,6 +1041,127 @@ fn get_rlimit() -> UResult { const STDIN_FILE: &str = "-"; +fn automatic_buffer_size(files: &[OsString]) -> usize { + let file_hint = file_size_hint(files); + let mem_hint = available_memory_hint(); + + match (file_hint, mem_hint) { + (Some(file), Some(mem)) => file.min(mem), + (Some(file), None) => file, + (None, Some(mem)) => mem, + (None, None) => FALLBACK_AUTOMATIC_BUF_SIZE, + } +} + +fn file_size_hint(files: &[OsString]) -> Option { + let mut total_bytes: u128 = 0; + + for file in files { + if file == STDIN_FILE { + continue; + } + + let Ok(metadata) = std::fs::metadata(file) else { + continue; + }; + + if !metadata.is_file() { + continue; + } + + total_bytes = total_bytes.saturating_add(metadata.len() as u128); + + if total_bytes >= (MAX_AUTOMATIC_BUF_SIZE as u128) * 8 { + break; + } + } + + if total_bytes == 0 { + return None; + } + + let desired_bytes = desired_file_buffer_bytes(total_bytes); + Some(clamp_hint(desired_bytes)) +} + +fn available_memory_hint() -> Option { + #[cfg(target_os = "linux")] + if let Some(bytes) = available_memory_bytes() { + return Some(clamp_hint(bytes / 8)); + } + + physical_memory_bytes().map(|bytes| clamp_hint(bytes / 8)) +} + +fn clamp_hint(bytes: u128) -> usize { + let min = MIN_AUTOMATIC_BUF_SIZE as u128; + let max = MAX_AUTOMATIC_BUF_SIZE as u128; + let clamped = bytes.clamp(min, max); + clamped.min(usize::MAX as u128) as usize +} + +fn desired_file_buffer_bytes(total_bytes: u128) -> u128 { + if total_bytes == 0 { + return 0; + } + + let max = MAX_AUTOMATIC_BUF_SIZE as u128; + + if total_bytes <= max { + let expanded = total_bytes.saturating_mul(12).clamp(total_bytes, max); + return expanded; + } + + let quarter = total_bytes / 4; + quarter.max(max) +} + +#[cfg(target_os = "linux")] +fn available_memory_bytes() -> Option { + let meminfo = std::fs::read_to_string("/proc/meminfo").ok()?; + let mut mem_available = None; + let mut mem_total = None; + + for line in meminfo.lines() { + if let Some(value) = line.strip_prefix("MemAvailable:") { + mem_available = parse_meminfo_value(value); + } else if let Some(value) = line.strip_prefix("MemTotal:") { + mem_total = parse_meminfo_value(value); + } + + if mem_available.is_some() && mem_total.is_some() { + break; + } + } + + mem_available.or(mem_total) +} + +#[cfg(target_os = "linux")] +fn parse_meminfo_value(value: &str) -> Option { + let amount_kib = value.split_whitespace().next()?.parse::().ok()?; + Some(amount_kib * 1024) +} + +fn physical_memory_bytes() -> Option { + #[cfg(target_family = "unix")] + { + let pages = unsafe { libc::sysconf(_SC_PHYS_PAGES) }; + let page_size = unsafe { libc::sysconf(_SC_PAGESIZE) }; + if pages <= 0 || page_size <= 0 { + return None; + } + let pages = u128::try_from(pages).ok()?; + let page_size = u128::try_from(page_size).ok()?; + Some(pages.saturating_mul(page_size)) + } + + #[cfg(not(target_family = "unix"))] + { + None + } +} + #[uucore::main] #[allow(clippy::cognitive_complexity)] pub fn uumain(args: impl uucore::Args) -> UResult<()> { @@ -1157,14 +1282,13 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { } } - settings.buffer_size = - matches - .get_one::(options::BUF_SIZE) - .map_or(Ok(DEFAULT_BUF_SIZE), |s| { - GlobalSettings::parse_byte_count(s).map_err(|e| { - USimpleError::new(2, format_error_message(&e, s, options::BUF_SIZE)) - }) - })?; + settings.buffer_size = matches.get_one::(options::BUF_SIZE).map_or_else( + || Ok(automatic_buffer_size(&files)), + |s| { + GlobalSettings::parse_byte_count(s) + .map_err(|e| USimpleError::new(2, format_error_message(&e, s, options::BUF_SIZE))) + }, + )?; let mut tmp_dir = TmpDirWrapper::new( matches @@ -1983,6 +2107,23 @@ mod tests { use super::*; + #[test] + fn desired_buffer_matches_total_when_small() { + let six_mebibytes = 6 * 1024 * 1024; + let expected = ((six_mebibytes as u128) * 12) + .clamp(six_mebibytes as u128, MAX_AUTOMATIC_BUF_SIZE as u128); + assert_eq!(desired_file_buffer_bytes(six_mebibytes as u128), expected); + } + + #[test] + fn desired_buffer_caps_at_max_for_large_inputs() { + let large = 256 * 1024 * 1024; // 256 MiB + assert_eq!( + desired_file_buffer_bytes(large as u128), + MAX_AUTOMATIC_BUF_SIZE as u128 + ); + } + fn tokenize_helper(line: &[u8], separator: Option) -> Vec { let mut buffer = vec![]; tokenize(line, separator, &mut buffer); diff --git a/src/uu/sort/src/tmp_dir.rs b/src/uu/sort/src/tmp_dir.rs index 474e01ae2f3..7d8222d86d2 100644 --- a/src/uu/sort/src/tmp_dir.rs +++ b/src/uu/sort/src/tmp_dir.rs @@ -2,10 +2,11 @@ // // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. +use std::sync::atomic::{AtomicBool, Ordering}; use std::{ fs::File, path::{Path, PathBuf}, - sync::{Arc, Mutex}, + sync::{Arc, Mutex, OnceLock}, }; use tempfile::TempDir; @@ -29,6 +30,64 @@ pub struct TmpDirWrapper { lock: Arc>, } +#[derive(Default, Clone)] +struct HandlerRegistration { + lock: Option>>, + path: Option, +} + +fn handler_state() -> Arc> { + static HANDLER_STATE: OnceLock>> = OnceLock::new(); + HANDLER_STATE + .get_or_init(|| Arc::new(Mutex::new(HandlerRegistration::default()))) + .clone() +} + +fn ensure_signal_handler_installed(state: Arc>) -> UResult<()> { + static HANDLER_INSTALLED: AtomicBool = AtomicBool::new(false); + + if HANDLER_INSTALLED + .compare_exchange(false, true, Ordering::AcqRel, Ordering::Acquire) + .is_err() + { + return Ok(()); + } + + let handler_state = state.clone(); + if let Err(e) = ctrlc::set_handler(move || { + let (lock, path) = { + let state = handler_state.lock().unwrap(); + (state.lock.clone(), state.path.clone()) + }; + + if let Some(lock) = lock { + let _guard = lock.lock().unwrap(); + if let Some(path) = path { + if let Err(e) = remove_tmp_dir(&path) { + show_error!( + "{}", + translate!( + "sort-failed-to-delete-temporary-directory", + "error" => e + ) + ); + } + } + } + + std::process::exit(2) + }) { + HANDLER_INSTALLED.store(false, Ordering::Release); + return Err(USimpleError::new( + 2, + translate!("sort-failed-to-set-up-signal-handler", "error" => e), + ) + .into()); + } + + Ok(()) +} + impl TmpDirWrapper { pub fn new(path: PathBuf) -> Self { Self { @@ -52,31 +111,14 @@ impl TmpDirWrapper { ); let path = self.temp_dir.as_ref().unwrap().path().to_owned(); - let lock = self.lock.clone(); - ctrlc::set_handler(move || { - // Take the lock so that `next_file_path` returns no new file path, - // and the program doesn't terminate before the handler has finished - let _lock = lock.lock().unwrap(); - if let Err(e) = remove_tmp_dir(&path) { - show_error!( - "{}", - translate!( - "sort-failed-to-delete-temporary-directory", - "error" => e - ) - ); - } - std::process::exit(2) - }) - .map_err(|e| { - USimpleError::new( - 2, - translate!( - "sort-failed-to-set-up-signal-handler", - "error" => e - ), - ) - }) + let state = handler_state(); + { + let mut guard = state.lock().unwrap(); + guard.lock = Some(self.lock.clone()); + guard.path = Some(path); + } + + ensure_signal_handler_installed(state) } pub fn next_file(&mut self) -> UResult<(File, PathBuf)> { @@ -100,6 +142,22 @@ impl TmpDirWrapper { } } +impl Drop for TmpDirWrapper { + fn drop(&mut self) { + let state = handler_state(); + let mut guard = state.lock().unwrap(); + + if guard + .lock + .as_ref() + .is_some_and(|current| Arc::ptr_eq(current, &self.lock)) + { + guard.lock = None; + guard.path = None; + } + } +} + /// Remove the directory at `path` by deleting its child files and then itself. /// Errors while deleting child files are ignored. fn remove_tmp_dir(path: &Path) -> std::io::Result<()> {