Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .codespell.rc
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
[codespell]
ignore-words-list = crate
skip = ./.git/**,./.vscode/cspell.dictionaries/**,./target/**,./tests/fixtures/**
skip = ./.git/**,./.vscode/cspell.dictionaries/**,./target/**,./tests/fixtures/**
1 change: 1 addition & 0 deletions .vscode/cspell.dictionaries/jargon.wordlist.txt
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ symlink
symlinks
syscall
syscalls
sysconf
tokenize
toolchain
truthy
Expand Down
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions src/uu/sort/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ thiserror = { workspace = true }
unicode-width = { workspace = true }
uucore = { workspace = true, features = ["fs", "parser", "version-cmp"] }
fluent = { workspace = true }
libc = { workspace = true }

[target.'cfg(target_os = "linux")'.dependencies]
nix = { workspace = true }
Expand Down
171 changes: 156 additions & 15 deletions src/uu/sort/src/sort.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,10 @@ use clap::{Arg, ArgAction, Command};
use custom_str_cmp::custom_str_cmp;
use ext_sort::ext_sort;
use fnv::FnvHasher;
#[cfg(target_family = "unix")]
use libc::{_SC_PAGESIZE, _SC_PHYS_PAGES};
#[cfg(target_os = "linux")]
use nix::libc::{RLIMIT_NOFILE, getrlimit, rlimit};
use libc::{RLIMIT_NOFILE, rlimit};
use numeric_str_cmp::{NumInfo, NumInfoParseSettings, human_numeric_str_cmp, numeric_str_cmp};
use rand::{Rng, rng};
use rayon::prelude::*;
Expand Down Expand Up @@ -115,10 +117,12 @@ const DECIMAL_PT: u8 = b'.';
const NEGATIVE: &u8 = &b'-';
const POSITIVE: &u8 = &b'+';

// Choosing a higher buffer size does not result in performance improvements
// (at least not on my machine). TODO: In the future, we should also take the amount of
// available memory into consideration, instead of relying on this constant only.
const DEFAULT_BUF_SIZE: usize = 1_000_000_000; // 1 GB
// The automatic buffer heuristics clamp to this range to avoid
// over-committing memory on constrained systems while still keeping
// reasonably large chunks for typical workloads.
const MIN_AUTOMATIC_BUF_SIZE: usize = 512 * 1024; // 512 KiB
const FALLBACK_AUTOMATIC_BUF_SIZE: usize = 4 * 1024 * 1024; // 4 MiB
const MAX_AUTOMATIC_BUF_SIZE: usize = 1024 * 1024 * 1024; // 1 GiB

#[derive(Debug, Error)]
pub enum SortError {
Expand Down Expand Up @@ -359,7 +363,7 @@ impl Default for GlobalSettings {
separator: None,
threads: String::new(),
line_ending: LineEnding::Newline,
buffer_size: DEFAULT_BUF_SIZE,
buffer_size: FALLBACK_AUTOMATIC_BUF_SIZE,
compress_prog: None,
merge_batch_size: 32,
precomputed: Precomputed::default(),
Expand Down Expand Up @@ -1029,14 +1033,135 @@ fn get_rlimit() -> UResult<usize> {
rlim_cur: 0,
rlim_max: 0,
};
match unsafe { getrlimit(RLIMIT_NOFILE, &raw mut limit) } {
match unsafe { libc::getrlimit(RLIMIT_NOFILE, &mut limit) } {
0 => Ok(limit.rlim_cur as usize),
_ => Err(UUsageError::new(2, translate!("sort-failed-fetch-rlimit"))),
}
}

const STDIN_FILE: &str = "-";

fn automatic_buffer_size(files: &[OsString]) -> usize {
let file_hint = file_size_hint(files);
let mem_hint = available_memory_hint();

match (file_hint, mem_hint) {
(Some(file), Some(mem)) => file.min(mem),
(Some(file), None) => file,
(None, Some(mem)) => mem,
(None, None) => FALLBACK_AUTOMATIC_BUF_SIZE,
}
}

fn file_size_hint(files: &[OsString]) -> Option<usize> {
let mut total_bytes: u128 = 0;

for file in files {
if file == STDIN_FILE {
continue;
}

let Ok(metadata) = std::fs::metadata(file) else {
continue;
};

if !metadata.is_file() {
continue;
}

total_bytes = total_bytes.saturating_add(metadata.len() as u128);

if total_bytes >= (MAX_AUTOMATIC_BUF_SIZE as u128) * 8 {
break;
}
}

if total_bytes == 0 {
return None;
}

let desired_bytes = desired_file_buffer_bytes(total_bytes);
Some(clamp_hint(desired_bytes))
}

fn available_memory_hint() -> Option<usize> {
#[cfg(target_os = "linux")]
if let Some(bytes) = available_memory_bytes() {
return Some(clamp_hint(bytes / 8));
}

physical_memory_bytes().map(|bytes| clamp_hint(bytes / 8))
}

fn clamp_hint(bytes: u128) -> usize {
let min = MIN_AUTOMATIC_BUF_SIZE as u128;
let max = MAX_AUTOMATIC_BUF_SIZE as u128;
let clamped = bytes.clamp(min, max);
clamped.min(usize::MAX as u128) as usize
}

fn desired_file_buffer_bytes(total_bytes: u128) -> u128 {
if total_bytes == 0 {
return 0;
}

let max = MAX_AUTOMATIC_BUF_SIZE as u128;

if total_bytes <= max {
let expanded = total_bytes.saturating_mul(12).clamp(total_bytes, max);
return expanded;
}

let quarter = total_bytes / 4;
quarter.max(max)
}

#[cfg(target_os = "linux")]
fn available_memory_bytes() -> Option<u128> {
let meminfo = std::fs::read_to_string("/proc/meminfo").ok()?;
let mut mem_available = None;
let mut mem_total = None;

for line in meminfo.lines() {
if let Some(value) = line.strip_prefix("MemAvailable:") {
mem_available = parse_meminfo_value(value);
} else if let Some(value) = line.strip_prefix("MemTotal:") {
mem_total = parse_meminfo_value(value);
}

if mem_available.is_some() && mem_total.is_some() {
break;
}
}

mem_available.or(mem_total)
}

#[cfg(target_os = "linux")]
fn parse_meminfo_value(value: &str) -> Option<u128> {
let amount_kib = value.split_whitespace().next()?.parse::<u128>().ok()?;
Some(amount_kib * 1024)
}

fn physical_memory_bytes() -> Option<u128> {
#[cfg(target_family = "unix")]
{
let pages = unsafe { libc::sysconf(_SC_PHYS_PAGES) };
let page_size = unsafe { libc::sysconf(_SC_PAGESIZE) };
if pages <= 0 || page_size <= 0 {
return None;
}
let pages = u128::try_from(pages).ok()?;
let page_size = u128::try_from(page_size).ok()?;
Some(pages.saturating_mul(page_size))
}

#[cfg(not(target_family = "unix"))]
{
None
}
}

#[uucore::main]
#[allow(clippy::cognitive_complexity)]
pub fn uumain(args: impl uucore::Args) -> UResult<()> {
Expand Down Expand Up @@ -1157,14 +1282,13 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
}
}

settings.buffer_size =
matches
.get_one::<String>(options::BUF_SIZE)
.map_or(Ok(DEFAULT_BUF_SIZE), |s| {
GlobalSettings::parse_byte_count(s).map_err(|e| {
USimpleError::new(2, format_error_message(&e, s, options::BUF_SIZE))
})
})?;
settings.buffer_size = matches.get_one::<String>(options::BUF_SIZE).map_or_else(
|| Ok(automatic_buffer_size(&files)),
|s| {
GlobalSettings::parse_byte_count(s)
.map_err(|e| USimpleError::new(2, format_error_message(&e, s, options::BUF_SIZE)))
},
)?;

let mut tmp_dir = TmpDirWrapper::new(
matches
Expand Down Expand Up @@ -1983,6 +2107,23 @@ mod tests {

use super::*;

#[test]
fn desired_buffer_matches_total_when_small() {
let six_mebibytes = 6 * 1024 * 1024;
let expected = ((six_mebibytes as u128) * 12)
.clamp(six_mebibytes as u128, MAX_AUTOMATIC_BUF_SIZE as u128);
assert_eq!(desired_file_buffer_bytes(six_mebibytes as u128), expected);
}

#[test]
fn desired_buffer_caps_at_max_for_large_inputs() {
let large = 256 * 1024 * 1024; // 256 MiB
assert_eq!(
desired_file_buffer_bytes(large as u128),
MAX_AUTOMATIC_BUF_SIZE as u128
);
}

fn tokenize_helper(line: &[u8], separator: Option<u8>) -> Vec<Field> {
let mut buffer = vec![];
tokenize(line, separator, &mut buffer);
Expand Down
110 changes: 84 additions & 26 deletions src/uu/sort/src/tmp_dir.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@
//
// For the full copyright and license information, please view the LICENSE
// file that was distributed with this source code.
use std::sync::atomic::{AtomicBool, Ordering};
use std::{
fs::File,
path::{Path, PathBuf},
sync::{Arc, Mutex},
sync::{Arc, Mutex, OnceLock},
};

use tempfile::TempDir;
Expand All @@ -29,6 +30,64 @@
lock: Arc<Mutex<()>>,
}

#[derive(Default, Clone)]
struct HandlerRegistration {
lock: Option<Arc<Mutex<()>>>,
path: Option<PathBuf>,
}

fn handler_state() -> Arc<Mutex<HandlerRegistration>> {
static HANDLER_STATE: OnceLock<Arc<Mutex<HandlerRegistration>>> = OnceLock::new();
HANDLER_STATE
.get_or_init(|| Arc::new(Mutex::new(HandlerRegistration::default())))
.clone()
}

fn ensure_signal_handler_installed(state: Arc<Mutex<HandlerRegistration>>) -> UResult<()> {
static HANDLER_INSTALLED: AtomicBool = AtomicBool::new(false);

if HANDLER_INSTALLED
.compare_exchange(false, true, Ordering::AcqRel, Ordering::Acquire)
.is_err()
{
return Ok(());
}

let handler_state = state.clone();
if let Err(e) = ctrlc::set_handler(move || {
let (lock, path) = {
let state = handler_state.lock().unwrap();
(state.lock.clone(), state.path.clone())
};

if let Some(lock) = lock {
let _guard = lock.lock().unwrap();
if let Some(path) = path {
if let Err(e) = remove_tmp_dir(&path) {
show_error!(
"{}",
translate!(
"sort-failed-to-delete-temporary-directory",
"error" => e
)
);
}
}
}

std::process::exit(2)
}) {
HANDLER_INSTALLED.store(false, Ordering::Release);
return Err(USimpleError::new(

Check failure on line 81 in src/uu/sort/src/tmp_dir.rs

View workflow job for this annotation

GitHub Actions / Style and Lint (ubuntu-24.04, unix)

ERROR: `cargo clippy`: useless conversion to the same type: `std::boxed::Box<dyn uucore::error::UError>` (file:'src/uu/sort/src/tmp_dir.rs', line:81)
2,
translate!("sort-failed-to-set-up-signal-handler", "error" => e),
)
.into());
}

Ok(())
}

impl TmpDirWrapper {
pub fn new(path: PathBuf) -> Self {
Self {
Expand All @@ -52,31 +111,14 @@
);

let path = self.temp_dir.as_ref().unwrap().path().to_owned();
let lock = self.lock.clone();
ctrlc::set_handler(move || {
// Take the lock so that `next_file_path` returns no new file path,
// and the program doesn't terminate before the handler has finished
let _lock = lock.lock().unwrap();
if let Err(e) = remove_tmp_dir(&path) {
show_error!(
"{}",
translate!(
"sort-failed-to-delete-temporary-directory",
"error" => e
)
);
}
std::process::exit(2)
})
.map_err(|e| {
USimpleError::new(
2,
translate!(
"sort-failed-to-set-up-signal-handler",
"error" => e
),
)
})
let state = handler_state();
{
let mut guard = state.lock().unwrap();
guard.lock = Some(self.lock.clone());
guard.path = Some(path);
}

ensure_signal_handler_installed(state)
}

pub fn next_file(&mut self) -> UResult<(File, PathBuf)> {
Expand All @@ -100,6 +142,22 @@
}
}

impl Drop for TmpDirWrapper {
fn drop(&mut self) {
let state = handler_state();
let mut guard = state.lock().unwrap();

if guard
.lock
.as_ref()
.is_some_and(|current| Arc::ptr_eq(current, &self.lock))
{
guard.lock = None;
guard.path = None;
}
}
}

/// Remove the directory at `path` by deleting its child files and then itself.
/// Errors while deleting child files are ignored.
fn remove_tmp_dir(path: &Path) -> std::io::Result<()> {
Expand Down
Loading