Skip to content

Commit

Permalink
Merge pull request #207 from pythonspeed/201.disable-oom-detection
Browse files Browse the repository at this point in the history
Option to disable oom detection
  • Loading branch information
itamarst authored Jul 12, 2021
2 parents 2e36a35 + 92d429a commit d2ebb6a
Show file tree
Hide file tree
Showing 6 changed files with 132 additions and 49 deletions.
1 change: 1 addition & 0 deletions .changelog/201.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Added a `--disable-oom-detection` to disable the out-of-memory detection heuristic.
7 changes: 6 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,12 @@ Fil uses three heuristics to determine if the process is close to running out of
* The operating system or memory-limited cgroup (e.g. a Docker container) only has 100MB of RAM available.
* The process swap is larger than available memory, indicating heavy swapping by the process.
In general you want to avoid swapping, and e.g. [explicitly use `mmap()`](https://pythonspeed.com/articles/mmap-vs-zarr-hdf5/) if you expect to be using disk as a backfill for memory.


#### Disabling the out-of-memory detection

Sometimes the out-of-memory detection heuristic will kick in too soon, shutting down the program even though in practice it could finish running.
You can disable the heuristic by doing `fil-profile --disable-oom-detection run yourprogram.py`.

## <a name="reducing-memory-usage">Reducing memory usage in your code</a>

You've found where memory usage is coming from—now what?
Expand Down
14 changes: 10 additions & 4 deletions filpreload/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use parking_lot::Mutex;
use pymemprofile_api::memorytracking::{AllocationTracker, CallSiteId, Callstack, FunctionId};
use pymemprofile_api::oom::{MemoryInfo, OutOfMemoryEstimator, RealMemoryInfo};
use pymemprofile_api::oom::{InfiniteMemory, OutOfMemoryEstimator, RealMemoryInfo};
use std::cell::RefCell;
use std::ffi::CStr;
use std::os::raw::{c_char, c_int, c_void};
Expand All @@ -18,14 +18,20 @@ static GLOBAL: Jemalloc = Jemalloc;
thread_local!(static THREAD_CALLSTACK: RefCell<Callstack> = RefCell::new(Callstack::new()));

struct TrackerState {
oom: OutOfMemoryEstimator<RealMemoryInfo>,
oom: OutOfMemoryEstimator,
allocations: AllocationTracker,
}

lazy_static! {
static ref TRACKER_STATE: Mutex<TrackerState> = Mutex::new(TrackerState {
allocations: AllocationTracker::new("/tmp".to_string()),
oom: OutOfMemoryEstimator::new(RealMemoryInfo::new()),
oom: OutOfMemoryEstimator::new(
if std::env::var("__FIL_DISABLE_OOM_DETECTION") == Ok("1".to_string()) {
Box::new(InfiniteMemory {})
} else {
Box::new(RealMemoryInfo::new())
}
),
});
}

Expand Down Expand Up @@ -113,7 +119,7 @@ fn add_allocation(
}
tracker_state.allocations.oom_break_glass();
eprintln!("=fil-profile= WARNING: Detected out-of-memory condition, exiting soon.");
tracker_state.oom.memory_info.print_info();
tracker_state.oom.print_info();
}

let allocations = &mut tracker_state.allocations;
Expand Down
23 changes: 21 additions & 2 deletions filprofiler/_script.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,12 @@
default="fil-result",
help="Directory where the profiling results written",
)
PARSER.add_argument(
"--disable-oom-detection",
action="store_true",
default=False,
help="Disable the heuristic that tries to catch out-of-memory situations before they occur",
)
PARSER.add_argument(
"--no-browser",
action="store_true",
Expand All @@ -92,13 +98,21 @@
subparsers = PARSER.add_subparsers(help="sub-command help")
parser_run = subparsers.add_parser(
"run",
help="Run a Python script or package",
help="Run a Python script or package with Fil enabled",
prefix_chars=[""],
add_help=False,
)
parser_python = subparsers.add_parser(
"python",
help="Run a Python script or package with Fil initially disabled",
prefix_chars=[""],
add_help=False,
)
parser_run.set_defaults(command="run")
parser_run.add_argument("rest", nargs=REMAINDER)
del subparsers, parser_run
parser_python.set_defaults(command="python")
parser_python.add_argument("rest", nargs=REMAINDER)
del subparsers, parser_run, parser_python

# Can't figure out if this is a standard path _everywhere_, but it definitely
# exists on Ubuntu 18.04 and 20.04, Debian Buster, CentOS 8, and Arch.
Expand Down Expand Up @@ -144,6 +158,11 @@ def stage_1():
PARSER.print_help()
sys.exit(0)

arguments = PARSER.parse_args()
if arguments.disable_oom_detection:
# See filpreload/src/lib.rs:
environ["__FIL_DISABLE_OOM_DETECTION"] = "1"

# Initial status:
environ["__FIL_STATUS"] = "launcher"
# Tracebacks when Rust crashes:
Expand Down
107 changes: 68 additions & 39 deletions memapi/src/oom.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,18 +30,18 @@ pub trait MemoryInfo {
/// Second, we probably don't want to check every time, that's expensive. So
/// check every 1% of allocations remaining until we run out of available memory
/// (we don't even check for free()s, which just means more frequent checks).
pub struct OutOfMemoryEstimator<M: MemoryInfo> {
pub struct OutOfMemoryEstimator {
// How many bytes it takes until we check again: whenever it's reset, it
// starts as 1% of available memory.
check_threshold_bytes: usize,
// Minimum number of bytes we want to be available at any time.
minimal_required_available_bytes: usize,
// Pluggable way to get memory usage of the system and process.
pub memory_info: M,
pub memory_info: Box<dyn MemoryInfo + Sync + Send>,
}

impl<M: MemoryInfo> OutOfMemoryEstimator<M> {
pub fn new(memory_info: M) -> Self {
impl OutOfMemoryEstimator {
pub fn new(memory_info: Box<dyn MemoryInfo + Sync + Send>) -> Self {
Self {
check_threshold_bytes: 0,
// Either 100MB or 2% of available memory, whatever is bigger.
Expand Down Expand Up @@ -125,6 +125,10 @@ impl<M: MemoryInfo> OutOfMemoryEstimator<M> {
return false;
}
}

pub fn print_info(&self) {
self.memory_info.print_info();
}
}

#[cfg(target_os = "linux")]
Expand Down Expand Up @@ -266,12 +270,35 @@ impl MemoryInfo for RealMemoryInfo {
}
}

// Used to disable out-of-memory heuristic.
pub struct InfiniteMemory {}

impl MemoryInfo for InfiniteMemory {
fn total_memory(&self) -> usize {
2usize.pow(48u32)
}

fn get_available_memory(&self) -> usize {
2usize.pow(48u32)
}

fn get_resident_process_memory(&self) -> usize {
0
}

/// Print debugging info to stderr.
fn print_info(&self) {
eprintln!("=fil-profile= Out of memory detection is disabled.");
}
}

#[cfg(test)]
mod tests {
use super::{MemoryInfo, OutOfMemoryEstimator};
use proptest::prelude::*;
use std::cell::Ref;
use std::cell::RefCell;
use std::sync::Arc;

struct FakeMemory {
available_memory: RefCell<usize>,
Expand All @@ -280,12 +307,12 @@ mod tests {
}

impl FakeMemory {
fn new() -> Self {
FakeMemory {
fn new() -> Arc<Self> {
Arc::new(FakeMemory {
available_memory: RefCell::new(1_000_000_000),
checks: RefCell::new(vec![]),
swap: RefCell::new(0),
}
})
}

fn allocate(&self, size: usize) {
Expand All @@ -302,11 +329,11 @@ mod tests {
}

fn get_allocated(&self) -> usize {
self.total_memory() - *self.available_memory.borrow()
1_000_000_000 - *self.available_memory.borrow()
}
}

impl MemoryInfo for FakeMemory {
impl MemoryInfo for Arc<FakeMemory> {
fn total_memory(&self) -> usize {
1_000_000_000
}
Expand All @@ -325,19 +352,24 @@ mod tests {
fn print_info(&self) {}
}

fn setup_estimator() -> OutOfMemoryEstimator<FakeMemory> {
unsafe impl Sync for FakeMemory {}

fn setup_estimator() -> (OutOfMemoryEstimator, Arc<FakeMemory>) {
let fake_memory = FakeMemory::new();
OutOfMemoryEstimator::new(fake_memory)
(
OutOfMemoryEstimator::new(Box::new(fake_memory.clone())),
fake_memory,
)
}

proptest! {
// Random allocations don't break invariants
#[test]
fn not_oom(allocated_sizes in prop::collection::vec(1..1000 as usize, 10..2000)) {
let mut estimator = setup_estimator();
let (mut estimator, memory_info) = setup_estimator();
let mut allocated = 0;
for size in allocated_sizes {
estimator.memory_info.allocate(size);
memory_info.allocate(size);
allocated += size;
let too_big = estimator.too_big_allocation(size, allocated);
prop_assert_eq!(too_big, estimator.memory_info.get_available_memory() <= estimator.minimal_required_available_bytes);
Expand All @@ -351,51 +383,48 @@ mod tests {
// We're out of memory if we're below the threshold.
#[test]
fn oom_threshold() {
let mut estimator = setup_estimator();
assert!(!estimator.are_we_oom(estimator.memory_info.get_allocated()));
estimator.memory_info.allocate(500_000_000);
assert!(!estimator.are_we_oom(estimator.memory_info.get_allocated()));
estimator.memory_info.allocate(350_000_000);
assert!(!estimator.are_we_oom(estimator.memory_info.get_allocated()));
estimator.memory_info.allocate(50_000_000);
let (mut estimator, memory_info) = setup_estimator();
assert!(!estimator.are_we_oom(memory_info.get_allocated()));
memory_info.allocate(500_000_000);
assert!(!estimator.are_we_oom(memory_info.get_allocated()));
memory_info.allocate(350_000_000);
assert!(!estimator.are_we_oom(memory_info.get_allocated()));
memory_info.allocate(50_000_000);
// Now that we're below the maximum, we've gone too far:
assert!(estimator.are_we_oom(estimator.memory_info.get_allocated()));
estimator.memory_info.allocate(40_000_000);
assert!(estimator.are_we_oom(estimator.memory_info.get_allocated()));
assert!(estimator.are_we_oom(memory_info.get_allocated()));
memory_info.allocate(40_000_000);
assert!(estimator.are_we_oom(memory_info.get_allocated()));
}

// We're out of memory if swap > available.
#[test]
fn oom_swap() {
let mut estimator = setup_estimator();
estimator.memory_info.allocate(500_000_001);
assert!(!estimator.are_we_oom(estimator.memory_info.get_allocated()));
let (mut estimator, memory_info) = setup_estimator();
memory_info.allocate(500_000_001);
assert!(!estimator.are_we_oom(memory_info.get_allocated()));

estimator.memory_info.add_swap(499_999_999);
assert!(!estimator.are_we_oom(estimator.memory_info.get_allocated()));
memory_info.add_swap(499_999_999);
assert!(!estimator.are_we_oom(memory_info.get_allocated()));

estimator.memory_info.add_swap(2);
assert!(estimator.are_we_oom(estimator.memory_info.get_allocated()));
memory_info.add_swap(2);
assert!(estimator.are_we_oom(memory_info.get_allocated()));
}

// The intervals between checking if out-of-memory shrink as we get closer
// to running out of memory
#[test]
fn oom_estimator_shrinking_intervals() {
let mut estimator = setup_estimator();
let (mut estimator, memory_info) = setup_estimator();
loop {
{
let memory = &mut estimator.memory_info;
memory.allocate(10_000);
}
if estimator.too_big_allocation(10_000, estimator.memory_info.get_allocated()) {
memory_info.allocate(10_000);

if estimator.too_big_allocation(10_000, memory_info.get_allocated()) {
break;
}
// by 100MB we should have detected OOM.
assert!(*(&estimator.memory_info).available_memory.borrow() >= 99_000_000);
assert!(*memory_info.available_memory.borrow() >= 99_000_000);
}
let fake_memory = estimator.memory_info;
let checks = fake_memory.get_checks();
let checks = memory_info.get_checks();
// Each check should come closer than the next:
for pair in checks.windows(2) {
assert!(pair[0] >= pair[1], "{} vs {}", pair[0], pair[1]);
Expand Down
29 changes: 26 additions & 3 deletions tests/test_endtoend.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,27 @@ def test_out_of_memory_slow_leak():
assert match(allocations, {expected_alloc: big}, as_mb) > 100


@pytest.mark.skipif(
shutil.which("systemd-run") is None or glibc_version() < (2, 30),
reason="systemd-run not found, or old systemd probably",
)
def test_out_of_memory_detection_disabled():
"""
If out-of-memory detection is disabled, we won't catch problems, the OS will.
"""
available_memory = psutil.virtual_memory().available
script = TEST_SCRIPTS / "oom-slow.py"
try:
check_call(
get_systemd_run_args(available_memory // 4)
+ ["fil-profile", "--disable-oom-detection", "run", str(script)]
)
except CalledProcessError as e:
assert e.returncode == -9 # killed by OS
else:
assert False, "process succeeded?!"


def get_systemd_run_args(available_memory):
"""
Figure out if we're on system with cgroups v2, or not, and return
Expand All @@ -319,14 +340,16 @@ def get_systemd_run_args(available_memory):
"--gid",
str(os.getegid()),
"-p",
f"MemoryLimit={available_memory // 2}B",
f"MemoryLimit={available_memory // 4}B",
"--scope",
"--same-dir",
]
try:
check_call(args + ["--user", "printf", "hello"])
args += ["--user", "--scope"]
args += ["--user"]
except CalledProcessError:
# cgroups v1 doesn't do --user :(
args = ["sudo", "--preserve-env=PATH"] + args + ["-t", "--same-dir"]
args = ["sudo", "--preserve-env=PATH"] + args
return args


Expand Down

0 comments on commit d2ebb6a

Please sign in to comment.