Skip to content

Commit

Permalink
Add test for fixed storage broker issue (#9311)
Browse files Browse the repository at this point in the history
Adds a test for the (now fixed) storage broker limit issue, see #9268
for the description and #9299 for the fix.

Also fix a race condition with endpoint creation/starts running in parallel,
leading to file not found errors.
  • Loading branch information
arpad-m authored Oct 14, 2024
1 parent 31b7703 commit d92ff57
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 2 deletions.
16 changes: 15 additions & 1 deletion control_plane/src/endpoint.rs
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,21 @@ impl ComputeControlPlane {
for endpoint_dir in std::fs::read_dir(env.endpoints_path())
.with_context(|| format!("failed to list {}", env.endpoints_path().display()))?
{
let ep = Endpoint::from_dir_entry(endpoint_dir?, &env)?;
let ep_res = Endpoint::from_dir_entry(endpoint_dir?, &env);
let ep = match ep_res {
Ok(ep) => ep,
Err(e) => match e.downcast::<std::io::Error>() {
Ok(e) => {
// A parallel task could delete an endpoint while we have just scanned the directory
if e.kind() == std::io::ErrorKind::NotFound {
continue;
} else {
Err(e)?
}
}
Err(e) => Err(e)?,
},
};
endpoints.insert(ep.endpoint_id.clone(), Arc::new(ep));
}

Expand Down
34 changes: 33 additions & 1 deletion test_runner/regress/test_tenants.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import concurrent.futures
import os
import threading
import time
from contextlib import closing
from datetime import datetime
Expand All @@ -10,7 +11,7 @@

import pytest
import requests
from fixtures.common_types import Lsn, TenantId
from fixtures.common_types import Lsn, TenantId, TimelineId
from fixtures.log_helper import log
from fixtures.metrics import (
PAGESERVER_GLOBAL_METRICS,
Expand Down Expand Up @@ -476,3 +477,34 @@ def only_int(samples: list[Sample]) -> int:
assert counts
log.info(f"directory counts: {counts}")
assert counts[2] > COUNT_AT_LEAST_EXPECTED


def test_timelines_parallel_endpoints(neon_simple_env: NeonEnv):
"""
(Relaxed) regression test for issue that led to https://github.com/neondatabase/neon/pull/9268
Create many endpoints in parallel and then restart them
"""
env = neon_simple_env

# This param needs to be 200+ to reproduce the limit issue
n_threads = 16
barrier = threading.Barrier(n_threads)

def test_timeline(branch_name: str, timeline_id: TimelineId):
endpoint = env.endpoints.create_start(branch_name)
endpoint.stop()
# Use a barrier to make sure we restart endpoints at the same time
barrier.wait()
endpoint.start()

workers = []

for i in range(0, n_threads):
branch_name = f"branch_{i}"
timeline_id = env.create_branch(branch_name)
w = threading.Thread(target=test_timeline, args=[branch_name, timeline_id])
workers.append(w)
w.start()

for w in workers:
w.join()

1 comment on commit d92ff57

@github-actions
Copy link

@github-actions github-actions bot commented on d92ff57 Oct 14, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

5298 tests run: 5081 passed, 0 failed, 217 skipped (full report)


Flaky tests (2)

Postgres 16

Postgres 14

Code coverage* (full report)

  • functions: 31.4% (7544 of 24027 functions)
  • lines: 49.2% (60345 of 122720 lines)

* collected from Rust tests only


The comment gets automatically updated with the latest test results
d92ff57 at 2024-10-14T16:23:49.073Z :recycle:

Please sign in to comment.