Skip to content

Commit 6b3c443

Browse files
committed
nixos/platform: change default nix to 2.18 (prod) or 2.25 (non-prod)
PL-133484 Nix 2.24's inefficient Git tarball cache is a major problem for VMs[1] with low IOPS (default is 250). The way forward is * switch back to Nix 2.18 on all production VMs. * roll out Nix 2.25 as designated successor to all non-production VMs (approach derived from the verification kernel topic from eba8be3). The challenge here is that `flyingcircus.agent.package` is supposed to be modified by downstream consumers, e.g. for Slurm support. Hence, the option isn't modified directly, but each agent package is post-processed via `apply` to use `config.nix.package`. A nice side-effect of this is that setting `nix.package` also changes the Nix used by the agent, so `nix.package` behaves as I'd expect it. Please note that this also means that setting `nix.package` to e.g. Nix 2.26 implies a rebuild of the agent now. I decided against overriding `pkgs.nix` with an overlay since there are a bunch of packages out there that explicitly require a specific Nix version, so the potential fallout from that is higher than modifying `nix.package`. Additionally I changed the usage of Nix 2.24 in the following places: * The PATH of `fc-collect-garbage.service` doesn't have a Nix at all anymore. The agent package already prefers its own Nix, so this had no effect at all. * The agent isn't built against 2.24 in pkgs/fc: I see no reason to do that since there's zero usage of this. It's now built with Nix 2.18 since that's what the majority of all VMs will use for now. The variant with 2.25 is also built by Hydra because of the VM-test, so staging VMs don't have to build their own agent. * The sensu check-env uses the default Nix as well. [1] See PL-133484 for measurements with Nix versions and related upstream bugs.
1 parent a64580d commit 6b3c443

File tree

7 files changed

+179
-4
lines changed

7 files changed

+179
-4
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
<!--
2+
3+
A new changelog entry.
4+
5+
Delete placeholder items that do not apply. Empty sections will be removed
6+
automatically during release.
7+
8+
Leave the XX.XX as is: this is a placeholder and will be automatically filled
9+
correctly during the release and helps when backporting over multiple platform
10+
branches.
11+
12+
-->
13+
14+
### Impact
15+
16+
- Nix: downgrade production VMs to 2.18 (and upgrade the rest to 2.25).
17+
18+
Due to a significant performance regression in 2.24, Nix will be rolled back
19+
to 2.18, the default from 24.05 and 23.11. Staging machines will get Nix 2.25
20+
as a preparation for upgrading the entire platform to 2.25.
21+
22+
### NixOS XX.XX platform
23+
24+
- Restart of `nix-daemon`.

nixos/platform/collect-garbage.nix

+1-1
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ in {
6666
SuccessExitStatus = [ 1 2 3 ];
6767
TimeoutStartSec = "infinity";
6868
};
69-
path = with pkgs; [ fc.userscan nix glibc util-linux ];
69+
path = with pkgs; [ fc.userscan glibc util-linux ];
7070
environment = {
7171
LANG = "en_US.utf8";
7272
PYTHONUNBUFFERED = "1";

nixos/platform/default.nix

+1
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ in {
3737
./kernel.nix
3838
./monitoring.nix
3939
./network.nix
40+
./nix.nix
4041
./packages.nix
4142
./shell.nix
4243
./static.nix

nixos/platform/nix.nix

+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
{ config, lib, pkgs, ... }:
2+
3+
let
4+
location = lib.attrByPath [ "parameters" "location" ] "" config.flyingcircus.enc;
5+
production = lib.attrByPath [ "parameters" "production" ] "" config.flyingcircus.enc;
6+
7+
nixPackage = if config.flyingcircus.nix.useUnstableNix
8+
then pkgs.nixVersions.nix_2_25
9+
else pkgs.nixVersions.nix_2_18;
10+
in {
11+
options.flyingcircus = {
12+
nix.useUnstableNix = lib.mkOption {
13+
default = (location == "dev") || (location == "whq") || (production == false);
14+
defaultText = lib.literalExpression ''(location == "dev") || (location == "whq") || (production == false)'';
15+
type = lib.types.bool;
16+
description = ''
17+
Whether to use a known stable Nix (i.e. 2.18) or a
18+
newer, potentially unstable version (i.e. 2.25).
19+
'';
20+
};
21+
22+
# The option is defined in `<fc-nixos/nixos/platform/agent.nix>`.
23+
# This injects a function that makes sure that the agent uses the correct
24+
# Nix version.
25+
#
26+
# It's not feasible to modify `config.flyingcircus.agent.package` for this,
27+
# since downstream consumers may do that already, e.g. for slurm support.
28+
agent.package = lib.mkOption {
29+
apply = package: package.override { nix = config.nix.package; };
30+
};
31+
};
32+
33+
config = {
34+
nix.package = nixPackage;
35+
};
36+
}

nixos/services/sensu/client.nix

+1-1
Original file line numberDiff line numberDiff line change
@@ -609,7 +609,7 @@ in {
609609
glibc
610610
lm_sensors
611611
monitoring-plugins
612-
nix
612+
config.nix.package
613613
openssl
614614
procps
615615
python3

pkgs/fc/default.nix

+4-2
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,10 @@
33
rec {
44
recurseForDerivations = true;
55

6-
agent = pythonPackages.callPackage ./agent {};
7-
agentWithSlurm = pythonPackages.callPackage ./agent { enableSlurm = true; };
6+
agent = pythonPackages.callPackage ./agent {
7+
nix = pkgs.nixVersions.nix_2_18;
8+
};
9+
agentWithSlurm = agent.override { enableSlurm = true; };
810

911
blockdev = callPackage ./blockdev {};
1012

tests/nix-version.nix

+112
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
import ./make-test-python.nix ({ ... }: {
2+
name = "nixversions";
3+
nodes = let
4+
mkTestVM = { location, withSlurm ? false, production ? true }: { lib, pkgs, ... }: {
5+
imports = [ ../nixos ../nixos/roles ];
6+
flyingcircus.agent.package = lib.mkIf withSlurm pkgs.fc.agentWithSlurm;
7+
flyingcircus.enc.parameters = {
8+
inherit location production;
9+
resource_group = "test";
10+
interfaces.srv = {
11+
mac = "52:54:00:12:34:56";
12+
bridged = false;
13+
networks = {
14+
"192.168.101.0/24" = [ "192.168.101.7" ];
15+
"2001:db8:f030:1c3::/64" = [ "2001:db8:f030:1c3::7" ];
16+
};
17+
gateways = {};
18+
};
19+
};
20+
};
21+
in {
22+
production = mkTestVM { location = "rzob"; };
23+
nonProd = mkTestVM { location = "rzob"; production = false; };
24+
slurmOnProduction = mkTestVM { location = "rzob"; withSlurm = true; };
25+
slurmOnNonProd = mkTestVM { location = "rzob"; withSlurm = true; production = false; };
26+
27+
devVM = mkTestVM { location = "dev"; };
28+
whqVM = mkTestVM { location = "whq"; };
29+
whqVMNonProd = mkTestVM { location = "whq"; production = false; };
30+
};
31+
32+
testScript = ''
33+
import os.path
34+
import re
35+
36+
relevant_nix_versions = [
37+
# the default on production
38+
"2.18",
39+
# default Nix in 24.11
40+
"2.24",
41+
# the default on staging
42+
"2.25",
43+
]
44+
45+
def strip_hash(store_path):
46+
basename = os.path.basename(store_path)
47+
return basename.split("-", 1)[1]
48+
49+
def verify_nix_versions(vm, expected_nix="2.25", expect_slurm=False):
50+
vm.start()
51+
version = vm.succeed("nix --version")
52+
assert version.startswith(f"nix (Nix) {expected_nix}."), f"""
53+
Expected Nix version to start with
54+
nix (Nix) {expected_nix}.
55+
Full output:
56+
{version}
57+
"""
58+
nix = os.path.dirname(os.path.dirname(vm.succeed("readlink -f $(type -P nix)")))
59+
fc_agent = vm.succeed("readlink -f $(type -P fc-manage)")
60+
61+
# no `nix why-depends` here since it always gives 0 as exit code and we'd have to
62+
# match the stderr.
63+
agent_closure = vm.succeed(f"nix-store -qR {fc_agent}").rstrip("\n").split("\n")
64+
assert nix in agent_closure, f"""
65+
Expected Nix {expected_nix} (i.e. store-path {nix}) to be in
66+
the agent closure:
67+
68+
{agent_closure}
69+
"""
70+
71+
nix_versions_not_used = [x for x in relevant_nix_versions if x != expected_nix]
72+
assert all(
73+
not strip_hash(p).startswith(f"nix-{majorminor}.")
74+
for p in agent_closure
75+
for majorminor in nix_versions_not_used
76+
), f"""
77+
Expected Nix versions {", ".join(nix_versions_not_used)} to NOT be in the agent closure:
78+
79+
{agent_closure}
80+
"""
81+
82+
assert any(strip_hash(p).startswith("slurm-") for p in agent_closure) == expect_slurm, """
83+
Expected no `slurm` in agent closure:
84+
85+
{agent_closure}
86+
"""
87+
88+
vm.shutdown()
89+
90+
91+
with subtest("rzob production vm"):
92+
verify_nix_versions(production, "2.18")
93+
94+
with subtest("rzob non-prod vm"):
95+
verify_nix_versions(nonProd, "2.25")
96+
97+
with subtest("rzob prod vm with slurm"):
98+
verify_nix_versions(slurmOnProduction, "2.18", expect_slurm=True)
99+
100+
with subtest("rzob non-prod vm with slurm"):
101+
verify_nix_versions(slurmOnNonProd, "2.25", expect_slurm=True)
102+
103+
with subtest("whq vm"):
104+
verify_nix_versions(whqVM, "2.25")
105+
106+
with subtest("whq vm with non-prod flag"):
107+
verify_nix_versions(whqVMNonProd, "2.25")
108+
109+
with subtest("dev vm"):
110+
verify_nix_versions(devVM, "2.25")
111+
'';
112+
})

0 commit comments

Comments
 (0)