-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlibcontainer.cc
1600 lines (1362 loc) · 48.8 KB
/
libcontainer.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// Copyright 2016 The ChromiumOS Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include <errno.h>
#include <fcntl.h>
#include <signal.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mount.h>
#include <sys/prctl.h>
#include <sys/stat.h>
#include <sys/sysmacros.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <syscall.h>
#include <unistd.h>
#include <algorithm>
#include <map>
#include <memory>
#include <ostream>
#include <set>
#include <sstream>
#include <string>
#include <tuple>
#include <utility>
#include <vector>
#include <base/files/file_path.h>
#include <base/files/file_util.h>
#include <base/files/scoped_file.h>
#include <base/functional/bind.h>
#include <base/functional/callback_helpers.h>
#include <base/logging.h>
#include <base/strings/string_util.h>
#include <base/strings/stringprintf.h>
#include <libminijail.h>
#include <scoped_minijail.h>
#include "libcontainer/cgroup.h"
#include "libcontainer/config.h"
#include "libcontainer/libcontainer.h"
#include "libcontainer/libcontainer_util.h"
#define QUOTE(s) ('"' + std::string(s) + '"')
// Not available in sys/prctl.h yet, but supported on some kernels.
#ifndef PR_SET_CORE_SCHED
#define PR_SET_CORE_SCHED 0x200
#endif
namespace {
using libcontainer::DeviceMapperDetach;
using libcontainer::DeviceMapperSetup;
using libcontainer::GetUsernsOutsideId;
using libcontainer::Loopdev;
using libcontainer::LoopdevDetach;
using libcontainer::LoopdevSetup;
using libcontainer::MakeDir;
using libcontainer::MountExternal;
using libcontainer::TouchFile;
constexpr size_t kMaxRlimits = 32; // Linux defines 15 at the time of writing.
struct Mount {
std::string name;
base::FilePath source;
base::FilePath destination;
std::string type;
std::string data;
std::string verity;
int flags;
int uid;
int gid;
int mode;
// True if mount should happen in new vfs ns.
bool mount_in_ns;
// True if target should be created if it doesn't exist.
bool create;
// True if target should be mounted via loopback.
bool loopback;
};
struct Device {
// 'c' or 'b' for char or block
char type;
base::FilePath path;
int fs_permissions;
int major;
int minor;
// Copy the major from existing node, ignores |major|.
bool copy_major;
// Copy the minor from existing node, ignores |minor|.
bool copy_minor;
int uid;
int gid;
};
struct CgroupDevice {
bool allow;
char type;
// -1 for either major or minor means all.
int major;
int minor;
bool read;
bool write;
bool modify;
};
struct CpuCgroup {
int shares;
int quota;
int period;
int rt_runtime;
int rt_period;
};
struct Rlimit {
int type;
rlim_t cur;
rlim_t max;
};
} // namespace
// Structure that configures how the container is run.
struct container_config {
// Path to the root of the container itself.
base::FilePath config_root;
// Path to the root of the container's filesystem.
base::FilePath rootfs;
// Flags that will be passed to mount() for the rootfs.
unsigned long rootfs_mount_flags = 0x0;
// Path to where the container will be run.
base::FilePath premounted_runfs;
// Path to the file where the pid should be written.
base::FilePath pid_file_path;
// The program to run and args, e.g. "/sbin/init".
std::vector<std::string> program_argv;
// The uid the container will run as.
uid_t uid = 0;
// Mapping of UIDs in the container, e.g. "0 100000 1024"
std::string uid_map;
// The gid the container will run as.
gid_t gid = 0;
// Mapping of GIDs in the container, e.g. "0 100000 1024"
std::string gid_map;
// The supplementary gids that attached to the container.
std::vector<gid_t> additional_gids;
// Syscall table to use or nullptr if none.
std::string alt_syscall_table;
// Filesystems to mount in the new namespace.
std::vector<Mount> mounts;
// Namespaces that should be used for the container.
std::set<std::string> namespaces;
// Device nodes to create.
std::vector<Device> devices;
// Device node cgroup permissions.
std::vector<CgroupDevice> cgroup_devices;
// CPU cgroup params.
CpuCgroup cpu_cgparams;
// Parent dir for cgroup creation
base::FilePath cgroup_parent;
// uid to own the created cgroups
uid_t cgroup_owner = 0;
// gid to own the created cgroups
gid_t cgroup_group = 0;
// Allow the child process to keep open FDs (for stdin/out/err).
bool keep_fds_open = false;
// Array of rlimits for the contained process.
Rlimit rlimits[kMaxRlimits];
// The number of elements in `rlimits`.
int num_rlimits = 0;
bool use_capmask = false;
bool use_capmask_ambient = false;
uint64_t capmask = 0x0;
// The mask of securebits to skip when restricting caps.
uint64_t securebits_skip_mask = 0x0;
// Core Scheduling policy
bool core_sched = false;
// Whether the container needs an extra process to be run as init.
bool do_init = false;
// The SELinux context name the container will run under.
std::string selinux_context;
// A function pointer to be called prior to calling execve(2).
minijail_hook_t pre_start_hook = nullptr;
// Parameter that will be passed to pre_start_hook().
void* pre_start_hook_payload = nullptr;
// A list of file descriptors to inherit.
std::vector<int> inherited_fds;
// A list of hooks that will be called upon minijail reaching various states
// of execution.
std::map<minijail_hook_event_t, std::vector<libcontainer::HookCallback>>
hooks;
};
// Container manipulation
struct container {
std::unique_ptr<libcontainer::Cgroup> cgroup;
ScopedMinijail jail;
pid_t init_pid = -1;
base::FilePath config_root;
base::FilePath runfs;
base::FilePath rundir;
base::FilePath runfsroot;
base::FilePath pid_file_path;
// Mounts made outside of the minijail.
std::vector<base::FilePath> ext_mounts;
std::vector<Loopdev> loopdevs;
std::vector<std::string> device_mappers;
std::string name;
std::vector<std::pair<libcontainer::HookState,
std::vector<libcontainer::HookCallback>>>
hook_states;
};
namespace {
std::string GetMountFlagsAsString(int flags) {
#define CHECK_MOUNT_FLAG(flag) \
do { \
if (flags & flag) \
result.push_back(#flag); \
} while (false)
std::vector<std::string> result;
CHECK_MOUNT_FLAG(MS_RDONLY);
CHECK_MOUNT_FLAG(MS_NOSUID);
CHECK_MOUNT_FLAG(MS_NODEV);
CHECK_MOUNT_FLAG(MS_NOEXEC);
CHECK_MOUNT_FLAG(MS_SYNCHRONOUS);
CHECK_MOUNT_FLAG(MS_REMOUNT);
CHECK_MOUNT_FLAG(MS_MANDLOCK);
CHECK_MOUNT_FLAG(MS_DIRSYNC);
CHECK_MOUNT_FLAG(MS_NOATIME);
CHECK_MOUNT_FLAG(MS_NODIRATIME);
CHECK_MOUNT_FLAG(MS_BIND);
CHECK_MOUNT_FLAG(MS_MOVE);
CHECK_MOUNT_FLAG(MS_REC);
CHECK_MOUNT_FLAG(MS_SILENT);
CHECK_MOUNT_FLAG(MS_POSIXACL);
CHECK_MOUNT_FLAG(MS_UNBINDABLE);
CHECK_MOUNT_FLAG(MS_PRIVATE);
CHECK_MOUNT_FLAG(MS_SLAVE);
CHECK_MOUNT_FLAG(MS_SHARED);
return result.empty() ? "no flags" : base::JoinString(result, " | ");
#undef CHECK_MOUNT_FLAG
}
std::ostream& operator<<(std::ostream& stream, const Mount& mount) {
stream << "mount:" << std::endl
<< " name: " << QUOTE(mount.name) << std::endl
<< " source: " << QUOTE(mount.source.value()) << std::endl
<< " destination: " << QUOTE(mount.destination.value()) << std::endl
<< " type: " << QUOTE(mount.type) << std::endl
<< " data: " << QUOTE(mount.data) << std::endl
<< " verity: " << QUOTE(mount.verity) << std::endl
<< " flags: 0x" << std::hex << mount.flags << std::dec << " ("
<< GetMountFlagsAsString(mount.flags) << ")" << std::endl
<< " uid: " << mount.uid << std::endl
<< " gid: " << mount.gid << std::endl
<< " mode: 0" << std::oct << mount.mode << std::dec << std::endl
<< " mount_in_ns: " << mount.mount_in_ns << std::endl
<< " create: " << mount.create << std::endl
<< " loopback: " << mount.loopback << std::endl;
return stream;
}
std::ostream& operator<<(std::ostream& stream, const Device& device) {
stream << "device:" << std::endl
<< " type: " << device.type << std::endl
<< " path: " << QUOTE(device.path.value()) << std::endl
<< " fs_permissions: 0" << std::oct << device.fs_permissions
<< std::dec << std::endl
<< " major: " << device.major << std::endl
<< " minor: " << device.minor << std::endl
<< " copy_minor: " << device.copy_minor << std::endl
<< " uid: " << device.uid << std::endl
<< " gid: " << device.gid << std::endl;
return stream;
}
std::ostream& operator<<(std::ostream& stream,
const CgroupDevice& cgroup_device) {
stream << "cgroup_device:" << std::endl
<< " allow: " << cgroup_device.allow << std::endl
<< " type: " << cgroup_device.type << std::endl
<< " major: " << cgroup_device.major << std::endl
<< " minor: " << cgroup_device.minor << std::endl
<< " read: " << cgroup_device.read << std::endl
<< " write: " << cgroup_device.write << std::endl
<< " modify: " << cgroup_device.modify << std::endl;
return stream;
}
std::ostream& operator<<(std::ostream& stream, const CpuCgroup& cpu_cgroup) {
stream << "cpu_cgroup:" << std::endl
<< " shares: " << cpu_cgroup.shares << std::endl
<< " quota: " << cpu_cgroup.quota << std::endl
<< " period: " << cpu_cgroup.period << std::endl
<< " rt_runtime: " << cpu_cgroup.rt_runtime << std::endl
<< " rt_period: " << cpu_cgroup.rt_period << std::endl;
return stream;
}
std::ostream& operator<<(std::ostream& stream, const Rlimit& rlimit) {
stream << "rlimit:" << std::endl
<< " type: " << rlimit.type << std::endl
<< " cur: " << rlimit.cur << std::endl
<< " max: " << rlimit.max << std::endl;
return stream;
}
void DumpConfig(std::ostream* stream,
const container_config* c,
bool sort_vectors) {
*stream << "config_root: " << QUOTE(c->config_root.value()) << std::endl
<< "rootfs: " << QUOTE(c->rootfs.value()) << std::endl
<< "rootfs_mount_flags: 0x" << std::hex << c->rootfs_mount_flags
<< std::dec << " (" << GetMountFlagsAsString(c->rootfs_mount_flags)
<< ")" << std::endl
<< "premounted_runfs: " << QUOTE(c->premounted_runfs.value())
<< std::endl
<< "pid_file_path: " << QUOTE(c->pid_file_path.value()) << std::endl
<< "program_argv: size=" << c->program_argv.size() << std::endl;
for (const std::string& argv : c->program_argv)
*stream << " " << QUOTE(argv) << std::endl;
*stream << "uid: " << c->uid << std::endl
<< "uid_map: " << QUOTE(c->uid_map) << std::endl
<< "gid: " << c->gid << std::endl
<< "gid_map: " << QUOTE(c->gid_map) << std::endl
<< "alt_syscall_table: " << QUOTE(c->alt_syscall_table) << std::endl
<< "core_sched:" << (c->core_sched ? "enable" : "disable")
<< std::endl;
auto mount_sorted = c->mounts;
if (sort_vectors) {
std::stable_sort(mount_sorted.begin(), mount_sorted.end(),
[](const Mount& lhs, const Mount& rhs) {
return std::make_tuple(lhs.destination.value(),
lhs.source.value(), lhs.flags) <
std::make_tuple(rhs.destination.value(),
rhs.source.value(), rhs.flags);
});
}
for (const auto& mount : mount_sorted)
*stream << mount;
*stream << "namespaces: size=" << c->namespaces.size() << std::endl;
for (const std::string& ns : c->namespaces)
*stream << " " << QUOTE(ns) << std::endl;
auto devices_sorted = c->devices;
if (sort_vectors) {
std::stable_sort(devices_sorted.begin(), devices_sorted.end(),
[](const Device& lhs, const Device& rhs) {
return lhs.path.value() < rhs.path.value();
});
}
for (const auto& device : devices_sorted)
*stream << device;
auto cgroup_devices_sorted = c->cgroup_devices;
if (sort_vectors) {
std::stable_sort(cgroup_devices_sorted.begin(), cgroup_devices_sorted.end(),
[](const CgroupDevice& lhs, const CgroupDevice& rhs) {
return std::make_tuple(lhs.type, lhs.major, lhs.minor) <
std::make_tuple(rhs.type, rhs.major, rhs.minor);
});
}
for (const auto& cgroup_device : cgroup_devices_sorted)
*stream << cgroup_device;
*stream << c->cpu_cgparams
<< "cgroup_parent: " << QUOTE(c->cgroup_parent.value()) << std::endl
<< "cgroup_owner: " << c->cgroup_owner << std::endl
<< "cgroup_group: " << c->cgroup_group << std::endl
<< "keep_fds_open: " << c->keep_fds_open << std::endl;
*stream << "num_rlimits: " << c->num_rlimits << std::endl;
for (size_t i = 0; i < c->num_rlimits; ++i)
*stream << c->rlimits[i];
*stream << "use_capmask: " << c->use_capmask << std::endl
<< "use_capmask_ambient: " << c->use_capmask_ambient << std::endl
<< "capmask: 0x" << std::hex << c->capmask << std::dec << std::endl
<< "securebits_skip_mask: 0x" << std::hex << c->securebits_skip_mask
<< std::dec << std::endl
<< "do_init: " << c->do_init << std::endl
<< "selinux_context: " << QUOTE(c->selinux_context) << std::endl
<< "pre_start_hook: " << reinterpret_cast<void*>(c->pre_start_hook)
<< std::endl
<< "pre_start_hook_payload: " << c->pre_start_hook_payload
<< std::endl
<< "inherited_fds: size=" << c->inherited_fds.size() << std::endl;
for (int fd : c->inherited_fds)
*stream << " " << fd << std::endl;
*stream << "hooks: size=" << c->hooks.size() << std::endl;
}
// Returns the path for |path_in_container| in the outer namespace.
base::FilePath GetPathInOuterNamespace(
const base::FilePath& root, const base::FilePath& path_in_container) {
if (path_in_container.IsAbsolute())
return base::FilePath(root.value() + path_in_container.value());
return root.Append(path_in_container);
}
// Make sure the mount target exists in the new rootfs. Create if needed and
// possible.
bool SetupMountDestination(const struct container_config* config,
const Mount& mount,
const base::FilePath& source,
const base::FilePath& dest) {
struct stat st_buf;
if (stat(dest.value().c_str(), &st_buf) == 0) {
// destination exists.
return true;
}
// Try to create the destination. Either make directory or touch a file
// depending on the source type.
int uid_userns;
if (!GetUsernsOutsideId(config->uid_map, mount.uid, &uid_userns))
return false;
int gid_userns;
if (!GetUsernsOutsideId(config->gid_map, mount.gid, &gid_userns))
return false;
if (stat(source.value().c_str(), &st_buf) != 0 || S_ISDIR(st_buf.st_mode) ||
S_ISBLK(st_buf.st_mode)) {
return MakeDir(dest, uid_userns, gid_userns, mount.mode);
}
return TouchFile(dest, uid_userns, gid_userns, mount.mode);
}
// Unmounts anything we mounted in this mount namespace in the opposite order
// that they were mounted.
bool UnmountExternalMounts(struct container* c) {
bool ret = true;
for (auto it = c->ext_mounts.rbegin(); it != c->ext_mounts.rend(); ++it) {
if (umount(it->value().c_str()) != 0) {
PLOG(ERROR) << "Failed to unmount " << it->value();
ret = false;
}
}
c->ext_mounts.clear();
for (auto it = c->loopdevs.rbegin(); it != c->loopdevs.rend(); ++it) {
if (!LoopdevDetach(&(*it)))
ret = false;
}
c->loopdevs.clear();
for (auto it = c->device_mappers.rbegin(); it != c->device_mappers.rend();
++it) {
if (!DeviceMapperDetach(*it))
ret = false;
}
c->device_mappers.clear();
return ret;
}
bool DoContainerMount(struct container* c,
const struct container_config* config,
const Mount& mount) {
base::FilePath dest =
GetPathInOuterNamespace(c->runfsroot, mount.destination);
// If it's a bind mount relative to rootfs, append source to
// rootfs path, otherwise source path is absolute.
base::FilePath source;
if ((mount.flags & MS_BIND) && !mount.source.IsAbsolute()) {
source = GetPathInOuterNamespace(c->runfsroot, mount.source);
} else if (mount.loopback && !mount.source.IsAbsolute() &&
!c->config_root.empty()) {
source = GetPathInOuterNamespace(c->config_root, mount.source);
} else {
source = mount.source;
}
// Only create the destinations for external mounts, minijail will take
// care of those mounted in the new namespace.
if (mount.create && !mount.mount_in_ns) {
if (!SetupMountDestination(config, mount, source, dest))
return false;
}
if (mount.loopback) {
Loopdev loopdev;
if (!LoopdevSetup(source, &loopdev))
return false;
// Replace the mount source with the loopback device path.
source = loopdev.path;
// Save this to cleanup when shutting down.
c->loopdevs.emplace_back(std::move(loopdev));
}
if (!mount.verity.empty()) {
// Set this device up via dm-verity.
std::string dm_name;
base::FilePath dm_source = source;
if (!DeviceMapperSetup(dm_source, mount.verity, &source, &dm_name))
return false;
// Save this to cleanup when shutting down.
c->device_mappers.push_back(dm_name);
}
if (mount.mount_in_ns) {
// We can mount this with minijail.
if (minijail_mount_with_data(
c->jail.get(), source.value().c_str(),
mount.destination.value().c_str(), mount.type.c_str(), mount.flags,
mount.data.empty() ? nullptr : mount.data.c_str()) != 0) {
return false;
}
} else {
// Mount this externally and unmount it on exit.
if (!MountExternal(source.value(), dest.value(), mount.type, mount.flags,
mount.data)) {
return false;
}
// Save this to unmount when shutting down.
c->ext_mounts.push_back(dest);
}
return true;
}
bool DoContainerMounts(struct container* c,
const struct container_config* config) {
UnmountExternalMounts(c);
// This will run in all the error cases.
base::ScopedClosureRunner teardown(base::BindOnce(
base::IgnoreResult(&UnmountExternalMounts), base::Unretained(c)));
for (const auto& mount : config->mounts) {
if (!DoContainerMount(c, config, mount))
return false;
}
// The mounts have been done successfully, no need to tear them down anymore.
teardown.ReplaceClosure(base::DoNothing());
return true;
}
bool ContainerCreateDevice(const struct container* c,
const struct container_config* config,
const Device& dev,
int major,
int minor) {
mode_t mode = dev.fs_permissions;
switch (dev.type) {
case 'b':
mode |= S_IFBLK;
break;
case 'c':
mode |= S_IFCHR;
break;
default:
return false;
}
int uid_userns;
if (!GetUsernsOutsideId(config->uid_map, dev.uid, &uid_userns))
return false;
int gid_userns;
if (!GetUsernsOutsideId(config->gid_map, dev.gid, &gid_userns))
return false;
base::FilePath path = GetPathInOuterNamespace(c->runfsroot, dev.path);
if (!libcontainer::CreateDirectoryOwnedBy(path.DirName(), 0755, uid_userns,
gid_userns)) {
PLOG(ERROR) << "Failed to create parent directory for " << path.value();
return false;
}
if (mknod(path.value().c_str(), mode, makedev(major, minor)) != 0 &&
errno != EEXIST) {
PLOG(ERROR) << "Failed to mknod " << path.value();
return false;
}
if (chown(path.value().c_str(), uid_userns, gid_userns) != 0) {
PLOG(ERROR) << "Failed to chown " << path.value();
return false;
}
if (chmod(path.value().c_str(), dev.fs_permissions) != 0) {
PLOG(ERROR) << "Failed to chmod " << path.value();
return false;
}
return true;
}
bool MountRunfs(struct container* c, const struct container_config* config) {
{
std::string runfs_template = base::StringPrintf(
"%s/%s_XXXXXX", c->rundir.value().c_str(), c->name.c_str());
// TODO(lhchavez): Replace this with base::CreateTemporaryDirInDir().
char* runfs_path = mkdtemp(const_cast<char*>(runfs_template.c_str()));
if (!runfs_path) {
PLOG(ERROR) << "Failed to mkdtemp in " << c->rundir.value();
return false;
}
c->runfs = base::FilePath(runfs_path);
}
int uid_userns;
if (!GetUsernsOutsideId(config->uid_map, config->uid, &uid_userns))
return false;
int gid_userns;
if (!GetUsernsOutsideId(config->gid_map, config->gid, &gid_userns))
return false;
// Make sure the container uid can access the rootfs.
if (chmod(c->runfs.value().c_str(), 0700) != 0) {
PLOG(ERROR) << "Failed to chmod " << c->runfs.value();
return false;
}
if (chown(c->runfs.value().c_str(), uid_userns, gid_userns) != 0) {
PLOG(ERROR) << "Failed to chown " << c->runfs.value();
return false;
}
c->runfsroot = c->runfs.Append("root");
constexpr mode_t kRootDirMode = 0660;
if (mkdir(c->runfsroot.value().c_str(), kRootDirMode) != 0) {
PLOG(ERROR) << "Failed to mkdir " << c->runfsroot.value();
return false;
}
if (chmod(c->runfsroot.value().c_str(), kRootDirMode) != 0) {
PLOG(ERROR) << "Failed to chmod " << c->runfsroot.value();
return false;
}
if (mount(config->rootfs.value().c_str(), c->runfsroot.value().c_str(), "",
MS_BIND | (config->rootfs_mount_flags & MS_REC), nullptr) != 0) {
PLOG(ERROR) << "Failed to bind-mount " << config->rootfs.value();
return false;
}
// MS_BIND ignores any flags passed to it (except MS_REC). We need a
// second call to mount() to actually set them.
if (config->rootfs_mount_flags &&
mount(config->rootfs.value().c_str(), c->runfsroot.value().c_str(), "",
(config->rootfs_mount_flags & ~MS_REC), nullptr) != 0) {
PLOG(ERROR) << "Failed to remount " << c->runfsroot.value();
return false;
}
return true;
}
bool CreateDeviceNodes(struct container* c,
const struct container_config* config,
pid_t container_pid) {
for (const auto& dev : config->devices) {
int major = dev.major;
int minor = dev.minor;
if (dev.copy_major || dev.copy_minor) {
struct stat st_buff;
if (stat(dev.path.value().c_str(), &st_buff) != 0)
continue;
if (dev.copy_major)
major = major(st_buff.st_rdev);
if (dev.copy_minor)
minor = minor(st_buff.st_rdev);
}
if (major < 0 || minor < 0)
continue;
if (!ContainerCreateDevice(c, config, dev, major, minor))
return false;
}
return true;
}
bool DeviceSetup(struct container* c, const struct container_config* config) {
c->cgroup->DenyAllDevices();
for (const auto& dev : config->cgroup_devices) {
if (!c->cgroup->AddDevice(dev.allow, dev.major, dev.minor, dev.read,
dev.write, dev.modify, dev.type)) {
return false;
}
}
for (const auto& loopdev : c->loopdevs) {
if (!c->cgroup->AddDevice(1, major(loopdev.info.lo_rdevice),
minor(loopdev.info.lo_rdevice), 1, 0, 0, 'b')) {
return false;
}
}
return true;
}
int SetCoreSched(void* payload) {
int ret = prctl(PR_SET_CORE_SCHED, 1);
if (ret != 0 && errno != EINVAL) {
// Bubble error, minijail will abort child process.
return -errno;
}
// Success or unsupported on this kernel, continue.
return 0;
}
int Setexeccon(void* payload) {
char* init_domain = reinterpret_cast<char*>(payload);
pid_t tid = syscall(SYS_gettid);
if (tid < 0) {
PLOG(ERROR) << "Failed to gettid";
return -errno;
}
std::string exec_path =
base::StringPrintf("/proc/self/task/%d/attr/exec", tid);
base::ScopedFD fd(open(exec_path.c_str(), O_WRONLY | O_CLOEXEC));
if (!fd.is_valid()) {
PLOG(ERROR) << "Failed to open " << exec_path;
return -errno;
}
if (!base::WriteFileDescriptor(fd.get(), init_domain)) {
PLOG(ERROR) << "Failed to write the SELinux label to " << exec_path;
return -errno;
}
return 0;
}
bool ContainerTeardown(struct container* c) {
UnmountExternalMounts(c);
if (!c->runfsroot.empty() && !c->runfs.empty()) {
/* |c->runfsroot| may have been mounted recursively. Thus use
* MNT_DETACH to "immediately disconnect the filesystem and all
* filesystems mounted below it from each other and from the
* mount table". Otherwise one would need to unmount every
* single dependent mount before unmounting |c->runfsroot|
* itself.
*/
if (umount2(c->runfsroot.value().c_str(), MNT_DETACH) != 0) {
PLOG(ERROR) << "Failed to detach " << c->runfsroot.value();
return false;
}
if (rmdir(c->runfsroot.value().c_str()) != 0) {
PLOG(ERROR) << "Failed to rmdir " << c->runfsroot.value();
return false;
}
c->runfsroot = base::FilePath();
}
if (!c->pid_file_path.empty()) {
if (unlink(c->pid_file_path.value().c_str()) != 0) {
PLOG(ERROR) << "Failed to unlink " << c->pid_file_path.value();
return false;
}
c->pid_file_path = base::FilePath();
}
if (!c->runfs.empty()) {
if (rmdir(c->runfs.value().c_str()) != 0) {
PLOG(ERROR) << "Failed to rmdir " << c->runfs.value();
return false;
}
c->runfs = base::FilePath();
}
return true;
}
void CancelContainerStart(struct container* c) {
if (c->init_pid != -1)
container_kill(c);
ContainerTeardown(c);
}
} // namespace
struct container_config* container_config_create() {
return new (std::nothrow) struct container_config();
}
void container_config_destroy(struct container_config* c) {
if (c == nullptr)
return;
delete c;
}
int container_config_config_root(struct container_config* c,
const char* config_root) {
c->config_root = base::FilePath(config_root);
return 0;
}
const char* container_config_get_config_root(const struct container_config* c) {
return c->config_root.value().c_str();
}
int container_config_rootfs(struct container_config* c, const char* rootfs) {
c->rootfs = base::FilePath(rootfs);
return 0;
}
const char* container_config_get_rootfs(const struct container_config* c) {
return c->rootfs.value().c_str();
}
void container_config_rootfs_mount_flags(struct container_config* c,
unsigned long rootfs_mount_flags) {
/* Since we are going to add MS_REMOUNT anyways, add it here so we can
* simply check against zero later. MS_BIND is also added to avoid
* re-mounting the original filesystem, since the rootfs is always
* bind-mounted.
*/
c->rootfs_mount_flags = MS_REMOUNT | MS_BIND | rootfs_mount_flags;
}
unsigned long container_config_get_rootfs_mount_flags(
const struct container_config* c) {
return c->rootfs_mount_flags;
}
int container_config_premounted_runfs(struct container_config* c,
const char* runfs) {
c->premounted_runfs = base::FilePath(runfs);
return 0;
}
const char* container_config_get_premounted_runfs(
const struct container_config* c) {
return c->premounted_runfs.value().c_str();
}
int container_config_pid_file(struct container_config* c, const char* path) {
c->pid_file_path = base::FilePath(path);
return 0;
}
const char* container_config_get_pid_file(const struct container_config* c) {
return c->pid_file_path.value().c_str();
}
int container_config_program_argv(struct container_config* c,
const char** argv,
size_t num_args) {
if (num_args < 1) {
errno = EINVAL;
return -1;
}
c->program_argv.clear();
c->program_argv.reserve(num_args);
for (size_t i = 0; i < num_args; ++i)
c->program_argv.emplace_back(argv[i]);
return 0;
}
size_t container_config_get_num_program_args(const struct container_config* c) {
return c->program_argv.size();
}
const char* container_config_get_program_arg(const struct container_config* c,
size_t index) {
if (index >= c->program_argv.size())
return nullptr;
return c->program_argv[index].c_str();
}
void container_config_uid(struct container_config* c, uid_t uid) {
c->uid = uid;
}
uid_t container_config_get_uid(const struct container_config* c) {
return c->uid;
}
int container_config_uid_map(struct container_config* c, const char* uid_map) {
c->uid_map = uid_map;
return 0;
}
void container_config_gid(struct container_config* c, gid_t gid) {
c->gid = gid;
}
gid_t container_config_get_gid(const struct container_config* c) {
return c->gid;
}
int container_config_gid_map(struct container_config* c, const char* gid_map) {
c->gid_map = gid_map;
return 0;
}
void container_config_additional_gids(struct container_config* c,
const gid_t* gids,
size_t num_gids) {
c->additional_gids.assign(gids, gids + num_gids);
}
int container_config_alt_syscall_table(struct container_config* c,
const char* alt_syscall_table) {
c->alt_syscall_table = alt_syscall_table;
return 0;
}
int container_config_add_rlimit(struct container_config* c,
int type,
rlim_t cur,
rlim_t max) {
if (c->num_rlimits >= kMaxRlimits) {
errno = ENOMEM;
return -1;
}
c->rlimits[c->num_rlimits].type = type;
c->rlimits[c->num_rlimits].cur = cur;
c->rlimits[c->num_rlimits].max = max;
c->num_rlimits++;
return 0;
}
int container_config_add_mount(struct container_config* c,
const char* name,
const char* source,
const char* destination,
const char* type,
const char* data,
const char* verity,
int flags,
int uid,
int gid,
int mode,
int mount_in_ns,
int create,
int loopback) {
if (name == nullptr || source == nullptr || destination == nullptr ||
type == nullptr) {
errno = EINVAL;
return -1;
}
c->mounts.emplace_back(
Mount{name, base::FilePath(source), base::FilePath(destination), type,